SIS
Symmetric Index Structures
/Users/dbr/ma/src/lmu/cis/sis/adapter/ScdawgAdapter.cpp
Go to the documentation of this file.
00001 #ifndef SCDAWGADAPTERADAPTER_CPP
00002 #define SCDAWGADAPTERADAPTER_CPP
00003 
00004 #include "ScdawgAdapter.hpp"
00005 
00006 namespace lmu { namespace cis { namespace sis {
00007 
00008 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00009 UINT SCDAWGAdapter::Add( const VoidSequence * sharpDocumentDollar ) {
00010     if (DEBUG_LEVEL >= 5) std::cerr << "virtual inline void SCDAWGAdapter::Add( VoidSequence * sharpDocumentDollar )" << std::endl;
00011 
00012     assert (ManagedAutomatonStage_ < CLOSED);
00013 
00014     /* We exceptionally need to cast the const away */
00015     VoidSequence * nc_sharpDocumentDollar = const_cast<VoidSequence*>(sharpDocumentDollar);
00016 
00017     /* Shamelessly stolen from scdawg.c because we need the last
00018        created sinkstate per document in the right automaton
00019        but the original method doesn't provide it.
00020     */
00021     UINT i, numberOfStates, numberOfNewStates, sinkstate;
00022 
00024     numberOfStates = help()->left->aut->statesTransitions->seqStored;
00025     sinkstate = CDAWGAdd( help()->right, nc_sharpDocumentDollar );
00026 
00027     assert(sinkstate != NO);
00028 
00029     VoidSequenceReverse( nc_sharpDocumentDollar );
00030     CDAWGAdd( help()->left, nc_sharpDocumentDollar );
00031 
00033     numberOfNewStates = help()->left->aut->statesTransitions->seqStored;
00034     for( i = numberOfStates; i < numberOfNewStates; i++ ){
00035         UINTSequenceAdd( help()->leftToRight, NO );
00036     }
00037     help()->leftToRight->seq[help()->left->aut->initialState] = help()->right->aut->initialState;
00038     for( i = numberOfStates; i < numberOfNewStates; i++ ){
00039         if( help()->leftToRight->seq[i] == NO ){
00040             help()->leftToRight->seq[i] = FindRightState( i );
00041         }
00042     }
00043     return sinkstate;
00044 }
00045 
00046 
00047 UINT SCDAWGAdapter::FindRightState( UINT l ) {
00048     /* verbatim from scdawg.c (function is static there) */
00049     TransitionFromStartTo tfst;
00050     UINT q, tr;
00051 
00052     q = help()->left->statesSuffixLink->seq[l];
00053     if( help()->leftToRight->seq[q] == NO ){
00054         help()->leftToRight->seq[q] = FindRightState( q );
00055     }
00056     tfst.from = help()->leftToRight->seq[q];
00057     tfst.symbol = mVoidSequenceElement( mLeft()->data, mLeft()->statesEnd->seq[l] - help()->left->statesLength->seq[q] );
00058     tr = HashGetIndex( help()->right->hash, &tfst );
00059 
00060     return mRight()->transitionsTo->seq[tr];
00061 }
00062 
00063 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00064 SCDAWG * SCDAWGAdapter::Read( FILE * fp ) {
00065     if (DEBUG_LEVEL >= 5) std::cerr << "SCDAWG * SCDAWGAdapter::Read( FILE * fp )" << std::endl;
00066 
00067     C_SCDAWG = SCDAWGRead( fp );
00068 
00069     ManagedAutomatonStage_ = CLOSED;
00070 
00071     return C_SCDAWG;
00072 }
00073 
00074 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00075 SCDAWG * SCDAWGAdapter::Read( const S8* fn ) {
00076     if (DEBUG_LEVEL >= 5) std::cerr << "T * TAdapter::Read( FILE * fp )" << std::endl;
00077 
00078     FILE* fpIn = Fopen(fn, "rb");
00079     Read( fpIn );   //< will write into member SCDAWG* C_SCDAWG
00080     Fclose(fpIn);
00081 
00082     ManagedAutomatonStage_ = CLOSED;
00083 
00084     return C_SCDAWG;
00085 }
00086 
00087 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00088 void SCDAWGAdapter::Close() {
00089     if (DEBUG_LEVEL >= 5) std::cerr << "virtual inline void SCDAWGAdapter::Close()" << std::endl;
00090 
00091     if ( ManagedAutomatonStage_ >= SORTED) {
00092         VoidSequenceFree( mLeft()->data );
00093         mLeft()->data = NULL;
00094         ManagedAutomatonStage_ = CLOSED;
00095         return;
00096     }
00097     if ( ManagedAutomatonStage_ < CLOSED ) {
00098         SCDAWGClose( C_SCDAWGBuildHelp );
00099         // C_SCDAWGBuildHelp = nullptr;
00100         ManagedAutomatonStage_ = CLOSED;
00101         return;
00102     }
00103 }
00104 
00105 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00106 void SCDAWGAdapter::Shrink() {
00107     if (DEBUG_LEVEL >= 5) std::cerr << "virtual inline void SCDAWGAdapter::Shrink()" << std::endl;
00108 
00109     assert (ManagedAutomatonStage_ < CLOSED);
00110 
00111     SCDAWGShrink( C_SCDAWG );
00112 }
00113 
00114 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00115 void SCDAWGAdapter::Write( FILE * fp ) {
00116     if (DEBUG_LEVEL >= 5) std::cerr << "virtual inline void SCDAWGAdapter::Write( FILE * fp )" << std::endl;
00117 
00118     assert (ManagedAutomatonStage_ == CLOSED);
00119 
00120     SCDAWGWrite( C_SCDAWG, fp );
00121 }
00122 
00123 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00124 void SCDAWGAdapter::Write( const S8* fn ) {
00125     if (DEBUG_LEVEL >= 5) std::cerr << "virtual inline void SCDAWGAdapter::Write(  const S8* fn )" << std::endl;
00126 
00127     assert (ManagedAutomatonStage_ == CLOSED);
00128 
00129     FILE * fpOut = Fopen(fn, "wb");
00130     SCDAWGWrite( C_SCDAWG, fpOut );
00131     Fclose(fpOut);
00132 }
00133 
00134 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00135 void SCDAWGAdapter::DumpStat( FILE * fp ) const {
00136     if (DEBUG_LEVEL >= 5) std::cerr << "virtual void SCDAWGAdapter::DumpStat( FILE * fp ) const" << std::endl;
00137 
00138     assert (ManagedAutomatonStage_ >= SORTED);
00139 
00140     SCDAWGDumpStat( C_SCDAWG, fp );
00141 }
00142 
00143 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00144 void SCDAWGAdapter::DumpStat( const S8* fn ) const {
00145     if (DEBUG_LEVEL >= 5) std::cerr << "virtual void SCDAWGAdapter::DumpStat( const S8* fn ) const" << std::endl;
00146 
00147     assert (ManagedAutomatonStage_ >= SORTED);
00148 
00149     FILE * fpOut;
00150     try {
00151         fpOut = Fopen(fn, "wb");
00152     } catch (...) {
00153         std::cerr << "Could not open file " << fn << std::endl;
00154     }
00155     SCDAWGDumpStat( C_SCDAWG, fpOut );
00156     Fclose(fpOut);
00157 }
00158 
00159 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00160 void SCDAWGAdapter::DumpGV( const S8 * fn, UINT encoding ) const {
00161     if (DEBUG_LEVEL >= 5) std::cerr << "virtual inline void SCDAWGAdapter::DumpGV( FILE * fp, void (*DumpLabel)(const SCDAWG *, FILE *, UINT, boolean, UINT), UINT encoding ) const" << std::endl;
00162 
00163     assert (ManagedAutomatonStage_ >= SORTED);
00164 
00165     FILE * fpOut;
00166     try {
00167         fpOut = Fopen(fn, "wb");
00168     } catch (...) {
00169         std::cerr << "Could not open file " << fn << std::endl;
00170     }
00171     SCDAWGDumpGV( C_SCDAWG, fpOut, DumpSCDAWGLabel, encoding );
00172     Fclose(fpOut);
00173 }
00174 
00175 void SCDAWGAdapter::DumpGVRight( const S8 * fn, UINT encoding) const {
00176     if (DEBUG_LEVEL >= 5) std::cerr << "virtual inline void SCDAWGAdapter::DumpGV( FILE * fp, void (*DumpLabel)(const SCDAWG *, FILE *, UINT, boolean, UINT), UINT encoding ) const" << std::endl;
00177 
00178     assert (ManagedAutomatonStage_ >= SORTED);
00179 
00180     FILE * fpOut;
00181     try {
00182         fpOut = Fopen(fn, "wb");
00183     } catch (...) {
00184         std::cerr << "Could not open file " << fn << std::endl;
00185     }
00186     CompressedAutomatonDumpGV( C_SCDAWG->right, fpOut, DumpLabel, encoding );
00187     Fclose(fpOut);
00188 }
00189 
00190 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00191 void SCDAWGAdapter::AddTarjanTable() {
00192     if (DEBUG_LEVEL >= 5) std::cerr << "virtual void SCDAWGAdapter::AddTarjanTable()" << std::endl;
00193 
00194     assert (ManagedAutomatonStage_ < TARJANTABLE);
00195 
00196     SCDAWGAddTarjanTable( C_SCDAWG );
00197 }
00198 
00199 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00200 void SCDAWGAdapter::GenerateLanguage( const S8* fnOut, UINT encoding ) const {
00201     if (DEBUG_LEVEL >= 5) std::cerr << "virtual inline void SCDAWGAdapter::GenerateLanguage( const S8* fn, UINT encoding = ENCODING_PLAIN ) const" << std::endl;
00202 
00203     assert (ManagedAutomatonStage_ >= SORTED);
00204 
00205     SCDAWG* aut = C_SCDAWG;
00206 
00207     // ->GenerateLanguage( fnOut, encoding );
00208     FILE * fpOutput = Fopen( fnOut, "wb" );
00209     CompressedAutomatonGenerateLanguage( C_SCDAWG->right, fpOutput, DumpSequenceOfLabels, encoding );
00210     Fclose( fpOutput );
00211 }
00212 
00213 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00214 void SCDAWGAdapter::SortTransitions() {
00215     if (DEBUG_LEVEL >= 5) std::cerr << "SCDAWGAdapter::SortTransitions()" << std::endl;
00216 
00217     assert( ManagedAutomatonStage_ > EMPTY && ManagedAutomatonStage_ < CLOSED );
00218     assert( C_SCDAWGBuildHelp != nullptr );
00219 
00220     if (ManagedAutomatonStage_ < SORTED) {
00221         if (DEBUG_LEVEL >= 3) std::cerr << "Sorting." << std::endl;
00222         UINT i, j, s;
00223         CompressedAutomatonSortTransitions( mRight(), help()->right->transitionsFrom );
00224 
00225         for( i = 0; i < mCompressedAutomatonGetTransitionsStored(mLeft()); i++ ){
00226             s = mLeft()->transitionsTo->seq[i];
00227             j = mLeft()->statesEnd->seq[s] - mLeft()->transitionsStart->seq[i];
00228             s = help()->leftToRight->seq[s];
00229             mLeft()->transitionsStart->seq[i] = (mRight()->statesEnd->seq[s] + 1) - help()->right->statesLength->seq[s] + j;
00230         }
00231         for( i = 0; i < mCompressedAutomatonGetTransitionsStored(mLeft()); i++ ){
00232             help()->left->transitionsFrom->seq[i] = help()->leftToRight->seq[help()->left->transitionsFrom->seq[i]];
00233             mLeft()->transitionsTo->seq[i] = help()->leftToRight->seq[mLeft()->transitionsTo->seq[i]];
00234         }
00235         for( i = 0; i < mCompressedAutomatonGetStatesStored( mLeft() ); i++ ){
00236             if( help()->right->statesLength->seq[i] != 0 ){
00237                 mLeft()->statesEnd->seq[i] = (mRight()->statesEnd->seq[i] + 1) - help()->right->statesLength->seq[i];
00238             }
00239         }
00240         // VoidSequenceFree( mLeft()->data ); mLeft()->data = mRight()->data;
00241         // CompressedAutomatonSortTransitions( mLeft(), help()->left->transitionsFrom );
00242         // mLeft()->data = NULL;
00243     } else {
00244         if (DEBUG_LEVEL >= 3) std::cerr << "Automaton sorted already." << std::endl;
00245     }
00246     ManagedAutomatonStage_ = SORTED;
00247 }
00248 
00249 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00250 UINT SCDAWGAdapter::suffixLink( const UINT& state ) const {
00251     if (DEBUG_LEVEL >= 5) std::cerr << "UINT SCDAWGAdapter::suffixLink( const UINT& state ) const" << std::endl;
00252     return buildhelp()->right->statesSuffixLink->seq[state];
00253 }
00254 
00255 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00256 CompressedAutomaton* SCDAWGAdapter::automaton() const {
00257     if (DEBUG_LEVEL >= 5) std::cerr << "CompressedAutomaton* SCDAWGAdapter::automaton() const" << std::endl;
00258     return C_SCDAWG->right;
00259 }
00260 
00261 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00262 CompressedAutomaton* SCDAWGAdapter::left_automaton() const {
00263     if (DEBUG_LEVEL >= 5) std::cerr << "SCDAWGAdapter::left_automaton() const" << std::endl;
00264     return C_SCDAWG->left;
00265 }
00266 
00267 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00268 PairState SCDAWGAdapter::GetInitialPairState( ) const {
00269     if (DEBUG_LEVEL >= 5) std::cerr << "virtual inline PairState SCDAWGAdapter::GetInitialPairState( ) const" << std::endl;
00270 
00271     assert (ManagedAutomatonStage_ > EMPTY);
00272 
00273     return CompressedAutomatonGetInitialPairState( automaton() );
00274 }
00275 
00276 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00277 SCDAWGBuildHelp* SCDAWGAdapter::buildhelp() const {
00278     assert (ManagedAutomatonStage_ < CLOSED);
00279 
00280     return C_SCDAWGBuildHelp;
00281 }
00282 
00283 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00284 PairState SCDAWGAdapter::Delta( const PairState * state, const void * symbol ) const {
00285     assert (ManagedAutomatonStage_ >= SORTED);
00286 
00287     return CompressedAutomatonDelta( automaton(), state, symbol );
00288 }
00289 
00290 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00291 void SCDAWGAdapter::BuildSCDAWG( const S8 * fileNameInput,
00292                                  const S8 * fileNameOutput,
00293                                  UINT bitsPerSymbol,
00294                                  UINT encoding,
00295                                  boolean tarjanTable )
00296 {   //TODO?
00297     FILE * fpInput;
00298     FILE * fpOutput;
00299     VoidSequence * line = NULL;
00300     VoidSequence * sharpDocumentDollar = NULL;
00301 
00302     // #ifndef __cplusplus
00303     SCDAWG * aut;
00304     SCDAWGBuildHelp * help;
00305     // #endif
00306 
00307     UINT i, n, nLine, symbolSize;
00308     mSymbolAndVariables( zero )
00309     mSymbolAndVariables( one )
00310 
00311     mValidateBitsPerSymbol( bitsPerSymbol )
00312     symbolSize = bitsPerSymbol/8;
00313     mSymbolAssignValue( zero, 0, symbolSize )
00314     mSymbolAssignValue( one, 1, symbolSize )
00315 
00316     #ifndef __cplusplus
00317     aut = SCDAWGInit( symbolSize );
00318     help = SCDAWGBuildHelpInit( aut );
00319     #else
00320     aut = C_SCDAWG;
00321     help = C_SCDAWGBuildHelp;
00322     #endif
00323 
00324     fpInput = Fopen( fileNameInput, "rb" );
00325     nLine = 0;
00326     while( bTRUE ){
00327         if( line != NULL ){
00328             VoidSequenceFree( line );
00329         }
00330         line = VoidSequenceReadLine( fpInput, symbolSize, encoding );
00331         if( line == NULL ){
00332             break;
00333         }
00334         nLine++;
00335         for( i = 0; i < line->seqStored; i++ ){
00336             n = SymbolToUINT( mVoidSequenceElement(line, i), symbolSize );
00337             if( n == 0 || n == 1 ){
00338                 S8 msg[MAX_INPUT_STRING_SIZE];
00339 
00340                 sprintf( msg, "Cannot process line %llu - 0 and 1 are reserved as new symbols. (pos is %d, sign is %d)", (U64)(nLine), i, n );
00341                 Throw( msg );
00342             }
00343         }
00344         sharpDocumentDollar = VoidSequenceInit2( symbolSize, line->seqStored + 2, 1 );
00345         VoidSequenceAdd( sharpDocumentDollar, one );
00346         VoidSequenceAppend( sharpDocumentDollar, line );
00347         VoidSequenceAdd( sharpDocumentDollar, zero );
00348         SCDAWGAdd( help, sharpDocumentDollar );
00349         VoidSequenceFree( sharpDocumentDollar );
00350     }
00351     Fclose( fpInput );
00352     SCDAWGClose( help );
00353     SCDAWGShrink( aut );
00354     SCDAWGBuildHelpFree( help );
00355     if( tarjanTable ){
00356         SCDAWGAddTarjanTable( aut );
00357     }
00358     fpOutput = Fopen( fileNameOutput, "wb" );
00359     SCDAWGWrite( aut, fpOutput );
00360     Fclose( fpOutput );
00361     SCDAWGFree( aut );
00362 }
00363 
00364 }}} /* End of namespace lmu::cis::sis */
00365 
00366 #endif /* end of include guard: SCDAWGADAPTERADAPTER_CPP */