SIS
Symmetric Index Structures
|
00001 #ifndef SCDAWGADAPTERADAPTER_CPP 00002 #define SCDAWGADAPTERADAPTER_CPP 00003 00004 #include "ScdawgAdapter.hpp" 00005 00006 namespace lmu { namespace cis { namespace sis { 00007 00008 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00009 UINT SCDAWGAdapter::Add( const VoidSequence * sharpDocumentDollar ) { 00010 if (DEBUG_LEVEL >= 5) std::cerr << "virtual inline void SCDAWGAdapter::Add( VoidSequence * sharpDocumentDollar )" << std::endl; 00011 00012 assert (ManagedAutomatonStage_ < CLOSED); 00013 00014 /* We exceptionally need to cast the const away */ 00015 VoidSequence * nc_sharpDocumentDollar = const_cast<VoidSequence*>(sharpDocumentDollar); 00016 00017 /* Shamelessly stolen from scdawg.c because we need the last 00018 created sinkstate per document in the right automaton 00019 but the original method doesn't provide it. 00020 */ 00021 UINT i, numberOfStates, numberOfNewStates, sinkstate; 00022 00024 numberOfStates = help()->left->aut->statesTransitions->seqStored; 00025 sinkstate = CDAWGAdd( help()->right, nc_sharpDocumentDollar ); 00026 00027 assert(sinkstate != NO); 00028 00029 VoidSequenceReverse( nc_sharpDocumentDollar ); 00030 CDAWGAdd( help()->left, nc_sharpDocumentDollar ); 00031 00033 numberOfNewStates = help()->left->aut->statesTransitions->seqStored; 00034 for( i = numberOfStates; i < numberOfNewStates; i++ ){ 00035 UINTSequenceAdd( help()->leftToRight, NO ); 00036 } 00037 help()->leftToRight->seq[help()->left->aut->initialState] = help()->right->aut->initialState; 00038 for( i = numberOfStates; i < numberOfNewStates; i++ ){ 00039 if( help()->leftToRight->seq[i] == NO ){ 00040 help()->leftToRight->seq[i] = FindRightState( i ); 00041 } 00042 } 00043 return sinkstate; 00044 } 00045 00046 00047 UINT SCDAWGAdapter::FindRightState( UINT l ) { 00048 /* verbatim from scdawg.c (function is static there) */ 00049 TransitionFromStartTo tfst; 00050 UINT q, tr; 00051 00052 q = help()->left->statesSuffixLink->seq[l]; 00053 if( help()->leftToRight->seq[q] == NO ){ 00054 help()->leftToRight->seq[q] = FindRightState( q ); 00055 } 00056 tfst.from = help()->leftToRight->seq[q]; 00057 tfst.symbol = mVoidSequenceElement( mLeft()->data, mLeft()->statesEnd->seq[l] - help()->left->statesLength->seq[q] ); 00058 tr = HashGetIndex( help()->right->hash, &tfst ); 00059 00060 return mRight()->transitionsTo->seq[tr]; 00061 } 00062 00063 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00064 SCDAWG * SCDAWGAdapter::Read( FILE * fp ) { 00065 if (DEBUG_LEVEL >= 5) std::cerr << "SCDAWG * SCDAWGAdapter::Read( FILE * fp )" << std::endl; 00066 00067 C_SCDAWG = SCDAWGRead( fp ); 00068 00069 ManagedAutomatonStage_ = CLOSED; 00070 00071 return C_SCDAWG; 00072 } 00073 00074 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00075 SCDAWG * SCDAWGAdapter::Read( const S8* fn ) { 00076 if (DEBUG_LEVEL >= 5) std::cerr << "T * TAdapter::Read( FILE * fp )" << std::endl; 00077 00078 FILE* fpIn = Fopen(fn, "rb"); 00079 Read( fpIn ); //< will write into member SCDAWG* C_SCDAWG 00080 Fclose(fpIn); 00081 00082 ManagedAutomatonStage_ = CLOSED; 00083 00084 return C_SCDAWG; 00085 } 00086 00087 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00088 void SCDAWGAdapter::Close() { 00089 if (DEBUG_LEVEL >= 5) std::cerr << "virtual inline void SCDAWGAdapter::Close()" << std::endl; 00090 00091 if ( ManagedAutomatonStage_ >= SORTED) { 00092 VoidSequenceFree( mLeft()->data ); 00093 mLeft()->data = NULL; 00094 ManagedAutomatonStage_ = CLOSED; 00095 return; 00096 } 00097 if ( ManagedAutomatonStage_ < CLOSED ) { 00098 SCDAWGClose( C_SCDAWGBuildHelp ); 00099 // C_SCDAWGBuildHelp = nullptr; 00100 ManagedAutomatonStage_ = CLOSED; 00101 return; 00102 } 00103 } 00104 00105 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00106 void SCDAWGAdapter::Shrink() { 00107 if (DEBUG_LEVEL >= 5) std::cerr << "virtual inline void SCDAWGAdapter::Shrink()" << std::endl; 00108 00109 assert (ManagedAutomatonStage_ < CLOSED); 00110 00111 SCDAWGShrink( C_SCDAWG ); 00112 } 00113 00114 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00115 void SCDAWGAdapter::Write( FILE * fp ) { 00116 if (DEBUG_LEVEL >= 5) std::cerr << "virtual inline void SCDAWGAdapter::Write( FILE * fp )" << std::endl; 00117 00118 assert (ManagedAutomatonStage_ == CLOSED); 00119 00120 SCDAWGWrite( C_SCDAWG, fp ); 00121 } 00122 00123 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00124 void SCDAWGAdapter::Write( const S8* fn ) { 00125 if (DEBUG_LEVEL >= 5) std::cerr << "virtual inline void SCDAWGAdapter::Write( const S8* fn )" << std::endl; 00126 00127 assert (ManagedAutomatonStage_ == CLOSED); 00128 00129 FILE * fpOut = Fopen(fn, "wb"); 00130 SCDAWGWrite( C_SCDAWG, fpOut ); 00131 Fclose(fpOut); 00132 } 00133 00134 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00135 void SCDAWGAdapter::DumpStat( FILE * fp ) const { 00136 if (DEBUG_LEVEL >= 5) std::cerr << "virtual void SCDAWGAdapter::DumpStat( FILE * fp ) const" << std::endl; 00137 00138 assert (ManagedAutomatonStage_ >= SORTED); 00139 00140 SCDAWGDumpStat( C_SCDAWG, fp ); 00141 } 00142 00143 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00144 void SCDAWGAdapter::DumpStat( const S8* fn ) const { 00145 if (DEBUG_LEVEL >= 5) std::cerr << "virtual void SCDAWGAdapter::DumpStat( const S8* fn ) const" << std::endl; 00146 00147 assert (ManagedAutomatonStage_ >= SORTED); 00148 00149 FILE * fpOut; 00150 try { 00151 fpOut = Fopen(fn, "wb"); 00152 } catch (...) { 00153 std::cerr << "Could not open file " << fn << std::endl; 00154 } 00155 SCDAWGDumpStat( C_SCDAWG, fpOut ); 00156 Fclose(fpOut); 00157 } 00158 00159 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00160 void SCDAWGAdapter::DumpGV( const S8 * fn, UINT encoding ) const { 00161 if (DEBUG_LEVEL >= 5) std::cerr << "virtual inline void SCDAWGAdapter::DumpGV( FILE * fp, void (*DumpLabel)(const SCDAWG *, FILE *, UINT, boolean, UINT), UINT encoding ) const" << std::endl; 00162 00163 assert (ManagedAutomatonStage_ >= SORTED); 00164 00165 FILE * fpOut; 00166 try { 00167 fpOut = Fopen(fn, "wb"); 00168 } catch (...) { 00169 std::cerr << "Could not open file " << fn << std::endl; 00170 } 00171 SCDAWGDumpGV( C_SCDAWG, fpOut, DumpSCDAWGLabel, encoding ); 00172 Fclose(fpOut); 00173 } 00174 00175 void SCDAWGAdapter::DumpGVRight( const S8 * fn, UINT encoding) const { 00176 if (DEBUG_LEVEL >= 5) std::cerr << "virtual inline void SCDAWGAdapter::DumpGV( FILE * fp, void (*DumpLabel)(const SCDAWG *, FILE *, UINT, boolean, UINT), UINT encoding ) const" << std::endl; 00177 00178 assert (ManagedAutomatonStage_ >= SORTED); 00179 00180 FILE * fpOut; 00181 try { 00182 fpOut = Fopen(fn, "wb"); 00183 } catch (...) { 00184 std::cerr << "Could not open file " << fn << std::endl; 00185 } 00186 CompressedAutomatonDumpGV( C_SCDAWG->right, fpOut, DumpLabel, encoding ); 00187 Fclose(fpOut); 00188 } 00189 00190 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00191 void SCDAWGAdapter::AddTarjanTable() { 00192 if (DEBUG_LEVEL >= 5) std::cerr << "virtual void SCDAWGAdapter::AddTarjanTable()" << std::endl; 00193 00194 assert (ManagedAutomatonStage_ < TARJANTABLE); 00195 00196 SCDAWGAddTarjanTable( C_SCDAWG ); 00197 } 00198 00199 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00200 void SCDAWGAdapter::GenerateLanguage( const S8* fnOut, UINT encoding ) const { 00201 if (DEBUG_LEVEL >= 5) std::cerr << "virtual inline void SCDAWGAdapter::GenerateLanguage( const S8* fn, UINT encoding = ENCODING_PLAIN ) const" << std::endl; 00202 00203 assert (ManagedAutomatonStage_ >= SORTED); 00204 00205 SCDAWG* aut = C_SCDAWG; 00206 00207 // ->GenerateLanguage( fnOut, encoding ); 00208 FILE * fpOutput = Fopen( fnOut, "wb" ); 00209 CompressedAutomatonGenerateLanguage( C_SCDAWG->right, fpOutput, DumpSequenceOfLabels, encoding ); 00210 Fclose( fpOutput ); 00211 } 00212 00213 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00214 void SCDAWGAdapter::SortTransitions() { 00215 if (DEBUG_LEVEL >= 5) std::cerr << "SCDAWGAdapter::SortTransitions()" << std::endl; 00216 00217 assert( ManagedAutomatonStage_ > EMPTY && ManagedAutomatonStage_ < CLOSED ); 00218 assert( C_SCDAWGBuildHelp != nullptr ); 00219 00220 if (ManagedAutomatonStage_ < SORTED) { 00221 if (DEBUG_LEVEL >= 3) std::cerr << "Sorting." << std::endl; 00222 UINT i, j, s; 00223 CompressedAutomatonSortTransitions( mRight(), help()->right->transitionsFrom ); 00224 00225 for( i = 0; i < mCompressedAutomatonGetTransitionsStored(mLeft()); i++ ){ 00226 s = mLeft()->transitionsTo->seq[i]; 00227 j = mLeft()->statesEnd->seq[s] - mLeft()->transitionsStart->seq[i]; 00228 s = help()->leftToRight->seq[s]; 00229 mLeft()->transitionsStart->seq[i] = (mRight()->statesEnd->seq[s] + 1) - help()->right->statesLength->seq[s] + j; 00230 } 00231 for( i = 0; i < mCompressedAutomatonGetTransitionsStored(mLeft()); i++ ){ 00232 help()->left->transitionsFrom->seq[i] = help()->leftToRight->seq[help()->left->transitionsFrom->seq[i]]; 00233 mLeft()->transitionsTo->seq[i] = help()->leftToRight->seq[mLeft()->transitionsTo->seq[i]]; 00234 } 00235 for( i = 0; i < mCompressedAutomatonGetStatesStored( mLeft() ); i++ ){ 00236 if( help()->right->statesLength->seq[i] != 0 ){ 00237 mLeft()->statesEnd->seq[i] = (mRight()->statesEnd->seq[i] + 1) - help()->right->statesLength->seq[i]; 00238 } 00239 } 00240 // VoidSequenceFree( mLeft()->data ); mLeft()->data = mRight()->data; 00241 // CompressedAutomatonSortTransitions( mLeft(), help()->left->transitionsFrom ); 00242 // mLeft()->data = NULL; 00243 } else { 00244 if (DEBUG_LEVEL >= 3) std::cerr << "Automaton sorted already." << std::endl; 00245 } 00246 ManagedAutomatonStage_ = SORTED; 00247 } 00248 00249 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00250 UINT SCDAWGAdapter::suffixLink( const UINT& state ) const { 00251 if (DEBUG_LEVEL >= 5) std::cerr << "UINT SCDAWGAdapter::suffixLink( const UINT& state ) const" << std::endl; 00252 return buildhelp()->right->statesSuffixLink->seq[state]; 00253 } 00254 00255 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00256 CompressedAutomaton* SCDAWGAdapter::automaton() const { 00257 if (DEBUG_LEVEL >= 5) std::cerr << "CompressedAutomaton* SCDAWGAdapter::automaton() const" << std::endl; 00258 return C_SCDAWG->right; 00259 } 00260 00261 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00262 CompressedAutomaton* SCDAWGAdapter::left_automaton() const { 00263 if (DEBUG_LEVEL >= 5) std::cerr << "SCDAWGAdapter::left_automaton() const" << std::endl; 00264 return C_SCDAWG->left; 00265 } 00266 00267 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00268 PairState SCDAWGAdapter::GetInitialPairState( ) const { 00269 if (DEBUG_LEVEL >= 5) std::cerr << "virtual inline PairState SCDAWGAdapter::GetInitialPairState( ) const" << std::endl; 00270 00271 assert (ManagedAutomatonStage_ > EMPTY); 00272 00273 return CompressedAutomatonGetInitialPairState( automaton() ); 00274 } 00275 00276 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00277 SCDAWGBuildHelp* SCDAWGAdapter::buildhelp() const { 00278 assert (ManagedAutomatonStage_ < CLOSED); 00279 00280 return C_SCDAWGBuildHelp; 00281 } 00282 00283 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00284 PairState SCDAWGAdapter::Delta( const PairState * state, const void * symbol ) const { 00285 assert (ManagedAutomatonStage_ >= SORTED); 00286 00287 return CompressedAutomatonDelta( automaton(), state, symbol ); 00288 } 00289 00290 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00291 void SCDAWGAdapter::BuildSCDAWG( const S8 * fileNameInput, 00292 const S8 * fileNameOutput, 00293 UINT bitsPerSymbol, 00294 UINT encoding, 00295 boolean tarjanTable ) 00296 { //TODO? 00297 FILE * fpInput; 00298 FILE * fpOutput; 00299 VoidSequence * line = NULL; 00300 VoidSequence * sharpDocumentDollar = NULL; 00301 00302 // #ifndef __cplusplus 00303 SCDAWG * aut; 00304 SCDAWGBuildHelp * help; 00305 // #endif 00306 00307 UINT i, n, nLine, symbolSize; 00308 mSymbolAndVariables( zero ) 00309 mSymbolAndVariables( one ) 00310 00311 mValidateBitsPerSymbol( bitsPerSymbol ) 00312 symbolSize = bitsPerSymbol/8; 00313 mSymbolAssignValue( zero, 0, symbolSize ) 00314 mSymbolAssignValue( one, 1, symbolSize ) 00315 00316 #ifndef __cplusplus 00317 aut = SCDAWGInit( symbolSize ); 00318 help = SCDAWGBuildHelpInit( aut ); 00319 #else 00320 aut = C_SCDAWG; 00321 help = C_SCDAWGBuildHelp; 00322 #endif 00323 00324 fpInput = Fopen( fileNameInput, "rb" ); 00325 nLine = 0; 00326 while( bTRUE ){ 00327 if( line != NULL ){ 00328 VoidSequenceFree( line ); 00329 } 00330 line = VoidSequenceReadLine( fpInput, symbolSize, encoding ); 00331 if( line == NULL ){ 00332 break; 00333 } 00334 nLine++; 00335 for( i = 0; i < line->seqStored; i++ ){ 00336 n = SymbolToUINT( mVoidSequenceElement(line, i), symbolSize ); 00337 if( n == 0 || n == 1 ){ 00338 S8 msg[MAX_INPUT_STRING_SIZE]; 00339 00340 sprintf( msg, "Cannot process line %llu - 0 and 1 are reserved as new symbols. (pos is %d, sign is %d)", (U64)(nLine), i, n ); 00341 Throw( msg ); 00342 } 00343 } 00344 sharpDocumentDollar = VoidSequenceInit2( symbolSize, line->seqStored + 2, 1 ); 00345 VoidSequenceAdd( sharpDocumentDollar, one ); 00346 VoidSequenceAppend( sharpDocumentDollar, line ); 00347 VoidSequenceAdd( sharpDocumentDollar, zero ); 00348 SCDAWGAdd( help, sharpDocumentDollar ); 00349 VoidSequenceFree( sharpDocumentDollar ); 00350 } 00351 Fclose( fpInput ); 00352 SCDAWGClose( help ); 00353 SCDAWGShrink( aut ); 00354 SCDAWGBuildHelpFree( help ); 00355 if( tarjanTable ){ 00356 SCDAWGAddTarjanTable( aut ); 00357 } 00358 fpOutput = Fopen( fileNameOutput, "wb" ); 00359 SCDAWGWrite( aut, fpOutput ); 00360 Fclose( fpOutput ); 00361 SCDAWGFree( aut ); 00362 } 00363 00364 }}} /* End of namespace lmu::cis::sis */ 00365 00366 #endif /* end of include guard: SCDAWGADAPTERADAPTER_CPP */