SIS
Symmetric Index Structures
|
00001 #include "base.h" 00002 00003 CompressedAutomaton * CompressedAutomatonInit( UINT symbolSize ){ 00004 CompressedAutomaton * aut; 00005 00006 aut = Malloc( 1, sizeof(CompressedAutomaton) ); 00007 aut->initialState = NO; 00008 aut->statesTransitions = UINTSequenceInit2( 1024, 1024 ); 00009 aut->statesNumberOfTransitions = UINTSequenceInit2( 1024, 1024 ); 00010 aut->statesEnd = UINTSequenceInit2( 1024, 1024 ); 00011 aut->transitionsStart = UINTSequenceInit2( 1024, 1024 ); 00012 aut->transitionsTo = UINTSequenceInit2( 1024, 1024 ); 00013 aut->data = VoidSequenceInit2( symbolSize, 1024, 1024 ); 00014 aut->tt = NULL; 00015 return aut; 00016 } 00017 00018 void CompressedAutomatonAddState( CompressedAutomaton * aut, UINT state ){ 00019 UINT statesStored; 00020 00021 statesStored = mCompressedAutomatonGetStatesStored( aut ); 00022 if( statesStored <= state ){ 00023 UINT i; 00024 00025 for( i = statesStored; i <= state; i++ ){ 00026 UINTSequenceAdd( aut->statesTransitions, NO ); 00027 UINTSequenceAdd( aut->statesNumberOfTransitions, 0 ); 00028 UINTSequenceAdd( aut->statesEnd, NO ); 00029 } 00030 } 00031 } 00032 00033 void CompressedAutomatonAddTransition( CompressedAutomaton * aut, UINT stateFrom, UINT start, UINT stateTo ){ 00034 UINT transitionsStored; 00035 00036 transitionsStored = mCompressedAutomatonGetTransitionsStored( aut ); 00037 CompressedAutomatonAddState( aut, mMAX( stateFrom, stateTo ) ); 00038 UINTSequenceAdd( aut->transitionsStart, start ); 00039 UINTSequenceAdd( aut->transitionsTo, stateTo ); 00040 aut->statesNumberOfTransitions->seq[stateFrom]++; 00041 if( aut->statesTransitions->seq[stateFrom] == NO ){ 00042 aut->statesTransitions->seq[stateFrom] = transitionsStored; 00043 } 00044 } 00045 00046 void CompressedAutomatonFree( CompressedAutomaton * aut ){ 00047 UINTSequenceFree( aut->statesTransitions ); 00048 UINTSequenceFree( aut->statesNumberOfTransitions ); 00049 UINTSequenceFree( aut->statesEnd ); 00050 UINTSequenceFree( aut->transitionsStart ); 00051 UINTSequenceFree( aut->transitionsTo ); 00052 if( aut->data != NULL ){ 00053 VoidSequenceFree( aut->data ); 00054 } 00055 if( aut->tt != NULL ){ 00056 if( aut->tt->alph != NULL ){ 00057 AlphabetFree( aut->tt->alph ); 00058 } 00059 TarjanTableFree( aut->tt ); 00060 } 00061 Free( aut ); 00062 } 00063 00064 void CompressedAutomatonWrite( const CompressedAutomaton * aut, FILE * fp ){ 00065 U8 * element; 00066 U8 tt; 00067 00068 if( endiannessOfThisMachine != endiannessOnWrite ){ 00069 element = (U8 *)(&(aut->initialState)); mReverseBytes( element, sizeof(UINT) ) 00070 } 00071 mWriteEndianness( fp, endiannessOnWrite ) 00072 Fwrite( &(aut->initialState), sizeof(UINT), 1, fp ); 00073 if( endiannessOfThisMachine != endiannessOnWrite ){ 00074 element = (U8 *)(&(aut->initialState)); mReverseBytes( element, sizeof(UINT) ) 00075 } 00076 UINTSequenceWrite( aut->statesTransitions, fp ); 00077 UINTSequenceWrite( aut->statesNumberOfTransitions, fp ); 00078 UINTSequenceWrite( aut->statesEnd, fp ); 00079 UINTSequenceWrite( aut->transitionsStart, fp ); 00080 UINTSequenceWrite( aut->transitionsTo, fp ); 00081 if( aut->data == NULL ){ 00082 tt = 0; 00083 Fwrite( &tt, sizeof(U8), 1, fp ); 00084 } 00085 else{ 00086 UINT sizes[1]; 00087 00088 sizes[0] = aut->data->elementSize; 00089 tt = 1; 00090 Fwrite( &tt, sizeof(U8), 1, fp ); 00091 VoidSequenceWrite( aut->data, fp, sizes, 1 ); 00092 } 00093 if( aut->tt != NULL ){ 00094 tt = 1; 00095 Fwrite( &tt, sizeof(U8), 1, fp ); 00096 TarjanTableWrite( aut->tt, fp ); 00097 } 00098 else{ 00099 tt = 0; 00100 Fwrite( &tt, sizeof(U8), 1, fp ); 00101 } 00102 } 00103 00104 CompressedAutomaton * CompressedAutomatonRead( FILE * fp ){ 00105 CompressedAutomaton * aut; 00106 U8 * element; 00107 UINT endianness; 00108 U8 tt; 00109 00110 aut = (CompressedAutomaton *)Malloc( 1, sizeof(CompressedAutomaton) ); 00111 mReadEndianness( fp, endianness ) 00112 Fread( &(aut->initialState), sizeof(UINT), 1, fp ); 00113 if( endiannessOfThisMachine != endianness ){ 00114 element = (U8 *)(&(aut->initialState)); mReverseBytes( element, sizeof(UINT) ) 00115 } 00116 aut->statesTransitions = UINTSequenceRead( fp ); 00117 aut->statesNumberOfTransitions = UINTSequenceRead( fp ); 00118 aut->statesEnd = UINTSequenceRead( fp ); 00119 aut->transitionsStart = UINTSequenceRead( fp ); 00120 aut->transitionsTo = UINTSequenceRead( fp ); 00121 Fread( &tt, sizeof(U8), 1, fp ); 00122 if( tt == 1 ){ 00123 aut->data = VoidSequenceRead( fp ); 00124 } 00125 else{ 00126 aut->data = NULL; 00127 } 00128 Fread( &tt, sizeof(U8), 1, fp ); 00129 if( tt == 1 ){ 00130 aut->tt = TarjanTableRead( fp ); 00131 } 00132 else{ 00133 aut->tt = NULL; 00134 } 00135 return aut; 00136 } 00137 00138 void CompressedAutomatonDumpStat( const CompressedAutomaton * aut, FILE * fp ){ 00139 double size; 00140 00141 if( aut->data != NULL ){ 00142 fprintf( fp, "number of data symbols: %llu\n", (U64)(aut->data->seqStored) ); 00143 fprintf( fp, "symbol size: %llu\n", (U64)(aut->data->elementSize) ); 00144 size = aut->data->seqStored; 00145 size *= aut->data->elementSize; 00146 size /= 1024.0; 00147 size /= 1024.0; 00148 fprintf( fp, "data size: %.2f MB\n", size ); 00149 } 00150 fprintf( fp, "number of states: %llu\n", (U64)mCompressedAutomatonGetStatesStored(aut) ); 00151 fprintf( fp, "number of transitions: %llu\n", (U64)mCompressedAutomatonGetTransitionsStored(aut) ); 00152 } 00153 00154 void CompressedAutomatonShrink( CompressedAutomaton * aut ){ 00155 UINTSequenceShrink( aut->statesTransitions ); 00156 UINTSequenceShrink( aut->statesNumberOfTransitions ); 00157 UINTSequenceShrink( aut->statesEnd ); 00158 UINTSequenceShrink( aut->transitionsStart ); 00159 UINTSequenceShrink( aut->transitionsTo ); 00160 if( aut->data != NULL ){ 00161 VoidSequenceShrink( aut->data ); 00162 } 00163 } 00164 00165 void GenerateLanguage( const CompressedAutomaton * aut, UINT state, VoidSequence * line, FILE * fp, void (*DumpSequenceOfLabels)(FILE *, const VoidSequence *, UINT), UINT encoding ){ 00166 UINT i, j, tr, end, seqStored; 00167 00168 if( aut->statesNumberOfTransitions->seq[state] == 0 ){ 00169 DumpSequenceOfLabels( fp, line, encoding ); 00170 } 00171 tr = aut->statesTransitions->seq[state]; 00172 seqStored = line->seqStored; 00173 for( i = 0; i < aut->statesNumberOfTransitions->seq[state]; i++ ){ 00174 end = aut->statesEnd->seq[aut->transitionsTo->seq[tr+i]]; 00175 for( j = aut->transitionsStart->seq[tr+i]; j <= end; j++ ){ 00176 VoidSequenceAdd( line, mVoidSequenceElement(aut->data, j) ); 00177 } 00178 GenerateLanguage( aut, aut->transitionsTo->seq[tr+i], line, fp, DumpSequenceOfLabels, encoding ); 00179 line->seqStored = seqStored; 00180 } 00181 } 00182 00183 void CompressedAutomatonGenerateLanguage( const CompressedAutomaton * aut, FILE * fp, void (*DumpSequenceOfLabels)(FILE *, const VoidSequence *, UINT), UINT encoding ){ 00184 VoidSequence * line; 00185 00186 if( aut->initialState == NO ){ 00187 return; 00188 } 00189 line = VoidSequenceInit( aut->data->elementSize ); 00190 GenerateLanguage( aut, aut->initialState, line, fp, DumpSequenceOfLabels, encoding ); 00191 VoidSequenceFree( line ); 00192 } 00193 00194 static void GenerateLeftLanguage( const CompressedAutomaton * aut, 00195 UINT state, 00196 VoidSequence * line, 00197 FILE * fp, 00198 void (*DumpSequenceOfLabels)(FILE *, const VoidSequence *, UINT), 00199 UINT encoding ){ 00200 UINT i, j, tr, end, seqStored; 00201 00202 if( aut->statesNumberOfTransitions->seq[state] == 0 ){ 00203 DumpSequenceOfLabels( fp, line, encoding ); 00204 } 00205 tr = aut->statesTransitions->seq[state]; 00206 seqStored = line->seqStored; 00207 for( i = 0; i < aut->statesNumberOfTransitions->seq[state]; i++ ){ 00208 end = aut->statesEnd->seq[aut->transitionsTo->seq[tr+i]]; 00209 for( j = aut->transitionsStart->seq[tr+i]+1; j > end; j-- ){ 00210 VoidSequenceAdd( line, mVoidSequenceElement(aut->data, j-1) ); 00211 } 00212 GenerateLeftLanguage( aut, aut->transitionsTo->seq[tr+i], line, fp, DumpSequenceOfLabels, encoding ); 00213 line->seqStored = seqStored; 00214 } 00215 } 00216 00217 void CompressedLeftAutomatonGenerateLanguage( const CompressedAutomaton * aut, FILE * fp, 00218 void (*DumpSequenceOfLabels)(FILE *, const VoidSequence *, UINT), UINT encoding ){ 00219 VoidSequence * line; 00220 00221 if( aut->initialState == NO ){ 00222 return; 00223 } 00224 line = VoidSequenceInit( aut->data->elementSize ); 00225 GenerateLeftLanguage( aut, aut->initialState, line, fp, DumpSequenceOfLabels, encoding ); 00226 VoidSequenceFree( line ); 00227 } 00228 00229 typedef struct tCompressedAutomatonTransitionsFrom{ 00230 CompressedAutomaton * aut; 00231 UINTSequence * transitionsFrom; 00232 } CompressedAutomatonTransitionsFrom; 00233 00234 static SINT CmpTransitions( void * ptr, UINT t1, UINT t2 ){ 00235 CompressedAutomatonTransitionsFrom * catf; 00236 SINT labelCmp; 00237 00238 catf = (CompressedAutomatonTransitionsFrom *)(ptr); 00239 if( catf->transitionsFrom->seq[t1] < catf->transitionsFrom->seq[t2] ){ 00240 return -1; 00241 } 00242 if( catf->transitionsFrom->seq[t1] > catf->transitionsFrom->seq[t2] ){ 00243 return 1; 00244 } 00245 labelCmp = CmpSymbols( mVoidSequenceElement(catf->aut->data, catf->aut->transitionsStart->seq[t1]), 00246 mVoidSequenceElement(catf->aut->data, catf->aut->transitionsStart->seq[t2]), 00247 catf->aut->data->elementSize ); 00248 if( labelCmp != 0 ){ 00249 return labelCmp; 00250 } 00251 if( catf->aut->transitionsTo->seq[t1] < catf->aut->transitionsTo->seq[t2] ){ 00252 return -1; 00253 } 00254 if( catf->aut->transitionsTo->seq[t1] > catf->aut->transitionsTo->seq[t2] ){ 00255 return 1; 00256 } 00257 return 0; 00258 } 00259 00260 static void SwapTransitions( void * ptr, UINT t1, UINT t2 ){ 00261 CompressedAutomatonTransitionsFrom * catf; 00262 UINT tmp; 00263 00264 catf = (CompressedAutomatonTransitionsFrom *)(ptr); 00265 mSwapVariables( catf->transitionsFrom->seq[t1], catf->transitionsFrom->seq[t2], tmp ); 00266 mSwapVariables( catf->aut->transitionsStart->seq[t1], catf->aut->transitionsStart->seq[t2], tmp ); 00267 mSwapVariables( catf->aut->transitionsTo->seq[t1], catf->aut->transitionsTo->seq[t2], tmp ); 00268 } 00269 00270 void CompressedAutomatonSortTransitions( CompressedAutomaton * aut, UINTSequence * transitionsFrom ){ 00271 CompressedAutomatonTransitionsFrom catf; 00272 UINT i, j, q, transitionsStored; 00273 00274 for( i = 0; i < mCompressedAutomatonGetStatesStored(aut); i++ ){ 00275 aut->statesTransitions->seq[i] = NO; 00276 aut->statesNumberOfTransitions->seq[i] = 0; 00277 } 00278 transitionsStored = mCompressedAutomatonGetTransitionsStored( aut ); 00279 if( transitionsStored == 0 ){ 00280 return; 00281 } 00282 catf.aut = aut; 00283 catf.transitionsFrom = transitionsFrom; 00284 Sort( &catf, transitionsStored, CmpTransitions, SwapTransitions ); 00285 j = 1; 00286 q = transitionsFrom->seq[0]; 00287 aut->statesTransitions->seq[q] = 0; 00288 aut->statesNumberOfTransitions->seq[q] = 1; 00289 for( i = 1; i < transitionsStored; i++ ){ 00290 if( CmpTransitions( &catf, j-1, i ) != 0 ){ 00291 if( j != i ){ 00292 transitionsFrom->seq[j] = transitionsFrom->seq[i]; 00293 aut->transitionsStart->seq[j] = aut->transitionsStart->seq[i]; 00294 aut->transitionsTo->seq[j] = aut->transitionsTo->seq[i]; 00295 } 00296 if( transitionsFrom->seq[j] == q ){ 00297 aut->statesNumberOfTransitions->seq[q]++; 00298 } 00299 else{ 00300 q = transitionsFrom->seq[j]; 00301 aut->statesTransitions->seq[q] = j; 00302 aut->statesNumberOfTransitions->seq[q] = 1; 00303 } 00304 j++; 00305 } 00306 } 00307 if( j < transitionsStored ){ 00308 transitionsFrom->seqStored = j; 00309 aut->transitionsStart->seqStored = j; 00310 aut->transitionsTo->seqStored = j; 00311 UINTSequenceShrink( transitionsFrom ); 00312 UINTSequenceShrink( aut->transitionsStart ); 00313 UINTSequenceShrink( aut->transitionsTo ); 00314 } 00315 } 00316 00317 static SINT Cmp( const void * set, UINT tr, const void * object ){ 00318 const CompressedAutomaton * a; 00319 00320 a = (const CompressedAutomaton *)(set); 00321 return CmpSymbols( mVoidSequenceElement(a->data, a->transitionsStart->seq[tr]), object, a->data->elementSize ); 00322 } 00323 00324 PairState CompressedAutomatonDelta( const CompressedAutomaton * a, const PairState * state, const void * symbol ){ 00325 PairState nextState; 00326 00327 nextState.transition = NO; 00328 nextState.index = NO; 00329 if( state->transition != NO ){ 00330 if( CmpSymbols( mVoidSequenceElement(a->data, state->index), symbol, a->data->elementSize ) != 0 ){ 00331 return nextState; 00332 } 00333 nextState.index = state->index + 1; 00334 nextState.transition = state->transition; 00335 if( nextState.index > a->statesEnd->seq[a->transitionsTo->seq[state->transition]] ){ 00336 nextState.transition = NO; 00337 nextState.index = a->transitionsTo->seq[state->transition]; 00338 } 00339 } 00340 else{ 00341 UINT tr; 00342 00343 if( a->tt != NULL ){ 00344 UINT code; 00345 00346 code = SymbolToUINT( symbol, a->data->elementSize ); 00347 code = mAlphabetGetCode( a->tt->alph, code ); 00348 if( code == NO_CODE ){ 00349 return nextState; 00350 } 00351 tr = a->tt->table->seq[a->tt->statesTransitions->seq[state->index] + code]; 00352 if( tr == NO || a->statesNumberOfTransitions->seq[state->index] == 0 || 00353 tr < a->statesTransitions->seq[state->index] || a->statesTransitions->seq[state->index] + a->statesNumberOfTransitions->seq[state->index] <= tr ){ 00354 return nextState; 00355 } 00356 } 00357 else{ 00358 tr = BinarySearch( a, a->statesTransitions->seq[state->index], a->statesNumberOfTransitions->seq[state->index], symbol, Cmp ); 00359 if( tr == NO ){ 00360 return nextState; 00361 } 00362 } 00363 nextState.index = a->transitionsStart->seq[tr] + 1; 00364 nextState.transition = tr; 00365 if( nextState.index > a->statesEnd->seq[a->transitionsTo->seq[tr]] ){ 00366 nextState.transition = NO; 00367 nextState.index = a->transitionsTo->seq[tr]; 00368 } 00369 } 00370 return nextState; 00371 } 00372 00373 void CompressedAutomatonAddTarjanTable( CompressedAutomaton * aut ){ 00374 Alphabet * alph; 00375 Automaton a; 00376 UINT i; 00377 00378 a.type = AUTOMATON_TYPE_SYMBOL; 00379 a.statesTransitions = aut->statesTransitions; 00380 a.statesNumberOfTransitions = aut->statesNumberOfTransitions; 00381 a.transitionsTo = aut->transitionsTo; 00382 a.transitionsLabel = VoidSequenceInit2( aut->data->elementSize, mCompressedAutomatonGetTransitionsStored(aut), 1 ); 00383 for( i = 0; i < mCompressedAutomatonGetTransitionsStored(aut); i++ ){ 00384 VoidSequenceAdd( a.transitionsLabel, mVoidSequenceElement(aut->data, aut->transitionsStart->seq[i]) ); 00385 } 00386 alph = AlphabetInit(); 00387 AlphabetAdd( alph, &a ); 00388 aut->tt = TarjanTableInit( alph, &a ); 00389 VoidSequenceFree( a.transitionsLabel ); 00390 } 00391 00392 PairState CompressedAutomatonGetInitialPairState( const CompressedAutomaton * aut ){ 00393 PairState state; 00394 00395 state.transition = NO; 00396 state.index = aut->initialState; 00397 return state; 00398 } 00399 00400 void CompressedAutomatonDumpGV( CompressedAutomaton * aut, FILE * fp, void (*DumpLabel)(const CompressedAutomaton *, FILE *, UINT, UINT), UINT encoding ){ 00401 S8 tmp[MAX_INPUT_STRING_SIZE]; 00402 UINT i, s, tr, statesStored, symbolSize; 00403 00404 symbolSize = aut->data->elementSize; 00405 PrintString( fp, "Digraph A{ \n", symbolSize, encoding ); 00406 PrintString( fp, "rankdir=LR;\n", symbolSize, encoding ); 00407 statesStored = mCompressedAutomatonGetStatesStored( aut ); 00408 PrintString( fp, "node[shape = point, color = white]; \"\";\n", symbolSize, encoding ); 00409 PrintString( fp, "node [shape = circle, color = black];\n", symbolSize, encoding ); 00410 sprintf( tmp, "\"\" -> %llu;\n", (U64)(aut->initialState) ); 00411 PrintString( fp, tmp, symbolSize, encoding ); 00412 for( s = 0; s < statesStored; s++ ){ 00413 for( i = 0; i < aut->statesNumberOfTransitions->seq[s]; i++ ){ 00414 tr = aut->statesTransitions->seq[s] + i; 00415 sprintf( tmp, "%llu -> %llu [ label = \"", (U64)(s), (U64)(aut->transitionsTo->seq[tr]) ); 00416 PrintString( fp, tmp, symbolSize, encoding ); 00417 DumpLabel( aut, fp, tr, encoding ); 00418 PrintString( fp, "\" ];\n", symbolSize, encoding ); 00419 } 00420 } 00421 PrintString( fp, "}\n", symbolSize, encoding ); 00422 }