SIS
Symmetric Index Structures
|
00001 #include "base.h" 00002 00003 static UINT GetIndexOfTransition( void * ptr, const void * object ){ 00004 CDAWGBuildHelp * help; 00005 TransitionFromStartTo * tr; 00006 UINT index; 00007 00008 help = (CDAWGBuildHelp *)(ptr); 00009 index = mCompressedAutomatonGetTransitionsStored( help->aut ); 00010 tr = (TransitionFromStartTo *)(object); 00011 UINTSequenceAdd( help->transitionsFrom, tr->from ); 00012 CompressedAutomatonAddTransition( help->aut, tr->from, tr->start, tr->to ); 00013 if( help->aut->statesTransitions->seq[tr->from] == index ){ 00014 UINTSequenceAdd( help->transitionsNext, NO ); 00015 } 00016 else{ 00017 UINTSequenceAdd( help->transitionsNext, help->aut->statesTransitions->seq[tr->from] ); 00018 help->aut->statesTransitions->seq[tr->from] = index; 00019 } 00020 return index; 00021 } 00022 00023 static UINT GetFirstIndex( void * ptr ){ 00024 CDAWGBuildHelp * help; 00025 00026 help = (CDAWGBuildHelp *)(ptr); 00027 if( mCompressedAutomatonGetTransitionsStored( help->aut ) == 0 ){ 00028 return NO; 00029 } 00030 return 0; 00031 } 00032 00033 static UINT GetNextIndex( void * ptr, UINT index ){ 00034 CDAWGBuildHelp * help; 00035 00036 help = (CDAWGBuildHelp *)(ptr); 00037 index++; 00038 if( index < mCompressedAutomatonGetTransitionsStored( help->aut ) ){ 00039 return index; 00040 } 00041 return NO; 00042 } 00043 00044 static boolean Equals( void * ptr, const void * object, UINT index ){ 00045 CDAWGBuildHelp * help; 00046 TransitionFromStartTo * tr; 00047 00048 help = (CDAWGBuildHelp *)(ptr); 00049 tr = (TransitionFromStartTo *)(object); 00050 return( help->transitionsFrom->seq[index] == tr->from && 00051 CmpSymbols( mVoidSequenceElement(help->aut->data, help->aut->transitionsStart->seq[index]), tr->symbol, help->aut->data->elementSize ) == 0 ); 00052 } 00053 00054 static UINT HashCode( void * ptr, const void * object, UINT hashSize ){ 00055 CDAWGBuildHelp * help; 00056 TransitionFromStartTo * tr; 00057 00058 help = (CDAWGBuildHelp *)(ptr); 00059 tr = (TransitionFromStartTo *)(object); 00060 return CodeBytes( CodeUINT( 0, tr->from, hashSize ), tr->symbol, help->aut->data->elementSize, hashSize ); 00061 } 00062 00063 static UINT HashCode2( void * ptr, UINT index, UINT hashSize ){ 00064 CDAWGBuildHelp * help; 00065 00066 help = (CDAWGBuildHelp *)(ptr); 00067 return CodeBytes( CodeUINT( 0, help->transitionsFrom->seq[index], hashSize ), 00068 mVoidSequenceElement( help->aut->data, help->aut->transitionsStart->seq[index] ), 00069 help->aut->data->elementSize, 00070 hashSize ); 00071 } 00072 00073 CDAWGBuildHelp * CDAWGBuildHelpInit( CompressedAutomaton * aut ){ 00074 CDAWGBuildHelp * help; 00075 00076 help = Malloc( 1, sizeof(CDAWGBuildHelp) ); 00077 help->aut = aut; 00078 help->statesSuffixLink = UINTSequenceInit2( aut->statesTransitions->seqAlloced, 1024 ); 00079 help->statesLength = UINTSequenceInit2( aut->statesTransitions->seqAlloced, 1024 ); 00080 help->transitionsFrom = UINTSequenceInit2( aut->transitionsStart->seqAlloced, 1024 ); 00081 help->transitionsNext = UINTSequenceInit2( aut->transitionsStart->seqAlloced, 1024 ); 00082 help->hash = HashInit( help, GetIndexOfTransition, GetFirstIndex, GetNextIndex, Equals, HashCode, HashCode2 ); 00083 help->symbol = Malloc( 1, aut->data->elementSize ); 00084 return help; 00085 } 00086 00087 void CDAWGBuildHelpFree( CDAWGBuildHelp * help ){ 00088 UINTSequenceFree( help->statesSuffixLink ); 00089 UINTSequenceFree( help->statesLength ); 00090 UINTSequenceFree( help->transitionsFrom ); 00091 UINTSequenceFree( help->transitionsNext ); 00092 HashFree( help->hash ); 00093 Free( help->symbol ); 00094 Free( help ); 00095 } 00096 00097 PairState Delta( CDAWGBuildHelp * help, const PairState * state, const void * symbol, UINT * tr ){ 00098 PairState nextState; 00099 CompressedAutomaton * a; 00100 00101 a = help->aut; 00102 nextState.transition = NO; 00103 nextState.index = NO; 00104 if( state->transition != NO ){ 00105 (*tr) = state->transition; 00106 if( CmpSymbols( mVoidSequenceElement(a->data, state->index), symbol, a->data->elementSize ) != 0 ){ 00107 return nextState; 00108 } 00109 nextState.index = state->index + 1; 00110 nextState.transition = state->transition; 00111 if( nextState.index > a->statesEnd->seq[a->transitionsTo->seq[state->transition]] ){ 00112 nextState.transition = NO; 00113 nextState.index = a->transitionsTo->seq[state->transition]; 00114 } 00115 } 00116 else{ 00117 TransitionFromStartTo tfst; 00118 00119 tfst.from = state->index; 00120 tfst.symbol = symbol; 00121 (*tr) = HashGetIndex( help->hash, &tfst ); 00122 if( (*tr) == NO ){ 00123 return nextState; 00124 } 00125 nextState.index = a->transitionsStart->seq[(*tr)] + 1; 00126 nextState.transition = (*tr); 00127 if( nextState.index > a->statesEnd->seq[a->transitionsTo->seq[(*tr)]] ){ 00128 nextState.transition = NO; 00129 nextState.index = a->transitionsTo->seq[(*tr)]; 00130 } 00131 } 00132 return nextState; 00133 } 00134 00135 static UINT NewState( CDAWGBuildHelp * help ){ 00136 UINT newState; 00137 00138 newState = mCompressedAutomatonGetStatesStored( help->aut ); 00139 CompressedAutomatonAddState( help->aut, newState ); 00140 UINTSequenceAdd( help->statesSuffixLink, NO ); 00141 UINTSequenceAdd( help->statesLength, NO ); 00142 return newState; 00143 } 00144 00145 static UINT Split( CDAWGBuildHelp * help, const PairState * state ){ 00146 UINT newState, from, to; 00147 TransitionFromStartTo tfst; 00148 00149 newState = NewState( help ); 00150 help->aut->statesEnd->seq[newState] = state->index - 1; 00151 from = help->transitionsFrom->seq[state->transition]; 00152 to = help->aut->transitionsTo->seq[state->transition]; 00153 help->statesLength->seq[newState] = help->statesLength->seq[from] + (state->index - help->aut->transitionsStart->seq[state->transition]); 00154 help->aut->transitionsTo->seq[state->transition] = newState; 00155 tfst.from = newState; 00156 tfst.start = state->index; 00157 tfst.to = to; 00158 memcpy( help->symbol, mVoidSequenceElement(help->aut->data, state->index), help->aut->data->elementSize ); 00159 tfst.symbol = help->symbol; 00160 HashAdd( help->hash, &tfst ); 00161 return newState; 00162 } 00163 00164 static PairState SuffixLink( CDAWGBuildHelp * help, const PairState * state ){ 00165 PairState fState; 00166 TransitionFromStartTo tfst; 00167 UINT s, start, end, tr, startTr, endTr; 00168 00169 if( state->transition == NO ){ 00170 fState.transition = NO; 00171 fState.index = help->statesSuffixLink->seq[state->index]; 00172 return fState; 00173 } 00174 s = help->transitionsFrom->seq[state->transition]; 00175 start = help->aut->transitionsStart->seq[state->transition]; 00176 end = state->index - 1; 00177 if( s == help->aut->initialState ){ 00178 start++; 00179 if( start > end ){ 00180 return CompressedAutomatonGetInitialPairState( help->aut ); 00181 } 00182 } 00183 else{ 00184 s = help->statesSuffixLink->seq[s]; 00185 } 00186 while( bTRUE ){ 00187 tfst.from = s; 00188 tfst.symbol = mVoidSequenceElement( help->aut->data, start ); 00189 tr = HashGetIndex( help->hash, &tfst ); 00190 startTr = help->aut->transitionsStart->seq[tr]; 00191 endTr = help->aut->statesEnd->seq[help->aut->transitionsTo->seq[tr]]; 00192 if( end - start < endTr - startTr ){ 00193 fState.index = startTr + end - start + 1; 00194 fState.transition = tr; 00195 break; 00196 } 00197 if( end - start == endTr - startTr ){ 00198 fState.index = help->aut->transitionsTo->seq[tr]; 00199 fState.transition = NO; 00200 break; 00201 } 00202 s = help->aut->transitionsTo->seq[tr]; 00203 start += (endTr - startTr + 1); 00204 } 00205 return fState; 00206 } 00207 00209 static PairState Separate( CDAWGBuildHelp * help, const PairState * state, UINT i ){ 00210 TransitionFromStartTo tfst; 00211 PairState nextState, st, q; 00212 UINT length, newState, tr, t; 00213 00214 nextState = Delta( help, state, mVoidSequenceElement(help->aut->data, i), &t ); 00215 if( nextState.transition != NO ){ 00216 return nextState; 00217 } 00218 if( state->transition != NO ){ 00219 UINT from, to; 00220 00221 from = help->transitionsFrom->seq[state->transition]; 00222 to = help->aut->transitionsTo->seq[state->transition]; 00223 length = help->statesLength->seq[from] + help->aut->statesEnd->seq[to] - help->aut->transitionsStart->seq[state->transition] + 1; 00224 } 00225 else{ 00226 length = help->statesLength->seq[state->index] + 1; 00227 } 00228 if( help->statesLength->seq[nextState.index] == length ){ 00229 return nextState; 00230 } 00231 newState = NewState( help ); 00232 help->statesLength->seq[newState] = length; 00233 help->aut->statesEnd->seq[newState] = help->aut->statesEnd->seq[nextState.index]; 00234 help->statesSuffixLink->seq[newState] = help->statesSuffixLink->seq[nextState.index]; 00235 help->statesSuffixLink->seq[nextState.index] = newState; 00236 tfst.from = newState; 00237 for( tr = help->aut->statesTransitions->seq[nextState.index]; tr != NO; tr = help->transitionsNext->seq[tr] ){ 00238 tfst.start = help->aut->transitionsStart->seq[tr]; 00239 tfst.to = help->aut->transitionsTo->seq[tr]; 00240 memcpy( help->symbol, mVoidSequenceElement(help->aut->data, tfst.start), help->aut->data->elementSize ); 00241 tfst.symbol = help->symbol; 00242 HashAdd( help->hash, &tfst ); 00243 } 00244 st = (*state); 00245 while( bTRUE ){ 00246 help->aut->transitionsTo->seq[t] = newState; 00247 if( st.transition == NO && st.index == help->aut->initialState ){ 00248 break; 00249 } 00250 st = SuffixLink( help, &st ); 00251 q = Delta( help, &st, mVoidSequenceElement(help->aut->data, i), &t ); 00252 if( q.transition != nextState.transition || q.index != nextState.index ){ 00253 break; 00254 } 00255 } 00256 nextState.transition = NO; 00257 nextState.index = newState; 00258 return nextState; 00259 } 00260 00261 00262 static PairState Update( CDAWGBuildHelp * help, const PairState * state, UINT i, UINT * sink, UINT infinity, boolean * slink ){ 00263 /* 00264 function update(s, (k, p)): returns pair of integers; 00265 // (s, (k, p − 1)) is the canonical reference pair for the active point. 00266 00267 c := w[p]; 00268 oldr := nil; 00269 00270 while not check_end_point(s,(k,p−1),c) do 00271 if k ≤ p−1 then 00272 r := split_edge(s,(k,p−1)); 00273 else 00274 r := s; // explicit case. 00275 00276 create node r′; 00277 create edge (r, (p, ∞), r′); 00278 00279 if oldr != nil then 00280 suf(oldr) := r; 00281 00282 oldr := r; 00283 00284 (s, k) := canonize(suf (s), (k, p − 1)); 00285 00286 if oldr != nil then 00287 suf(oldr) := s; 00288 00289 return canonize(s, (k, p)); 00290 */ 00291 PairState state_with_i; 00292 UINT r, prevr, succ, t; 00293 TransitionFromStartTo tfst; 00294 00295 r = succ = NO; 00296 state_with_i = (*state); 00297 prevr = help->aut->initialState; 00298 while( bTRUE ){ 00299 (*slink) = ( Delta( help, &state_with_i, mVoidSequenceElement(help->aut->data, i), &t ).index != NO ); 00300 if( *slink ){ 00301 break; 00302 } 00303 if( state_with_i.transition != NO ){ 00304 if( succ == help->aut->transitionsTo->seq[state_with_i.transition] ){ 00305 help->aut->transitionsTo->seq[state_with_i.transition] = r; 00306 state_with_i = SuffixLink( help, &state_with_i ); 00307 continue; 00308 } 00309 else{ 00310 succ = help->aut->transitionsTo->seq[state_with_i.transition]; 00311 r = Split( help, &state_with_i ); 00312 } 00313 } 00314 else{ 00315 r = state_with_i.index; 00316 } 00317 tfst.from = r; 00318 tfst.start = i; 00319 if( (*sink) == NO ){ 00320 (*sink) = NewState( help ); 00321 help->statesLength->seq[(*sink)] = infinity; 00322 help->aut->statesEnd->seq[(*sink)] = help->aut->data->seqStored - 1; 00323 } 00324 tfst.to = (*sink); 00325 memcpy( help->symbol, mVoidSequenceElement(help->aut->data, i), help->aut->data->elementSize ); 00326 tfst.symbol = help->symbol; 00327 HashAdd( help->hash, &tfst ); 00328 if( prevr != help->aut->initialState ){ 00329 help->statesSuffixLink->seq[prevr] = r; 00330 } 00331 if( state_with_i.transition == NO && state_with_i.index == help->aut->initialState ){ 00332 return state_with_i; 00333 } 00334 prevr = r; 00335 state_with_i = SuffixLink( help, &state_with_i ); 00336 } 00337 if( prevr != help->aut->initialState ){ 00338 help->statesSuffixLink->seq[prevr] = state_with_i.index; 00339 } 00340 return Separate( help, &state_with_i, i ); 00341 } 00342 00344 UINT CDAWGAdd( CDAWGBuildHelp * help, const VoidSequence * documentDollar ){ 00345 PairState state; 00346 UINT i, sink; 00347 boolean slink; 00348 00349 i = help->aut->data->seqStored; // WTF? 00350 00351 // make sure there is a dollar at the end 00352 VoidSequenceAppend( help->aut->data, documentDollar ); 00353 00354 00355 // create_nodes root and ⊥; 00356 if( help->aut->initialState == NO ){ 00357 UINT newState; 00358 00359 newState = NewState( help ); 00360 help->statesLength->seq[newState] = 0; 00361 help->aut->initialState = newState; 00362 } 00363 state = CompressedAutomatonGetInitialPairState( help->aut ); 00364 00365 // Not doing this step? 00366 // for j := 1 to m do 00367 //create_edge (⊥, (−j, −j), root); 00368 00369 00370 sink = NO; 00371 for( ; i < help->aut->data->seqStored; i++ ){ 00372 state = Update( help, &state, i, &sink, documentDollar->seqStored, &slink ); 00373 } 00374 if( sink != NO ){ 00375 if( slink ){ 00376 help->statesSuffixLink->seq[sink] = state.index; 00377 } 00378 else{ 00379 help->statesSuffixLink->seq[sink] = help->aut->initialState; 00380 } 00381 } 00382 return sink; 00383 }