SIS
Symmetric Index Structures
|
00001 #include "../base.h" 00002 00003 void CmdBuildSuffixTree( const S8 * fileNameInput, const S8 * fileNameOutput, UINT bitsPerSymbol, UINT encoding, boolean tarjanTable ){ 00004 FILE * fpInput; 00005 FILE * fpOutput; 00006 VoidSequence * line = NULL; 00007 CompressedAutomaton * aut; 00008 SuffixTreeBuildHelp * help; 00009 UINT i, nLine, symbolSize; 00010 mSymbolAndVariables( zero ) 00011 00012 mValidateBitsPerSymbol( bitsPerSymbol ) 00013 symbolSize = bitsPerSymbol/8; 00014 mSymbolAssignValue( zero, 0, symbolSize ) 00015 aut = CompressedAutomatonInit( symbolSize ); 00016 help = SuffixTreeBuildHelpInit( aut ); 00017 fpInput = Fopen( fileNameInput, "rb" ); 00018 nLine = 0; 00019 while( bTRUE ){ 00020 if( line != NULL ){ 00021 VoidSequenceFree( line ); 00022 } 00023 line = VoidSequenceReadLine( fpInput, symbolSize, encoding ); 00024 if( line == NULL ){ 00025 break; 00026 } 00027 nLine++; 00028 for( i = 0; i < line->seqStored; i++ ){ 00029 if( SymbolToUINT( mVoidSequenceElement(line, i), symbolSize ) == 0 ){ 00030 S8 msg[MAX_INPUT_STRING_SIZE]; 00031 00032 sprintf( msg, "Cannot process line %llu - 0 is reserved as a new symbol.", (U64)(nLine) ); 00033 Throw( msg ); 00034 } 00035 } 00036 VoidSequenceAdd( line, zero ); 00037 SuffixTreeAdd( help, line ); 00038 } 00039 Fclose( fpInput ); 00040 CompressedAutomatonSortTransitions( aut, help->transitionsFrom ); 00041 CompressedAutomatonShrink( aut ); 00042 SuffixTreeBuildHelpFree( help ); 00043 if( tarjanTable ){ 00044 CompressedAutomatonAddTarjanTable( aut ); 00045 } 00046 fpOutput = Fopen( fileNameOutput, "wb" ); 00047 CompressedAutomatonWrite( aut, fpOutput ); 00048 Fclose( fpOutput ); 00049 CompressedAutomatonFree( aut ); 00050 } 00051 00052 void CmdBuildCDAWG( const S8 * fileNameInput, const S8 * fileNameOutput, UINT bitsPerSymbol, UINT encoding, boolean tarjanTable ){ 00053 FILE * fpInput; 00054 FILE * fpOutput; 00055 VoidSequence * line = NULL; 00056 CompressedAutomaton * aut; 00057 CDAWGBuildHelp * help; 00058 UINT i, nLine, symbolSize; 00059 mSymbolAndVariables( zero ) 00060 00061 mValidateBitsPerSymbol( bitsPerSymbol ) 00062 symbolSize = bitsPerSymbol/8; 00063 mSymbolAssignValue( zero, 0, symbolSize ) 00064 aut = CompressedAutomatonInit( symbolSize ); 00065 help = CDAWGBuildHelpInit( aut ); 00066 fpInput = Fopen( fileNameInput, "rb" ); 00067 nLine = 0; 00068 while( bTRUE ){ 00069 if( line != NULL ){ 00070 VoidSequenceFree( line ); 00071 } 00072 line = VoidSequenceReadLine( fpInput, symbolSize, encoding ); 00073 if( line == NULL ){ 00074 break; 00075 } 00076 nLine++; 00077 for( i = 0; i < line->seqStored; i++ ){ 00078 if( SymbolToUINT( mVoidSequenceElement(line, i), symbolSize ) == 0 ){ 00079 S8 msg[MAX_INPUT_STRING_SIZE]; 00080 00081 sprintf( msg, "Cannot process line %llu - 0 is reserved as a new symbol.", (U64)(nLine) ); 00082 Throw( msg ); 00083 } 00084 } 00085 VoidSequenceAdd( line, zero ); 00086 CDAWGAdd( help, line ); 00087 } 00088 Fclose( fpInput ); 00089 CompressedAutomatonSortTransitions( aut, help->transitionsFrom ); 00090 CompressedAutomatonShrink( aut ); 00091 CDAWGBuildHelpFree( help ); 00092 if( tarjanTable ){ 00093 CompressedAutomatonAddTarjanTable( aut ); 00094 } 00095 fpOutput = Fopen( fileNameOutput, "wb" ); 00096 CompressedAutomatonWrite( aut, fpOutput ); 00097 Fclose( fpOutput ); 00098 CompressedAutomatonFree( aut ); 00099 } 00100 00101 void CmdBuildSCDAWG( const S8 * fileNameInput, const S8 * fileNameOutput, UINT bitsPerSymbol, UINT encoding, boolean tarjanTable ){ 00102 FILE * fpInput; 00103 FILE * fpOutput; 00104 VoidSequence * line = NULL; 00105 VoidSequence * sharpDocumentDollar = NULL; 00106 SCDAWG * aut; 00107 SCDAWGBuildHelp * help; 00108 UINT i, n, nLine, symbolSize; 00109 mSymbolAndVariables( zero ) 00110 mSymbolAndVariables( one ) 00111 00112 mValidateBitsPerSymbol( bitsPerSymbol ) 00113 symbolSize = bitsPerSymbol/8; 00114 mSymbolAssignValue( zero, 0, symbolSize ) 00115 mSymbolAssignValue( one, 1, symbolSize ) 00116 aut = SCDAWGInit( symbolSize ); 00117 help = SCDAWGBuildHelpInit( aut ); 00118 fpInput = Fopen( fileNameInput, "rb" ); 00119 nLine = 0; 00120 while( bTRUE ){ 00121 if( line != NULL ){ 00122 VoidSequenceFree( line ); 00123 } 00124 line = VoidSequenceReadLine( fpInput, symbolSize, encoding ); 00125 if( line == NULL ){ 00126 break; 00127 } 00128 nLine++; 00129 for( i = 0; i < line->seqStored; i++ ){ 00130 n = SymbolToUINT( mVoidSequenceElement(line, i), symbolSize ); 00131 if( n == 0 || n == 1 ){ 00132 S8 msg[MAX_INPUT_STRING_SIZE]; 00133 00134 sprintf( msg, "Cannot process line %llu - 0 and 1 are reserved as new symbols. (pos is %d, sign is %d)", (U64)(nLine), i, n ); 00135 Throw( msg ); 00136 } 00137 } 00138 sharpDocumentDollar = VoidSequenceInit2( symbolSize, line->seqStored + 2, 1 ); 00139 VoidSequenceAdd( sharpDocumentDollar, one ); 00140 VoidSequenceAppend( sharpDocumentDollar, line ); 00141 VoidSequenceAdd( sharpDocumentDollar, zero ); 00142 SCDAWGAdd( help, sharpDocumentDollar ); 00143 VoidSequenceFree( sharpDocumentDollar ); 00144 } 00145 Fclose( fpInput ); 00146 SCDAWGClose( help ); 00147 SCDAWGShrink( aut ); 00148 SCDAWGBuildHelpFree( help ); 00149 if( tarjanTable ){ 00150 SCDAWGAddTarjanTable( aut ); 00151 } 00152 fpOutput = Fopen( fileNameOutput, "wb" ); 00153 SCDAWGWrite( aut, fpOutput ); 00154 Fclose( fpOutput ); 00155 SCDAWGFree( aut ); 00156 }