SIS
Symmetric Index Structures
/Users/dbr/ma/src/bas/lml/cmd/cmdGenerateSuffixes.c
Go to the documentation of this file.
00001 #include "../base.h"
00002 
00003 typedef struct tCntStarts{
00004     VoidSequence * cnt;
00005     UINTSequence * starts;
00006     void * zero;
00007 } CntStarts;
00008 
00009 static SINT Cmp( void * ptr, UINT s1, UINT s2 ){
00010     CntStarts * cntStarts;
00011     void * str1;
00012     void * str2;
00013     UINT symbolSize;
00014 
00015     cntStarts = (CntStarts *)(ptr);
00016     symbolSize = cntStarts->cnt->elementSize;
00017     str1 = mVoidSequenceElement( cntStarts->cnt, cntStarts->starts->seq[s1] );
00018     str2 = mVoidSequenceElement( cntStarts->cnt, cntStarts->starts->seq[s2] );
00019     while( CmpSymbols( str1, str2, symbolSize ) == 0 && SymbolToUINT( str1, symbolSize ) != 0 ){
00020         str1 = str1 + symbolSize;
00021         str2 = str2 + symbolSize;
00022     }
00023     return CmpSymbols( str1, str2, symbolSize );
00024 }
00025 
00026 static void Swap( void * ptr, UINT s1, UINT s2 ){
00027     CntStarts * cntStarts;
00028     UINT tmp;
00029 
00030     cntStarts = (CntStarts *)(ptr);
00031     mSwapVariables( cntStarts->starts->seq[s1], cntStarts->starts->seq[s2], tmp );
00032 }
00033 
00034 static void Print( CntStarts * cntStarts, UINT s, FILE * fp, UINT encoding ){
00035     UINT symbolSize;
00036     void * str;
00037 
00038     symbolSize = cntStarts->cnt->elementSize;
00039     str = mVoidSequenceElement( cntStarts->cnt, cntStarts->starts->seq[s] );
00040     while( SymbolToUINT( str, symbolSize ) != 0 ){
00041         PrintSymbol( str, fp, symbolSize, encoding );
00042         str = str + symbolSize;
00043     }
00044     PrintLine( fp, symbolSize, encoding );
00045 }
00046 
00047 void CmdGenerateSuffixes( const S8 * fileNameInput, const S8 * fileNameOutput, UINT bitsPerSymbol, UINT encoding ){
00048     FILE * fpInput;
00049     FILE * fpOutput;
00050     VoidSequence * line = NULL;
00051     VoidSequence * cnt;
00052     UINTSequence * starts;
00053     CntStarts cntStarts;
00054     UINT i, nLine, symbolSize;
00055     mSymbolAndVariables( zero )
00056 
00057     mValidateBitsPerSymbol( bitsPerSymbol )
00058     symbolSize = bitsPerSymbol/8;
00059     mSymbolAssignValue( zero, 0, symbolSize )
00060     cnt = VoidSequenceInit( symbolSize );
00061     fpInput = Fopen( fileNameInput, "rb" );
00062     nLine = 0;
00063     while( bTRUE ){
00064         if( line != NULL ){
00065             VoidSequenceFree( line );
00066         }
00067         line = VoidSequenceReadLine( fpInput, symbolSize, encoding );
00068         if( line == NULL ){
00069             break;
00070         }
00071         nLine++;
00072         for( i = 0; i < line->seqStored; i++ ){
00073             if( SymbolToUINT( mVoidSequenceElement(line, i), symbolSize ) == 0 ){
00074                 S8 msg[MAX_INPUT_STRING_SIZE];
00075 
00076                 sprintf( msg, "Cannot process line %llu - 0 is reserved as a new symbol.", (U64)(nLine) );
00077                 Throw( msg );
00078             }
00079         }
00080         VoidSequenceAdd( line, zero );
00081         VoidSequenceAppend( cnt, line );
00082     }
00083     Fclose( fpInput );
00084     starts = UINTSequenceInit2( cnt->seqStored, 1 );
00085     for( i = 0; i < cnt->seqStored; i++ ){
00086         UINTSequenceAdd( starts, i );
00087     }
00088     cntStarts.cnt = cnt;
00089     cntStarts.starts = starts;
00090     cntStarts.zero = zero;
00091     Sort( &cntStarts, starts->seqStored, Cmp, Swap );
00092     fpOutput = Fopen( fileNameOutput, "wb" );
00093     if( starts->seqStored > 0 ){
00094         Print( &cntStarts, 0, fpOutput, encoding );
00095         for( i = 1; i < starts->seqStored; i++ ){
00096             if( Cmp( &cntStarts, i-1, i ) != 0 ){
00097                 Print( &cntStarts, i, fpOutput, encoding );
00098             }
00099         }
00100     }
00101     Fclose( fpOutput );
00102     VoidSequenceFree( cnt );
00103     UINTSequenceFree( starts );
00104 }