SIS
Symmetric Index Structures
/Users/dbr/ma/src/lmu/cis/sis/indexer/DocumentIndexingAutomaton.cpp
Go to the documentation of this file.
00001 #ifndef DOCUMENTINDEXINGAUTOMATON_CPP
00002 #define DOCUMENTINDEXINGAUTOMATON_CPP
00003 
00004 #include "../cppbase.hpp"
00005 #include "DocumentIndexingAutomaton.hpp"
00006 
00007 namespace lmu { namespace cis { namespace sis {
00008 
00009 
00010 
00011 
00012 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00013 /* * * * * * * * * * * * Specializations * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00014 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00015 
00016 template<typename T>
00017 void
00018 DocumentIndexingAutomaton<T>::
00019 AddDocuments( const std::vector<DocumentName>& documents ) {
00020     if (DEBUG_LEVEL >= 3) std::cerr << "virtual void DocumentIndexer::AddDocuments(std::vector<std::string>& documents)" << std::endl;
00021 
00022     assert( ManagedAutomatonStage_ < CLOSED );
00023 
00024     UINT total_length = 0;
00025     UINT number_of_docs_read = 0;
00026 
00027     _document_names = std::vector<std::string>(documents);
00028 
00029     for (auto& doc: _document_names) {
00030         number_of_docs_read++;
00031         if (DEBUG_LEVEL >= 3) std::cerr << "Adding document: " << doc << std::endl;
00032 
00033         VoidSequenceAdapterDollarized v2( T::get_symbol_size() );
00034         std::ifstream is(doc);
00035         is >> v2;
00036 
00037         State sinkstate = T::Add(v2.get()); // call to template's Add() (i.e. CDAWG::Add or SCDAWG::Add)!
00038 
00039         if (DEBUG_LEVEL >= 4) std::cerr << "Document " << doc << "'s sinkstate is " << sinkstate << std::endl;
00040         assert(sinkstate != NO);
00041 
00042         _document_sinkstate[doc] = sinkstate;
00043 
00044         if (DEBUG_LEVEL >= 4) std::cerr << doc << " -- current  length = " << v2.length() - 2                                         << std::endl;
00045         if (DEBUG_LEVEL >= 4) std::cerr << doc << " -- absolute length = " << v2.length() - 2 + total_length                          << std::endl;
00046         if (DEBUG_LEVEL >= 4) std::cerr << doc << " -- actual   length = " << v2.length() - 2 + total_length + number_of_docs_read*2-1<< std::endl;
00047 
00048         PositionsTuple pt(
00049             v2.length(),
00050             v2.length() + total_length,
00051             v2.length() + total_length + number_of_docs_read*2-1
00052         );
00053 
00054         _document_length[doc] = pt;
00055 
00056         total_length += v2.length() - 2;
00057     }
00058 
00059     ManagedAutomatonStage_ = UNSORTED;
00060 }
00061 
00062 template<typename T>
00063 void
00064 DocumentIndexingAutomaton<T>::Close() {
00065     if (DEBUG_LEVEL >= 4) std::cerr << "DocumentIndexingAutomaton<typename T>::Close()" << std::endl;
00066     T::Close();
00067     ManagedAutomatonStage_ = CLOSED;
00068 }
00069 
00070 template<typename T>
00071 void
00072 DocumentIndexingAutomaton<T>::
00073 Index() {
00074     if (DEBUG_LEVEL >= 4) std::cerr << "virtual void DocumentIndexingAutomaton<T>::Index()" << std::endl;
00075 
00076     assert( ManagedAutomatonStage_ > EMPTY && ManagedAutomatonStage_ < CLOSED );
00077 
00078     if ( ManagedAutomatonStage_ < SORTED ) {
00079         if (DEBUG_LEVEL >= 4) std::cerr << "----UNSORTED-----" << std::endl;
00080         T::SortTransitions();
00081     } else {
00082         if (DEBUG_LEVEL >= 4) std::cerr << "----SORTED-----" << std::endl;
00083     }
00084     assert( ManagedAutomatonStage_ >= SORTED );
00085 
00086     if (ManagedAutomatonStage_ < INDEXED ) {
00087         State source = 0;
00088         State state  = 0;
00089 
00090         for (auto doc: _document_names) {
00091             state = _document_sinkstate[doc];
00092 
00093             if (DEBUG_LEVEL >= 4) std::cerr << "doc is " << doc << std::endl;
00094             if (DEBUG_LEVEL >= 4) std::cerr << "state is " << state << std::endl;
00095             if (DEBUG_LEVEL >= 4) std::cerr << "source is " << source << std::endl;
00096 
00097             // documents
00098             while(state != source){
00099                 if (DEBUG_LEVEL >= 4) std::cerr << "state is " << state << std::endl;
00100                 if (DEBUG_LEVEL >= 4) std::cerr << "........doc is " << doc << std::endl;
00101                 _states_documents[state].push_back(doc);
00102                 state = suffixLink(state);
00103                 assert(state != NO);
00104                 if (state == NO) {
00105                     std::cerr << "WARN state was NO" << std::endl;
00106                     state = source;
00107                 }
00108             }
00109         }
00110     }
00111     ManagedAutomatonStage_ = INDEXED;
00112 }
00113 
00114 
00115 
00116 
00117 template<typename T>
00118 DocumentIndexingAutomatonFindResults&&
00119 DocumentIndexingAutomaton < T > ::
00120 findall( const std::string& w ) {
00121     // 2. find all occurrences of a given string W
00122     if (DEBUG_LEVEL >= 4) std::cerr << "virtual unsigned int DocumentIndexingAutomaton<InenagaCDAWGAdapter>::findall(const std::string& w)" << std::endl;
00123 
00124     assert( ManagedAutomatonStage_ < CLOSED );
00125 
00126     if ( ManagedAutomatonStage_ < SORTED ) {
00127         SortTransitions();
00128     }
00129 
00130     if ( ManagedAutomatonStage_ < INDEXED) {
00131         Index();
00132     }
00133 
00134     VoidSequenceAdapter v(w.c_str());
00135 
00136     PairState pairState = GetInitialPairState();
00137     PairState nextPairState;
00138     for( size_t i = 0; i < v.length(); i++ ){
00139         nextPairState = Delta( &pairState, v.at(i) );
00140         if( nextPairState.index == NO ){
00141             if (DEBUG_LEVEL >= 2) std::cerr << "no occurences" << std::endl;
00142             return std::forward<DocumentIndexingAutomatonFindResults>(DocumentIndexingAutomatonFindResults(this)); // empty results
00143         } else {
00144             if (DEBUG_LEVEL >= 5) std::cerr << "State might be implicit or explicit find() will set it to the canonical representative" << std::endl;
00145         }
00146 
00147         pairState = nextPairState;
00148     }
00149 
00150     //TODO set assertions?
00151     return std::forward<DocumentIndexingAutomatonFindResults>(
00152         find( pairState, v.length() )
00153     );
00154 }
00155 
00156 
00157 
00158 template<typename T>
00159 DocumentIndexingAutomatonFindResults&&
00160 DocumentIndexingAutomaton<T>::
00161 find( const PairState state, UINT length ) {
00162     if (DEBUG_LEVEL >= 5) std::cerr << "virtual void find( PairState state, int length = " << length << " )" << std::endl;
00163     if (DEBUG_LEVEL >= 5) std::cerr << "PairState's index is "  << state.index  << " length is " << length << std::endl;
00164 
00165     //3. the find method
00166     UINT s;
00167     if(state.transition != NO){ // check excplicit-ness of the state: if state is not explicit yet, "read" the "rest" (i.e. find the canonical representative)
00168         if (DEBUG_LEVEL >= 5) std::cerr << "IMPLICIT" << std::endl;
00169         length += automaton()->statesEnd->seq[automaton()->transitionsTo->seq[state.transition]] - state.index + 1; // state.transition is state-ID
00170         s = automaton()->transitionsTo->seq[state.transition];
00171     } else {
00172         if (DEBUG_LEVEL >= 5) std::cerr << "EXPLICIT" << std::endl;
00173         s = state.index;
00174     }
00175     return std::forward<DocumentIndexingAutomatonFindResults>(
00176         findRec(s, length)
00177     );
00178 }
00179 
00180 template<typename T>
00181 DocumentIndexingAutomatonFindResults&&
00182 DocumentIndexingAutomaton<T>::
00183 findRec( const UINT s, const UINT length ) { // 4. the recursive method
00184     if (DEBUG_LEVEL >= 4) std::cerr << "virtual void findRec(int s = " << s << ", int length = " << length << ")" << std::endl;
00185 
00186     UINT i, position, abs_pos;
00187     // static DocumentIndexingAutomatonFindResults<T> results_acc(this);
00188     static DocumentIndexingAutomatonFindResults results_acc(this);
00189 
00190     for (auto doc: _states_documents[s]) {
00191         if (DEBUG_LEVEL >= 4) std::cout << "_document_length[ " << doc << "] = " << std::get<0>(_document_length[doc]) << std::endl;
00192         DocumentPosition current_position  = std::get<0>(_document_length[doc]) - length;
00193         DocumentPosition absolute_position = std::get<1>(_document_length[doc]) - length;
00194         DocumentPosition actual_position   = std::get<2>(_document_length[doc]) - length;
00195 
00196         PositionsTuple pt (
00197             current_position,
00198             absolute_position,
00199             actual_position
00200         );
00201         std::cout << "INSERTING " << doc <<  " ++  " << current_position <<  " ++  " << absolute_position << " ++  " << actual_position << std::endl;
00202         results_acc.insert_position(doc, pt);
00203     }
00204     int tr;
00205     if (DEBUG_LEVEL >= 4) std::cout << "  --> State " << s << " number of transitions = " << number_of_transitions(s) << std::endl;
00206 
00207     for(UINT i = 0; i < automaton()->statesNumberOfTransitions->seq[s]; i++) {
00208         State to = automaton()->transitionsTo->seq[s];
00209         if (DEBUG_LEVEL >= 4) std::cout << "  --> State " << s << " transition to " << to << std::endl;
00210     }
00211 
00212     for(UINT i = 0; i < automaton()->statesNumberOfTransitions->seq[s]; i++) {
00213         tr = automaton()->statesTransitions->seq[s] + i;
00214         findRec(automaton()->transitionsTo->seq[tr], length + automaton()->statesEnd->seq[automaton()->transitionsTo->seq[tr]] - automaton()->transitionsStart->seq[tr] + 1);
00215     }
00216 
00217     return std::forward<DocumentIndexingAutomatonFindResults>(results_acc);
00218 }
00219 
00220 template<typename T>
00221 UINT
00222 DocumentIndexingAutomaton<T>::
00223 suffixLink( const UINT& state ) const {
00224     return std::forward<UINT>(T::suffixLink(state));
00225 }
00226 
00227 // template<typename T>
00228 // std::string DocumentIndexingAutomaton<T>::data_at(UINT pos) const {
00229 //     char * ptr = (S8*)mVoidSequenceElement( T::automaton()->data, pos );
00230 //     return std::string(
00231 //         ptr, 1
00232 //     );
00233 // }
00234 
00235 // template<typename T>
00236 // std::string DocumentIndexingAutomaton<T>::lrcont(const UINT pos, const UINT width) const {
00237 //     std::string res;
00238 // 
00239 //     mSymbolAndVariables( zero )
00240 //     mSymbolAndVariables( one )
00241 //     mSymbolAssignValue( zero, 0, T::get_symbol_size() )
00242 //     mSymbolAssignValue( one, 1, T::get_symbol_size() )
00243 // 
00244 //     if (width > pos) {
00245 //         for (auto i = width - pos; i > 0; i--)
00246 //             res.append( data_at(i) );
00247 //     }
00248 //     for (auto i = width > pos ? 0 : pos - width; i < pos+width; i++) { // make sure the unsigned ints start from 0.
00249 //         std::string s = data_at(i);
00250 //         if ( (memcmp(zero, s.data(), 1) == 0) ) {
00251 //             std::cout << "HIT LEFT BORDER." << std::endl;
00252 //             i = 0;
00253 //         }
00254 //         if ( (memcmp(one, s.data(), 1) == 0) ) {
00255 //             std::cout << "HIT RIGHT BORDER." << std::endl;
00256 //             i = pos+width; // break.
00257 //         }
00258 //         res.append( s );
00259 //     }
00260 //     return std::forward<std::string>(res);
00261 // }
00262 
00263 }}} /* End of namespace lmu::cis::sis */
00264 
00265 #endif /* end of include guard: DOCUMENTINDEXINGAUTOMATON_CPP */