SIS
Symmetric Index Structures
|
00001 #ifndef DOCUMENTINDEXINGAUTOMATON_CPP 00002 #define DOCUMENTINDEXINGAUTOMATON_CPP 00003 00004 #include "../cppbase.hpp" 00005 #include "DocumentIndexingAutomaton.hpp" 00006 00007 namespace lmu { namespace cis { namespace sis { 00008 00009 00010 00011 00012 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00013 /* * * * * * * * * * * * Specializations * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00014 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00015 00016 template<typename T> 00017 void 00018 DocumentIndexingAutomaton<T>:: 00019 AddDocuments( const std::vector<DocumentName>& documents ) { 00020 if (DEBUG_LEVEL >= 3) std::cerr << "virtual void DocumentIndexer::AddDocuments(std::vector<std::string>& documents)" << std::endl; 00021 00022 assert( ManagedAutomatonStage_ < CLOSED ); 00023 00024 UINT total_length = 0; 00025 UINT number_of_docs_read = 0; 00026 00027 _document_names = std::vector<std::string>(documents); 00028 00029 for (auto& doc: _document_names) { 00030 number_of_docs_read++; 00031 if (DEBUG_LEVEL >= 3) std::cerr << "Adding document: " << doc << std::endl; 00032 00033 VoidSequenceAdapterDollarized v2( T::get_symbol_size() ); 00034 std::ifstream is(doc); 00035 is >> v2; 00036 00037 State sinkstate = T::Add(v2.get()); // call to template's Add() (i.e. CDAWG::Add or SCDAWG::Add)! 00038 00039 if (DEBUG_LEVEL >= 4) std::cerr << "Document " << doc << "'s sinkstate is " << sinkstate << std::endl; 00040 assert(sinkstate != NO); 00041 00042 _document_sinkstate[doc] = sinkstate; 00043 00044 if (DEBUG_LEVEL >= 4) std::cerr << doc << " -- current length = " << v2.length() - 2 << std::endl; 00045 if (DEBUG_LEVEL >= 4) std::cerr << doc << " -- absolute length = " << v2.length() - 2 + total_length << std::endl; 00046 if (DEBUG_LEVEL >= 4) std::cerr << doc << " -- actual length = " << v2.length() - 2 + total_length + number_of_docs_read*2-1<< std::endl; 00047 00048 PositionsTuple pt( 00049 v2.length(), 00050 v2.length() + total_length, 00051 v2.length() + total_length + number_of_docs_read*2-1 00052 ); 00053 00054 _document_length[doc] = pt; 00055 00056 total_length += v2.length() - 2; 00057 } 00058 00059 ManagedAutomatonStage_ = UNSORTED; 00060 } 00061 00062 template<typename T> 00063 void 00064 DocumentIndexingAutomaton<T>::Close() { 00065 if (DEBUG_LEVEL >= 4) std::cerr << "DocumentIndexingAutomaton<typename T>::Close()" << std::endl; 00066 T::Close(); 00067 ManagedAutomatonStage_ = CLOSED; 00068 } 00069 00070 template<typename T> 00071 void 00072 DocumentIndexingAutomaton<T>:: 00073 Index() { 00074 if (DEBUG_LEVEL >= 4) std::cerr << "virtual void DocumentIndexingAutomaton<T>::Index()" << std::endl; 00075 00076 assert( ManagedAutomatonStage_ > EMPTY && ManagedAutomatonStage_ < CLOSED ); 00077 00078 if ( ManagedAutomatonStage_ < SORTED ) { 00079 if (DEBUG_LEVEL >= 4) std::cerr << "----UNSORTED-----" << std::endl; 00080 T::SortTransitions(); 00081 } else { 00082 if (DEBUG_LEVEL >= 4) std::cerr << "----SORTED-----" << std::endl; 00083 } 00084 assert( ManagedAutomatonStage_ >= SORTED ); 00085 00086 if (ManagedAutomatonStage_ < INDEXED ) { 00087 State source = 0; 00088 State state = 0; 00089 00090 for (auto doc: _document_names) { 00091 state = _document_sinkstate[doc]; 00092 00093 if (DEBUG_LEVEL >= 4) std::cerr << "doc is " << doc << std::endl; 00094 if (DEBUG_LEVEL >= 4) std::cerr << "state is " << state << std::endl; 00095 if (DEBUG_LEVEL >= 4) std::cerr << "source is " << source << std::endl; 00096 00097 // documents 00098 while(state != source){ 00099 if (DEBUG_LEVEL >= 4) std::cerr << "state is " << state << std::endl; 00100 if (DEBUG_LEVEL >= 4) std::cerr << "........doc is " << doc << std::endl; 00101 _states_documents[state].push_back(doc); 00102 state = suffixLink(state); 00103 assert(state != NO); 00104 if (state == NO) { 00105 std::cerr << "WARN state was NO" << std::endl; 00106 state = source; 00107 } 00108 } 00109 } 00110 } 00111 ManagedAutomatonStage_ = INDEXED; 00112 } 00113 00114 00115 00116 00117 template<typename T> 00118 DocumentIndexingAutomatonFindResults&& 00119 DocumentIndexingAutomaton < T > :: 00120 findall( const std::string& w ) { 00121 // 2. find all occurrences of a given string W 00122 if (DEBUG_LEVEL >= 4) std::cerr << "virtual unsigned int DocumentIndexingAutomaton<InenagaCDAWGAdapter>::findall(const std::string& w)" << std::endl; 00123 00124 assert( ManagedAutomatonStage_ < CLOSED ); 00125 00126 if ( ManagedAutomatonStage_ < SORTED ) { 00127 SortTransitions(); 00128 } 00129 00130 if ( ManagedAutomatonStage_ < INDEXED) { 00131 Index(); 00132 } 00133 00134 VoidSequenceAdapter v(w.c_str()); 00135 00136 PairState pairState = GetInitialPairState(); 00137 PairState nextPairState; 00138 for( size_t i = 0; i < v.length(); i++ ){ 00139 nextPairState = Delta( &pairState, v.at(i) ); 00140 if( nextPairState.index == NO ){ 00141 if (DEBUG_LEVEL >= 2) std::cerr << "no occurences" << std::endl; 00142 return std::forward<DocumentIndexingAutomatonFindResults>(DocumentIndexingAutomatonFindResults(this)); // empty results 00143 } else { 00144 if (DEBUG_LEVEL >= 5) std::cerr << "State might be implicit or explicit find() will set it to the canonical representative" << std::endl; 00145 } 00146 00147 pairState = nextPairState; 00148 } 00149 00150 //TODO set assertions? 00151 return std::forward<DocumentIndexingAutomatonFindResults>( 00152 find( pairState, v.length() ) 00153 ); 00154 } 00155 00156 00157 00158 template<typename T> 00159 DocumentIndexingAutomatonFindResults&& 00160 DocumentIndexingAutomaton<T>:: 00161 find( const PairState state, UINT length ) { 00162 if (DEBUG_LEVEL >= 5) std::cerr << "virtual void find( PairState state, int length = " << length << " )" << std::endl; 00163 if (DEBUG_LEVEL >= 5) std::cerr << "PairState's index is " << state.index << " length is " << length << std::endl; 00164 00165 //3. the find method 00166 UINT s; 00167 if(state.transition != NO){ // check excplicit-ness of the state: if state is not explicit yet, "read" the "rest" (i.e. find the canonical representative) 00168 if (DEBUG_LEVEL >= 5) std::cerr << "IMPLICIT" << std::endl; 00169 length += automaton()->statesEnd->seq[automaton()->transitionsTo->seq[state.transition]] - state.index + 1; // state.transition is state-ID 00170 s = automaton()->transitionsTo->seq[state.transition]; 00171 } else { 00172 if (DEBUG_LEVEL >= 5) std::cerr << "EXPLICIT" << std::endl; 00173 s = state.index; 00174 } 00175 return std::forward<DocumentIndexingAutomatonFindResults>( 00176 findRec(s, length) 00177 ); 00178 } 00179 00180 template<typename T> 00181 DocumentIndexingAutomatonFindResults&& 00182 DocumentIndexingAutomaton<T>:: 00183 findRec( const UINT s, const UINT length ) { // 4. the recursive method 00184 if (DEBUG_LEVEL >= 4) std::cerr << "virtual void findRec(int s = " << s << ", int length = " << length << ")" << std::endl; 00185 00186 UINT i, position, abs_pos; 00187 // static DocumentIndexingAutomatonFindResults<T> results_acc(this); 00188 static DocumentIndexingAutomatonFindResults results_acc(this); 00189 00190 for (auto doc: _states_documents[s]) { 00191 if (DEBUG_LEVEL >= 4) std::cout << "_document_length[ " << doc << "] = " << std::get<0>(_document_length[doc]) << std::endl; 00192 DocumentPosition current_position = std::get<0>(_document_length[doc]) - length; 00193 DocumentPosition absolute_position = std::get<1>(_document_length[doc]) - length; 00194 DocumentPosition actual_position = std::get<2>(_document_length[doc]) - length; 00195 00196 PositionsTuple pt ( 00197 current_position, 00198 absolute_position, 00199 actual_position 00200 ); 00201 std::cout << "INSERTING " << doc << " ++ " << current_position << " ++ " << absolute_position << " ++ " << actual_position << std::endl; 00202 results_acc.insert_position(doc, pt); 00203 } 00204 int tr; 00205 if (DEBUG_LEVEL >= 4) std::cout << " --> State " << s << " number of transitions = " << number_of_transitions(s) << std::endl; 00206 00207 for(UINT i = 0; i < automaton()->statesNumberOfTransitions->seq[s]; i++) { 00208 State to = automaton()->transitionsTo->seq[s]; 00209 if (DEBUG_LEVEL >= 4) std::cout << " --> State " << s << " transition to " << to << std::endl; 00210 } 00211 00212 for(UINT i = 0; i < automaton()->statesNumberOfTransitions->seq[s]; i++) { 00213 tr = automaton()->statesTransitions->seq[s] + i; 00214 findRec(automaton()->transitionsTo->seq[tr], length + automaton()->statesEnd->seq[automaton()->transitionsTo->seq[tr]] - automaton()->transitionsStart->seq[tr] + 1); 00215 } 00216 00217 return std::forward<DocumentIndexingAutomatonFindResults>(results_acc); 00218 } 00219 00220 template<typename T> 00221 UINT 00222 DocumentIndexingAutomaton<T>:: 00223 suffixLink( const UINT& state ) const { 00224 return std::forward<UINT>(T::suffixLink(state)); 00225 } 00226 00227 // template<typename T> 00228 // std::string DocumentIndexingAutomaton<T>::data_at(UINT pos) const { 00229 // char * ptr = (S8*)mVoidSequenceElement( T::automaton()->data, pos ); 00230 // return std::string( 00231 // ptr, 1 00232 // ); 00233 // } 00234 00235 // template<typename T> 00236 // std::string DocumentIndexingAutomaton<T>::lrcont(const UINT pos, const UINT width) const { 00237 // std::string res; 00238 // 00239 // mSymbolAndVariables( zero ) 00240 // mSymbolAndVariables( one ) 00241 // mSymbolAssignValue( zero, 0, T::get_symbol_size() ) 00242 // mSymbolAssignValue( one, 1, T::get_symbol_size() ) 00243 // 00244 // if (width > pos) { 00245 // for (auto i = width - pos; i > 0; i--) 00246 // res.append( data_at(i) ); 00247 // } 00248 // for (auto i = width > pos ? 0 : pos - width; i < pos+width; i++) { // make sure the unsigned ints start from 0. 00249 // std::string s = data_at(i); 00250 // if ( (memcmp(zero, s.data(), 1) == 0) ) { 00251 // std::cout << "HIT LEFT BORDER." << std::endl; 00252 // i = 0; 00253 // } 00254 // if ( (memcmp(one, s.data(), 1) == 0) ) { 00255 // std::cout << "HIT RIGHT BORDER." << std::endl; 00256 // i = pos+width; // break. 00257 // } 00258 // res.append( s ); 00259 // } 00260 // return std::forward<std::string>(res); 00261 // } 00262 00263 }}} /* End of namespace lmu::cis::sis */ 00264 00265 #endif /* end of include guard: DOCUMENTINDEXINGAUTOMATON_CPP */