SIS
Symmetric Index Structures
|
00001 #ifndef DOCUMENTINDEXINGAUTOMATONFINDRESULTS_HPP 00002 #define DOCUMENTINDEXINGAUTOMATONFINDRESULTS_HPP 00003 00004 #include "../cppbase.hpp" 00005 #include "../adapter/CompressedAutomatonAdapter.hpp" 00006 // #include "DocumentIndexingAutomaton.hpp" 00007 00008 // using lmu::cis::sis::DocumentIndexingAutomaton; 00009 #include <iostream> 00010 #include <set> 00011 #include <map> 00012 #include <vector> 00013 #include <string> 00014 #include <deque> 00015 #include <tuple> 00016 #include <algorithm> 00017 #include <list> 00018 #include <regex> 00019 00020 namespace lmu { namespace cis { namespace sis { 00021 00026 typedef std::string DocumentName; 00027 // typedef UINT RelativePosition; 00028 // typedef UINT AbsolutePosition; 00029 typedef std::tuple< 00030 DocumentName, 00031 RelativeDocumentPosition, 00032 AbsoluteDocumentPosition, 00033 ActualDocumentPosition> ResultsTuple; 00034 typedef std::list<ResultsTuple> ResultsList; 00035 // typedef std::pair<DocumentName, PositionsTupleContainer> ResultsMapIterator; 00037 00038 00054 // template<typename T> 00055 class DocumentIndexingAutomatonFindResults { 00056 protected: 00057 ResultsList results_; 00058 ResultsList::const_iterator curr_it_; 00059 const CompressedAutomatonAdapter& aut_; 00060 public: 00061 00062 // DocumentIndexingAutomatonFindResults() = default; ///< Set to default 00063 DocumentIndexingAutomatonFindResults(CompressedAutomatonAdapter* automaton_) 00064 : results_() 00065 , curr_it_() 00066 , aut_(*automaton_) 00067 { 00068 } 00069 virtual ~DocumentIndexingAutomatonFindResults() = default; 00070 00071 // Methods. 00072 virtual 00073 DocumentIndexingAutomatonFindResults& 00074 insert_position(const DocumentName& doc, const PositionsTuple& p) { 00075 results_.push_back( std::tuple_cat(std::make_tuple(doc), p) ); 00076 return *this; 00077 } 00078 00079 virtual inline ResultsList::const_iterator begin() { results_.sort(); curr_it_ = results_.cbegin(); return curr_it_; } 00080 virtual inline ResultsList::const_iterator end() { curr_it_ = results_.cend(); return curr_it_; } 00081 virtual inline ResultsList::const_iterator cbegin() { return begin(); } 00082 virtual inline ResultsList::const_iterator cend() { return end(); } 00083 virtual inline size_t size() const { return results_.size(); } 00084 virtual inline bool empty() const { return size() == 0; } 00085 virtual inline DocumentName document_name(ResultsList::const_iterator it) { return std::get<0>(*it); } 00086 virtual inline DocumentPosition relative_pos(ResultsList::const_iterator it) { return std::get<1>(*it); } 00087 virtual inline DocumentPosition absolute_pos(ResultsList::const_iterator it) { return std::get<2>(*it); } 00088 virtual inline DocumentPosition actual_pos (ResultsList::const_iterator it) { return std::get<3>(*it); } 00089 virtual inline std::string lr_context(ResultsList::const_iterator it, UINT width = 20) { 00090 std::string res; 00091 UINT pos = actual_pos(it); 00092 00093 mSymbolAndVariables( zero ) 00094 mSymbolAndVariables( one ) 00095 mSymbolAssignValue( zero, 0, aut_.get_symbol_size() ) 00096 mSymbolAssignValue( one, 1, aut_.get_symbol_size() ) 00097 00098 if (width > pos) { 00099 for (auto i = width - pos; i > 0; i--) 00100 res.append( aut_.data_at(i) ); 00101 } 00102 for (auto i = width > pos ? 0 : pos - width; i < pos+width; i++) { // make sure the unsigned ints start from 0. 00103 std::string s = aut_.data_at(i); 00104 if ( (memcmp(zero, s.data(), 1) == 0) ) { 00105 std::cout << "HIT LEFT BORDER." << std::endl; 00106 i = 0; 00107 } 00108 if ( (memcmp(one, s.data(), 1) == 0) ) { 00109 std::cout << "HIT RIGHT BORDER." << std::endl; 00110 i = pos+width; // break. 00111 } 00112 res.append( s ); 00113 } 00114 return std::forward<std::string>(res); 00115 } 00116 00117 00118 virtual inline DocumentName document_name(ResultsTuple it) { return std::get<0>(it); } 00119 virtual inline DocumentPosition relative_pos(ResultsTuple it) { return std::get<1>(it); } 00120 virtual inline DocumentPosition absolute_pos(ResultsTuple it) { return std::get<2>(it); } 00121 virtual inline DocumentPosition actual_pos (ResultsTuple it) { return std::get<3>(it); } 00122 virtual inline std::string lr_context(ResultsTuple it, UINT width = 20) { 00123 // a lot of alignment happens here. for now there are small off-by-one's in 00124 // there but i don't care. 00125 std::string res; 00126 00127 // get the actual position in the long data string 00128 UINT pos = actual_pos(it); 00129 00130 // get those # and $ thingies to test on them 00131 mSymbolAndVariables( zero ) 00132 mSymbolAndVariables( one ) 00133 mSymbolAssignValue( zero, 0, aut_.get_symbol_size() ) 00134 mSymbolAssignValue( one, 1, aut_.get_symbol_size() ) 00135 00136 // declare our maximum boundaries to the left and right 00137 UINT max_left = 0; 00138 UINT max_right = 0; 00139 00140 if (DEBUG_LEVEL >= 5) std::cerr << "pos: " << pos << std::endl; 00141 00142 // find the left boundary: either '#' or width chars away from pos 00143 for (auto i = pos; i != 0 && i > pos-width-1; i--) { 00144 if (memcmp(zero, aut_.data_at(i).data(), 1) == 0) break; 00145 max_left = i+1; 00146 } 00147 if (max_left == 0) max_left = 1; // dunno. just set it back. 00148 00149 // find the right border 00150 for (auto i = pos; i <= (pos + width) && i <= aut_.data_length(); i++) { 00151 if (memcmp(one, aut_.data_at(i).data(), 1) == 0) break; 00152 max_right = i; 00153 } 00154 if (! max_right == aut_.data_length() ) max_right++; 00155 00156 // printf debugging... 00157 if (DEBUG_LEVEL >= 5) std::cerr << "left: " << max_left << std::endl; 00158 if (DEBUG_LEVEL >= 5) std::cerr << "right: " << max_right << std::endl; 00159 00160 // insert some blanks or whatever to get a nicely layed out concordance view 00161 if ( width > (pos-max_left) ) res.append( (width - (pos - max_left)), ' '); 00162 00163 // append the found data 00164 for (auto i = max_left; i < max_right; i++) { 00165 res.append( aut_.data_at(i) ); 00166 } 00167 // append blanks to the right again. 00168 if ( width > max_right-pos ) res.append( (width - (max_right - pos)), ' '); 00169 00170 // std::string res2; 00171 // try { 00172 // std::regex rx(R"([\t\n])"); 00173 // std::string replacement(" "); 00174 // res2 = std::regex_replace(res, rx, replacement); 00175 // } catch (std::regex_error& e) { 00176 // std::cerr << "Caught exception: " << e.code() << std::endl; 00177 // } 00178 std::replace (res.begin(), res.end(), '\n', ' '); 00179 return std::forward<std::string>(res); 00180 } 00181 00182 }; /* End of class DocumentIndexingAutomatonFindResults */ 00183 00184 }}} /* End of namespace lmu::cis::sis */ 00185 00186 #endif /* end of include guard: DOCUMENTINDEXINGAUTOMATONFINDRESULTS_HPP */