SIS
Symmetric Index Structures
/Users/dbr/ma/src/lmu/cis/sis/indexer/DocumentIndexingAutomatonFindResults.hpp
Go to the documentation of this file.
00001 #ifndef DOCUMENTINDEXINGAUTOMATONFINDRESULTS_HPP
00002 #define DOCUMENTINDEXINGAUTOMATONFINDRESULTS_HPP
00003 
00004 #include "../cppbase.hpp"
00005 #include "../adapter/CompressedAutomatonAdapter.hpp"
00006 // #include "DocumentIndexingAutomaton.hpp"
00007 
00008 // using lmu::cis::sis::DocumentIndexingAutomaton;
00009 #include <iostream>
00010 #include <set>
00011 #include <map>
00012 #include <vector>
00013 #include <string>
00014 #include <deque>
00015 #include <tuple>
00016 #include <algorithm>
00017 #include <list>
00018 #include <regex>
00019 
00020 namespace lmu { namespace cis { namespace sis {
00021 
00026 typedef std::string                                 DocumentName;
00027 // typedef UINT                                        RelativePosition;
00028 // typedef UINT                                        AbsolutePosition;
00029 typedef std::tuple<
00030                 DocumentName,
00031                 RelativeDocumentPosition,
00032                 AbsoluteDocumentPosition,
00033                 ActualDocumentPosition>             ResultsTuple;
00034 typedef std::list<ResultsTuple>                   ResultsList;
00035 // typedef std::pair<DocumentName, PositionsTupleContainer>    ResultsMapIterator;
00037 
00038 
00054 // template<typename T>
00055 class DocumentIndexingAutomatonFindResults {
00056 protected:
00057     ResultsList results_;
00058     ResultsList::const_iterator curr_it_;
00059     const CompressedAutomatonAdapter& aut_;
00060 public:
00061 
00062     // DocumentIndexingAutomatonFindResults() = default; ///< Set to default
00063     DocumentIndexingAutomatonFindResults(CompressedAutomatonAdapter* automaton_)
00064         : results_()
00065         , curr_it_()
00066         , aut_(*automaton_)
00067         {
00068         }
00069     virtual ~DocumentIndexingAutomatonFindResults() = default; 
00070 
00071     // Methods.
00072     virtual
00073     DocumentIndexingAutomatonFindResults&
00074     insert_position(const DocumentName& doc, const PositionsTuple& p) {
00075         results_.push_back( std::tuple_cat(std::make_tuple(doc), p) );
00076         return *this;
00077     }
00078 
00079     virtual inline ResultsList::const_iterator begin()  { results_.sort(); curr_it_ = results_.cbegin(); return curr_it_; }
00080     virtual inline ResultsList::const_iterator end()    { curr_it_ = results_.cend(); return curr_it_; }
00081     virtual inline ResultsList::const_iterator cbegin() { return begin(); }
00082     virtual inline ResultsList::const_iterator cend()   { return end(); }
00083     virtual inline size_t size() const { return results_.size(); }
00084     virtual inline bool empty()  const { return size() == 0; }
00085     virtual inline DocumentName    document_name(ResultsList::const_iterator it) { return std::get<0>(*it); }
00086     virtual inline DocumentPosition relative_pos(ResultsList::const_iterator it) { return std::get<1>(*it); }
00087     virtual inline DocumentPosition absolute_pos(ResultsList::const_iterator it) { return std::get<2>(*it); }
00088     virtual inline DocumentPosition actual_pos  (ResultsList::const_iterator it) { return std::get<3>(*it); }
00089     virtual inline std::string        lr_context(ResultsList::const_iterator it, UINT width = 20) {
00090         std::string res;
00091         UINT pos = actual_pos(it);
00092 
00093         mSymbolAndVariables( zero )
00094         mSymbolAndVariables( one )
00095         mSymbolAssignValue( zero, 0, aut_.get_symbol_size() )
00096         mSymbolAssignValue( one, 1, aut_.get_symbol_size() )
00097 
00098         if (width > pos) {
00099             for (auto i = width - pos; i > 0; i--)
00100                 res.append( aut_.data_at(i) );
00101         }
00102         for (auto i = width > pos ? 0 : pos - width; i < pos+width; i++) { // make sure the unsigned ints start from 0.
00103             std::string s = aut_.data_at(i);
00104             if ( (memcmp(zero, s.data(), 1) == 0) ) {
00105                 std::cout << "HIT LEFT BORDER." << std::endl;
00106                 i = 0;
00107             }
00108             if ( (memcmp(one, s.data(), 1) == 0) ) {
00109                 std::cout << "HIT RIGHT BORDER." << std::endl;
00110                 i = pos+width; // break.
00111             }
00112             res.append( s );
00113         }
00114         return std::forward<std::string>(res);
00115     }
00116 
00117 
00118     virtual inline DocumentName    document_name(ResultsTuple it) { return std::get<0>(it); }
00119     virtual inline DocumentPosition relative_pos(ResultsTuple it) { return std::get<1>(it); }
00120     virtual inline DocumentPosition absolute_pos(ResultsTuple it) { return std::get<2>(it); }
00121     virtual inline DocumentPosition actual_pos  (ResultsTuple it) { return std::get<3>(it); }
00122     virtual inline std::string        lr_context(ResultsTuple it, UINT width = 20) {
00123         // a lot of alignment happens here. for now there are small off-by-one's in
00124         // there but i don't care.
00125         std::string res;
00126 
00127         // get the actual position in the long data string
00128         UINT pos = actual_pos(it);
00129 
00130         // get those # and $ thingies to test on them
00131         mSymbolAndVariables( zero )
00132         mSymbolAndVariables( one )
00133         mSymbolAssignValue( zero, 0, aut_.get_symbol_size() )
00134         mSymbolAssignValue( one, 1, aut_.get_symbol_size() )
00135 
00136         // declare our maximum boundaries to the left and right
00137         UINT max_left  = 0;
00138         UINT max_right = 0;
00139 
00140         if (DEBUG_LEVEL >= 5) std::cerr << "pos: " << pos << std::endl;
00141 
00142         // find the left boundary: either '#' or width chars away from pos
00143         for (auto i = pos; i != 0 && i > pos-width-1; i--) {
00144             if (memcmp(zero, aut_.data_at(i).data(), 1) == 0) break;
00145             max_left = i+1;
00146         }
00147         if (max_left == 0) max_left = 1; // dunno. just set it back.
00148 
00149         // find the right border
00150         for (auto i = pos; i <= (pos + width) && i <= aut_.data_length(); i++) {
00151             if (memcmp(one, aut_.data_at(i).data(), 1) == 0) break;
00152             max_right = i;
00153         }
00154         if (! max_right == aut_.data_length() ) max_right++;
00155 
00156         // printf debugging...
00157         if (DEBUG_LEVEL >= 5) std::cerr << "left: " << max_left << std::endl;
00158         if (DEBUG_LEVEL >= 5) std::cerr << "right: " << max_right << std::endl;
00159 
00160         // insert some blanks or whatever to get a nicely layed out concordance view
00161         if ( width > (pos-max_left) ) res.append( (width - (pos - max_left)), ' ');
00162 
00163         // append the found data
00164         for (auto i = max_left; i < max_right; i++) {
00165             res.append( aut_.data_at(i) );
00166         }
00167         // append blanks to the right again.
00168         if ( width > max_right-pos ) res.append( (width - (max_right - pos)), ' ');
00169 
00170         // std::string res2;
00171         // try {
00172         // std::regex rx(R"([\t\n])");
00173         // std::string replacement(" ");
00174         //     res2 = std::regex_replace(res, rx, replacement);
00175         // } catch (std::regex_error& e) {
00176         //     std::cerr << "Caught exception: " << e.code() << std::endl;
00177         // }
00178         std::replace (res.begin(), res.end(), '\n', ' ');
00179         return std::forward<std::string>(res);
00180     }
00181 
00182 }; /* End of class DocumentIndexingAutomatonFindResults */
00183 
00184 }}} /* End of namespace lmu::cis::sis */
00185 
00186 #endif /* end of include guard: DOCUMENTINDEXINGAUTOMATONFINDRESULTS_HPP */