/*********************************************************************************** Copyright 2005 Adam Sanitt This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ************************************************************************************/ #ifndef CATENATOR_H_GUARD #define CATENATOR_H_GUARD #include #include #include #include #include #include ///////////////////////////////////////////////// //Basic exception class for use with Catenators// //although error-checking is minimal // //within the class to increase efficiency // ///////////////////////////////////////////////// class CatenatorException : public std::exception { public: CatenatorException (std::string s) : d_(s) {} ~CatenatorException() throw () { } std::string description() { return d_; } const char * what() { return "Catenator exception"; } private: std::string d_; }; //////////////////////////////////////////////////////////////////////////// //Traits classes for use with Catenator // //provides general interface for functions used by Catenator for each type// //specialise some or all for each new type to be used with Catenator // //////////////////////////////////////////////////////////////////////////// // //standard traits class // template struct CatenatorTraits { static T reverse(const T& t) { return T(t.rbegin(), t.rend()); } static std::size_t length(const T& t) { return t.length(); } static T cutlast(const T& t) { return t.substr(0, t.length()-1); } static T cutfirst(const T& t) { return t.substr(1, t.length()-1); } static T join(const T& t1, const T& t2) { return t1 + t2; } static T insert(const T& t1, const T& t2, std::size_t inpos) { return t1.substr(0, inpos) + t2 + t1.substr(inpos, t1.length()-inpos); } static T substr(const T& t, std::size_t base, std::size_t length) { return t.substr(base, length); } static bool match(const T& t1, const T& t2, const T& t3) { return (t1 == t2); } static T createempty() { return T(); } }; // //traits class incorporating wildcard matching // template struct CatenatorTraitsWildCard { static T reverse(const T& t) { return T(t.rbegin(), t.rend()); } static std::size_t length(const T& t) { return t.length(); } static T cutlast(const T& t) { return t.substr(0, t.length()-1); } static T cutfirst(const T& t) { return t.substr(1, t.length()-1); } static T join(const T& t1, const T& t2) { return t1 + t2; } static T insert(const T& t1, const T& t2, std::size_t inpos) { return t1.substr(0, inpos) + t2 + t1.substr(inpos, t1.length()-inpos); } static T substr(const T& t, std::size_t base, std::size_t length) { return t.substr(base, length); } static bool match(const T& t1, const T& t2, const T& t3) { for (std::size_t ind=0; ind < t1.length(); ++ind) { if (t1[ind] != '?' && t2[ind] != '?' && t1[ind] != t2[ind]) { return false; } } return true; } static T createempty() { return T(); } }; ///////////////////////////////////////////////////////////// //Catenator class templatized on type T // //container class for concatenation of groups of T // //and Tr, traits and policy for T // //T must satisfy constraints of CatenatorTraits class // //ie, provide joins, insertions, etc. // //prototypical T is string // //access is through array style [] operator // //but iteration through all of catenator is expensive // //better access is through matching with couldmatch // //to get a short list of possible matches // ///////////////////////////////////////////////////////////// template > class Catenator { private: //disallow copy semantics //except for private construction of new Catenator used internally Catenator() : s_(0), l_(0) { } Catenator& operator=(const Catenator&); Catenator(const Catenator&); //helper typedefs typedef T value_type; typedef Tr traits_type; typedef std::vector vect_type; typedef std::vector vecvect_type; typedef std::map cqmap_type; typedef std::pair cqpair_type; private: //members std::vector c_; mutable std::size_t s_; mutable std::size_t l_; /////////////////////////////// //creators using vectors of T// /////////////////////////////// void append(const vect_type& vtoadd) { vect_type * vt = new(vect_type); vt->assign(vtoadd.begin(), vtoadd.end()); c_.push_back(make_pair(vt, new(cqmap_type))); } void prepend(const vect_type& vtoadd) { vect_type * vt = new(vect_type); vt->assign(vtoadd.begin(), vtoadd.end()); c_.insert(c_.begin(), make_pair(vt, new(cqmap_type))); } public: Catenator( const vecvect_type& v ) : s_(0), l_(0) { for (std::size_t i = 0; i < v.size(); ++i) { append(*v[i]); } } Catenator( const vect_type& v1) : s_(0), l_(0) { append(v1); } Catenator( const vect_type& v1, const vect_type& v2 ) : s_(0), l_(0) { append(v1); append(v2); } Catenator( const vect_type * v1) : s_(0), l_(0) { append(*v1); } Catenator( const vect_type * v1, const vect_type * v2) : s_(0), l_(0) { append(*v1); append(*v2); } /////////////////////////////////// //creators using other Catenators// /////////////////////////////////// public: Catenator * clone() const { Catenator * cqreturn = new(Catenator); for (std::size_t i = 0; i < c_.size(); ++i) { copypart(cqreturn, i); } return cqreturn; } Catenator(const Catenator& qa, const Catenator& qb) : s_(0), l_(0) { for (std::size_t i = 0; i < qa.c_.size(); ++i) { qa.copypart(this, i); } for (std::size_t i = 0; i < qb.c_.size(); ++i) { qb.copypart(this, i); } } Catenator(const Catenator * qa, const Catenator * qb) : s_(0), l_(0) { for (std::size_t i = 0; i < qa->c_.size(); ++i) { qa->copypart(this, i); } for (std::size_t i = 0; i < qb->c_.size(); ++i) { qb->copypart(this, i); } } Catenator(const Catenator& cqfirst, const Catenator& cqsecond, std::size_t inpos) : s_(0), l_(0) { //copy prior to insert, and find insert point std::size_t i = 0; while (inpos > cqfirst.length(i)) { inpos -= cqfirst.length(i); cqfirst.copypart(this, i); if (++i == cqfirst.c_.size()) throw CatenatorException("Out of range in catenator insert"); } //perform insertion if (inpos == 0) { // insert before next item for (std::size_t k = 0; k < cqsecond.c_.size(); ++k) { cqsecond.copypart(this, k); } cqfirst.copypart(this, i); } else if (inpos == cqfirst.length(i)) { //insert after next item cqfirst.copypart(this, i); for (std::size_t k = 0; k < cqsecond.c_.size(); ++k) { cqsecond.copypart(this, k); } } else { //insert in middle of next item typename cqmap_type::iterator mpos = cqfirst.c_[i].second->begin(); while ((mpos != cqfirst.c_[i].second->end()) && (inpos >= mpos->first)) { //not before next insertion if (inpos > mpos->first + mpos->second->length()) { //after end of this insertion inpos -= mpos->second->length(); mpos++; } else if (inpos == mpos->first) { //at beginning of this insertion Catenator * prepcq = new Catenator(&cqsecond, mpos->second); cqfirst.copypartinsert(this, i, prepcq, mpos->first); inpos = 0; } else if (inpos == mpos->first + mpos->second->length()) { //at end of this insertion Catenator * prepcq = new Catenator(mpos->second, &cqsecond); cqfirst.copypartinsert(this, i, prepcq, mpos->first); inpos = 0; } else { //in middle of this insertion Catenator * newcq = new Catenator(*(mpos->second), cqsecond, inpos - mpos->first); cqfirst.copypartinsert(this, i, newcq, mpos->first); inpos = 0; } } if (inpos > 0) { //need brand new insertion cqfirst.copypartinsert(this, i, cqsecond.clone(), inpos); } } //copy items after insertion for ( ++i ; i < cqfirst.c_.size(); ++i) { cqfirst.copypart(this, i); } } private: void copypart(Catenator * cqdest, std::size_t index) const { vect_type * newvt = new(vect_type); newvt->assign(c_[index].first->begin(), c_[index].first->end()); cqmap_type * newm = new(cqmap_type); for (typename cqmap_type::const_iterator it = c_[index].second->begin(); it != c_[index].second->end(); ++it) { (*newm)[it->first] = it->second->clone(); } cqdest->c_.push_back(make_pair(newvt,newm)); } //////////////// //destructor // //////////////// public: ~Catenator() { for (std::size_t i = 0; i < c_.size(); ++i) { delete c_[i].first; for (typename cqmap_type::iterator it = c_[i].second->begin(); it != c_[i].second->end(); ++it) { delete it->second; } delete c_[i].second; } } //////////////// //data access // //////////////// // //number of different combinations in the Catenator ie size when treated as a vector // std::size_t size() const { if (s_ != 0) return s_; std::size_t ss = 1; for (std::size_t i = 0; i < c_.size(); ++i) { ss *= c_[i].first->size(); for (typename cqmap_type::const_iterator it = c_[i].second->begin(); it != c_[i].second->end(); ++it) { ss *= it->second->size(); } } s_ = ss; return ss; } // //length of each item in the Catenator // private: std::size_t length(std::size_t i) const { if (i < 0 || i >= c_.size()) throw CatenatorException("Bad catenator size"); if (c_[i].first->size() == 0) return 0; std::size_t ll = traits_type::length((*c_[i].first)[0]); for (typename cqmap_type::const_iterator it = c_[i].second->begin(); it != c_[i].second->end(); ++it) { ll += it->second->length(); } return ll; } public: std::size_t length() const { if (l_ != 0) return l_; std::size_t ll = 0; for (std::size_t i = 0; i < c_.size(); ++i) { ll += length(i); } l_ = ll; return ll; } // //output // friend std::ostream& operator<<(std::ostream& ss, const Catenator& cq) { for (std::size_t k = 0; k < cq.c_.size(); ++k) { ss << "{" << k << "} " << cq.c_[k].first->size() << " of " << traits_type::length((*(cq.c_[k].first))[0]); for (typename cqmap_type::const_iterator it = cq.c_[k].second->begin(); it != cq.c_[k].second->end(); ++it) { ss << " (" << *(it->second) << ") at " << it->first; } ss << " : "; } ss << "=" << cq.size(); return ss; } // //array-style read-only access // const value_type operator[](std::size_t ind) const { value_type t = traits_type::createempty(); for (std::size_t i = 0; i < c_.size(); ++i) { std::size_t nextind = ind % c_[i].first->size(); value_type newt((*c_[i].first)[nextind]); ind -= nextind; ind /= c_[i].first->size(); if (c_[i].second) { typename cqmap_type::const_reverse_iterator it(c_[i].second->rbegin()); while (!(it == (typename cqmap_type::const_reverse_iterator) c_[i].second->rend())) { std::size_t mind = ind % it->second->size(); value_type mt = (*it->second)[mind]; ind -= mind; ind /= it->second->size(); newt = traits_type::insert(newt, mt, it->first); it++; } } t = traits_type::join(t, newt); } if (ind > 0) throw CatenatorException("Catenator buffer overflow."); return t; } /////////////////////////////////////////////////////////// //modifications and insertions into existing Catenators // /////////////////////////////////////////////////////////// // //reverse existing catenator // friend Catenator * cqreverse(const Catenator& cqrev) { if (cqrev.c_.size() == 0) throw CatenatorException("Bad catenator in cqreverse"); Catenator * cqresult = new(Catenator); for (std::size_t i = 0; i < cqrev.c_.size(); ++i) { std::size_t ind = cqrev.c_.size() - (i + 1); vect_type * newvt = new(vect_type); for (std::size_t j = 0; j < (cqrev.c_[ind].first)->size(); ++j) { value_type nextt = traits_type::reverse((*(cqrev.c_[ind].first))[j]); newvt->push_back(nextt); } std::size_t tsize = traits_type::length((*newvt)[0]); cqmap_type * newm = new(cqmap_type); for (typename cqmap_type::iterator it = cqrev.c_[ind].second->begin(); it != cqrev.c_[ind].second->end(); ++it) { (*newm)[tsize - it->first] = cqreverse(*(it->second)); } cqresult->c_.push_back(make_pair(newvt, newm)); } return cqresult; } // //cut last from a catenator // friend Catenator * cqcutlast(const Catenator& cqcut) { Catenator * cqresult = new(Catenator); for (std::size_t k = 0; k < cqcut.c_.size() - 1; ++k) { cqcut.copypart(cqresult, k); } std::size_t lastcut = cqcut.c_.size() - 1; Catenator * cqlast = (Catenator *) 0; if (cqcut.length(lastcut) > 1) { vect_type * shortvt = new(vect_type); for (std::size_t l = 0; l < cqcut.c_[lastcut].first->size(); ++l) { shortvt->push_back(traits_type::cutlast((*cqcut.c_[lastcut].first)[l])); } cqmap_type * newm = new(cqmap_type); for (typename cqmap_type::const_iterator it = cqcut.c_[lastcut].second->begin(); it != cqcut.c_[lastcut].second->end(); ++it) { if (it->first == cqcut.length(lastcut) - 1) { cqlast = it->second; } else { (*newm)[it->first] = it->second->clone(); } } cqresult->c_.push_back(make_pair(shortvt,newm)); if (!cqlast) return cqresult; Catenator * newcqresult = new Catenator(cqresult, cqlast); delete cqresult; return newcqresult; } return cqresult; } // //cut first from a catenator // friend Catenator * cqcutfirst(const Catenator& cqcut) { Catenator * cqresult = new(Catenator); if (cqcut.length(0) > 1) { Catenator * cqlast = (Catenator *) 0; vect_type * shortvt = new(vect_type); for (std::size_t l = 0; l < cqcut.c_[0].first->size(); ++l) { shortvt->push_back(traits_type::cutfirst((*cqcut.c_[0].first)[l])); } cqmap_type * newm = new(cqmap_type); for (typename cqmap_type::const_iterator it = cqcut.c_[0].second->begin(); it != cqcut.c_[0].second->end(); ++it) { if (it->first == 1) { cqlast = it->second; } else { (*newm)[it->first - 1] = it->second->clone(); } } if (cqlast) { delete cqresult; cqresult = cqlast->clone(); } cqresult->c_.push_back(make_pair(shortvt, newm)); } for (std::size_t k = 1; k < cqcut.c_.size(); ++k) { cqcut.copypart(cqresult, k); } return cqresult; } // //helper routine for insertion // private: void copypartinsert(Catenator * cqdest, std::size_t index, Catenator * newcq, std::size_t insertindex) const { bool newinsert = true; vect_type * newvt = new(vect_type); newvt->assign(c_[index].first->begin(), c_[index].first->end()); cqmap_type * newm = new(cqmap_type); for (typename cqmap_type::const_iterator it = c_[index].second->begin(); it != c_[index].second->end(); ++it) { if (it->first != insertindex) { (*newm)[it->first] = it->second->clone(); } else { (*newm)[it->first] = newcq; newinsert = false; } } if (newinsert) (*newm)[insertindex] = newcq; cqdest->c_.push_back(make_pair(newvt, newm)); } /////////////////////////////// //efficient search routines // /////////////////////////////// // //interface routine, dispatches to private helper //parameter is concatenation of individual T units //that could include wildcards depending on 'match' //traits routine // public: vect_type * mightmatch(value_type tofind) { if (length() != traits_type::length(tofind)) return 0; return findpart(tofind, tofind); } // //top level constituent finding helper routine // value_type topmatcher(value_type thisfind, std::size_t index, std::map& inserts) { std::size_t offset = 0; for (std::size_t j = 0; j < index; ++j) { offset += length(j); } value_type confind = traits_type::createempty(); std::size_t orig = 0; for (typename cqmap_type::const_iterator it = c_[index].second->begin(); it != c_[index].second->end(); ++it) { //for each insert in that constituent std::size_t gap = it->first - orig; confind = traits_type::join(confind, traits_type::substr(thisfind, offset, gap)); //add next bit to match string inserts[it->first] = traits_type::substr(thisfind, offset + gap, it->second->length()); //get matching bit for insert offset += gap + it->second->length(); //advance index into match string past insert orig = it->first; } confind = traits_type::join( confind, traits_type::substr(thisfind, offset, traits_type::length((*(c_[index].first))[0]) - orig)); //add final match part return confind; } // //add insert matches to top level matches // vect_type * addinsertmatches(vect_type * maybes, std::map& inserts, std::size_t index, value_type allfind) { for (typename std::map::reverse_iterator mrit = inserts.rbegin(); mrit != inserts.rend(); ++mrit) { //combine with matches for inserts vect_type * maybeaccum = new(vect_type); vect_type * nextins = (*(c_[index].second))[mrit->first]->findpart(mrit->second, allfind); for (std::size_t b = 0; b < maybes->size(); ++b) { for (std::size_t c = 0; c < nextins->size(); ++c) { maybeaccum->push_back(traits_type::insert((*maybes)[b], (*nextins)[c], mrit->first)); } } delete nextins; swap(maybeaccum, maybes); delete maybeaccum; } return maybes; } // //helper routine for search // private: vect_type * findpart(value_type thisfind, value_type allfind) { vect_type * accum = new(vect_type); vect_type * newaccum = new(vect_type); for (std::size_t i = 0; i < c_.size(); ++i) { //for each top-level constituent T std::map inserts; value_type confind = topmatcher(thisfind, i, inserts); //get top level to match and fill map of inserts to match vect_type * maybes = couldmatch(i, confind, allfind); //top level match only maybes = addinsertmatches(maybes, inserts, i, allfind); //put insert matches into maybes if (i == 0) { newaccum->insert(newaccum->begin(), maybes->begin(), maybes->end()); } else { for (std::size_t n = 0; n < accum->size(); ++n) { for (std::size_t m = 0; m < maybes->size(); ++m) { newaccum->push_back(traits_type::join((*accum)[n], (*maybes)[m])); } } } delete maybes; swap(accum, newaccum); newaccum->clear(); } delete newaccum; return accum; } // //test for match of individual Ts, given bits to compare and total context // vect_type * couldmatch(std::size_t i, value_type confind, value_type allfind) { vect_type * results = new(vect_type); for (std::size_t j = 0; j < c_[i].first->size(); ++j) { value_type t = (*(c_[i].first))[j]; if (traits_type::match(confind, t, allfind)) results->push_back(t); } return results; } }; // //insert one catenator inside another // template Catenator * cqinsert(const Catenator& cqfirst, const Catenator& cqsecond, std::size_t inpos) { return new Catenator(cqfirst, cqsecond, inpos); } typedef Catenator strcq_t; #endif