Back to index

scribus-ng  1.3.4.dfsg+svn20071115
digester.cpp
Go to the documentation of this file.
00001 /*
00002  *  digester.cpp
00003  *  
00004  *
00005  *  Created by Andreas Vox on 02.06.06.
00006  *  Copyright 2006 under GPL2. All rights reserved.
00007  *
00008  */
00009 
00010 #include <iostream>
00011 #include <vector>
00012 #include <map>
00013 #include <utility>
00014 #include <deque>
00015 #include <functional>
00016 #include <set>
00017 
00018 #include "digester.h"
00019 #include "automata.h"
00020 #include "actions.h"
00021 
00022 namespace desaxe {
00023        
00024 typedef std::pair<Xml_string, Action> rule_t;
00025        
00026 typedef unsigned short token_t;
00027 typedef std::vector<token_t> path_t;
00028 enum special_token { EMPTY = 0, START = 1, ANY = 2, REPEAT = 3 } ;
00029 
00030 typedef unsigned short nfa_state_t;
00031 
00032 struct DFA_State
00033 {
00034        unsigned int ID;
00035        std::vector<rule_t> rules;
00036        
00037        DFA_State() : rules() {}
00038 };
00039 
00040 typedef DFA_State* dfa_state_t;
00041 
00042 
00050 class RuleState 
00051 { 
00052 public:
00053        RuleState();
00054        RuleState(const RuleState& other);
00055        ~RuleState();
00057        void addRule(const Xml_string& pattern, Action action);
00059        void reset();
00061        const std::vector<rule_t>& rulesForCurrentState();
00063        void open(const Xml_string& tag);
00065        void close();
00067        void dump();
00068 private:
00070        std::vector<rule_t> rules;
00071        
00073        std::map<Xml_string, token_t> tokens;
00075        std::vector<nfa_state_t> accepting;
00077        automata::DFA<dfa_state_t, token_t> *dfa;
00079        std::vector<dfa_state_t> stateStack;
00080 
00082        token_t createToken(const Xml_string& tok);
00083        void makeTokens();
00084        
00086        automata::NFA<nfa_state_t, token_t>* createNFA();
00087 
00089        void compileDFA();
00091        bool valid;
00092 };
00093 
00094 
00095 Digester::Digester() : objects(), storage(), errors() 
00096 { 
00097        state = new RuleState();
00098        result_.ptr = NULL;
00099        result_.type = "";
00100 }
00101 
00102 
00103 Digester::~Digester() {
00104        delete state;
00105        deletePatches(patches);
00106 }
00107 
00108 
00109 Digester& Digester::operator=(const Digester& other)
00110 {
00111        delete state;
00112        state = new RuleState(*other.state);
00113        objects = other.objects;
00114        storage = other.storage;
00115        result_ = other.result_;
00116        errors = other.errors;
00117        return *this;
00118 }
00119 
00120 
00121 int Digester::nrOfErrors() const
00122 {
00123        return errors.size();
00124 }
00125 
00126 const Xml_string Digester::getError(int i) const
00127 {
00128        return errors[i];
00129 }
00130 
00131 
00132 void Digester::addRule(const Xml_string& pattern, Action action)
00133 {
00134        action.setDigester(this);
00135        state->addRule(pattern, action);
00136 }
00137 
00138 
00139 void Digester::beginDoc()
00140 {
00141        state->reset();
00142 #ifdef DESAXE_DEBUG  
00143        state->dump();
00144 #endif
00145 }
00146 
00147 void Digester::endDoc()
00148 {
00149 }
00150 
00151 void Digester::begin(const Xml_string& tag, Xml_attr attr)
00152 {
00153        state->open(tag);
00154        const std::vector<rule_t>& rules (state->rulesForCurrentState());
00155        std::vector<rule_t>::const_iterator it;
00156        for(it=rules.begin(); it!=rules.end(); ++it)
00157        {
00158 #ifdef DESAXE_DEBUG
00159               std::cerr << "B " << it->first << " " << typeid(it->second).name() << "\n";
00160 #endif
00161               const_cast<Action&>(it->second).begin(tag, attr);
00162        }
00163 }
00164 
00165 void Digester::end(const Xml_string& tag)
00166 {
00167        const std::vector<rule_t>& rules (state->rulesForCurrentState());
00168        std::vector<rule_t>::const_reverse_iterator it;
00169        for(it=rules.rbegin(); it!=rules.rend(); ++it)
00170        {
00171 #ifdef DESAXE_DEBUG
00172               std::cerr << "E " << it->first << " " << typeid(it->second).name() << "\n";
00173 #endif
00174               const_cast<Action&>(it->second).end(tag);
00175        }
00176        state->close();
00177 }
00178 
00179 void Digester::chars(const Xml_string& text)
00180 {
00181        const std::vector<rule_t>& rules (state->rulesForCurrentState());
00182        std::vector<rule_t>::const_iterator it;
00183        for(it=rules.begin(); it!=rules.end(); ++it)
00184        {
00185 #ifdef DESAXE_DEBUG
00186               std::cerr << "C " << it->first << " " << typeid(it->second).name() << "\n";
00187 #endif
00188               const_cast<Action&>(it->second).chars(text);
00189        }
00190 }
00191 
00192 
00193 Xml_string Digester::concat(const Xml_string& pattern1, const Xml_string& pattern2)
00194 {
00195        if (pattern1 == "")
00196               return pattern2;
00197        else if (pattern2 == "")
00198               return pattern1;
00199        else if ( (pattern1[pattern1.length()-1] != '/') && (pattern2[0] != '/') )
00200               // insert "/" as separator
00201               return pattern1 + "/" + pattern2;
00202        else if ( (pattern1[pattern1.length()-1]=='/') || (pattern2[0]=='/') )
00203               return pattern1 + pattern2;
00204        else // cut off one "/"
00205               return pattern1 + std::string(static_cast<const std::string&>(pattern2), 1, std::string::npos);
00206 }
00207 
00208 
00209 RuleState::RuleState() : rules(), dfa(NULL), stateStack(), valid(false)
00210 {}
00211 
00212 RuleState::RuleState(const RuleState& other) : rules(other.rules), dfa(NULL), stateStack(), valid(false)
00213 {}
00214 
00215 RuleState::~RuleState()
00216 {
00217        if (dfa) 
00218        {
00219               std::set<dfa_state_t> morituri(dfa->states());
00220               for (std::set<dfa_state_t>::iterator i=morituri.begin(); i != morituri.end(); ++i) {
00221                      delete *i;
00222               }
00223               delete dfa;
00224        }
00225 }
00226 
00227 
00228 void RuleState::addRule(const Xml_string& pattern, Action action)
00229 {
00230        rules.push_back(std::pair<Xml_string, Action>(pattern,action));
00231        valid = false;
00232 }
00233 
00234 
00235 inline
00236 void RuleState::reset()
00237 {
00238        stateStack.clear();
00239        if (!valid)
00240               compileDFA();
00241        stateStack.push_back(dfa->start());
00242 }
00243 
00244 void RuleState::dump()
00245 {
00246        std::cout << "Rules:\n";
00247        for (unsigned int r=0; r<rules.size(); ++r) {
00248               std::cout << r << ":\t\"" << rules[r].first << "\" accepted in  (" << accepting[r] << ")\n";
00249        }
00250        std::cout << "\nTokens:\n";
00251        for (std::map<Xml_string, token_t>::iterator it=tokens.begin(); it!=tokens.end(); ++it) {
00252               std::cout << it->first << ":\t--> " << it->second << "\n";
00253        }
00254        std::cout << "\nAutomaton:\n";
00255        const std::set<dfa_state_t>& states(dfa->states());
00256        const std::set<token_t>& inputs(dfa->inputs());
00257        std::cout << "STATE";
00258        for (std::set<token_t>::const_iterator i=inputs.begin(); i != inputs.end(); ++i) {
00259               std::cout << "\t" << *i;
00260        }
00261        std::cout << "\tRULES\n\n";
00262        for (std::set<dfa_state_t>::const_iterator s=states.begin(); s!=states.end(); ++s) {
00263               std::cout << (*s)->ID;
00264               for (std::set<token_t>::const_iterator i=inputs.begin(); i!=inputs.end(); ++i) {
00265                      dfa_state_t nstate = dfa->next(*s,*i);
00266                      std::cout << "\t";
00267                      if (nstate)
00268                             std::cout << nstate->ID;
00269                      else
00270                             std::cout << "--";
00271               }
00272               for (std::vector<rule_t>::iterator rs=(*s)->rules.begin(); rs!=(*s)->rules.end(); ++rs)
00273                      std::cout << "\t\"" << rs->first << "\"";
00274               std::cout << "\n";
00275        }
00276 }
00277 
00278 inline
00279 void RuleState::open(const Xml_string& tag)
00280 {
00281        dfa_state_t nstate = dfa->next(stateStack.back(), tokens[tag]);
00282        assert(nstate != NULL);
00283 #ifdef DESAXE_DEBUG
00284        std::cerr << "to state " << nstate->ID << "\n";  
00285 #endif
00286        if (nstate->ID == dfa->deflt()->ID) {
00287               nstate = dfa->next(stateStack.back(), ANY);
00288 #ifdef DESAXE_DEBUG
00289               std::cerr << "empty, trying any:" << nstate->ID << "\n"; 
00290 #endif
00291        }
00292        stateStack.push_back(nstate);
00293 }
00294 
00295 inline
00296 void RuleState::close()
00297 {
00298        stateStack.pop_back();
00299 }
00300 
00301 
00302 inline
00303 const std::vector<rule_t>& RuleState::rulesForCurrentState()
00304 {
00305        return stateStack.back()->rules;
00306 }
00307 
00308 void RuleState::makeTokens()
00309 {
00310        tokens.clear();
00311        tokens[""] = EMPTY;
00312        tokens["/"] = START;
00313        tokens["*"] = ANY;
00314        tokens["**"] = REPEAT;
00315 }
00316 
00317 
00318 
00319 token_t RuleState::createToken(const Xml_string& tok)
00320 {
00321        if (tokens.find(tok) == tokens.end())
00322               tokens[tok] = tokens.size() + 1;
00323        return tokens[tok];
00324 }
00325 
00326 
00327 
00328 automata::NFA<nfa_state_t, token_t>* RuleState::createNFA()
00329 {
00330        using automata::NFA;
00331        
00333        std::map<path_t, nfa_state_t> nfa_states;
00334        nfa_states.clear();
00335 
00336        path_t prefix;
00337 
00338        // START and EMPTY are both: tokens and nfa_states
00339        prefix.push_back(START);
00340        nfa_states[prefix] = START;
00341        
00342        prefix[0] = EMPTY;
00343        nfa_states[prefix] = EMPTY;
00344        
00345        std::set<nfa_state_t> deflt;
00346        deflt.insert(EMPTY);
00347        
00348        NFA<nfa_state_t, token_t> *nfa = new NFA<nfa_state_t, token_t>(START, deflt);
00349        
00350        nfa->addState(EMPTY);
00351        nfa->addInput(ANY);
00352        nfa->addTransition(EMPTY, ANY, EMPTY);
00353 
00354        for (unsigned int i = 0; i < rules.size(); ++i)
00355        {
00356               const std::string currPattern(fromXMLString(rules[i].first));
00357               const unsigned int len = currPattern.length();
00358               int pos;
00359               nfa_state_t lastState;
00360               
00361               // determine if this is a start pattern
00362               prefix.resize(1);
00363               if (currPattern[0] == '/') {
00364                      prefix[0] = START;
00365                      pos = 1;
00366                      lastState = START;
00367               }
00368               else {
00369                      prefix[0] = EMPTY;
00370                      pos = 0;
00371                      lastState = EMPTY;
00372               }
00373               
00374 //            std::cerr << "looking at pattern: " << currPattern << "\n";
00375               // for all prefixes
00376               do {
00377                      std::string::size_type pos2 = currPattern.find('/', pos);
00378                      if (pos2 == std::string::npos)
00379                             pos2 = len;
00380                      
00381                      std::string diff(currPattern.substr(pos, pos2-pos));
00382                      token_t tok = createToken(fromSTLString(diff));
00383 //                   std::cerr << pos << "-" << pos2 << "\t: " << diff << " = " << tok << "\n";
00384 
00385                      // create loop if REPEAT token
00386                      if (tok == REPEAT) {
00387                             nfa->addTransition(lastState, ANY, lastState); //FIXME: that's wrong, need to create repeating state
00388 //                          std::cerr << "T " << lastState << "--*-->" << lastState << "\n";
00389                             pos = pos2 + 1;
00390                             continue;
00391                      }
00392                      
00393                      prefix.push_back(tok);
00394                      // create new state if necessary
00395                      nfa_state_t nstate;
00396                      if (nfa_states.find(prefix) != nfa_states.end()) {
00397                             nstate = nfa_states[prefix];
00398                      }
00399                      else {
00400                             nstate = nfa_states.size();
00401                             nfa->addState(nstate);
00402                             nfa_states[prefix] = nstate;
00403                      }
00404                      // add transition
00405                      nfa->addInput(tok);
00406                      nfa->addTransition(lastState, tok, nstate);
00407 //                   std::cerr << "T " << lastState << "--(" << tok << ")-->" << nstate << "\n";
00408                      lastState = nstate;
00409                      pos = pos2 + 1;
00410               } while(pos < signed(len));
00411               accepting.push_back(lastState);
00412 //            std::cerr << "accepted in " << lastState << "\n";
00413 
00414               // copy all transition from EMPTY to all other states
00415               const NFA<nfa_state_t, token_t>::Transitions& transFromEmpty(nfa->transitions(EMPTY));
00416               std::set<nfa_state_t>::const_iterator it, st;
00417               NFA<nfa_state_t, token_t>::Transitions::const_iterator tr;
00418               for (it = nfa->states().begin(); it != nfa->states().end(); ++it) {
00419                      if (*it == EMPTY)
00420                             continue;
00421                      for (tr = transFromEmpty.begin(); tr != transFromEmpty.end(); ++tr)
00422                             for (st = tr->second.begin(); st != tr->second.end(); ++st)
00423                                    nfa->addTransition(*it, tr->first, *st);
00424               }
00425               
00426               // ANY transitions
00427               const std::set<token_t>& inputs(nfa->inputs());
00428               std::set<token_t>::const_iterator tok;
00429               for (it = nfa->states().begin(); it != nfa->states().end(); ++it) {
00430                      const std::set<nfa_state_t>& anyStates(nfa->next(*it, ANY));
00431                      for (st = anyStates.begin(); st != anyStates.end(); ++st)
00432                             for (tok=inputs.begin(); tok != inputs.end(); ++tok)
00433                                    if (*tok != ANY)
00434                                           nfa->addTransition(*it, *tok, *st);
00435               }
00436        }
00437        return nfa;
00438 }
00439 
00440 struct CreateDFAState : public std::unary_function <std::set<nfa_state_t>, dfa_state_t> {
00441 
00442        CreateDFAState(const std::vector<rule_t>& rules, const std::vector<nfa_state_t>& accepting) 
00443        : n(0), rules_(rules), accepting_(accepting) 
00444        {}
00445 
00446        dfa_state_t operator()(const std::set<nfa_state_t>& combination)
00447        {
00448               dfa_state_t result = new DFA_State();
00449               result->ID = n++;
00450               result->rules.clear();
00451               for (unsigned int i=0; i < rules_.size(); ++i)
00452                      if (combination.find(accepting_[i]) != combination.end())
00453                             result->rules.push_back(rules_[i]);
00454               return result;
00455        } 
00456 
00457        unsigned int n;
00458        const std::vector<rule_t>& rules_;
00459        const std::vector<nfa_state_t>& accepting_;
00460 };
00461 
00462 //dfa_state_t create_fun(const std::set<nfa_state_t>& combination);
00463 
00464 void RuleState::compileDFA()
00465 {      
00466        // build NFA:
00467        makeTokens(); 
00468        automata::NFA<nfa_state_t, token_t> *nfa = createNFA();
00469        // make deterministic:
00470        CreateDFAState create(rules, accepting);
00471 //     dfa = nfa->constructDFA<dfa_state_t>(create);        // wont work
00472 //     dfa = nfa->constructDFA<dfa_state_t>(create_fun);    // works
00473 //     std::set<nfa_state_t> arg;
00474 //     dfa_state_t res = create(arg);                       // works
00475 //     create("bullshit");                                  // never works :-)
00476        dfa = nfa->constructDFA<dfa_state_t,CreateDFAState&>(create);   // finally
00477        delete nfa;
00478        valid = true;
00479 }
00480 
00481 } // namespace