Back to index

texmacs  1.0.7.15
packrat_grammar.cpp
Go to the documentation of this file.
00001 
00002 /******************************************************************************
00003 * MODULE     : packrat_grammar.cpp
00004 * DESCRIPTION: packrat grammars
00005 * COPYRIGHT  : (C) 2010  Joris van der Hoeven
00006 *******************************************************************************
00007 * This software falls under the GNU general public license version 3 or later.
00008 * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE
00009 * in the root directory or <http://www.gnu.org/licenses/gpl-3.0.html>.
00010 ******************************************************************************/
00011 
00012 #include "packrat_grammar.hpp"
00013 #include "analyze.hpp"
00014 #include "iterator.hpp"
00015 
00016 tree                 packrat_uninit (UNINIT);
00017 int                  packrat_nr_tokens= 256;
00018 int                  packrat_nr_symbols= 0;
00019 hashmap<string,C>    packrat_tokens;
00020 hashmap<tree,C>      packrat_symbols;
00021 hashmap<C,tree>      packrat_decode (packrat_uninit);
00022 hashmap<string,int>  color_encoding;
00023 hashmap<int,string>  color_decoding;
00024 
00025 RESOURCE_CODE(packrat_grammar);
00026 
00027 /******************************************************************************
00028 * Encoding and decoding of tokens and symbols
00029 ******************************************************************************/
00030 
00031 C
00032 new_token (string s) {
00033   if (N(s) == 1)
00034     return (C) (unsigned char) s[0];
00035   else {
00036     C ret= packrat_nr_tokens++;
00037     //cout << "Encode " << s << " -> " << ret << LF;
00038     return ret;
00039   }
00040 }
00041 
00042 C
00043 encode_token (string s) {
00044   if (!packrat_tokens->contains (s)) {
00045     int pos= 0;
00046     tm_char_forwards (s, pos);
00047     if (pos == 0 || pos != N(s)) return -1;
00048     C sym= new_token (s);
00049     packrat_tokens (s)= sym;
00050     packrat_decode (sym)= s;
00051   }
00052   return packrat_tokens[s];
00053 }
00054 
00055 array<C>
00056 encode_tokens (string s) {
00057   int pos= 0;
00058   array<C> ret;
00059   while (pos < N(s)) {
00060     int old= pos;
00061     tm_char_forwards (s, pos);
00062     ret << encode_token (s (old, pos));
00063   }
00064   return ret;
00065 }
00066 
00067 C
00068 new_symbol (tree t) {
00069   if (is_atomic (t)) {
00070     C ret= encode_token (t->label);
00071     if (ret != -1) return ret;
00072   }
00073   C ret= (packrat_nr_symbols++) + PACKRAT_SYMBOLS;
00074   //cout << "Encode " << t << " -> " << ret << LF;
00075   return ret;
00076 }
00077 
00078 C
00079 encode_symbol (tree t) {
00080   if (!packrat_symbols->contains (t)) {
00081     C sym= new_symbol (t);
00082     packrat_symbols (t)= sym;
00083     packrat_decode  (sym)= t;
00084   }
00085   return packrat_symbols[t];
00086 }
00087 
00088 /******************************************************************************
00089 * Encode and decode colors for syntax highlighting
00090 ******************************************************************************/
00091 
00092 void
00093 initialize_color_encodings () {
00094   color_encoding ("comment")= 1;
00095   color_encoding ("keyword")= 2;
00096   color_encoding ("error")= 3;
00097   color_encoding ("constant")= 10;
00098   color_encoding ("constant_identifier")= 11;
00099   color_encoding ("constant_function")= 12;
00100   color_encoding ("constant_type")= 13;
00101   color_encoding ("constant_category")= 14;
00102   color_encoding ("constant_module")= 15;
00103   color_encoding ("constant_number")= 16;
00104   color_encoding ("constant_string")= 17;
00105   color_encoding ("variable")= 20;
00106   color_encoding ("variable_identifier")= 21;
00107   color_encoding ("variable_function")= 22;
00108   color_encoding ("variable_type")= 23;
00109   color_encoding ("variable_category")= 24;
00110   color_encoding ("variable_module")= 25;
00111   color_encoding ("declare")= 30;
00112   color_encoding ("declare_identifier")= 31;
00113   color_encoding ("declare_function")= 32;
00114   color_encoding ("declare_type")= 33;
00115   color_encoding ("declare_category")= 34;
00116   color_encoding ("declare_module")= 35;
00117 }
00118 
00119 void
00120 initialize_color_decodings () {
00121   color_decoding (-1)= "red";
00122   color_decoding (1)= "brown";
00123   color_decoding (2)= "dark green";
00124   color_decoding (3)= "dark red";
00125   color_decoding (10)= "#4040c0";
00126   color_decoding (11)= "#4040c0";
00127   color_decoding (12)= "#4040c0";
00128   color_decoding (13)= "#4040c0";
00129   color_decoding (14)= "#4040c0";
00130   color_decoding (15)= "#4040c0";
00131   color_decoding (16)= "#4040c0";
00132   color_decoding (17)= "#4040c0";
00133   color_decoding (20)= "#606060";
00134   color_decoding (21)= "#606060";
00135   color_decoding (22)= "#606060";
00136   color_decoding (23)= "#00c000";
00137   color_decoding (24)= "#00c000";
00138   color_decoding (25)= "#00c000";
00139   color_decoding (30)= "#0000c0";
00140   color_decoding (31)= "#0000c0";
00141   color_decoding (32)= "#0000c0";
00142   color_decoding (33)= "#0000c0";
00143   color_decoding (34)= "#0000c0";
00144   color_decoding (35)= "#0000c0";
00145 }
00146 
00147 int
00148 encode_color (string s) {
00149   if (N(color_encoding) == 0) initialize_color_encodings ();
00150   if (color_encoding->contains (s)) return color_encoding[s];
00151   else return -1;
00152 }
00153 
00154 string
00155 decode_color (int c) {
00156   if (N(color_decoding) == 0) initialize_color_decodings ();
00157   if (color_decoding->contains (c)) return color_decoding[c];
00158   else return "";
00159 }
00160 
00161 /******************************************************************************
00162 * Left recursion
00163 ******************************************************************************/
00164 
00165 bool
00166 left_recursive (string s, tree t) {
00167   if (is_atomic (t))
00168     return false;
00169   else if (is_compound (t, "symbol", 1) && is_atomic (t[0]))
00170     return t[0]->label == s;
00171   else if (is_compound (t, "or")) {
00172     for (int i=0; i<N(t); i++)
00173       if (left_recursive (s, t[i]))
00174        return true;
00175     return false;
00176   }
00177   else if (is_compound (t, "concat"))
00178     return N(t) > 0 && left_recursive (s, t[0]);
00179   else return false;
00180 }
00181 
00182 tree
00183 left_head (string s, tree t) {
00184   if (is_atomic (t)) return t;
00185   else if (is_compound (t, "symbol", 1) && is_atomic (t[0])) {
00186     if (t[0]->label == s) return compound ("or");
00187     else return t;
00188   }
00189   else if (is_compound (t, "or")) {
00190     tree r= compound ("or");
00191     for (int i=0; i<N(t); i++) {
00192       tree h= left_head (s, t[i]);
00193       if (is_compound (h, "or")) r << A(h);
00194       else r << h;
00195     }
00196     return r;
00197   }
00198   else if (is_compound (t, "concat")) {
00199     if (N(t) == 0 || !left_recursive (s, t)) return t;
00200     else return left_head (s, t[0]);
00201   }
00202   else return t;
00203 }
00204 
00205 tree
00206 left_tail (string s, tree t) {
00207   if (is_atomic (t)) return t;
00208   else if (is_compound (t, "symbol", 1) && is_atomic (t[0])) {
00209     if (t[0]->label == s) return compound ("concat");
00210     else return compound ("or");
00211   }
00212   else if (is_compound (t, "or")) {
00213     tree r= compound ("or");
00214     for (int i=0; i<N(t); i++) {
00215       tree u= left_tail (s, t[i]);
00216       if (is_compound (u, "or")) r << A(u);
00217       else r << u;
00218     }
00219     return r;
00220   }
00221   else if (is_compound (t, "concat")) {
00222     if (N(t) == 0 || !left_recursive (s, t)) return compound ("or");
00223     else {
00224       tree r= compound ("concat");
00225       tree u= left_tail (s, t[0]);
00226       if (u == compound ("or")) return u;
00227       else if (is_compound (u, "concat")) r << A(u);
00228       else r << u;
00229       r << A (t (1, N(t)));
00230       return r;
00231     }
00232   }
00233   else compound ("or");
00234 }
00235 
00236 /******************************************************************************
00237 * Constructor
00238 ******************************************************************************/
00239 
00240 array<C>
00241 singleton (C c) {
00242   array<C> ret;
00243   ret << c;
00244   return ret;
00245 }
00246 
00247 packrat_grammar_rep::packrat_grammar_rep (string s):
00248   rep<packrat_grammar> (s),
00249   grammar (singleton (PACKRAT_TM_FAIL)),
00250   productions (packrat_uninit)
00251 {
00252   grammar (PACKRAT_TM_OPEN)= singleton (PACKRAT_TM_OPEN);
00253   grammar (PACKRAT_TM_ANY )= singleton (PACKRAT_TM_ANY );
00254   grammar (PACKRAT_TM_ARGS)= singleton (PACKRAT_TM_ARGS);
00255   grammar (PACKRAT_TM_LEAF)= singleton (PACKRAT_TM_LEAF);
00256   grammar (PACKRAT_TM_FAIL)= singleton (PACKRAT_TM_FAIL);
00257 }
00258 
00259 packrat_grammar
00260 make_packrat_grammar (string s) {
00261   if (packrat_grammar::instances -> contains (s)) return packrat_grammar (s);
00262   return make (packrat_grammar, s, tm_new<packrat_grammar_rep> (s));
00263 }
00264 
00265 packrat_grammar
00266 find_packrat_grammar (string s) {
00267   if (packrat_grammar::instances -> contains (s)) return packrat_grammar (s);
00268   eval ("(lazy-language-force " * s * ")");
00269   return make_packrat_grammar (s);
00270 }
00271 
00272 /******************************************************************************
00273 * Accelerate big lists of consecutive symbols
00274 ******************************************************************************/
00275 
00276 void
00277 packrat_grammar_rep::accelerate (array<C>& def) {
00278   if (N(def) == 0 || def[0] != PACKRAT_OR) return;
00279   hashmap<C,bool> all;
00280   for (int i=1; i<N(def); i++)
00281     if (def[i] >= PACKRAT_OR) return;
00282     else all (def[i])= true;
00283   array<C> ret;
00284   ret << PACKRAT_OR;
00285   hashmap<C,bool> done;
00286   for (int i=1; i<N(def); i++) {
00287     C c= def[i];
00288     if (done->contains (c)) continue;
00289     C start= c;
00290     while (start > 0 && all->contains (start-1)) start--;
00291     C end= c;
00292     while (end+1 < PACKRAT_OR && all->contains (end+1)) end++;
00293     if (end == start) ret << c;
00294     else {
00295       tree t= compound ("range", packrat_decode[start], packrat_decode[end]);
00296       C sym= encode_symbol (t);
00297       array<C> rdef;
00298       rdef << PACKRAT_RANGE << start << end;
00299       grammar (sym)= rdef;
00300       ret << sym;
00301       //cout << "Made range " << packrat_decode[start]
00302       //<< "--" << packrat_decode[end] << "\n";
00303     }
00304     for (int j=start; j<=end; j++) done(j)= true;
00305   }
00306   if (N(ret) == 2) {
00307     ret= range (ret, 1, 2);
00308     if (ret[0] >= PACKRAT_SYMBOLS) ret= grammar[ret[0]];
00309   }
00310   //cout << "Was: " << def << "\n";
00311   //cout << "Is : " << ret << "\n";
00312   def= ret;
00313 }
00314 
00315 /******************************************************************************
00316 * Definition of grammars
00317 ******************************************************************************/
00318 
00319 array<C>
00320 packrat_grammar_rep::define (tree t) {
00321   //cout << "Define " << t << INDENT << LF;
00322   array<C> def;
00323   if (t == "")
00324     def << PACKRAT_CONCAT;
00325   else if (is_atomic (t)) {
00326     int pos= 0;
00327     string s= t->label;
00328     tm_char_forwards (s, pos);
00329     if (pos > 0 && pos == N(s)) def << encode_symbol (s);
00330     else def << PACKRAT_CONCAT << encode_tokens (s);
00331   }
00332   else if (is_compound (t, "symbol", 1))
00333     def << encode_symbol (t);
00334   else if (is_compound (t, "range", 2) &&
00335           is_atomic (t[0]) && is_atomic (t[1]) &&
00336           N(t[0]->label) == 1 && N(t[1]->label) == 1)
00337     def << PACKRAT_RANGE
00338        << encode_token (t[0]->label)
00339        << encode_token (t[1]->label);
00340   else if (is_compound (t, "or", 1))
00341     def << define (t[0]);
00342   else {
00343     if (is_compound (t, "or")) def << PACKRAT_OR;
00344     else if (is_compound (t, "concat")) def << PACKRAT_CONCAT;
00345     else if (is_compound (t, "while")) def << PACKRAT_WHILE;
00346     else if (is_compound (t, "repeat")) def << PACKRAT_REPEAT;
00347     else if (is_compound (t, "not")) def << PACKRAT_NOT;
00348     else if (is_compound (t, "except")) def << PACKRAT_EXCEPT;
00349     else if (is_compound (t, "tm-open")) def << PACKRAT_TM_OPEN;
00350     else if (is_compound (t, "tm-any")) def << PACKRAT_TM_ANY;
00351     else if (is_compound (t, "tm-args")) def << PACKRAT_TM_ARGS;
00352     else if (is_compound (t, "tm-leaf")) def << PACKRAT_TM_LEAF;
00353     else if (is_compound (t, "tm-char")) def << PACKRAT_TM_CHAR;
00354     else if (is_compound (t, "tm-cursor")) def << PACKRAT_TM_CURSOR;
00355     else def << PACKRAT_TM_FAIL;
00356     for (int i=0; i<N(t); i++) {
00357       (void) define (t[i]);
00358       def << encode_symbol (t[i]);
00359     }
00360     accelerate (def);
00361   }
00362   if (N (def) != 1 || def[0] != encode_symbol (t)) {
00363     C sym= encode_symbol (t);
00364     grammar (sym)= def;
00365   }
00366   //cout << UNINDENT << "Defined " << t << " -> " << def << LF;
00367   return def;
00368 }
00369 
00370 void
00371 packrat_grammar_rep::define (string s, tree t) {
00372   //cout << "Define " << s << " := " << t << "\n";
00373   if (left_recursive (s, t)) {
00374     string s1= s * "-head";
00375     string s2= s * "-tail";
00376     tree   t1= left_head (s, t);
00377     tree   t2= left_tail (s, t);
00378     define (s1, t1);
00379     define (s2, t2);
00380     tree   u1= compound ("symbol", s1);
00381     tree   u2= compound ("while", compound ("symbol", s2));
00382     define (s, compound ("concat", u1, u2));
00383   }
00384   else {
00385     C        sym= encode_symbol (compound ("symbol", s));
00386     array<C> def= define (t);
00387     grammar (sym)= def;
00388   }
00389 }
00390 
00391 void
00392 packrat_grammar_rep::set_property (string s, string var, string val) {
00393   //cout << "Set property " << s << ", " << var << " -> " << val << "\n";
00394   C sym = encode_symbol (compound ("symbol", s));
00395   C prop= encode_symbol (compound ("property", var));
00396   D key = (((D) prop) << 32) + ((D) (sym ^ prop));
00397   properties (key)= val;
00398 }
00399 
00400 /******************************************************************************
00401 * Member analysis
00402 ******************************************************************************/
00403 
00404 string
00405 packrat_grammar_rep::decode_as_string (C sym) {
00406   string r;
00407   if (sym < PACKRAT_OR) {
00408     tree t= packrat_decode [sym];
00409     if (is_atomic (t)) r << t->label;
00410   }
00411   else {
00412     array<C> def= grammar[sym];
00413     if (N(def) == 1 && (def[0] < PACKRAT_OR || def[0] >= PACKRAT_SYMBOLS))
00414       r << decode_as_string (def[0]);
00415     else if (N(def) >= 1 && def[0] == PACKRAT_CONCAT)
00416       for (int i=1; i<N(def); i++)
00417        r << decode_as_string (def[i]);
00418     else {
00419       cout << "Warning: could not transform " << packrat_decode[sym]
00420           << " into a string\n";
00421     }
00422   }
00423   return r;
00424 }
00425 
00426 array<string>
00427 packrat_grammar_rep::decode_as_array_string (C sym) {
00428   array<string> r;
00429   array<C> def= grammar[sym];
00430   if (N(def) == 1 && def[0] >= PACKRAT_SYMBOLS)
00431     r << decode_as_array_string (def[0]);
00432   else if (N(def) >= 1 && def[0] == PACKRAT_OR)
00433     for (int i=1; i<N(def); i++)
00434       r << decode_as_array_string (def[i]);
00435   else if (N(def) == 3 && def[0] == PACKRAT_RANGE)
00436     for (C c=def[1]; c<=def[2]; c++)
00437       r << decode_as_string (c);
00438   else r << decode_as_string (sym);
00439   return r;
00440 }
00441 
00442 array<string>
00443 packrat_grammar_rep::members (string s) {
00444   C sym= encode_symbol (compound ("symbol", s));
00445   //cout << s << " -> " << decode_as_array_string (sym) << "\n";
00446   return decode_as_array_string (sym);
00447 }
00448 
00449 /******************************************************************************
00450 * Interface
00451 ******************************************************************************/
00452 
00453 void
00454 packrat_define (string lan, string s, tree t) {
00455   packrat_grammar gr= find_packrat_grammar (lan);
00456   gr->define (s, t);
00457 }
00458 
00459 void
00460 packrat_property (string lan, string s, string var, string val) {
00461   packrat_grammar gr= find_packrat_grammar (lan);
00462   gr->set_property (s, var, val);
00463 }
00464 
00465 void
00466 packrat_inherit (string lan, string from) {
00467   packrat_grammar gr = find_packrat_grammar (lan);
00468   packrat_grammar inh= find_packrat_grammar (from);
00469   iterator<C>     it = iterate (inh->grammar);
00470   while (it->busy ()) {
00471     C sym= it->next ();
00472     //cout << "Inherit " << sym << " -> " << inh->grammar (sym) << LF;
00473     gr->grammar (sym)= inh->grammar (sym);
00474     gr->productions (sym)= inh->productions (sym);
00475   }
00476 
00477   iterator<D> it2 = iterate (inh->properties);
00478   while (it2->busy ()) {
00479     D p= it2->next ();
00480     //cout << "Inherit " << p << " -> " << inh->properties (p) << LF;
00481     gr->properties (p)= inh->properties (p);
00482   }
00483 }
00484 
00485 int
00486 packrat_abbreviation (string lan, string s) {
00487   static int nr= 1;
00488   static hashmap<string,int> abbrs (-1);
00489   string key= lan * ":" * s;
00490   int r= abbrs[key];
00491   if (r >= 0) return r;
00492   packrat_grammar gr= find_packrat_grammar (lan);
00493   C sym= encode_symbol (compound ("symbol", s));
00494   if (gr->grammar->contains (sym)) r= nr++;
00495   else r= 0;
00496   abbrs (key)= r;
00497   return r;
00498 }