Back to index

texmacs  1.0.7.15
parsexml.cpp
Go to the documentation of this file.
00001 
00002 /******************************************************************************
00003 * MODULE     : parsehtml.cpp
00004 * DESCRIPTION: conversion of xml and html strings into logical html trees
00005 * COPYRIGHT  : (C) 2000  Joris van der Hoeven
00006 *******************************************************************************
00007 * This software falls under the GNU general public license version 3 or later.
00008 * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE
00009 * in the root directory or <http://www.gnu.org/licenses/gpl-3.0.html>.
00010 ******************************************************************************/
00011 
00012 #include "convert.hpp"
00013 #include "hashset.hpp"
00014 #include "converter.hpp"
00015 #include "parse_string.hpp"
00016 
00017 /******************************************************************************
00018 * The xml/html parser aims to parse a superset of the set of valid documents.
00019 * In other words, no attempts are made to signal error messages for
00020 * incorrect documents; in the case of Html we even attempt to correct
00021 * common mistakes, like badly structured documents. So correct documents
00022 * should be parsed correctly and incorrect documents are transformed into
00023 * correct documents in a heuristic way.
00024 *
00025 * The parser proceeds in three steps: the first pass does all parsing
00026 * except for the construction of a tree structure for nested tags.
00027 * The second stage takes care of the nesting, while heuristically
00028 * correcting improper nested trees, and while taking care of optional
00029 * closing tags in the case of Html. The last stage does some final
00030 * white space and entity cleanup.
00031 *
00032 * Present limitations: we do not fully parse <!DOCTYPE ...> constructs yet.
00033 * Entities which are present in the DOCTYPE definition of the document
00034 * will be expanded. However, external DTD's are not read. Notice also that
00035 * it is not yet possible to associate default xml:space attributes to tags.
00036 ******************************************************************************/
00037 
00038 struct xml_html_parser {
00039   bool html;
00040   parse_string s;
00041   hashmap<string,string> entities;
00042   array<tree> a;
00043   int i, n;
00044   tree stack;
00045 
00046   xml_html_parser ();
00047   inline void skip_space () {
00048     while (s && is_space (s[0])) s += 1; }
00049   inline bool is_name_char (char c) {
00050     return is_alpha (c) || is_digit (c) ||
00051       (c == '_') || (c == ':') || (c == '.') || (c == '-') ||
00052       (((int) ((unsigned char) c)) >= 128); }
00053 
00054   string transcode (string s);
00055 
00056   string parse_until (string what);
00057   string parse_name ();
00058   string parse_quoted ();
00059   string expand_entity (string s);
00060   string expand_entities (string s);
00061   string parse_entity ();
00062   tree parse_attribute ();
00063   tree parse_opening ();
00064   tree parse_closing ();
00065   tree parse_pi ();
00066   tree parse_comment ();
00067   tree parse_cdata ();
00068   tree parse_misc ();
00069   void parse ();
00070 
00071   tree parse_system ();
00072   tree parse_public ();
00073   tree parse_element ();
00074   tree parse_attlist ();
00075   void parse_entity_decl ();
00076   tree parse_notation ();
00077   tree parse_doctype ();
00078 
00079   // NOTE: these routines should remain there even if they are not used
00080   bool finalize_preserve_space (string tag);
00081   string finalize_space (string s, bool first, bool last);
00082   tree finalize_space (tree t);
00083   // END NOTE
00084   bool build_valid_child (string parent, string child);
00085   bool build_must_close (string tag);
00086   bool build_can_close (string tag);
00087   void build (tree& r);
00088 
00089   tree finalize_sxml (tree t);
00090   tree parse (string s);
00091 };
00092 
00093 /******************************************************************************
00094 * Initialization
00095 ******************************************************************************/
00096 
00097 static hashset<string> html_empty_tag_table;
00098 static hashset<string> html_auto_close_table;
00099 static hashset<string> html_block_table;
00100 static hashmap<string,string> html_entity ("");
00101 
00102 void load_html_entities (hashmap<string, string> table, string fname) {
00103   string s;
00104   if (DEBUG_VERBOSE) cout << "TeXmacs] Loading " << fname << "\n";
00105   if (load_string (url ("$TEXMACS_PATH/langs/encoding", fname), s, false)) return;
00106   tree t= block_to_scheme_tree (s);
00107   if (!is_tuple (t)) return;
00108 
00109   int i, n= N(t);
00110   for (i=0; i<n; i++)
00111     if (is_func (t[i], TUPLE, 2) &&
00112        is_atomic (t[i][0]) && is_atomic (t[i][1]))
00113       {
00114        string l= t[i][0]->label; if (is_quoted (l)) l= scm_unquote (l);
00115        string r= t[i][1]->label; if (is_quoted (r)) r= scm_unquote (r);
00116        table (l)= r;
00117       }
00118 }
00119 
00120 xml_html_parser::xml_html_parser (): entities ("") {
00121   if (N(html_empty_tag_table) == 0) {
00122     html_empty_tag_table->insert ("basefont");
00123     html_empty_tag_table->insert ("br");
00124     html_empty_tag_table->insert ("area");
00125     html_empty_tag_table->insert ("link");
00126     html_empty_tag_table->insert ("param");
00127     html_empty_tag_table->insert ("hr");
00128     html_empty_tag_table->insert ("input");
00129     html_empty_tag_table->insert ("col");
00130     html_empty_tag_table->insert ("frame");
00131     html_empty_tag_table->insert ("isindex");
00132     html_empty_tag_table->insert ("base");
00133     html_empty_tag_table->insert ("meta");
00134     html_empty_tag_table->insert ("img");
00135   }
00136 
00137   if (N(html_auto_close_table) == 0) {
00138     html_auto_close_table->insert ("body");
00139     html_auto_close_table->insert ("p");
00140     html_auto_close_table->insert ("dt");
00141     html_auto_close_table->insert ("dd");
00142     html_auto_close_table->insert ("li");
00143     html_auto_close_table->insert ("option");
00144     html_auto_close_table->insert ("thead");
00145     html_auto_close_table->insert ("tfoot");
00146     html_auto_close_table->insert ("tbody");
00147     html_auto_close_table->insert ("colgroup");
00148     html_auto_close_table->insert ("tr");
00149     html_auto_close_table->insert ("th");
00150     html_auto_close_table->insert ("td");
00151     html_auto_close_table->insert ("head");
00152     html_auto_close_table->insert ("html");
00153   }
00154 
00155   if (N(html_block_table) == 0) {
00156     html_block_table->insert ("h1");
00157     html_block_table->insert ("h2");
00158     html_block_table->insert ("h3");
00159     html_block_table->insert ("h4");
00160     html_block_table->insert ("h5");
00161     html_block_table->insert ("h6");
00162     html_block_table->insert ("ul");
00163     html_block_table->insert ("ol");
00164     html_block_table->insert ("li");
00165     html_block_table->insert ("dl");
00166     html_block_table->insert ("dd");
00167     html_block_table->insert ("dt");
00168     html_block_table->insert ("pre");
00169     html_block_table->insert ("div");
00170     html_block_table->insert ("p");
00171     html_block_table->insert ("noscript");
00172     html_block_table->insert ("blockquote");
00173     html_block_table->insert ("form");
00174     html_block_table->insert ("hr");
00175     html_block_table->insert ("table");
00176     html_block_table->insert ("fieldset");
00177     html_block_table->insert ("address");
00178   }
00179 
00180   if (N (html_entity) == 0) {
00181     load_html_entities (html_entity, "HTMLlat1.scm");
00182     load_html_entities (html_entity, "HTMLspecial.scm");
00183     load_html_entities (html_entity, "HTMLsymbol.scm");
00184   }
00185 }
00186 
00187 /******************************************************************************
00188 * Transcoding input to UTF-8
00189 ******************************************************************************/
00190 
00191 // TODO: support BOM and other bells and whistles
00192 // http://www.w3.org/TR/REC-xml#sec-guessing
00193 
00194 // TODO: support HTML http-equiv Content-Type
00195 // http://www.w3.org/TR/html4/charset.html#h-5.2.2
00196 
00197 // Currently, the input encoding is expected to be ASCII-compatible.
00198 // If no <?xml?> prolog is found, the encoding is assumed to be UTF-8 or
00199 // ISO-8859-1 if iconv cannot perform an utf8->utf8 conversion.
00200 
00201 string
00202 xml_html_parser::transcode (string s2) {
00203   s= parse_string (s2);
00204 
00205   string encoding;
00206   if (test (s, "<?")) {
00207     s += 2;
00208     string target= parse_name ();
00209     skip_space ();
00210     if (target == "xml") {
00211       // since html==true implies we can accept horribly broken HTML, the
00212       // presence of an XML prolog is not enough to clear the flag.
00213       /* html= false; */
00214       while (s && !test (s, "?>")) {
00215        string attname= parse_name ();
00216        skip_space ();
00217        if (!test (s, "=")) break;
00218        s += 1;
00219        skip_space ();
00220        string val;
00221        if (test (s, "\"")) {
00222          s += 1;
00223          val= parse_until ("\"");
00224          skip_space ();       
00225        }
00226        else if (test (s, "'")) {
00227          s += 1;
00228          val= parse_until ("'");
00229          skip_space ();
00230        }
00231        if (attname == "encoding") {
00232          encoding= upcase_all (val);
00233          break;
00234        }
00235       }
00236     }
00237   }
00238 
00239   if (N(encoding) != 0) {
00240     // cout << "encoding was specified\n" ;
00241     string s3= convert (s2, encoding, "UTF-8");
00242     if (N(s3) == 0)
00243       /* conversion from specified charset failed, do nothing (and pray) */ ;
00244     else return s3;
00245   }
00246   else {
00247     // cout << "guess encoding\n" ;
00248     if (check_encoding (s2, "UTF-8"))
00249       /* input encoding seems to be utf-8, do nothing */ ;
00250     else {
00251       string s3= convert (s2, "ISO-8859-1", "UTF-8");
00252       if (N(s3) != 0) return s3;
00253     }
00254   }
00255 
00256   return s2;
00257 }
00258 
00259 /******************************************************************************
00260 * Parsing without structuring
00261 ******************************************************************************/
00262 
00263 string
00264 xml_html_parser::parse_until (string what) {
00265   string r;
00266   while (s && !test (s, what)) r << s->read (1);
00267   if (test (s, what)) s += N(what);
00268   return expand_entities (r);
00269 }
00270 
00271 string
00272 xml_html_parser::parse_name () {
00273   string r;
00274   while (s && is_name_char (s[0])) r << s->read (1);
00275   if (html) return locase_all (r);
00276   return expand_entities (r);
00277 }
00278 
00279 string
00280 xml_html_parser::expand_entity (string s) {
00281   if (entities->contains (s)) return entities[s];
00282   else if (s[0] == '&') {
00283     if (N(s)>1 && s[1] == '#') {
00284       int i= 2;
00285       bool okay= false;
00286       string r= convert_char_entity (s, i, okay);
00287       if (okay) return r;
00288       return s;
00289     }
00290     else if (html) {
00291       string ss= s (1, s [N(s)-1] == ';' ? N(s)-1 : N(s));
00292       if (html_entity->contains (ss))
00293        // HTML entity references expand to character references
00294        // so they need to be finalized a second time.
00295        return expand_entity (html_entity [ss]);
00296     }
00297   }
00298   return s;
00299 }
00300 
00301 string
00302 xml_html_parser::expand_entities (string s) {
00303   string r;
00304   int i, n= N(s);
00305   for (i=0; i<n; ) {
00306     if (s[i] == '&' || s[i] == '%') {
00307       int start= i++;
00308       if (i<n && s[i] == '#') {
00309        i++;
00310        if (i<n && (s[i] == 'x' || s[i] == 'X')) {
00311          i++;
00312          while (i<n && is_hex_digit (s[i])) i++;
00313        }
00314        else while (i<n && is_digit (s[i])) i++;
00315       }
00316       else while (i<n && is_name_char (s[i])) i++;
00317       if (i<n && s[i] == ';') i++;
00318       r << expand_entity (s (start, i));
00319     }
00320     else r << s[i++];
00321   }
00322   if (r == s) return r;
00323   return expand_entities (r);
00324 }
00325 
00326 string
00327 xml_html_parser::parse_entity () {
00328   string r= s->read (1);
00329   if (test (s, "#")) {
00330     r << s->read (1);
00331     if (test (s, "x") || test (s, "X")) {
00332       r << s->read (1);
00333       while (s && is_hex_digit (s[0])) r << s->read (1);
00334     }
00335     else while (s && is_digit (s[0])) r << s->read (1);
00336   }
00337   else while (s && is_name_char (s[0])) r << s->read (1);
00338   if (test (s, ";")) r << s->read (1);
00339   string x= expand_entity (r);
00340   if (x == r || r == "&lt;" || r == "&amp;") return x;
00341   s->write (x);
00342   return "";
00343 }
00344 
00345 string
00346 xml_html_parser::parse_quoted () {
00347   if (test (s, "\42")) {
00348     s += 1;
00349     return parse_until ("\42");
00350   }
00351   if (test (s, "'")) {
00352     s += 1;
00353     return parse_until ("'");
00354   }
00355   return "";
00356 }
00357 
00358 tree
00359 xml_html_parser::parse_attribute () {
00360   string attr= parse_name (), val;
00361   bool no_val= false;
00362   skip_space ();
00363   if (test (s, "=")) s += 1;
00364   skip_space ();
00365   if (test (s, "\42") || test (s, "'"))
00366     val= parse_quoted ();
00367   else { // for Html
00368     string r;
00369     while (s) {
00370       if (is_space (s[0]) || (s[0]=='<') || (s[0]=='>')) break;
00371       r << s->read (1);
00372     }
00373     val   = r;
00374     no_val= N(r) == 0;
00375   }
00376   if (!no_val) return tuple ("attr", attr, val);
00377   else if (attr != "") return tuple ("attr", attr);
00378   else return tuple ("attr");
00379 }
00380 
00381 tree
00382 xml_html_parser::parse_opening () {
00383   s += 1;
00384   string name= parse_name ();
00385   tree t= tuple ("begin", name);
00386   while (true) {
00387     skip_space ();
00388     if (!s || s[0] == '>' || test (s, "/>")) break;
00389     tree attr= parse_attribute ();
00390     if (attr == tuple ("attr")) break;
00391     t << attr;
00392   }
00393   if (test (s, "/>")) { t[0]= "tag"; s += 2; }
00394   else if (test (s, ">")) s += 1;
00395   return t;
00396 }
00397 
00398 tree
00399 xml_html_parser::parse_closing () {
00400   s += 2;
00401   string name= parse_name ();
00402   (void) parse_until (">");
00403   return tuple ("end", name);
00404 }
00405 
00406 tree
00407 xml_html_parser::parse_pi () {
00408   s += 2;
00409   string name= parse_name ();
00410   skip_space ();
00411   return tuple ("pi", name, parse_until ("?>"));
00412 }
00413 
00414 tree
00415 xml_html_parser::parse_comment () {
00416   s += 4;
00417   return tuple ("comment", parse_until ("-->"));
00418 }
00419 
00420 tree
00421 xml_html_parser::parse_cdata () {
00422   s += 9;
00423   return tuple ("cdata", parse_until ("]]>"));
00424 }
00425 
00426 tree
00427 xml_html_parser::parse_misc () {
00428   s += 2;
00429   tree t= tuple ("misc");
00430   while (true) {
00431     skip_space ();
00432     if (test (s, ">")) { s += 1; break; }
00433     string r;
00434     while (s) {
00435       if (is_space (s[0]) || (s[0] == '>')) break;
00436       r << s->read (1);
00437     }
00438     t << r;
00439   }
00440   return t;
00441 }
00442 
00443 void
00444 xml_html_parser::parse () {
00445   string r;
00446   while (s) {
00447     if (s[0] == '<') {
00448       if (N(r) != 0) { a << tree (r); }
00449       if (test (s, "</")) a << parse_closing ();
00450       else if (test (s, "<?")) a << parse_pi ();
00451       else if (test (s, "<!--")) a << parse_comment ();
00452       else if (test (s, "<![CDATA[")) a << parse_cdata ();
00453       else if (test (s, "<!DOCTYPE")) a << parse_doctype ();
00454       else if (test (s, "<!")) a << parse_misc ();
00455       else a << parse_opening ();
00456       r= "";
00457     }
00458     else if (s[0] == '&') r << parse_entity ();
00459     else r << s->read (1);
00460   }
00461   if (N(r) != 0) a << tree (r);
00462 }
00463 
00464 /******************************************************************************
00465 * Parsing the document type
00466 ******************************************************************************/
00467 
00468 tree
00469 xml_html_parser::parse_system () {
00470   s += 6;
00471   tree st= tuple ("system");
00472   skip_space ();
00473   st << parse_quoted ();
00474   return st;
00475 }
00476 
00477 tree
00478 xml_html_parser::parse_public () {
00479   s += 6;
00480   tree st= tuple ("public");
00481   skip_space ();
00482   st << parse_quoted ();
00483   skip_space ();
00484   st << parse_quoted ();
00485   return st;
00486 }
00487 
00488 tree
00489 xml_html_parser::parse_element () {
00490   s += 9;
00491   return tuple ("element", parse_until (">"));
00492 }
00493 
00494 tree
00495 xml_html_parser::parse_attlist () {
00496   s += 9;
00497   return tuple ("attlist", parse_until (">"));
00498 }
00499 
00500 void
00501 xml_html_parser::parse_entity_decl () {
00502   s += 8;
00503   skip_space ();
00504   bool parameter= test (s, "%");
00505   if (parameter) { s += 1; skip_space (); }
00506   string name= parse_name ();
00507   if (parameter) name= "%" * name * ";";
00508   else name= "&" * name * ";";
00509   skip_space ();
00510 
00511   if (test (s, "SYSTEM") || test (s, "PUBLIC")) {
00512     // TODO: allow for loading of external entities using wget
00513     if (test (s, "SYSTEM")) (void) parse_system ();
00514     else (void) parse_public ();
00515     skip_space ();
00516     if (test (s, "NDATA")) {
00517       s += 5;
00518       skip_space ();
00519       (void) parse_name ();
00520     }
00521   }
00522   else {
00523     string val= parse_quoted ();
00524     val= expand_entities (val);
00525     entities (name) = val;
00526     // cout << name << " := " << val << "\n";
00527   }
00528 
00529   skip_space ();
00530   if (test (s, ">")) s += 1;
00531 }
00532 
00533 tree
00534 xml_html_parser::parse_notation () {
00535   s += 10;
00536   return tuple ("notation", parse_until (">"));
00537 }
00538 
00539 tree
00540 xml_html_parser::parse_doctype () {
00541   s += 9;
00542   tree dt= tuple ("doctype");
00543   skip_space ();
00544   dt << parse_name ();
00545   skip_space ();
00546   if (test (s, "SYSTEM")) dt << parse_system ();
00547   else if (test (s, "PUBLIC")) dt << parse_public ();
00548   skip_space ();
00549 
00550   if (test (s, "[")) {
00551     s += 1;
00552     while (s) {
00553       skip_space ();
00554       if (test (s, "]")) { s += 1; break; }
00555       else if (test (s, "<!ELEMENT")) dt << parse_element ();
00556       else if (test (s, "<!ATTLIST")) dt << parse_cdata ();
00557       else if (test (s, "<!ENTITY")) parse_entity_decl ();
00558       else if (test (s, "<!NOTATION")) a << parse_notation ();
00559       else if (test (s, "<?")) dt << parse_pi ();
00560       else if (test (s, "<!--")) dt << parse_comment ();
00561       else if (s[0] == '&' || s[0] == '%') (void) parse_entity ();
00562       else s += 1;
00563     }
00564   }
00565 
00566   skip_space ();
00567   if (test (s, ">")) s += 1;
00568   return dt;
00569 }
00570 
00571 /******************************************************************************
00572 * Building the structured parse tree with error correction
00573 ******************************************************************************/
00574 
00575 bool
00576 xml_html_parser::build_valid_child (string parent, string child) {
00577   if (!html) return true;
00578   if ((parent == "<bottom>") || (parent == "html") || (parent == "body"))
00579     return true;
00580   if (html_empty_tag_table->contains (parent)) return false;
00581   if (!html_auto_close_table->contains (child)) return true;
00582   if (parent == "p") return !html_block_table->contains (child);
00583   if ((child == "dt") || (child == "dd")) return parent == "dl";
00584   if (child == "li")
00585     return (parent == "ul") || (parent == "ol") ||
00586            (parent == "dir") || (parent == "menu");
00587   if (child == "option") return (parent == "select") || (parent == "optgroup");
00588   if ((child == "thead") || (child == "tfoot") || (child == "tbody"))
00589     return parent == "table";
00590   if (child == "colgroup") return parent == "table";
00591   if (child == "col") return (parent == "table") || (parent == "colgroup");
00592   if (child == "tr")
00593     return (parent == "table") || (parent == "thead") ||
00594            (parent == "tfoot") || (parent == "tbody");
00595   if ((child == "th") || (child == "td"))
00596     return (parent == "tr") ||
00597            (parent == "table") || (parent == "thead") ||
00598            (parent == "tfoot") || (parent == "tbody");
00599   return true;
00600 }
00601 
00602 bool
00603 xml_html_parser::build_must_close (string tag) {
00604   if (build_valid_child (stack[0]->label, tag)) return false;
00605   // if !html, we have already returned false
00606   tree counter= stack;
00607   while (counter != tuple ("<bottom>")) {
00608     if (build_valid_child (counter[0]->label, tag)) return true;
00609     counter= counter[1];
00610   }
00611   // since <html> and <body> can have any child we only get here when parsing
00612   // something where both are omitted and we can close nodes up to the root.
00613   return true;
00614 }
00615 
00616 bool
00617 xml_html_parser::build_can_close (string tag) {
00618   tree counter= stack[1];
00619   while (counter != tuple ("<bottom>")) {
00620     if (counter[0]->label == tag) return true;
00621     counter= counter[1];
00622   }
00623   return false;
00624 }
00625 
00626 void
00627 xml_html_parser::build (tree& r) {
00628   while (i<n) {
00629     if (is_tuple (a[i], "begin")) {
00630       string name= a[i][1]->label;
00631       if (build_must_close (name)) return;
00632       tree sub= copy (a[i]); sub[0]= "tag";
00633       i++;
00634       if (html_empty_tag_table->contains (name))
00635        r << sub;
00636       else {
00637        stack= tuple (name, stack);
00638        build (sub);
00639        r << sub;
00640        stack= stack[1];
00641       }
00642     }
00643     else if (is_tuple (a[i], "end")) {
00644       if (stack[0]->label == a[i][1]->label) { i++; return; }
00645       if (build_can_close (a[i][1]->label)) return;
00646       i++;
00647     }
00648     else r << a[i++];
00649   }
00650 }
00651 
00652 /******************************************************************************
00653 * Finalization
00654 ******************************************************************************/
00655 
00656 bool
00657 xml_html_parser::finalize_preserve_space (string tag) {
00658   return tag == "pre";
00659 }
00660 
00661 string
00662 xml_html_parser::finalize_space (string s, bool first, bool last) {
00663   int i, n= N(s);
00664   string r;
00665   bool flag= first;
00666   for (i=0; i<n; i++)
00667     if (is_space (s[i])) {
00668       if (!flag) r << ' ';
00669       flag= true;
00670     }
00671     else {
00672       r << s[i];
00673       flag= false;
00674     }
00675   n= N(r);
00676   if (last && (n>0) && (r[n-1] == ' '))
00677     r->resize (n-1);
00678   return r;
00679 }
00680 
00681 tree
00682 xml_html_parser::finalize_space (tree t) {
00683   if (is_atomic (t) || (!is_tuple (t, "tag"))) return t;
00684   else {
00685     int i, n= N(t);
00686     tree r= tuple (t[0], t[1]);
00687     int first= -1, last= -1;
00688     for (i=2; i<n; i++)
00689       if (!is_tuple (t[i], "attr")) {
00690        first= i; break;
00691       }
00692     if (!is_tuple (t[n-1], "attr"))
00693       last= n-1;
00694     (void) first; (void) last;
00695     for (i=2; i<n; i++) {
00696       if (is_atomic (t[i])) {
00697        if (finalize_preserve_space (t[1]->label)) r << t[i];
00698        else {
00699          string s= finalize_space (t[i]->label, i==2, i==(n-1));
00700          if (s != "") r << s;
00701        }
00702       }
00703       else if (is_tuple (t[i], "tag")) r << finalize_space (t[i]);
00704       else r << t[i];
00705     }
00706     return r;
00707   }
00708 }
00709 
00710 static string
00711 simple_quote (string s) {
00712   return "\"" * s * "\"";
00713 }
00714 
00715 tree
00716 xml_html_parser::finalize_sxml (tree t) {
00717   if (!is_tuple (t, "tag")) return ""; // sanity
00718   int i, n= N(t);
00719   tree tag = tuple (t[1]);
00720   if (t[1] == "<document>") tag= tuple ("*TOP*");
00721   tree attrs = tuple ("@");
00722   tree content = tuple ();
00723   for (i=2; i<n; i++)
00724     if (is_tuple (t[i], "attr")) {
00725       tree attr;
00726       if (N(t[i]) == 2) attr= tuple (t[i][1]);
00727       else attr= tuple (t[i][1]->label, simple_quote (t[i][2]->label));
00728       attrs << attr;
00729     }
00730     else if (is_tuple (t[i], "tag"))
00731       content << finalize_sxml (t[i]);
00732     else if (is_atomic (t[i]))
00733       content << simple_quote (t[i]->label);
00734     else if (is_tuple (t[i], "pi"))
00735       content << tuple ("*PI*", t[i][1]->label, simple_quote (t[i][2]->label));
00736     else if (is_tuple (t[i], "doctype"))
00737       // TODO: convert DTD declarations
00738       content << tuple ("*DOCTYPE*", simple_quote (t[i][1]->label));
00739     else if (is_tuple (t[i], "cdata"))
00740       content << simple_quote (t[i][1]->label);
00741   if (N(attrs) > 1) tag << attrs;
00742   tag << A(content);
00743   return tag;
00744 }
00745 
00746 /******************************************************************************
00747 * Building the structured parse tree with error correction
00748 ******************************************************************************/
00749 
00750 tree
00751 xml_html_parser::parse (string s2) {
00752   // end of line handling
00753   string s3;
00754   i= 0, n= N(s2);
00755   bool is_cr= false;
00756   while (i<n) {
00757     bool prev_is_cr= is_cr;
00758     is_cr= false;
00759     char c= s2[i];
00760     if (c == '\15') {
00761       s3 << '\12';
00762       is_cr= true;
00763     }
00764     else if (prev_is_cr && (c == '\12')) /* no-op */;
00765     else s3 << c;
00766     i++;
00767   }
00768   s2= s3;
00769 
00770   // cout << "Transcoding " << s2 << "\n";
00771   if (html) s2= transcode (s2);
00772   // cout << HRULE << LF;
00773   s= parse_string (s2);
00774   //cout << "Parsing " << s << "\n";
00775   parse ();
00776   // cout << HRULE << LF;
00777   // cout << "a= " << a << "\n";
00778   i= 0; n= N(a); stack= tuple ("<bottom>");
00779   tree r= tuple ("tag", "<document>");
00780   build (r);
00781   // cout << HRULE << LF;
00782   // print_tree (r);
00783   r= finalize_sxml (r);
00784   // cout << HRULE << LF;
00785   // print_tree (r);
00786   return r;
00787 }
00788 
00789 /******************************************************************************
00790 * Interface
00791 ******************************************************************************/
00792 
00793 tree
00794 parse_xml (string s) {
00795   xml_html_parser parser;
00796   parser.html= false;
00797   tree t= parser.parse (s);
00798   return t;
00799 }
00800 
00801 tree
00802 parse_html (string s) {
00803   xml_html_parser parser;
00804   parser.html= true;
00805   tree t= parser.parse (s);
00806   return t;
00807 }