Back to index

texmacs  1.0.7.15
Public Member Functions | Public Attributes
xml_html_parser Struct Reference
Collaboration diagram for xml_html_parser:
Collaboration graph
[legend]

List of all members.

Public Member Functions

 xml_html_parser ()
void skip_space ()
bool is_name_char (char c)
string transcode (string s)
string parse_until (string what)
string parse_name ()
string parse_quoted ()
string expand_entity (string s)
string expand_entities (string s)
string parse_entity ()
tree parse_attribute ()
tree parse_opening ()
tree parse_closing ()
tree parse_pi ()
tree parse_comment ()
tree parse_cdata ()
tree parse_misc ()
void parse ()
tree parse_system ()
tree parse_public ()
tree parse_element ()
tree parse_attlist ()
void parse_entity_decl ()
tree parse_notation ()
tree parse_doctype ()
bool finalize_preserve_space (string tag)
string finalize_space (string s, bool first, bool last)
tree finalize_space (tree t)
bool build_valid_child (string parent, string child)
bool build_must_close (string tag)
bool build_can_close (string tag)
void build (tree &r)
tree finalize_sxml (tree t)
tree parse (string s)

Public Attributes

bool html
parse_string s
hashmap< string, stringentities
array< treea
int i
int n
tree stack

Detailed Description

Definition at line 38 of file parsexml.cpp.


Constructor & Destructor Documentation

Definition at line 120 of file parsexml.cpp.

                                 : entities ("") {
  if (N(html_empty_tag_table) == 0) {
    html_empty_tag_table->insert ("basefont");
    html_empty_tag_table->insert ("br");
    html_empty_tag_table->insert ("area");
    html_empty_tag_table->insert ("link");
    html_empty_tag_table->insert ("param");
    html_empty_tag_table->insert ("hr");
    html_empty_tag_table->insert ("input");
    html_empty_tag_table->insert ("col");
    html_empty_tag_table->insert ("frame");
    html_empty_tag_table->insert ("isindex");
    html_empty_tag_table->insert ("base");
    html_empty_tag_table->insert ("meta");
    html_empty_tag_table->insert ("img");
  }

  if (N(html_auto_close_table) == 0) {
    html_auto_close_table->insert ("body");
    html_auto_close_table->insert ("p");
    html_auto_close_table->insert ("dt");
    html_auto_close_table->insert ("dd");
    html_auto_close_table->insert ("li");
    html_auto_close_table->insert ("option");
    html_auto_close_table->insert ("thead");
    html_auto_close_table->insert ("tfoot");
    html_auto_close_table->insert ("tbody");
    html_auto_close_table->insert ("colgroup");
    html_auto_close_table->insert ("tr");
    html_auto_close_table->insert ("th");
    html_auto_close_table->insert ("td");
    html_auto_close_table->insert ("head");
    html_auto_close_table->insert ("html");
  }

  if (N(html_block_table) == 0) {
    html_block_table->insert ("h1");
    html_block_table->insert ("h2");
    html_block_table->insert ("h3");
    html_block_table->insert ("h4");
    html_block_table->insert ("h5");
    html_block_table->insert ("h6");
    html_block_table->insert ("ul");
    html_block_table->insert ("ol");
    html_block_table->insert ("li");
    html_block_table->insert ("dl");
    html_block_table->insert ("dd");
    html_block_table->insert ("dt");
    html_block_table->insert ("pre");
    html_block_table->insert ("div");
    html_block_table->insert ("p");
    html_block_table->insert ("noscript");
    html_block_table->insert ("blockquote");
    html_block_table->insert ("form");
    html_block_table->insert ("hr");
    html_block_table->insert ("table");
    html_block_table->insert ("fieldset");
    html_block_table->insert ("address");
  }

  if (N (html_entity) == 0) {
    load_html_entities (html_entity, "HTMLlat1.scm");
    load_html_entities (html_entity, "HTMLspecial.scm");
    load_html_entities (html_entity, "HTMLsymbol.scm");
  }
}

Here is the call graph for this function:


Member Function Documentation

Definition at line 627 of file parsexml.cpp.

                               {
  while (i<n) {
    if (is_tuple (a[i], "begin")) {
      string name= a[i][1]->label;
      if (build_must_close (name)) return;
      tree sub= copy (a[i]); sub[0]= "tag";
      i++;
      if (html_empty_tag_table->contains (name))
       r << sub;
      else {
       stack= tuple (name, stack);
       build (sub);
       r << sub;
       stack= stack[1];
      }
    }
    else if (is_tuple (a[i], "end")) {
      if (stack[0]->label == a[i][1]->label) { i++; return; }
      if (build_can_close (a[i][1]->label)) return;
      i++;
    }
    else r << a[i++];
  }
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 617 of file parsexml.cpp.

                                            {
  tree counter= stack[1];
  while (counter != tuple ("<bottom>")) {
    if (counter[0]->label == tag) return true;
    counter= counter[1];
  }
  return false;
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 603 of file parsexml.cpp.

                                             {
  if (build_valid_child (stack[0]->label, tag)) return false;
  // if !html, we have already returned false
  tree counter= stack;
  while (counter != tuple ("<bottom>")) {
    if (build_valid_child (counter[0]->label, tag)) return true;
    counter= counter[1];
  }
  // since <html> and <body> can have any child we only get here when parsing
  // something where both are omitted and we can close nodes up to the root.
  return true;
}

Here is the call graph for this function:

Here is the caller graph for this function:

bool xml_html_parser::build_valid_child ( string  parent,
string  child 
)

Definition at line 576 of file parsexml.cpp.

                                                               {
  if (!html) return true;
  if ((parent == "<bottom>") || (parent == "html") || (parent == "body"))
    return true;
  if (html_empty_tag_table->contains (parent)) return false;
  if (!html_auto_close_table->contains (child)) return true;
  if (parent == "p") return !html_block_table->contains (child);
  if ((child == "dt") || (child == "dd")) return parent == "dl";
  if (child == "li")
    return (parent == "ul") || (parent == "ol") ||
           (parent == "dir") || (parent == "menu");
  if (child == "option") return (parent == "select") || (parent == "optgroup");
  if ((child == "thead") || (child == "tfoot") || (child == "tbody"))
    return parent == "table";
  if (child == "colgroup") return parent == "table";
  if (child == "col") return (parent == "table") || (parent == "colgroup");
  if (child == "tr")
    return (parent == "table") || (parent == "thead") ||
           (parent == "tfoot") || (parent == "tbody");
  if ((child == "th") || (child == "td"))
    return (parent == "tr") ||
           (parent == "table") || (parent == "thead") ||
           (parent == "tfoot") || (parent == "tbody");
  return true;
}

Here is the caller graph for this function:

Definition at line 302 of file parsexml.cpp.

                                          {
  string r;
  int i, n= N(s);
  for (i=0; i<n; ) {
    if (s[i] == '&' || s[i] == '%') {
      int start= i++;
      if (i<n && s[i] == '#') {
       i++;
       if (i<n && (s[i] == 'x' || s[i] == 'X')) {
         i++;
         while (i<n && is_hex_digit (s[i])) i++;
       }
       else while (i<n && is_digit (s[i])) i++;
      }
      else while (i<n && is_name_char (s[i])) i++;
      if (i<n && s[i] == ';') i++;
      r << expand_entity (s (start, i));
    }
    else r << s[i++];
  }
  if (r == s) return r;
  return expand_entities (r);
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 280 of file parsexml.cpp.

                                        {
  if (entities->contains (s)) return entities[s];
  else if (s[0] == '&') {
    if (N(s)>1 && s[1] == '#') {
      int i= 2;
      bool okay= false;
      string r= convert_char_entity (s, i, okay);
      if (okay) return r;
      return s;
    }
    else if (html) {
      string ss= s (1, s [N(s)-1] == ';' ? N(s)-1 : N(s));
      if (html_entity->contains (ss))
       // HTML entity references expand to character references
       // so they need to be finalized a second time.
       return expand_entity (html_entity [ss]);
    }
  }
  return s;
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 657 of file parsexml.cpp.

                                                    {
  return tag == "pre";
}

Here is the caller graph for this function:

string xml_html_parser::finalize_space ( string  s,
bool  first,
bool  last 
)

Definition at line 662 of file parsexml.cpp.

                                                                {
  int i, n= N(s);
  string r;
  bool flag= first;
  for (i=0; i<n; i++)
    if (is_space (s[i])) {
      if (!flag) r << ' ';
      flag= true;
    }
    else {
      r << s[i];
      flag= false;
    }
  n= N(r);
  if (last && (n>0) && (r[n-1] == ' '))
    r->resize (n-1);
  return r;
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 682 of file parsexml.cpp.

                                       {
  if (is_atomic (t) || (!is_tuple (t, "tag"))) return t;
  else {
    int i, n= N(t);
    tree r= tuple (t[0], t[1]);
    int first= -1, last= -1;
    for (i=2; i<n; i++)
      if (!is_tuple (t[i], "attr")) {
       first= i; break;
      }
    if (!is_tuple (t[n-1], "attr"))
      last= n-1;
    (void) first; (void) last;
    for (i=2; i<n; i++) {
      if (is_atomic (t[i])) {
       if (finalize_preserve_space (t[1]->label)) r << t[i];
       else {
         string s= finalize_space (t[i]->label, i==2, i==(n-1));
         if (s != "") r << s;
       }
      }
      else if (is_tuple (t[i], "tag")) r << finalize_space (t[i]);
      else r << t[i];
    }
    return r;
  }
}

Here is the call graph for this function:

Definition at line 716 of file parsexml.cpp.

                                      {
  if (!is_tuple (t, "tag")) return ""; // sanity
  int i, n= N(t);
  tree tag = tuple (t[1]);
  if (t[1] == "<document>") tag= tuple ("*TOP*");
  tree attrs = tuple ("@");
  tree content = tuple ();
  for (i=2; i<n; i++)
    if (is_tuple (t[i], "attr")) {
      tree attr;
      if (N(t[i]) == 2) attr= tuple (t[i][1]);
      else attr= tuple (t[i][1]->label, simple_quote (t[i][2]->label));
      attrs << attr;
    }
    else if (is_tuple (t[i], "tag"))
      content << finalize_sxml (t[i]);
    else if (is_atomic (t[i]))
      content << simple_quote (t[i]->label);
    else if (is_tuple (t[i], "pi"))
      content << tuple ("*PI*", t[i][1]->label, simple_quote (t[i][2]->label));
    else if (is_tuple (t[i], "doctype"))
      // TODO: convert DTD declarations
      content << tuple ("*DOCTYPE*", simple_quote (t[i][1]->label));
    else if (is_tuple (t[i], "cdata"))
      content << simple_quote (t[i][1]->label);
  if (N(attrs) > 1) tag << attrs;
  tag << A(content);
  return tag;
}

Here is the call graph for this function:

Here is the caller graph for this function:

bool xml_html_parser::is_name_char ( char  c) [inline]

Definition at line 49 of file parsexml.cpp.

                                    {
    return is_alpha (c) || is_digit (c) ||
      (c == '_') || (c == ':') || (c == '.') || (c == '-') ||
      (((int) ((unsigned char) c)) >= 128); }

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 444 of file parsexml.cpp.

                        {
  string r;
  while (s) {
    if (s[0] == '<') {
      if (N(r) != 0) { a << tree (r); }
      if (test (s, "</")) a << parse_closing ();
      else if (test (s, "<?")) a << parse_pi ();
      else if (test (s, "<!--")) a << parse_comment ();
      else if (test (s, "<![CDATA[")) a << parse_cdata ();
      else if (test (s, "<!DOCTYPE")) a << parse_doctype ();
      else if (test (s, "<!")) a << parse_misc ();
      else a << parse_opening ();
      r= "";
    }
    else if (s[0] == '&') r << parse_entity ();
    else r << s->read (1);
  }
  if (N(r) != 0) a << tree (r);
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 751 of file parsexml.cpp.

                                 {
  // end of line handling
  string s3;
  i= 0, n= N(s2);
  bool is_cr= false;
  while (i<n) {
    bool prev_is_cr= is_cr;
    is_cr= false;
    char c= s2[i];
    if (c == '\15') {
      s3 << '\12';
      is_cr= true;
    }
    else if (prev_is_cr && (c == '\12')) /* no-op */;
    else s3 << c;
    i++;
  }
  s2= s3;

  // cout << "Transcoding " << s2 << "\n";
  if (html) s2= transcode (s2);
  // cout << HRULE << LF;
  s= parse_string (s2);
  //cout << "Parsing " << s << "\n";
  parse ();
  // cout << HRULE << LF;
  // cout << "a= " << a << "\n";
  i= 0; n= N(a); stack= tuple ("<bottom>");
  tree r= tuple ("tag", "<document>");
  build (r);
  // cout << HRULE << LF;
  // print_tree (r);
  r= finalize_sxml (r);
  // cout << HRULE << LF;
  // print_tree (r);
  return r;
}

Here is the call graph for this function:

Definition at line 495 of file parsexml.cpp.

                                {
  s += 9;
  return tuple ("attlist", parse_until (">"));
}

Here is the call graph for this function:

Definition at line 359 of file parsexml.cpp.

                                  {
  string attr= parse_name (), val;
  bool no_val= false;
  skip_space ();
  if (test (s, "=")) s += 1;
  skip_space ();
  if (test (s, "\42") || test (s, "'"))
    val= parse_quoted ();
  else { // for Html
    string r;
    while (s) {
      if (is_space (s[0]) || (s[0]=='<') || (s[0]=='>')) break;
      r << s->read (1);
    }
    val   = r;
    no_val= N(r) == 0;
  }
  if (!no_val) return tuple ("attr", attr, val);
  else if (attr != "") return tuple ("attr", attr);
  else return tuple ("attr");
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 421 of file parsexml.cpp.

                              {
  s += 9;
  return tuple ("cdata", parse_until ("]]>"));
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 399 of file parsexml.cpp.

                                {
  s += 2;
  string name= parse_name ();
  (void) parse_until (">");
  return tuple ("end", name);
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 415 of file parsexml.cpp.

                                {
  s += 4;
  return tuple ("comment", parse_until ("-->"));
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 540 of file parsexml.cpp.

                                {
  s += 9;
  tree dt= tuple ("doctype");
  skip_space ();
  dt << parse_name ();
  skip_space ();
  if (test (s, "SYSTEM")) dt << parse_system ();
  else if (test (s, "PUBLIC")) dt << parse_public ();
  skip_space ();

  if (test (s, "[")) {
    s += 1;
    while (s) {
      skip_space ();
      if (test (s, "]")) { s += 1; break; }
      else if (test (s, "<!ELEMENT")) dt << parse_element ();
      else if (test (s, "<!ATTLIST")) dt << parse_cdata ();
      else if (test (s, "<!ENTITY")) parse_entity_decl ();
      else if (test (s, "<!NOTATION")) a << parse_notation ();
      else if (test (s, "<?")) dt << parse_pi ();
      else if (test (s, "<!--")) dt << parse_comment ();
      else if (s[0] == '&' || s[0] == '%') (void) parse_entity ();
      else s += 1;
    }
  }

  skip_space ();
  if (test (s, ">")) s += 1;
  return dt;
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 489 of file parsexml.cpp.

                                {
  s += 9;
  return tuple ("element", parse_until (">"));
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 327 of file parsexml.cpp.

                               {
  string r= s->read (1);
  if (test (s, "#")) {
    r << s->read (1);
    if (test (s, "x") || test (s, "X")) {
      r << s->read (1);
      while (s && is_hex_digit (s[0])) r << s->read (1);
    }
    else while (s && is_digit (s[0])) r << s->read (1);
  }
  else while (s && is_name_char (s[0])) r << s->read (1);
  if (test (s, ";")) r << s->read (1);
  string x= expand_entity (r);
  if (x == r || r == "&lt;" || r == "&amp;") return x;
  s->write (x);
  return "";
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 501 of file parsexml.cpp.

                                    {
  s += 8;
  skip_space ();
  bool parameter= test (s, "%");
  if (parameter) { s += 1; skip_space (); }
  string name= parse_name ();
  if (parameter) name= "%" * name * ";";
  else name= "&" * name * ";";
  skip_space ();

  if (test (s, "SYSTEM") || test (s, "PUBLIC")) {
    // TODO: allow for loading of external entities using wget
    if (test (s, "SYSTEM")) (void) parse_system ();
    else (void) parse_public ();
    skip_space ();
    if (test (s, "NDATA")) {
      s += 5;
      skip_space ();
      (void) parse_name ();
    }
  }
  else {
    string val= parse_quoted ();
    val= expand_entities (val);
    entities (name) = val;
    // cout << name << " := " << val << "\n";
  }

  skip_space ();
  if (test (s, ">")) s += 1;
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 427 of file parsexml.cpp.

                             {
  s += 2;
  tree t= tuple ("misc");
  while (true) {
    skip_space ();
    if (test (s, ">")) { s += 1; break; }
    string r;
    while (s) {
      if (is_space (s[0]) || (s[0] == '>')) break;
      r << s->read (1);
    }
    t << r;
  }
  return t;
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 272 of file parsexml.cpp.

                             {
  string r;
  while (s && is_name_char (s[0])) r << s->read (1);
  if (html) return locase_all (r);
  return expand_entities (r);
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 534 of file parsexml.cpp.

                                 {
  s += 10;
  return tuple ("notation", parse_until (">"));
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 382 of file parsexml.cpp.

                                {
  s += 1;
  string name= parse_name ();
  tree t= tuple ("begin", name);
  while (true) {
    skip_space ();
    if (!s || s[0] == '>' || test (s, "/>")) break;
    tree attr= parse_attribute ();
    if (attr == tuple ("attr")) break;
    t << attr;
  }
  if (test (s, "/>")) { t[0]= "tag"; s += 2; }
  else if (test (s, ">")) s += 1;
  return t;
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 407 of file parsexml.cpp.

                           {
  s += 2;
  string name= parse_name ();
  skip_space ();
  return tuple ("pi", name, parse_until ("?>"));
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 478 of file parsexml.cpp.

                               {
  s += 6;
  tree st= tuple ("public");
  skip_space ();
  st << parse_quoted ();
  skip_space ();
  st << parse_quoted ();
  return st;
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 346 of file parsexml.cpp.

                               {
  if (test (s, "\42")) {
    s += 1;
    return parse_until ("\42");
  }
  if (test (s, "'")) {
    s += 1;
    return parse_until ("'");
  }
  return "";
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 469 of file parsexml.cpp.

                               {
  s += 6;
  tree st= tuple ("system");
  skip_space ();
  st << parse_quoted ();
  return st;
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 264 of file parsexml.cpp.

                                         {
  string r;
  while (s && !test (s, what)) r << s->read (1);
  if (test (s, what)) s += N(what);
  return expand_entities (r);
}

Here is the call graph for this function:

Here is the caller graph for this function:

void xml_html_parser::skip_space ( ) [inline]

Definition at line 47 of file parsexml.cpp.

                            {
    while (s && is_space (s[0])) s += 1; }

Here is the caller graph for this function:

Definition at line 202 of file parsexml.cpp.

                                     {
  s= parse_string (s2);

  string encoding;
  if (test (s, "<?")) {
    s += 2;
    string target= parse_name ();
    skip_space ();
    if (target == "xml") {
      // since html==true implies we can accept horribly broken HTML, the
      // presence of an XML prolog is not enough to clear the flag.
      /* html= false; */
      while (s && !test (s, "?>")) {
       string attname= parse_name ();
       skip_space ();
       if (!test (s, "=")) break;
       s += 1;
       skip_space ();
       string val;
       if (test (s, "\"")) {
         s += 1;
         val= parse_until ("\"");
         skip_space ();       
       }
       else if (test (s, "'")) {
         s += 1;
         val= parse_until ("'");
         skip_space ();
       }
       if (attname == "encoding") {
         encoding= upcase_all (val);
         break;
       }
      }
    }
  }

  if (N(encoding) != 0) {
    // cout << "encoding was specified\n" ;
    string s3= convert (s2, encoding, "UTF-8");
    if (N(s3) == 0)
      /* conversion from specified charset failed, do nothing (and pray) */ ;
    else return s3;
  }
  else {
    // cout << "guess encoding\n" ;
    if (check_encoding (s2, "UTF-8"))
      /* input encoding seems to be utf-8, do nothing */ ;
    else {
      string s3= convert (s2, "ISO-8859-1", "UTF-8");
      if (N(s3) != 0) return s3;
    }
  }

  return s2;
}

Here is the call graph for this function:

Here is the caller graph for this function:


Member Data Documentation

Definition at line 42 of file parsexml.cpp.

Definition at line 41 of file parsexml.cpp.

Definition at line 39 of file parsexml.cpp.

Definition at line 43 of file parsexml.cpp.

Definition at line 43 of file parsexml.cpp.

Definition at line 40 of file parsexml.cpp.

Definition at line 44 of file parsexml.cpp.


The documentation for this struct was generated from the following file: