Back to index

texmacs  1.0.7.15
parsetex.cpp
Go to the documentation of this file.
00001 
00002 /******************************************************************************
00003 * MODULE     : parsetex.cpp
00004 * DESCRIPTION: conversion of tex/latex strings into logical tex/latex trees
00005 * COPYRIGHT  : (C) 1999  Joris van der Hoeven
00006 *******************************************************************************
00007 * This software falls under the GNU general public license version 3 or later.
00008 * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE
00009 * in the root directory or <http://www.gnu.org/licenses/gpl-3.0.html>.
00010 ******************************************************************************/
00011 
00012 #include "Tex/convert_tex.hpp"
00013 #include "converter.hpp"
00014 
00015 string string_arg (tree t);
00016 extern bool textm_class_flag;
00017 hashmap<string,int> textm_recursion_level (0);
00018 
00019 /******************************************************************************
00020 * The latex_parser structure
00021 *******************************************************************************
00022 *
00023 * During the parsing, the following global variables are used:
00024 *
00025 *     command_type   Contains the types of all currently defined tex commands.
00026 *                    This is either 'command' 'modifier' 'operator'
00027 *                    'environment' 'list' 'symbol' 'big-symbol' or 'user'.
00028 *     command_arity  Contains the corresponding arity.
00029 *     command_def    Contains the definitions of user commands.
00030 *
00031 * The command_type hashmap also contains come special fields
00032 *
00033 *     <sub>         Stands for the subscript command
00034 *     <sup>         Stands for the supscript command
00035 *
00036 *     !mode          Gives the current mode ("text" or "math").
00037 *     !verbatim      Verbatim mode ("true" or "false")
00038 *     !em            Emphasized mode ("true" or "false")
00039 *
00040 *******************************************************************************
00041 * WARNING: we recently put the standard LaTeX macros in latex_type and
00042 * latex_arity instead of command_type and command_arity.
00043 ******************************************************************************/
00044 
00045 struct latex_parser {
00046   int level;
00047   bool unicode;
00048   latex_parser (bool unicode2): level (0), unicode (unicode2) {}
00049   void latex_error (string s, int i, string message);
00050 
00051   tree parse             (string s, int& i, string stop= "", bool ch= false);
00052   tree parse_backslash   (string s, int& i);
00053   tree parse_symbol      (string s, int& i);
00054   tree parse_command     (string s, int& i, string which);
00055   tree parse_argument    (string s, int& i);
00056   tree parse_unknown     (string s, int& i, string which);
00057   bool can_parse_length  (string s, int i);
00058   tree parse_length      (string s, int& i);
00059   tree parse_length_name (string s, int& i);
00060   tree parse_verbatim    (string s, int& i, string end);
00061 
00062   tree parse             (string s, bool change);
00063 };
00064 
00065 /******************************************************************************
00066 * Error handling
00067 ******************************************************************************/
00068 
00069 void
00070 latex_parser::latex_error (string s, int i, string message) {
00071   if (!textm_class_flag) {
00072     cerr << "Latex error] " << message << "\n";
00073     if (i>30) s= "..." * s (i-27, N(s));
00074     if (N(s)>60) s= s (0, 57) * "...";
00075     cerr << "Latex error] in " << s << "\n";
00076   }
00077 }
00078 
00079 /******************************************************************************
00080 * Main parsing routine
00081 ******************************************************************************/
00082 
00083 static bool
00084 is_regular (tree t) {
00085   if (!is_tuple (t)) return true;
00086   if (N(t) == 0 || !is_atomic (t[0])) return false;
00087   string s= t[0]->label;
00088   return !starts (s, "\\begin-") && !starts (s, "\\end-");
00089 }
00090 
00091 static bool
00092 is_tex_alpha (char c) {
00093   return is_alpha (c) || c == '@';
00094 }
00095 
00096 static bool
00097 is_tex_alpha (string s) {
00098   for (int i=0; i<N(s); i++)
00099     if (!is_alpha (s[i]) && s[i] != '@') return false;
00100   return true;
00101 }
00102 
00103 tree
00104 latex_parser::parse (string s, int& i, string stop, bool change) {
00105   bool no_error= true;
00106   int n= N(s);
00107   tree t (CONCAT);
00108 
00109   level++;
00110   command_type ->extend ();
00111   command_arity->extend ();
00112   command_def  ->extend ();
00113 
00114   while ((i<n) && is_space (s[i])) i++;
00115   while ((i<n) && no_error &&
00116         (s[i] != '\0' || N (stop) != 0) &&
00117         (N(stop) != 1 || s[i] != stop[0]) &&
00118         (s[i] != '$' || stop != "$$" || i+1>=n || s[i+1] != '$') &&
00119         (stop != "denom" ||
00120          (s[i] != '$' && s[i] != '}' &&
00121           (i+2>n || s(i,i+2) != "\\]") &&
00122           (i+4>n || s(i,i+4) != "\\end")))) {
00123     switch (s[i]) {
00124     case '~':
00125       t << tuple ("\\nbsp");
00126       i++;
00127       break;
00128     case ' ':
00129     case '\t':
00130     case '\r':
00131       while ((i<n) && ((s[i]==' ') || (s[i]=='\t') || (s[i]=='\r'))) i++;
00132       if ((i<n) && (s[i]!='\n')) t << " ";
00133       break;
00134     case '\n': {
00135       int ln=0;
00136       while ((i<n) && is_space (s[i]))
00137        if (s[i++]=='\n') ln++;
00138       if (i<n) {
00139        if (ln == 1) t << " ";
00140        else t << "\n";
00141       }
00142       break;
00143     }
00144     case '%': {
00145       while ((i<n) && (s[i]!='\n')) i++;
00146       if (i<n) i++;
00147       int ln=0;
00148       while ((i<n) && is_space (s[i]))
00149        if (s[i++]=='\n') ln++;
00150       if (ln > 0) {
00151        if ((N(t)>0) && ((t[N(t)-1]==" ") || (t[N(t)-1]=="\n")))
00152          t[N(t)-1]= "\n";
00153        else t << "\n";
00154       }
00155       break;
00156     }
00157     case '#':
00158       i++;
00159       if (i==n) return t;
00160       if (is_numeric (s[i])) {
00161        t << s (i-1, i+1);
00162        i++;
00163       }
00164       else t << s (i-1, i);
00165       break;
00166     case '\\':
00167       if (((i+7)<n) && !is_tex_alpha (s (i+5, i+7)) &&
00168          (s (i, i+5) == "\\over" || s (i, i+5) == "\\atop"))
00169        {
00170          string fr_cmd= s(i,i+5);
00171          if (fr_cmd == "\\over") fr_cmd= "\\frac";
00172          if (fr_cmd == "\\atop") fr_cmd= "\\ontop";
00173          int j;
00174          for (j=N(t); j>0 && is_regular (t[j-1]); j--) {}
00175          tree num= t (j, N(t));
00176          if (N(num) == 0) num= "";
00177          t= t (0, j);
00178          i+=5;
00179          while (i<n && (s[i] == ' ' || s[i] == '\n' || s[i] == '\t')) i++;
00180          tree den= parse (s, i, "denom");
00181          t << tree (TUPLE, fr_cmd, num, den);
00182        }
00183       else if ((i+5) < n && s(i,i+3) == "\\sp" && !is_tex_alpha (s[i+3])) {
00184        i+=3;
00185        t << parse_command (s, i, "\\<sup>");
00186       }
00187       else if ((i+5) < n && s(i,i+3) == "\\sb" && !is_tex_alpha (s[i+3])) {
00188        i+=3;
00189        t << parse_command (s, i, "\\<sub>");
00190       }
00191       else if ((i+10) < n && s(i,i+8) == "\\pmatrix") {
00192        i+=8;
00193        tree arg= parse_command (s, i, "\\pmatrix");
00194        if (is_tuple (arg, "\\pmatrix", 1)) arg= arg[1];
00195        t << tree (TUPLE, "\\begin-pmatrix");
00196        if (is_concat (arg)) t << A (arg);
00197        else t << arg;
00198        t << tree (TUPLE, "\\end-pmatrix");
00199       }
00200       else if (can_parse_length (s, i))
00201        t << parse_length (s, i);
00202       else {
00203        tree u= parse_backslash (s, i);
00204        if (u != "") t << u;
00205       }
00206       break;
00207     case '\'':
00208       i++;
00209       if (command_type ["!mode"] == "math") {
00210        int start= i-1;
00211        while ((i < N(s)) && (s[i] == '\'')) i++;
00212        t << tuple ("\\prime", s (start, i));
00213       }
00214       else t << s (i-1, i);
00215       break;
00216     case '*':
00217       if (command_type ["!mode"] == "math") t << tree (TUPLE, "\\ast");
00218       else t << "*";
00219       i++;
00220       break;
00221     case '_':
00222       i++;
00223       t << parse_command (s, i, "\\<sub>");
00224       /*
00225       if (command_type ["!mode"] == "math")
00226        t << parse_command (s, i, "\<sub>");
00227       else t << s (i-1, i);
00228       */
00229       break;
00230     case '^':
00231       i++;
00232       t << parse_command (s, i, "\\<sup>");
00233       /*
00234       if (command_type ["!mode"] == "math")
00235        t << parse_command (s, i, "\<sup>");
00236       else t << s (i-1, i);
00237       */
00238       break;
00239     case '<':
00240       t << tree (TUPLE, "\\<less>");
00241       i++;
00242       break;
00243     case '>':
00244       t << tree (TUPLE, "\\<gtr>");
00245       i++;
00246       break;
00247     case '\244':
00248       i++;
00249       t << parse_verbatim (s, i, "\244");
00250       break;
00251     case '{': {
00252       i++;
00253       t << parse (s, i, "}");
00254       if ((i<n) && (s[i]=='}')) i++;
00255 
00256       int ln=0;
00257       if ((i<n) && (!is_space (s[i]))) break;
00258       while ((i<n) && is_space (s[i]))
00259        if (s[i++]=='\n') ln++;
00260       if (ln >= 2) t << "\n";
00261       else if (i<n) t << tree (TUPLE, "\\ ");
00262       break;
00263     }
00264     case '$': {
00265       i++;
00266       if ((i<n) & (s[i]=='$')) {
00267        i++;
00268        t << tree (TUPLE, "\\begin-displaymath");
00269        command_type ("!mode")= "math";
00270        t << parse (s, i, "$$");
00271        command_type ("!mode")= "text";
00272        if ((i<n) && (s[i]=='$')) i++;
00273        if ((i<n) && (s[i]=='$')) i++;
00274        t << tree (TUPLE, "\\end-displaymath");
00275       }
00276       else {
00277        t << tree (TUPLE, "\\begin-math");
00278        command_type ("!mode")= "math";
00279        t << parse (s, i, "$");
00280        command_type ("!mode")= "text";
00281        if ((i<n) && (s[i]=='$')) i++;
00282        t << tree (TUPLE, "\\end-math");
00283       }
00284       break;
00285     }
00286     default:
00287       if ((s[i] == '-' || (s[i] >= '0' && s[i] <= '9')) &&
00288          can_parse_length (s, i))
00289        t << parse_length (s, i);
00290       else if (unicode && ((unsigned char) s[i]) >= 128) {
00291        unsigned int code= decode_from_utf8 (s, i);
00292        t << tree (TUPLE, "\\#" * as_hexadecimal (code));
00293       }
00294       else if (!unicode && is_iso_alpha (s[i])) {
00295        // If we encounter too much text in math mode, then return
00296        int start= i;
00297        while ((i<n) && is_iso_alpha (s[i])) i++;
00298        int end= i;
00299        if ((i >= start+3) && (command_type ["!mode"] == "math")) {
00300          while ((i<n) && (is_iso_alpha (s[i]) ||
00301                         is_punctuation (s[i]) ||
00302                         is_space (s[i])))
00303            i++;
00304          if (i >= start+20) {
00305            int last= i, words= 0, letters= 0;
00306            for (i=start; i<last; i++) {
00307              if (is_iso_alpha (s[i])) {
00308               letters++;
00309               if ((i==start) || (!is_iso_alpha (s[i-1]))) words++;
00310              }
00311            }
00312            if ((words > 3) && (letters/words >= 3) && (letters >= 15)) {
00313              i= start;
00314              no_error= false;
00315            }
00316          }
00317        }
00318        if (no_error)
00319          for (i=start; i<end; i++)
00320            t << s(i, i+1);
00321       }
00322       else {
00323        t << s (i, i+1);
00324        i++;
00325       }
00326       break;
00327     }
00328   }
00329 
00330   level--;
00331   if (change) {
00332     command_type ->merge ();
00333     command_arity->merge ();
00334     command_def  ->merge ();
00335   }
00336   else {
00337     command_type ->shorten ();
00338     command_arity->shorten ();
00339     command_def  ->shorten ();
00340   }
00341 
00342   if (N(t)==0) return "";
00343   if (N(t)==1) return t[0];
00344   return t;
00345 }
00346 
00347 /******************************************************************************
00348 * Parsing commands
00349 ******************************************************************************/
00350 
00351 tree
00352 latex_parser::parse_backslash (string s, int& i) {
00353   int n= N(s);
00354   if (((i+7)<n) && (s(i,i+5)=="\\verb")) {
00355     i+=6;
00356     return parse_verbatim (s, i, s(i-1,i));
00357   }
00358   if (((i+29)<n) && (s(i,i+16)=="\\begin{verbatim}")) {
00359     i+=16;
00360     return parse_verbatim (s, i, "\\end{verbatim}");
00361   }
00362   if (((i+5)<n) && (s(i,i+4)=="\\url") && !is_tex_alpha (s[i+5])) {
00363     i+=4;
00364     while (i<n && (s[i] == ' ' || s[i] == '\n' || s[i] == '\t')) i++;
00365     string ss;
00366     if (i<n && s[i] == '{') {
00367       i++;
00368       int start= i;
00369       while ((i<n) && s[i] != '}') i++;
00370       ss= s (start, i++);
00371     }
00372     return tree (TUPLE, "\\url", ss);
00373   }
00374   if (((i+6)<n) && (s(i,i+5)=="\\href")) {
00375     i+=5;
00376     while (i<n && (s[i] == ' ' || s[i] == '\n' || s[i] == '\t')) i++;
00377     string ss;
00378     if (i<n && s[i] == '{') {
00379       i++;
00380       int start= i;
00381       while ((i<n) && s[i] != '}') i++;
00382       ss= s (start, i++);
00383     }
00384     tree u= "";
00385     while (i<n && (s[i] == ' ' || s[i] == '\n' || s[i] == '\t')) i++;
00386     if (i<n && s[i] == '{') { i++; u= parse (s, i, "}"); i++; }
00387     return tree (TUPLE, "\\href", ss, u);
00388   }
00389 
00390   /************************ special commands *********************************/
00391   i++;
00392   if (i==n) return "";
00393   if (s[i]==' ') {
00394     i++;
00395     return tree (TUPLE, "\\ ");
00396   }
00397   if (!is_tex_alpha(s[i])) {
00398     i++;
00399     if (s[i-1]=='(') return parse_command (s, i, "\\begin-math");
00400     if (s[i-1]==')') return parse_command (s, i, "\\end-math");
00401     if (s[i-1]=='[') return parse_command (s, i, "\\begin-displaymath");
00402     if (s[i-1]==']') return parse_command (s, i, "\\end-displaymath");
00403     return parse_command (s, i, s (i-2, i));
00404   }
00405 
00406   /************************* normal commands *********************************/
00407   int start= i-1;
00408   while ((i<n) && is_tex_alpha (s[i])) i++;
00409   if ((i<n) && (s[i]=='*') && latex_type (s (start, i+1)) != "undefined") i++;
00410   string r= s (start, i);
00411   if ((r == "\\begin") || (r == "\\end")) {
00412     while ((i<n) && is_space (s[i])) i++;
00413     if ((i==n) || (s[i]!='{')) {
00414       latex_error (s, i, "begin or end which environment ?");
00415       return "";
00416     }
00417     i++; start= i;
00418     while ((i<n) && (s[i]!='}')) i++;
00419     r = r * "-" * s (start, i);
00420     if (i<n) i++;
00421   }
00422   return parse_command (s, i, r);
00423 }
00424 
00425 static string
00426 sharp_to_arg (string s, tree args) {
00427   int i;
00428   string r;
00429   for (i=0; i<N(s); i++)
00430     if ((s[i]=='#') && ((i+1)<N(s)) && (s[i+1]>='1') && (s[i+1]<='9')) {
00431       int nr= ((int) s[++i]) - ((int) '0');
00432       if (N(args)>nr) r << string_arg (args[nr]);
00433     }
00434     else r << s[i];
00435   return r;
00436 }
00437 
00438 tree
00439 latex_parser::parse_symbol (string s, int& i) {
00440   int start= i;
00441   if ((s[i] == '*') && (command_type ["!mode"] == "math")) {
00442     i++; return tree (TUPLE, "\\ast"); }
00443   if (s[i] == '<') { i++; return tree (TUPLE, "\\<less>"); }
00444   if (s[i] == '>') { i++; return tree (TUPLE, "\\<gtr>"); }
00445   if (s[i] != '\\') { i++; return s(start, i); }
00446   i++;
00447   if (i == N(s)) return tree (TUPLE, "\\backslash");
00448   if (!is_tex_alpha (s[i])) { i++; return s(start, i); }
00449   while ((i<N(s)) && is_tex_alpha (s[i])) i++;
00450   if ((i<N(s)) && (s[i]=='*')) i++;
00451   return s(start,i);
00452 }
00453 
00454 static bool
00455 is_math_environment (tree t) {
00456   //cout << "t= " << t << "\n";
00457   tree b= t[N(t)-2];
00458   tree e= t[N(t)-1];
00459   if (!is_concat (b)) b= tree (CONCAT, b);
00460   if (!is_concat (e)) e= tree (CONCAT, e);
00461   int i, j;
00462   for (i=N(b)-1; i>=0; i--)
00463     if (is_tuple (b[i]) && N(b[i])>0 && is_atomic (b[i][0]))
00464       if (latex_type (b[i][0]->label) == "math-environment")
00465        break;
00466   for (j=0; j<N(e); j++)
00467     if (is_tuple (e[j]) && N(e[j])>0 && is_atomic (e[j][0]))
00468       if (latex_type (e[j][0]->label) == "math-environment")
00469        break;
00470   if (i >= 0 && j < N(e)) {
00471     string bs= b[i][0]->label;
00472     string es= e[j][0]->label;
00473     bool ok=
00474       starts (bs, "\\begin-") &&
00475       starts (es, "\\end-") &&
00476       bs (7, N(bs)) == es (5, N(es));
00477     //cout << t[1] << " -> " << ok << "\n";
00478     return ok;
00479   }
00480   return false;
00481 }
00482 
00483 static bool
00484 is_text_argument (string cmd, int remaining_arity) {
00485   // FIXME: this test should be improved using DRD properties
00486   (void) remaining_arity;
00487   return cmd == "\\label" || cmd == "\\ref";
00488 }
00489 
00490 tree
00491 latex_parser::parse_command (string s, int& i, string cmd) {
00492   //cout << cmd << " [" << latex_type (cmd) << ", "
00493   //<< command_type ["!mode"] << ", " << latex_arity (cmd) << "]" << LF;
00494   if (cmd == "\\newcommand") cmd= "\\def";
00495   if (cmd == "\\renewcommand") cmd= "\\def";
00496   if (cmd == "\\renewenvironment") cmd= "\\newenvironment";
00497   if (cmd == "\\begin-split") cmd= "\\begin-eqsplit";
00498   if (cmd == "\\end-split") cmd= "\\end-eqsplit";
00499   if (cmd == "\\begin-split*") cmd= "\\begin-eqsplit*";
00500   if (cmd == "\\end-split*") cmd= "\\end-eqsplit*";
00501 
00502   if (latex_type (cmd) == "undefined")
00503     return parse_unknown (s, i, cmd);
00504 
00505   if (latex_type (cmd) == "math-environment") {
00506     if (cmd (0, 6) == "\\begin") command_type ("!mode") = "math";
00507     else command_type ("!mode") = "text";
00508   }
00509 
00510   if (textm_class_flag && level <= 1 && latex_type (cmd) == "length") {
00511     //cout << "Parse length " << cmd << "\n";
00512     int n= N(s);
00513     while (i<n && (is_space (s[i]) || s[i] == '=')) i++;
00514     tree len= parse_length (s, i);
00515     //cout << "Got " << len << "\n";
00516     return tree (TUPLE, "\\setlength", copy (cmd), copy (len));
00517   }
00518 
00519   if (cmd == "\\setlength") {
00520     tree name= parse_length_name (s, i);
00521     tree arg = parse_argument (s, i);
00522     return tuple (cmd, name, arg);
00523   }
00524 
00525   bool mbox_flag=
00526     ((cmd == "\\text") || (cmd == "\\mbox")) &&
00527     (command_type ["!mode"] == "math");
00528   if (mbox_flag) command_type ("!mode") = "text";
00529 
00530   int  n     = N(s);
00531   int  arity = latex_arity (cmd);
00532   bool option= (arity<0);
00533   if (option) arity= -1-arity;
00534 
00535   /************************ retrieve arguments *******************************/
00536   tree t (TUPLE, copy (cmd)); // parsed arguments
00537   tree u (TUPLE, copy (cmd)); // unparsed arguments
00538   while (i<n && arity>=0 && (arity>0 || option)) {
00539     int j= i;
00540     while ((j<n) && is_space (s[j])) j++;
00541     if (j==n) break;
00542     if (option && (s[j]=='[')) {
00543       j++;
00544       i=j;
00545       tree opt= parse (s, i, "]");
00546       if (cmd != "\\newtheorem" && cmd != "\\newtheorem*")
00547        t << opt;
00548       u << s (j, i);
00549       if ((i<n) && (s[i]==']')) i++;
00550       if (cmd != "\\newtheorem" && cmd != "\\newtheorem*")
00551        t[0]->label= t[0]->label * "*";
00552       option= false;
00553     }
00554     else if ((arity>0) && (s[j]=='{')) {
00555       bool text_arg=
00556        (command_type["!mode"] == "math") && is_text_argument (cmd, arity);
00557       j++;
00558       i=j;
00559       if (text_arg) command_type ("!mode")= "text";
00560       if ((N(t)==1) && (cmd == "\\def")) {
00561        while ((i<n) && (s[i]!='}')) i++;
00562        t << s (j, i);
00563       }
00564       else t << parse (s, i, "}");
00565       if (text_arg) command_type ("!mode")= "math";
00566       u << s (j, i);
00567       if ((i<n) && (s[i]=='}')) i++;
00568       arity--;
00569       if (arity == 0) option= false;
00570     }
00571     else if (s[j] == '}') break;
00572     else if (option && (s[j]=='#') && (cmd == "\\def")) {
00573       while ((j+3 <= n) && is_numeric (s[j+1]) && (s[j+2] == '#')) j+=2;
00574       if (j+2<=n) {
00575        t << s (j+1, j+2);
00576        u << s (j+1, j+2);
00577        i= j+2;
00578       }
00579       t[0]->label= t[0]->label * "*";
00580       option= false;
00581     }
00582     else {
00583       if (arity>0) {
00584        i=j;
00585        tree st= parse_symbol (s, i);
00586        t << st;
00587        u << st;
00588        arity--;
00589        if (arity == 0) option= false;
00590       }
00591       else break;
00592     }
00593   }
00594   if (arity>0) latex_error (s, i, "too little arguments for " * cmd);
00595 
00596   /******************** new commands and environments ************************/
00597   if (is_tuple (t, "\\def", 2)) {
00598     string var= string_arg (t[1]);
00599     command_type  (var)= "user";
00600     command_arity (var)= 0;
00601     command_def   (var)= as_string (u[2]);
00602   }
00603   if (is_tuple (t, "\\def*", 3)) {
00604     string var= string_arg (t[1]);
00605     command_type  (var)= "user";
00606     command_arity (var)= as_int (t[2]);
00607     command_def   (var)= as_string (u[3]);
00608   }
00609   if (is_tuple (t, "\\newtheorem", 2) || is_tuple (t, "\\newtheorem*", 2)) {
00610     string var= "\\begin-" * string_arg (t[1]);
00611     command_type  (var)= "environment";
00612     command_arity (var)= 0;
00613     var= "\\end-" * string_arg (t[1]);
00614     command_type  (var)= "environment";
00615     command_arity (var)= 0;
00616   }
00617   if (is_tuple (t, "\\newdimen", 1) || is_tuple (t, "\\newlength", 1)) {
00618     string var= string_arg (t[1]);
00619     command_type  (var)= "length";
00620     command_arity (var)= 0;
00621   }
00622   if (is_tuple (t, "\\newenvironment", 3)) {
00623     string var= "\\begin-" * string_arg (t[1]);
00624     command_type  (var)= "user";
00625     command_arity (var)= 0;
00626     command_def   (var)= as_string (u[2]);
00627     if (is_math_environment (t)) command_type (var)= "math-environment";
00628     var= "\\end-" * string_arg (t[1]);
00629     command_type  (var)= "user";
00630     command_arity (var)= 0;
00631     command_def   (var)= as_string (u[3]);
00632     if (is_math_environment (t)) command_type (var)= "math-environment";
00633   }
00634   if (is_tuple (t, "\\newenvironment*", 4)) {
00635     string var= "\\begin-" * string_arg (t[1]);
00636     command_type  (var)= "user";
00637     command_arity (var)= as_int (t[2]);
00638     command_def   (var)= as_string (u[3]);
00639     if (is_math_environment (t)) command_type (var)= "math-environment";
00640     var= "\\end-" * string_arg (t[1]);
00641     command_type  (var)= "user";
00642     command_arity (var)= 0;
00643     command_def   (var)= as_string (u[4]);
00644     if (is_math_environment (t)) command_type (var)= "math-environment";
00645   }
00646 
00647   /***************** environment changes for user commands  ******************/
00648   if (latex_type (cmd) == "user") {
00649     int pos= 0;
00650     string body= command_def[cmd];
00651     textm_recursion_level (cmd)++;
00652     if (textm_recursion_level [cmd] <= 5) {
00653       if (count_occurrences ("\\begin", body) ==
00654          count_occurrences ("\\end", body))
00655        (void) parse (sharp_to_arg (body, u), pos, "", true);
00656       else t= parse (sharp_to_arg (body, u), pos, "", true);
00657     }
00658     textm_recursion_level (cmd)--;
00659     // replaces macros by their definitions in the case when
00660     // the user defined shorthands for \\begin{env} and \\end{env}
00661   }
00662 
00663   if (mbox_flag) command_type ("!mode") = "math";
00664   return t;
00665 }
00666 
00667 tree
00668 latex_parser::parse_argument (string s, int& i) {
00669   skip_spaces (s, i);
00670   if (s[i] == '{') {
00671     i++;
00672     return parse (s, i, "}");
00673   }
00674   else parse_symbol (s, i);
00675 }
00676 
00677 tree
00678 latex_parser::parse_unknown (string s, int& i, string cmd) {
00679   int  n     = N(s);
00680   bool option= false;
00681 
00682   tree t (TUPLE, copy (cmd));
00683   while (i<n) {
00684     int j=i;
00685     while ((j<n) && is_space (s[j])) j++;
00686     if (j==n) break;
00687     if (option && (s[j]=='[')) {
00688       j++;
00689       i=j;
00690       t << parse (s, i, "]");
00691       if ((i<n) && (s[i]==']')) i++;
00692       t[0]->label= t[0]->label * "*";
00693       option= false;
00694     }
00695     else if (s[j]=='{') {
00696       j++;
00697       i=j;
00698       t << parse (s, i, "}");
00699       if ((i<n) && (s[i]=='}')) i++;
00700     }
00701     else break;
00702   }
00703   return t;
00704 }
00705 
00706 /******************************************************************************
00707 * Parsing lengths
00708 ******************************************************************************/
00709 
00710 bool
00711 latex_parser::can_parse_length (string s, int i) {
00712   if (!textm_class_flag) return false;
00713   int initial= i;
00714   int stage= 0;
00715   int n= N(s);
00716   while (i<n) {
00717     if (is_numeric (s[i]) || s[i] == '.' || s[i] == '-') { stage= 1; i++; }
00718     else if (is_space (s[i]) && stage > 0) i++;
00719     else if (read (s, i, "plus") || read (s, i, "\\@plus") ||
00720             read (s, i, "minus") || read (s, i, "\\@minus"))
00721       return stage >= 2;
00722     else if (is_tex_alpha (s[i])) {
00723       if (read (s, i, "cm")) stage= 2;
00724       else if (read (s, i, "mm")) stage= 2;
00725       else if (read (s, i, "pt")) stage= 2;
00726       else if (read (s, i, "in")) stage= 2;
00727       else if (read (s, i, "em")) stage= 2;
00728       else if (read (s, i, "pc")) stage= 2;
00729       else if (read (s, i, "bp")) stage= 2;
00730       else if (read (s, i, "dd")) stage= 2;
00731       else if (read (s, i, "cc")) stage= 2;
00732       else if (read (s, i, "sp")) stage= 2;
00733       else return false;
00734       if (i<n && is_tex_alpha (s[i])) return false;
00735     }
00736     else if (s[i] == '\\') {
00737       i++;
00738       int start= i;
00739       while (i<n && is_tex_alpha (s[i])) i++;
00740       if (latex_type (s (start, i)) != "length") return false;
00741       return s[initial] != '\\' || level > 1;
00742     }
00743     else return false;
00744   }
00745   return false;
00746 }
00747 
00748 tree
00749 latex_parser::parse_length (string s, int& i) {
00750   int n= N(s);
00751   tree r= tree (CONCAT);
00752   while (i<n) {
00753     if (is_numeric (s[i]) || s[i] == '.' || s[i] == '-')
00754       r << s (i, i+1);
00755     else if (read (s, i, "plus") || read (s, i, "\\@plus")) {
00756       tree next= parse_length (s, i);
00757       if (is_tuple (next, "\\tex-len", 3)) {
00758        //ASSERT (next[2] == "0pt", "invalid multiple plus");
00759        return tuple ("\\tex-len", r, next[1], next[3]);
00760       }
00761       else return tuple ("\\tex-len", r, next, "0pt");
00762     }
00763     else if (read (s, i, "minus") || read (s, i, "\\@minus")) {
00764       tree next= parse_length (s, i);
00765       if (is_tuple (next, "\\tex-len", 3)) {
00766        //ASSERT (next[3] == "0pt", "invalid multiple minus");
00767        return tuple ("\\tex-len", r, next[2], next[1]);
00768       }
00769       else return tuple ("\\tex-len", r, "0pt", next);
00770     }
00771     else if (is_tex_alpha (s[i]) && N(r) > 0 && is_atomic (r[N(r)-1]) &&
00772             (is_numeric (r[N(r)-1]->label) ||
00773              r[N(r)-1] == "." || r[N(r)-1] == "-")) {
00774       for (;i<n && is_tex_alpha (s[i]); i++)
00775        r << s (i, i+1);
00776       continue;
00777     }
00778     else if (s[i] == '\\') {
00779       i++;
00780       int start= i;
00781       while (i<n && is_tex_alpha (s[i])) i++;
00782       string unit= s (start, i);
00783       if (latex_type (unit) != "length") { i= start-1; break; }
00784       // FIXME
00785       if (unit == "p@")
00786        r << string ("p") << string ("t");
00787       else if (unit == "z@")
00788        r << string ("0") << string ("p") << string ("t");
00789       // FIXME
00790       continue;
00791     }
00792     else if (is_space (s[i]));
00793     else break;
00794     i++;
00795   }
00796   return r;
00797 }
00798 
00799 tree
00800 latex_parser::parse_length_name (string s, int& i) {
00801   skip_spaces (s, i);
00802   if (s[i] == '{') {
00803     i++;
00804     tree r= parse_length_name (s, i);
00805     skip_spaces (s, i);
00806     if (s[i] == '}') i++;
00807     return r;
00808   }
00809   else if (s[i] == '\\') {
00810     int start= i;
00811     i++;
00812     while (i<N(s) && is_tex_alpha (s[i])) i++;
00813     return s (start, i);
00814   }
00815   else return "";
00816 }
00817 
00818 /******************************************************************************
00819 * Parsing verbatim text
00820 ******************************************************************************/
00821 
00822 tree
00823 latex_parser::parse_verbatim (string s, int& i, string end) {
00824   int start=i, n= N(s), e= N(end);
00825   while ((i<(n-e)) && (s(i,i+e)!=end)) i++;
00826   i+=e;
00827   return tree (CONCAT,
00828               tree (TUPLE, "\\begin-verbatim"),
00829               s(start,i-e),
00830               tree (TUPLE, "\\end-verbatim"));
00831 }
00832 
00833 /******************************************************************************
00834 * This routine may be used to transform accented characters to the Cork format
00835 ******************************************************************************/
00836 
00837 static char Cork_unaccented[128]= {
00838   'A', 'A', 'C', 'C', 'D', 'E', 'E', 'G',
00839   'L', 'L', ' ', 'N', 'N', ' ', 'O', 'R',
00840   'R', 'S', 'S', 'S', 'T', 'T', 'U', 'U',
00841   'Y', 'Z', 'Z', 'Z', ' ', 'I', 'd', ' ',
00842   'a', 'a', 'c', 'c', 'd', 'e', 'e', 'g',
00843   'l', 'l', ' ', 'n', 'n', ' ', 'o', 'r',
00844   'r', 's', 's', 's', 't', 't', 'u', 'u',
00845   'y', 'z', 'z', 'z', ' ', ' ', ' ', ' ',
00846   'A', 'A', 'A', 'A', 'A', 'A', ' ', 'C',
00847   'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
00848   'D', 'N', 'O', 'O', 'O', 'O', 'O', ' ',
00849   ' ', 'U', 'U', 'U', 'U', 'Y', ' ', ' ',
00850   'a', 'a', 'a', 'a', 'a', 'a', ' ', 'c',
00851   'e', 'e', 'e', 'e', 25 , 25 , 25 , 25 ,
00852   'd', 'n', 'o', 'o', 'o', 'o', 'o', ' ',
00853   ' ', 'u', 'u', 'u', 'u', 'y', ' ', ' '
00854 };
00855 
00856 static char Cork_accent[128]= {
00857   'u' , 'k' , '\'', 'v' , 'v' , 'v' , 'k' , 'u' ,
00858   '\'', 'v' , ' ' , '\'', 'v' , ' ' , 'H' , '\'',
00859   'v' , '\'', 'v' , 'c' , 'v' , 'c' , 'H' , 'r' ,
00860   '\"', '\'', 'v' , '.' , ' ' , '.' , '=' , ' ' , // "
00861   'u' , 'k' , '\'', 'v' , 'v' , 'v' , 'k' , 'u' ,
00862   '\'', 'v' , ' ' , '\'', 'v' , ' ' , 'H' , '\'',
00863   'v' , '\'', 'v' , 'c' , 'v' , 'c' , 'H' , 'r' ,
00864   '\"', '\'', 'v' , '.' , ' ' , ' ' , ' ' , ' ' , // "
00865   '`' , '\'', '^' , '~' , '\"', ' ' , ' ' , 'c' , // "
00866   '`' , '\'', '^' , '\"', '`' , '\'', '^' , '\"', // "
00867   '=' , '~' , '`' , '\'', '^' , '~' , '\"', ' ' , // "
00868   ' ' , '`' , '\'', '^' , '\"', '\'', ' ' , ' ' , // "
00869   '`' , '\'', '^' , '~' , '\"', ' ' , ' ' , 'c' , // "
00870   '`' , '\'', '^' , '\"', '`' , '\'', '^' , '\"', // "
00871   '=' , '~' , '`' , '\'', '^' , '~' , '\"', ' ' , // "
00872   ' ' , '`' , '\'', '^' , '\"', '\'', ' ' , ' '   // "
00873 };
00874 
00875 tree
00876 accented_to_Cork (tree t) {
00877   if (arity (t) == 0) return t;
00878   int i, n=N(t);
00879   tree r (t, n);
00880   for (i=0; i<n; i++) r[i]= accented_to_Cork (t[i]);
00881   if (is_compound (t[0])) return r;
00882 
00883   string s= t[0]->label;
00884   if ((N(s)==2) && (s[0]=='\\') && (n==2) &&
00885       is_atomic (r[1]) && (N(r[1]->label)<=2)) {
00886     string v= r[1]->label;
00887     if (N(v)==0) {
00888       if (s[1] == '`' ) {
00889        string ret_s (1);
00890        ret_s[0]= '\000';
00891        return ret_s;
00892       }
00893       if (s[1] == '\'') return "\001";
00894       if (s[1] == '^' ) return "\136";
00895       if (s[1] == '\"') return "\004"; // "
00896       if (s[1] == '~' ) return "\176";
00897       if (s[1] == '=' ) return "\026";
00898       if (s[1] == '.' ) return "\137";
00899       if (s[1] == 'u' ) return "\025";
00900       if (s[1] == 'v' ) return "\024";
00901       if (s[1] == 'H' ) return "\175";
00902       if (s[1] == 'c' ) return "\030";
00903     }
00904     else {
00905       char c1= v[0], c2= s[1];
00906       if (v == "\\i") c1= (char) 25;
00907       if ((N(v)==1) || (v=="\\i"))
00908        for (i=0; i<127; i++)
00909          if ((Cork_unaccented[i]==c1) && (Cork_accent[i]==c2))
00910            return tree (string ((char) (i+128)));
00911     }
00912   }
00913   return r;
00914 }
00915 
00916 /******************************************************************************
00917 * Interface
00918 ******************************************************************************/
00919 
00920 tree
00921 latex_parser::parse (string s, bool change) {
00922   command_type ->extend ();
00923   command_arity->extend ();
00924   command_def  ->extend ();
00925 
00926   // We first cut the string into pieces at strategic places
00927   // This reduces the risk that the parser gets confused
00928   array<string> a;
00929   int i, start=0, n= N(s);
00930   for (i=0; i<n; i++)
00931     if (s[i]=='\n' || (s[i] == '\\' && test (s, i, "\\nextbib"))) {
00932       while ((i<n) && is_space (s[i])) i++;
00933       if (test (s, i, "%%%%%%%%%% Start TeXmacs macros\n")) {
00934        a << s (start, i);
00935        while ((i<n) && (!test (s, i, "%%%%%%%%%% End TeXmacs macros\n")))
00936          i++;
00937        i += 30;
00938        start= i;
00939        continue;
00940       }
00941       if (test (s, i, "\\begin{document}") ||
00942          test (s, i, "\\begin{abstract}") ||
00943          test (s, i, "\\chapter") ||
00944          test (s, i, "\\section") ||
00945          test (s, i, "\\subsection") ||
00946          test (s, i, "\\subsubsection") ||
00947          test (s, i, "\\paragraph") ||
00948          test (s, i, "\\subparagraph") ||
00949          test (s, i, "\\nextbib") ||
00950          test (s, i, "\\newcommand") ||
00951          test (s, i, "\\def") ||
00952          test (s, i, "\\input{") ||
00953          test (s, i, "\\include{"))
00954        {
00955          a << s (start, i);
00956          start= i;
00957           if (test (s, i, "\\input{") || test (s, i, "\\include{")) {
00958            while (i<N(s) && s[i] != '{') i++;
00959            int start_name= i+1;
00960             while (i<N(s) && s[i] != '}') i++;
00961             string name= s (start_name, i);
00962             if (!ends (name, ".tex")) name= name * ".tex";
00963             url incl= relative (get_file_focus (), name);
00964             string body;
00965             if (!exists (incl) || load_string (incl, body, false)) i++;
00966             else {
00967               //cout << "Include " << name << " -> " << incl << "\n";
00968               s= s (0, start) * "\n" * body * "\n" * s (i+1, N(s));
00969               n= N(s);
00970               i= start + 1;
00971             }
00972             start= i;
00973           }
00974           while (i < n && test (s, i, "\\nextbib{}")) {
00975             i += 10;
00976             a << s (start, i);
00977             start= i;
00978           }
00979        }
00980       if (i == n) break;
00981     }
00982   a << s (start, i);
00983 
00984   // We now parse each of the pieces
00985   tree t (CONCAT);
00986   for (i=0; i<N(a); i++) {
00987     int j=0;
00988     while (j<N(a[i])) {
00989       int start= j;
00990       command_type ("!mode") = "text";
00991       command_type ("!em") = "false";
00992       tree u= parse (a[i], j, "", true);
00993       if ((N(t)>0) && (t[N(t)-1]!='\n') && (start==0)) t << "\n";
00994       if (is_concat (u)) t << A(u);
00995       else t << u;
00996       if (j == start) j++;
00997     }
00998   }
00999 
01000   if (change) {
01001     command_type ->merge ();
01002     command_arity->merge ();
01003     command_def  ->merge ();
01004   }
01005   else {
01006     command_type ->shorten ();
01007     command_arity->shorten ();
01008     command_def  ->shorten ();
01009   }
01010   //cout << "Parsed " << t << "\n";
01011   return t;
01012 }
01013 
01014 static bool
01015 japanese_tex (string& s) {
01016   if (search_forwards ("\\documentclass{jarticle}", s) != -1) {
01017     s= replace (s, "\\documentclass{jarticle}", "\\documentclass{article}");
01018     s= convert (s, "ISO-2022-JP", "UTF-8");
01019     return true;
01020   }
01021   if (search_forwards ("\\documentclass{jbook}", s) != -1) {
01022     s= replace (s, "\\documentclass{jbook}", "\\documentclass{book}");
01023     s= convert (s, "ISO-2022-JP", "UTF-8");
01024     return true;
01025   }
01026   return false;
01027 }
01028 
01029 static bool
01030 korean_tex (string& s) {
01031   if (search_forwards ("\\usepackage{hangul}", s) != -1 ||
01032       search_forwards ("\\usepackage{hfont}", s) != -1 ||
01033       search_forwards ("]{hangul}", s) != -1 ||
01034       search_forwards ("]{hfont}", s) != -1)
01035     {
01036       s= replace (s, "\\usepackage{hangul}", "");
01037       s= replace (s, "\\usepackage{hfont}", "");
01038       s= convert (s, "EUC-KR", "UTF-8");
01039       return true;
01040     }
01041   if (search_forwards ("\\usepackage{dhucs}", s) != -1 ||
01042       search_forwards ("\\usepackage{memhangul-ucs}", s) != -1 ||
01043       search_forwards ("]{dhucs}", s) != -1 ||
01044       search_forwards ("]{memhangul-ucs}", s) != -1)
01045     {
01046       s= replace (s, "\\usepackage{dhucs}", "");
01047       s= replace (s, "\\usepackage{memhangul-ucs}", "");
01048       return true;
01049     }
01050   return false;
01051 }
01052 
01053 static bool
01054 chinese_tex (string& s) {
01055   if (search_forwards ("\\kaishu", s) != -1)
01056     s= replace (s, "\\kaishu", "");
01057   if (search_forwards ("\\begin{CJK}{GBK}{kai}", s) != -1)
01058     s= replace (s, "\\begin{CJK}{GBK}{kai}", "");
01059   if (search_forwards ("\\begin{CJK*}{GBK}{kai}", s) != -1)
01060     s= replace (s, "\\begin{CJK*}{GBK}{kai}", "");
01061   if (search_forwards ("\\end{CJK}", s) != -1)
01062     s= replace (s, "\\end{CJK}", "");
01063   if (search_forwards ("\\end{CJK*}", s) != -1)
01064     s= replace (s, "\\end{CJK*}", "");
01065   if (search_forwards ("\\CJKindent", s) != -1)
01066     s= replace (s, "\\CJKindent", "");
01067   if (search_forwards ("\\CJKcaption{GBk}", s) != -1)
01068     s= replace (s, "\\CJKcaption{GBK}", "");
01069   if (search_forwards ("\\usepackage{CJK}", s) != -1) {
01070     s= replace (s, "\\usepackage{CJK}", "");
01071     s= convert (s, "cp936", "UTF-8");
01072     return true;
01073   }
01074   if (search_forwards ("\\documentclass{cctart}", s) != -1) {
01075     s= replace (s, "\\documentclass{cctart}", "\\documentclass{article}");
01076     s= convert (s, "cp936", "UTF-8");
01077     return true;
01078   }
01079   if (search_forwards ("\\documentclass[CJK]{cctart}", s) != -1) {
01080     s= replace (s, "\\documentclass[CJK]{cctart}", "\\documentclass{article}");
01081     s= convert (s, "cp936", "UTF-8");
01082     return true;
01083   }
01084   return false;
01085 }
01086 
01087 static bool
01088 taiwanese_tex (string& s) {
01089   if (search_forwards ("\\usepackage{CJKvert,type1cm}", s) != -1)
01090     s= replace (s, "\\usepackage{CJKvert,type1cm}", "");
01091   if (search_forwards ("\\begin{CJK}{Bg5}{aming}", s) != -1)
01092     s= replace (s, "\\begin{CJK}{Bg5}{aming}", "");
01093   if (search_forwards ("\\begin{CJK}{Bg5}{kai}", s) != -1)
01094     s= replace (s, "\\begin{CJK}{Bg5}{kai}", "");
01095   if (search_forwards ("\\end{CJK}", s) != -1)
01096     s= replace (s, "\\end{CJK}", "");
01097   if (search_forwards ("\\CJKcaption{Bg5}", s) != -1)
01098     s= replace (s, "\\CJKcaption{Bg5}", "");
01099   if (search_forwards ("\\CJKindent", s) != -1)
01100     s= replace (s, "\\CJKindent", "");
01101   if (search_forwards ("\\usepackage{CJK}", s) != -1) {
01102     s= replace (s, "\\usepackage{CJK}", "");
01103     s= convert (s, "cp950", "UTF-8");
01104     return true;
01105   }
01106   if (search_forwards ("\\usepackage{CJK*}", s) != -1) {
01107     s= replace (s, "\\usepackage{CJK*}", "");
01108     s= convert (s, "cp950", "UTF-8");
01109     return true;
01110   }
01111   return false;
01112 }
01113 
01114 tree
01115 parse_latex (string s, bool change) {
01116   s= dos_to_better (s);
01117   string lan= "";
01118   if (japanese_tex (s)) lan= "japanese";
01119   else if (korean_tex (s)) lan= "korean";
01120   else if (taiwanese_tex (s)) lan= "taiwanese";
01121   else if (chinese_tex (s)) lan= "chinese";
01122   bool unicode= (lan == "chinese" || lan == "japanese" ||
01123                lan == "korean" || lan == "taiwanese");
01124   latex_parser ltx (unicode);
01125   tree r= accented_to_Cork (ltx.parse (s, change));
01126   if (lan == "") return r;
01127   return compound ("!language", r, lan);
01128 }
01129 
01130 tree
01131 parse_latex_document (string s, bool change) {
01132   return compound ("!file", parse_latex (s, change));
01133 }