Back to index

tetex-bin  3.0
Lexer.cc
Go to the documentation of this file.
00001 //========================================================================
00002 //
00003 // Lexer.cc
00004 //
00005 // Copyright 1996-2003 Glyph & Cog, LLC
00006 //
00007 //========================================================================
00008 
00009 #include <aconf.h>
00010 
00011 #ifdef USE_GCC_PRAGMAS
00012 #pragma implementation
00013 #endif
00014 
00015 #include <stdlib.h>
00016 #include <stddef.h>
00017 #include <string.h>
00018 #include <ctype.h>
00019 #include "Lexer.h"
00020 #include "Error.h"
00021 
00022 //------------------------------------------------------------------------
00023 
00024 // A '1' in this array means the character is white space.  A '1' or
00025 // '2' means the character ends a name or command.
00026 static char specialChars[256] = {
00027   1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,   // 0x
00028   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 1x
00029   1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2,   // 2x
00030   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,   // 3x
00031   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 4x
00032   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0,   // 5x
00033   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 6x
00034   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0,   // 7x
00035   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 8x
00036   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 9x
00037   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // ax
00038   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // bx
00039   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // cx
00040   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // dx
00041   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // ex
00042   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    // fx
00043 };
00044 
00045 //------------------------------------------------------------------------
00046 // Lexer
00047 //------------------------------------------------------------------------
00048 
00049 Lexer::Lexer(XRef *xref, Stream *str) {
00050   Object obj;
00051 
00052   curStr.initStream(str);
00053   streams = new Array(xref);
00054   streams->add(curStr.copy(&obj));
00055   strPtr = 0;
00056   freeArray = gTrue;
00057   curStr.streamReset();
00058 }
00059 
00060 Lexer::Lexer(XRef *xref, Object *obj) {
00061   Object obj2;
00062 
00063   if (obj->isStream()) {
00064     streams = new Array(xref);
00065     freeArray = gTrue;
00066     streams->add(obj->copy(&obj2));
00067   } else {
00068     streams = obj->getArray();
00069     freeArray = gFalse;
00070   }
00071   strPtr = 0;
00072   if (streams->getLength() > 0) {
00073     streams->get(strPtr, &curStr);
00074     curStr.streamReset();
00075   }
00076 }
00077 
00078 Lexer::~Lexer() {
00079   if (!curStr.isNone()) {
00080     curStr.streamClose();
00081     curStr.free();
00082   }
00083   if (freeArray) {
00084     delete streams;
00085   }
00086 }
00087 
00088 int Lexer::getChar() {
00089   int c;
00090 
00091   c = EOF;
00092   while (!curStr.isNone() && (c = curStr.streamGetChar()) == EOF) {
00093     curStr.streamClose();
00094     curStr.free();
00095     ++strPtr;
00096     if (strPtr < streams->getLength()) {
00097       streams->get(strPtr, &curStr);
00098       curStr.streamReset();
00099     }
00100   }
00101   return c;
00102 }
00103 
00104 int Lexer::lookChar() {
00105   if (curStr.isNone()) {
00106     return EOF;
00107   }
00108   return curStr.streamLookChar();
00109 }
00110 
00111 Object *Lexer::getObj(Object *obj) {
00112   char *p;
00113   int c, c2;
00114   GBool comment, neg, done;
00115   int numParen;
00116   int xi;
00117   double xf, scale;
00118   GString *s;
00119   int n, m;
00120 
00121   // skip whitespace and comments
00122   comment = gFalse;
00123   while (1) {
00124     if ((c = getChar()) == EOF) {
00125       return obj->initEOF();
00126     }
00127     if (comment) {
00128       if (c == '\r' || c == '\n')
00129        comment = gFalse;
00130     } else if (c == '%') {
00131       comment = gTrue;
00132     } else if (specialChars[c] != 1) {
00133       break;
00134     }
00135   }
00136 
00137   // start reading token
00138   switch (c) {
00139 
00140   // number
00141   case '0': case '1': case '2': case '3': case '4':
00142   case '5': case '6': case '7': case '8': case '9':
00143   case '-': case '.':
00144     neg = gFalse;
00145     xi = 0;
00146     if (c == '-') {
00147       neg = gTrue;
00148     } else if (c == '.') {
00149       goto doReal;
00150     } else {
00151       xi = c - '0';
00152     }
00153     while (1) {
00154       c = lookChar();
00155       if (isdigit(c)) {
00156        getChar();
00157        xi = xi * 10 + (c - '0');
00158       } else if (c == '.') {
00159        getChar();
00160        goto doReal;
00161       } else {
00162        break;
00163       }
00164     }
00165     if (neg)
00166       xi = -xi;
00167     obj->initInt(xi);
00168     break;
00169   doReal:
00170     xf = xi;
00171     scale = 0.1;
00172     while (1) {
00173       c = lookChar();
00174       if (!isdigit(c)) {
00175        break;
00176       }
00177       getChar();
00178       xf = xf + scale * (c - '0');
00179       scale *= 0.1;
00180     }
00181     if (neg)
00182       xf = -xf;
00183     obj->initReal(xf);
00184     break;
00185 
00186   // string
00187   case '(':
00188     p = tokBuf;
00189     n = 0;
00190     numParen = 1;
00191     done = gFalse;
00192     s = NULL;
00193     do {
00194       c2 = EOF;
00195       switch (c = getChar()) {
00196 
00197       case EOF:
00198 #if 0
00199       // This breaks some PDF files, e.g., ones from Photoshop.
00200       case '\r':
00201       case '\n':
00202 #endif
00203        error(getPos(), "Unterminated string");
00204        done = gTrue;
00205        break;
00206 
00207       case '(':
00208        ++numParen;
00209        c2 = c;
00210        break;
00211 
00212       case ')':
00213        if (--numParen == 0) {
00214          done = gTrue;
00215        } else {
00216          c2 = c;
00217        }
00218        break;
00219 
00220       case '\\':
00221        switch (c = getChar()) {
00222        case 'n':
00223          c2 = '\n';
00224          break;
00225        case 'r':
00226          c2 = '\r';
00227          break;
00228        case 't':
00229          c2 = '\t';
00230          break;
00231        case 'b':
00232          c2 = '\b';
00233          break;
00234        case 'f':
00235          c2 = '\f';
00236          break;
00237        case '\\':
00238        case '(':
00239        case ')':
00240          c2 = c;
00241          break;
00242        case '0': case '1': case '2': case '3':
00243        case '4': case '5': case '6': case '7':
00244          c2 = c - '0';
00245          c = lookChar();
00246          if (c >= '0' && c <= '7') {
00247            getChar();
00248            c2 = (c2 << 3) + (c - '0');
00249            c = lookChar();
00250            if (c >= '0' && c <= '7') {
00251              getChar();
00252              c2 = (c2 << 3) + (c - '0');
00253            }
00254          }
00255          break;
00256        case '\r':
00257          c = lookChar();
00258          if (c == '\n') {
00259            getChar();
00260          }
00261          break;
00262        case '\n':
00263          break;
00264        case EOF:
00265          error(getPos(), "Unterminated string");
00266          done = gTrue;
00267          break;
00268        default:
00269          c2 = c;
00270          break;
00271        }
00272        break;
00273 
00274       default:
00275        c2 = c;
00276        break;
00277       }
00278 
00279       if (c2 != EOF) {
00280        if (n == tokBufSize) {
00281          if (!s)
00282            s = new GString(tokBuf, tokBufSize);
00283          else
00284            s->append(tokBuf, tokBufSize);
00285          p = tokBuf;
00286          n = 0;
00287        }
00288        *p++ = (char)c2;
00289        ++n;
00290       }
00291     } while (!done);
00292     if (!s)
00293       s = new GString(tokBuf, n);
00294     else
00295       s->append(tokBuf, n);
00296     obj->initString(s);
00297     break;
00298 
00299   // name
00300   case '/':
00301     p = tokBuf;
00302     n = 0;
00303     while ((c = lookChar()) != EOF && !specialChars[c]) {
00304       getChar();
00305       if (c == '#') {
00306        c2 = lookChar();
00307        if (c2 >= '0' && c2 <= '9') {
00308          c = c2 - '0';
00309        } else if (c2 >= 'A' && c2 <= 'F') {
00310          c = c2 - 'A' + 10;
00311        } else if (c2 >= 'a' && c2 <= 'f') {
00312          c = c2 - 'a' + 10;
00313        } else {
00314          goto notEscChar;
00315        }
00316        getChar();
00317        c <<= 4;
00318        c2 = getChar();
00319        if (c2 >= '0' && c2 <= '9') {
00320          c += c2 - '0';
00321        } else if (c2 >= 'A' && c2 <= 'F') {
00322          c += c2 - 'A' + 10;
00323        } else if (c2 >= 'a' && c2 <= 'f') {
00324          c += c2 - 'a' + 10;
00325        } else {
00326          error(getPos(), "Illegal digit in hex char in name");
00327        }
00328       }
00329      notEscChar:
00330       if (++n == tokBufSize) {
00331        error(getPos(), "Name token too long");
00332        break;
00333       }
00334       *p++ = c;
00335     }
00336     *p = '\0';
00337     obj->initName(tokBuf);
00338     break;
00339 
00340   // array punctuation
00341   case '[':
00342   case ']':
00343     tokBuf[0] = c;
00344     tokBuf[1] = '\0';
00345     obj->initCmd(tokBuf);
00346     break;
00347 
00348   // hex string or dict punctuation
00349   case '<':
00350     c = lookChar();
00351 
00352     // dict punctuation
00353     if (c == '<') {
00354       getChar();
00355       tokBuf[0] = tokBuf[1] = '<';
00356       tokBuf[2] = '\0';
00357       obj->initCmd(tokBuf);
00358 
00359     // hex string
00360     } else {
00361       p = tokBuf;
00362       m = n = 0;
00363       c2 = 0;
00364       s = NULL;
00365       while (1) {
00366        c = getChar();
00367        if (c == '>') {
00368          break;
00369        } else if (c == EOF) {
00370          error(getPos(), "Unterminated hex string");
00371          break;
00372        } else if (specialChars[c] != 1) {
00373          c2 = c2 << 4;
00374          if (c >= '0' && c <= '9')
00375            c2 += c - '0';
00376          else if (c >= 'A' && c <= 'F')
00377            c2 += c - 'A' + 10;
00378          else if (c >= 'a' && c <= 'f')
00379            c2 += c - 'a' + 10;
00380          else
00381            error(getPos(), "Illegal character <%02x> in hex string", c);
00382          if (++m == 2) {
00383            if (n == tokBufSize) {
00384              if (!s)
00385               s = new GString(tokBuf, tokBufSize);
00386              else
00387               s->append(tokBuf, tokBufSize);
00388              p = tokBuf;
00389              n = 0;
00390            }
00391            *p++ = (char)c2;
00392            ++n;
00393            c2 = 0;
00394            m = 0;
00395          }
00396        }
00397       }
00398       if (!s)
00399        s = new GString(tokBuf, n);
00400       else
00401        s->append(tokBuf, n);
00402       if (m == 1)
00403        s->append((char)(c2 << 4));
00404       obj->initString(s);
00405     }
00406     break;
00407 
00408   // dict punctuation
00409   case '>':
00410     c = lookChar();
00411     if (c == '>') {
00412       getChar();
00413       tokBuf[0] = tokBuf[1] = '>';
00414       tokBuf[2] = '\0';
00415       obj->initCmd(tokBuf);
00416     } else {
00417       error(getPos(), "Illegal character '>'");
00418       obj->initError();
00419     }
00420     break;
00421 
00422   // error
00423   case ')':
00424   case '{':
00425   case '}':
00426     error(getPos(), "Illegal character '%c'", c);
00427     obj->initError();
00428     break;
00429 
00430   // command
00431   default:
00432     p = tokBuf;
00433     *p++ = c;
00434     n = 1;
00435     while ((c = lookChar()) != EOF && !specialChars[c]) {
00436       getChar();
00437       if (++n == tokBufSize) {
00438        error(getPos(), "Command token too long");
00439        break;
00440       }
00441       *p++ = c;
00442     }
00443     *p = '\0';
00444     if (tokBuf[0] == 't' && !strcmp(tokBuf, "true")) {
00445       obj->initBool(gTrue);
00446     } else if (tokBuf[0] == 'f' && !strcmp(tokBuf, "false")) {
00447       obj->initBool(gFalse);
00448     } else if (tokBuf[0] == 'n' && !strcmp(tokBuf, "null")) {
00449       obj->initNull();
00450     } else {
00451       obj->initCmd(tokBuf);
00452     }
00453     break;
00454   }
00455 
00456   return obj;
00457 }
00458 
00459 void Lexer::skipToNextLine() {
00460   int c;
00461 
00462   while (1) {
00463     c = getChar();
00464     if (c == EOF || c == '\n') {
00465       return;
00466     }
00467     if (c == '\r') {
00468       if ((c = lookChar()) == '\n') {
00469        getChar();
00470       }
00471       return;
00472     }
00473   }
00474 }