Back to index

tetex-bin  3.0
pdfparse.c
Go to the documentation of this file.
00001 /*  $Header$
00002     This is dvipdfm, a DVI to PDF translator.
00003     Copyright (C) 1998, 1999 by Mark A. Wicks
00004 
00005     This program is free software; you can redistribute it and/or modify
00006     it under the terms of the GNU General Public License as published by
00007     the Free Software Foundation; either version 2 of the License, or
00008     (at your option) any later version.
00009 
00010     This program is distributed in the hope that it will be useful,
00011     but WITHOUT ANY WARRANTY; without even the implied warranty of
00012     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013     GNU General Public License for more details.
00014 
00015     You should have received a copy of the GNU General Public License
00016     along with this program; if not, write to the Free Software
00017     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00018     
00019     The author may be contacted via the e-mail address
00020 
00021        mwicks@kettering.edu
00022 */
00023 
00024        
00025 #include <stdio.h>
00026 #include <stdlib.h>
00027 #include <ctype.h>
00028 #include <string.h>
00029 #include "system.h"
00030 #include "mem.h"
00031 #include "mfileio.h"
00032 #include "numbers.h"
00033 #include "dvi.h"
00034 #include "pdfparse.h"
00035 #include "pdfspecial.h"
00036 #include "pdfobj.h"
00037 #include "pdfdoc.h"
00038 #include "pdfdev.h"
00039 
00040 #define verbose 0
00041 #define debug 0
00042 
00043 #define DUMP_LIMIT 50
00044 void dump(char *start, char *end)
00045 {
00046   char *p = start;
00047   fprintf (stderr, "\nCurrent input buffer is ");
00048   fprintf (stderr, "-->");
00049   while (p < end && p < start+DUMP_LIMIT)
00050     fprintf (stderr, "%c", *(p++));
00051   if (p == start+DUMP_LIMIT)
00052     fprintf (stderr, "...\n");
00053   fprintf (stderr, "<--\n");
00054 }
00055 
00056 void skip_white (char **start, char *end)
00057 {
00058   while (*start < end && (isspace (**start) || **start == '%')) {
00059     if (**start == '%') 
00060       skip_line (start, end);
00061     else /* Skip the white char  */
00062       (*start)++;
00063   }
00064   return;
00065 }
00066 
00067 void skip_line (char **start, char *end)
00068 {
00069   /* Note: PDF spec says that all platforms must end line with '\n'
00070      after a "stream" keyword */
00071   while (*start < end && **start != '\n' && **start != '\r')
00072     (*start)++;
00073   if (*start < end && **start == '\r')
00074     (*start) += 1;
00075   if (*start < end && **start == '\n')
00076     (*start) += 1;
00077   return;
00078 }
00079 
00080 void parse_crap (char **start, char *end)
00081 {
00082   skip_white(start, end);
00083   if (*start != end) {
00084     fprintf (stderr, "\nCrap left over after object!!\n");
00085     dump(*start, end);
00086   }
00087 }
00088 
00089 int is_an_int(const char *s)
00090 {
00091   int i;
00092   for (i=0; i<strlen(s); i++) {
00093     if (i == 0 && s[i] == '-')
00094       continue;
00095     if (!isdigit (s[i]))
00096       return 0;
00097   }
00098   return 1;
00099 }
00100 
00101 int is_a_number(const char *s)
00102 {
00103   int i, period = 0;
00104   for (i=0; i<strlen(s); i++) {
00105     if (s[i] == '-' && i == 0)
00106       continue;
00107     if (s[i] == '.' && !period) {
00108       period = 1;
00109       continue;
00110     }
00111     if (!isdigit (s[i]))
00112       return 0;
00113   }
00114   return 1;
00115 }
00116 
00117 
00118 pdf_obj *parse_pdf_dict (char **start, char *end)
00119 {
00120   pdf_obj *result, *tmp1, *tmp2;
00121   char *save = *start;
00122   skip_white(start, end);
00123   if (*((*start)++) != '<' ||
00124       *((*start)++) != '<') {
00125     *start = save;
00126     dump (*start, end);
00127     return NULL;
00128   }
00129   result = pdf_new_dict ();
00130     skip_white(start, end);
00131   while (*start < end &&
00132         **start != '>') {
00133     if ((tmp1 = parse_pdf_name (start, end)) == NULL) {
00134       pdf_release_obj (result); 
00135       {
00136        *start = save;
00137        dump (*start, end);
00138        return NULL;
00139       }
00140     };
00141     if ((tmp2 = parse_pdf_object (start, end)) == NULL) {
00142       pdf_release_obj (result);
00143       pdf_release_obj (tmp1); 
00144       {
00145        *start = save;
00146        dump (*start, end);
00147        return NULL;
00148       }
00149     }
00150     pdf_add_dict (result, tmp1, tmp2);
00151     skip_white(start, end);
00152   }
00153   if (*start >= end) {
00154     pdf_release_obj (result);
00155     *start = save;
00156     dump (*start, end);
00157     return NULL;
00158   }
00159   if (*((*start)++) == '>' &&
00160       *((*start)++) == '>') {
00161     return result;
00162   } else {
00163     pdf_release_obj (result);
00164     fprintf (stderr, "\nDictionary object ended prematurely\n");
00165     *start = save;
00166     dump (*start, end);
00167     return NULL;
00168   }
00169 }
00170 
00171 pdf_obj *parse_pdf_array (char **start, char *end)
00172 {
00173   pdf_obj *result, *tmp1;
00174 #ifdef MEM_DEBUG
00175 MEM_START
00176 #endif
00177   skip_white(start, end);
00178   if (*((*start)++) != '[')
00179     return NULL;
00180   result = pdf_new_array ();
00181   skip_white(start, end);
00182   while (*start < end &&
00183         **start != ']') {
00184     if ((tmp1 = parse_pdf_object (start, end)) == NULL) {
00185       pdf_release_obj (result);
00186       return NULL;
00187     };
00188     pdf_add_array (result, tmp1);
00189     skip_white(start, end);
00190   }
00191   if (*start >= end) {
00192     pdf_release_obj (result);
00193     fprintf (stderr, "\nArray ended prematurely\n");
00194     return NULL;
00195   }
00196   (*start)++;
00197 #ifdef MEM_DEBUG
00198 MEM_END
00199 #endif
00200   return result;
00201 }
00202 
00203 char *parse_number (char **start, char *end)
00204 {
00205   char *number, *save;
00206 #ifdef MEM_DEBUG
00207 MEM_START
00208 #endif
00209   skip_white(start, end);
00210   save = *start;
00211   if (*start < end && (**start == '+' || **start == '-')) {
00212     *start += 1;
00213   }
00214   while (*start < end &&
00215         isdigit(**start))
00216     (*start)++;
00217   if (*start < end && **start == '.') {
00218     (*start)++;
00219     while (*start < end &&
00220           isdigit(**start))
00221       (*start)++;
00222   }
00223   if (*start > save) {
00224     number = NEW ((*start-save)+1, char);
00225     memcpy (number, save, (*start-save));
00226     number[*start-save] = 0;
00227     return number;
00228   }
00229   *start = save;
00230 #ifdef MEM_DEBUG
00231 MEM_END
00232 #endif
00233   return NULL;
00234 }
00235 
00236 char *parse_unsigned (char **start, char *end)
00237 {
00238   char *number, *save;
00239 #ifdef MEM_DEBUG
00240 MEM_START
00241 #endif
00242   skip_white(start, end);
00243   save = *start;
00244   while (*start < end &&
00245         isdigit(**start))
00246     (*start)++;
00247   if (*start > save) {
00248     number = NEW ((*start-save)+1, char);
00249     memcpy (number, save, (*start-save));
00250     number[*start-save] = 0;
00251     return number;
00252   }
00253   *start = save;
00254 #ifdef MEM_DEBUG
00255 MEM_END
00256 #endif
00257   return NULL;
00258 }
00259 
00260 static char *parse_gen_ident (char **start, char *end, char *valid_chars)
00261 {
00262   char *ident, *save;
00263   save = *start;
00264   skip_white(start, end);
00265   while (*start < end && strchr (valid_chars, **start))
00266     (*start)++;
00267   if (save == *start)
00268     return NULL;
00269   ident = NEW (*start-save+1, char);
00270   memcpy (ident, save, *start-save);
00271   ident[*start-save] = 0;
00272   return ident;
00273 }
00274 
00275 char *parse_ident (char **start, char *end)
00276 {
00277   static char *valid_chars =
00278     "!\"#$&'*+,-.0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\\^_`abcdefghijklmnopqrstuvwxyz|~";
00279   return parse_gen_ident (start, end, valid_chars);
00280 }
00281 
00282 char *parse_val_ident (char **start, char *end)
00283 {
00284   static char *valid_chars =
00285     "!\"#$&'*+,-./0123456789:;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\\^_`abcdefghijklmnopqrstuvwxyz|~";
00286   return parse_gen_ident (start, end, valid_chars);
00287 }
00288 
00289 char *parse_c_ident (char **start, char *end)
00290 {
00291   static char *valid_chars =
00292     "0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz";
00293   return parse_gen_ident (start, end, valid_chars);
00294 }
00295 
00296 char *parse_opt_ident(char **start, char *end)
00297 {
00298   if (*start  >= end || (**start) != '@')
00299     return NULL;
00300   (*start)++;
00301   return parse_ident(start, end);
00302 }
00303 
00304 
00305 pdf_obj *parse_pdf_name (char **start, char *end)
00306 {
00307   pdf_obj *result;
00308   char *name;
00309   skip_white(start, end);
00310   if (**start != '/') {
00311     fprintf (stderr, "\nPDF Name expected and not found.\n");
00312     dump(*start, end);
00313     return NULL;
00314   }
00315   (*start)++;
00316   if ((name = parse_ident(start, end)) != NULL) {
00317     result = pdf_new_name (name);
00318     RELEASE (name);
00319     return result;
00320   }
00321   return NULL;
00322 }
00323 
00324 char *parse_pdf_reference(char **start, char *end)
00325 {
00326   skip_white (start, end);
00327   if (**start != '@') {
00328     fprintf (stderr, "\nPDF Name expected and not found.\n");
00329     dump(*start, end);
00330     return NULL;
00331   }
00332   (*start)++;
00333   return parse_ident(start, end);
00334 }
00335 
00336 pdf_obj *parse_pdf_boolean (char **start, char *end)
00337 {
00338   skip_white (start, end);
00339   if (end-*start > strlen ("true") &&
00340       !strncmp (*start, "true", strlen("true"))) {
00341     *start += strlen("true");
00342     return pdf_new_boolean (1);
00343   }
00344   if (end - *start > strlen ("false") &&
00345       !strncmp (*start, "false", strlen("false"))) {
00346     *start += strlen("false");
00347     return pdf_new_boolean (0);
00348   }
00349   return NULL;
00350 }
00351 
00352 pdf_obj *parse_pdf_null (char **start, char *end)
00353 {
00354   char *save = *start;
00355   char *ident;
00356   skip_white (start, end);
00357   ident = parse_ident(start, end);
00358   if (!strcmp (ident, "null")) {
00359     RELEASE(ident);
00360     return pdf_new_null();
00361   }
00362   *start = save;
00363   fprintf (stderr, "\nNot a valid object\n");
00364   dump(*start, end);
00365   return NULL;
00366 }
00367 
00368 static pdf_obj *parse_pdf_number (char **start, char *end)
00369 {
00370   char *number;
00371   pdf_obj *result;
00372   skip_white(start, end);
00373   if ((number = parse_number(start, end)) != NULL) {
00374     result = pdf_new_number (atof(number));
00375     RELEASE (number);
00376     return result;
00377   }
00378   return NULL;
00379 }
00380 
00381 int xtod (char c) 
00382 {
00383   if (c >= '0' && c <= '9')
00384     return c-'0';
00385   if (c >= 'A' && c <= 'F')
00386     return (c-'A')+10;
00387   if (c >= 'a' && c <= 'f')
00388     return (c-'a')+10;
00389   return 0;
00390 }
00391 
00392 pdf_obj *parse_pdf_hex_string (char **start, char *end)
00393 {
00394   pdf_obj *result;
00395   char *save;
00396   unsigned char *string = NULL;
00397   int strlength;
00398   skip_white (start, end);
00399   if (*start == end || *((*start)++) != '<')
00400     return NULL;
00401   save = *start;
00402   string = NEW ((end - *start)/2+2, unsigned char); /* A little excess here */
00403   strlength = 0;
00404   while (*start < end && **start != '>') {
00405     string[strlength] = xtod(**start) * 16;
00406     (*start) += 1;
00407     if (*start < end && **start != '>') {
00408       string[strlength] += xtod(**start);
00409       (*start) += 1;
00410     }
00411     skip_white (start, end);
00412     strlength += 1;
00413   }
00414   if (*start < end) {
00415      *start += 1;
00416      result = pdf_new_string (string, strlength);
00417   } else {
00418      result = NULL;
00419   }
00420   if (string)
00421      RELEASE(string);
00422   return result;
00423 }
00424 
00425 pdf_obj *parse_pdf_string (char **start, char *end)
00426 {
00427   pdf_obj *result;
00428   int balance = 0;
00429   char *save;
00430   unsigned char *string;
00431   int strlength;
00432   skip_white(start, end);
00433   save = *start;
00434   if (*start == end || **start != '(') {
00435     return NULL;
00436   }
00437   ++(*start);
00438   string = NEW (end - *start, unsigned char);
00439   strlength = 0;
00440   balance = 0;
00441   while (*start < end &&
00442         (**start != ')' || balance > 0)) {
00443     if (**start == '\\')
00444       switch (*(++(*start))) {
00445       case 'n':
00446        string[strlength++] = '\n';
00447        (*start)++;
00448        break;
00449       case 'r':
00450        string[strlength++] = '\r';
00451        (*start)++;
00452        break;
00453       case 't':
00454        string[strlength++] = '\t';
00455        (*start)++;
00456        break;
00457       case 'b':
00458        string[strlength++] = '\b';
00459        (*start)++;
00460        break;
00461       default:
00462        if (isdigit(**start)) {
00463          int i;
00464          string[strlength] = 0;
00465          for (i=0; i<3; i++) 
00466            string[strlength] = string[strlength]*8 + (*((*start)++)-'0');
00467          strlength+= 1;
00468        } else {
00469          string[strlength++] = *((*start)++);
00470        }
00471       }
00472     else {
00473       if (**start == '(')
00474        balance += 1;
00475       if (**start == ')')
00476        balance -= 1;
00477       string[strlength++] = *((*start)++);
00478     }
00479   }
00480   if (*start >= end) {
00481     fprintf (stderr, "\nString object ended prematurely\n");
00482     dump (save, *start);
00483     return NULL;
00484   }
00485   (*start)++;
00486   result = pdf_new_string (string, strlength);
00487   RELEASE (string);
00488   return result;
00489 }
00490 
00491 char *parse_c_string (char **start, char *end)
00492 {
00493   char *string, *save;
00494   int strlength;
00495   skip_white(start, end);
00496   save = *start;
00497   if (*start == end || **start != '"') {
00498     return NULL;
00499   }
00500   ++(*start);
00501   string = NEW (end - *start, char);
00502   strlength = 0;
00503   while (*start < end && (**start != '"')) {
00504     if (**start == '\\')
00505       switch (*(++(*start))) {
00506       case '"':
00507        string[strlength++] = '"';
00508        (*start)++;
00509        break;
00510       case 'n':
00511        string[strlength++] = '\n';
00512        (*start)++;
00513        break;
00514       case 'r':
00515        string[strlength++] = '\r';
00516        (*start)++;
00517        break;
00518       case 't':
00519        string[strlength++] = '\t';
00520        (*start)++;
00521        break;
00522       case 'b':
00523        string[strlength++] = '\b';
00524        (*start)++;
00525        break;
00526       default:
00527        if (isdigit(**start)) {
00528          int i;
00529          string[strlength] = 0;
00530          for (i=0; i<3; i++) 
00531            string[strlength] = string[strlength]*8 + (*((*start)++)-'0');
00532          strlength+= 1;
00533        } else {
00534          string[strlength++] = *((*start)++);
00535        }
00536       }
00537     else {
00538       string[strlength++] = *((*start)++);
00539     }
00540     string[strlength]=0;
00541   }
00542   if (*start >= end) {
00543     fprintf (stderr, "\nString ended prematurely\n");
00544     dump (save, *start);
00545     return NULL;
00546   }
00547   string[strlength] = 0;
00548   (*start)++;
00549   return string;
00550 }
00551 
00552 static pdf_obj *parse_pdf_stream (char **start, char *end, pdf_obj
00553                               *dict)
00554 {
00555   pdf_obj *result, *new_dict, *tmp1, *length_obj;
00556   unsigned long length;
00557   if (pdf_lookup_dict(dict, "F")) {
00558     fprintf (stderr, "File streams not implemented (yet)");
00559     return NULL;
00560   }
00561   if ((tmp1 = pdf_lookup_dict(dict, "Length")) == NULL) {
00562     fprintf (stderr, "No length specified");
00563     return NULL;
00564   }
00565   length = pdf_number_value (length_obj = pdf_deref_obj (tmp1));
00566   pdf_release_obj (length_obj);
00567   skip_white(start, end);
00568   skip_line(start, end);
00569   result = pdf_new_stream(0);
00570   new_dict = pdf_stream_dict(result);
00571   pdf_merge_dict (new_dict, dict);
00572   pdf_release_obj (dict);
00573   pdf_add_stream (result, *start, length);
00574   *start += length;
00575   skip_white(start, end);
00576   if (*start+strlen("endstream") > end ||
00577       strncmp(*start, "endstream", strlen("endstream"))) {
00578     fprintf (stderr, "\nendstream not found\n");
00579     return NULL;
00580   }
00581   *start += strlen("endstream");
00582   return result;
00583 }
00584 
00585 pdf_obj *parse_pdf_object (char **start, char *end)
00586 {
00587   pdf_obj *result, *tmp1=NULL, *tmp2=NULL;
00588   char *save = *start;
00589   char *position2;
00590   skip_white(start, end);
00591   if (*start >= end)
00592     return NULL;
00593   switch (**start) {
00594   case '<': 
00595     /* Check for those troublesome strings starting with '<' */
00596     if (*start+1 < end && *(*start+1) != '<') {
00597       result = parse_pdf_hex_string (start, end);
00598       break;
00599     }
00600     result = parse_pdf_dict (start, end);
00601     skip_white(start, end);
00602     if (end - *start > strlen("stream") &&
00603        !strncmp(*start, "stream", strlen("stream"))) {
00604       result = parse_pdf_stream (start, end, result);
00605     }
00606     /* Check for stream */
00607     break;
00608   case '(':
00609     result = parse_pdf_string(start, end);
00610     break;
00611   case '[':
00612     result = parse_pdf_array(start, end);
00613     break;
00614   case '/':
00615     result = parse_pdf_name(start, end);
00616     break;
00617   case '@':
00618     result = get_reference(start, end);
00619     break;
00620   case 't':
00621   case 'f':
00622     result = parse_pdf_boolean(start, end);
00623     break;
00624   default:
00625     /* This is a bit of a hack, but PDF doesn't easily allow you to
00626        tell a number from an indirect object reference with some
00627        serious looking ahead */
00628     
00629     if (*start < end && 
00630        (isdigit(**start) || **start == '+' || **start == '-' || **start == '.')) {
00631       tmp1 = parse_pdf_number(start, end);
00632       tmp2 = NULL;
00633       /* This could be a # # R type reference.  We can't be sure unless
00634         we look ahead for the second number and the 'R' */
00635       skip_white(start, end);
00636       position2 = *start;
00637       if (*start < end && isdigit(**start)) {
00638        tmp2 = parse_pdf_number(start, end);
00639       } else
00640        tmp2 = NULL;
00641       skip_white(start, end);
00642       if (tmp1 != NULL && tmp2 != NULL && *start < end && *((*start)++) == 'R') {
00643        result = pdf_new_ref ((unsigned long) pdf_number_value (tmp1), 
00644                            (int) pdf_number_value (tmp2));
00645        pdf_release_obj (tmp1);
00646        pdf_release_obj (tmp2);
00647        break;
00648       }
00649       /* Following checks if we got two numbers, but not 'r' */
00650       if (tmp1 != NULL && tmp2 != NULL) {
00651        pdf_release_obj (tmp2);
00652        *start = position2;
00653       }
00654       result = tmp1;
00655       break;
00656     }
00657     if (*start < end && **start == 'n') {
00658       result = parse_pdf_null(start, end);
00659       break;
00660     }
00661     result = NULL;
00662     break;
00663   }
00664   if (result == NULL) {
00665     fprintf (stderr, "\nExpecting an object, but didn't find one");
00666     *start = save;
00667     dump(*start, end);
00668   }
00669   return result;
00670 }
00671 
00672 void parse_key_val (char **start, char *end, char **key, char **val) 
00673 {
00674   *key = NULL;
00675   *val = NULL;
00676   skip_white (start, end);
00677   if ((*key = parse_c_ident (start, end))) {
00678     skip_white (start, end);
00679     if (*start < end && **start == '=')
00680       {
00681        (*start) += 1;
00682        skip_white (start, end);
00683        if (*start < end) switch (**start) {
00684        case '"':
00685          *val = parse_c_string (start, end);
00686          break;
00687        default:
00688          *val = parse_val_ident (start, end);
00689        }
00690       }
00691   }
00692 }