Back to index

texmacs  1.0.7.15
pdfparse.c
Go to the documentation of this file.
00001 /*  $Header: /home/cvsroot/dvipdfmx/src/pdfparse.c,v 1.44 2009/04/29 11:20:21 chofchof Exp $
00002 
00003     This is dvipdfmx, an eXtended version of dvipdfm by Mark A. Wicks.
00004 
00005     Copyright (C) 2007 by Jin-Hwan Cho and Shunsaku Hirata,
00006     the dvipdfmx project team <dvipdfmx@project.ktug.or.kr>
00007     
00008     Copyright (C) 1998, 1999 by Mark A. Wicks <mwicks@kettering.edu>
00009 
00010     This program is free software; you can redistribute it and/or modify
00011     it under the terms of the GNU General Public License as published by
00012     the Free Software Foundation; either version 2 of the License, or
00013     (at your option) any later version.
00014     
00015     This program is distributed in the hope that it will be useful,
00016     but WITHOUT ANY WARRANTY; without even the implied warranty of
00017     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00018     GNU General Public License for more details.
00019     
00020     You should have received a copy of the GNU General Public License
00021     along with this program; if not, write to the Free Software
00022     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
00023 */
00024 
00025 #if HAVE_CONFIG_H
00026 #include "config.h"
00027 #endif
00028 
00029 #include <ctype.h>
00030 #include <string.h>
00031 
00032 #include "system.h"
00033 #include "mem.h"
00034 #include "error.h"
00035 
00036 #include "numbers.h"
00037 
00038 #include "mfileio.h"
00039 
00040 #include "pdfobj.h"
00041 #include "pdfdoc.h"
00042 #include "pdfdev.h"
00043 
00044 #include "pdfparse.h"
00045 
00046 
00047 #define PDF_PARSE_STRICT 1
00048 /* PDF */
00049 #ifdef  is_space
00050 #undef  is_space
00051 #endif
00052 #ifdef  is_delim
00053 #undef  is_delim
00054 #endif
00055 
00056 #define is_space(c) ((c) == ' '  || (c) == '\t' || (c) == '\f' || \
00057                    (c) == '\r' || (c) == '\n' || (c) == '\0')
00058 #define is_delim(c) ((c) == '(' || (c) == '/' || \
00059                      (c) == '<' || (c) == '>' || \
00060                    (c) == '[' || (c) == ']' || \
00061                      (c) == '%')
00062 #define PDF_TOKEN_END(p,e) ((p) >= (e) || is_space(*(p)) || is_delim(*(p)))
00063 
00064 #define istokensep(c) (is_space((c)) || is_delim((c)))
00065 
00066 static struct {
00067   int tainted;
00068 } parser_state = {
00069   0
00070 };
00071 
00072 static int xtoi (char ch);
00073 
00074 static char *save = NULL;
00075 
00076 void
00077 dump (const char *start, const char *end)
00078 {
00079   const char *p = start;
00080 
00081 #define DUMP_LIMIT 50
00082   MESG("\nCurrent input buffer is -->");
00083   while (p < end && p < start + DUMP_LIMIT)
00084     MESG("%c", *(p++));
00085   if (p == start+DUMP_LIMIT)
00086     MESG("...");
00087   MESG("<--\n");
00088 }
00089 
00090 #define SAVE(s,e) do {\
00091    save = (s);\
00092  } while (0)
00093 #define DUMP_RESTORE(s,e) do {\
00094    dump(save, end);\
00095    (s) = save;\
00096  } while (0)
00097 
00098 void
00099 skip_line (char **start, char *end)
00100 {
00101   while (*start < end && **start != '\n' && **start != '\r')
00102     (*start)++;
00103   /* The carriage return (CR; \r; 0x0D) and line feed (LF; \n; 0x0A)
00104    * characters, also called newline characters, are treated as
00105    * end-of-line (EOL) markers. The combination of a carriage return
00106    * followed immediately by a line feed is treated as one EOL marker.
00107    */
00108   if (*start < end && **start == '\r')
00109     (*start)++;
00110   if (*start < end && **start == '\n')
00111     (*start)++;
00112 }
00113 
00114 void
00115 skip_white (char **start, char *end)
00116 {
00117   /*
00118    * The null (NUL; 0x00) character is a white-space character in PDF spec
00119    * but isspace(0x00) returns FALSE; on the other hand, the vertical tab
00120    * (VT; 0x0B) character is not a white-space character in PDF spec but
00121    * isspace(0x0B) returns TRUE.
00122    */
00123   while (*start < end && (is_space(**start) || **start == '%')) {
00124     if (**start == '%')
00125       skip_line(start, end);
00126     else
00127       (*start)++;
00128   }
00129 }
00130 
00131 
00132 static char *
00133 parsed_string (const char *start, const char *end)
00134 {
00135   char *result = NULL;
00136   int   len;
00137 
00138   len = end - start;
00139   if (len > 0) {
00140     result = NEW(len + 1, char);
00141     memcpy(result, start, len);
00142     result[len] = '\0';
00143   }
00144 
00145   return result;
00146 }
00147 
00148 char *
00149 parse_number (char **start, char *end)
00150 {
00151   char *number, *p;
00152 
00153   skip_white(start, end);
00154   p = *start;
00155   if (p < end && (*p == '+' || *p == '-'))
00156     p++;
00157   while (p < end && isdigit(*p))
00158     p++;
00159   if (p < end && *p == '.') {
00160     p++;
00161     while (p < end && isdigit(*p))
00162       p++;
00163   }
00164   number = parsed_string(*start, p);
00165 
00166   *start = p;
00167   return number;
00168 }
00169 
00170 char *
00171 parse_unsigned (char **start, char *end)
00172 {
00173   char *number, *p;
00174 
00175   skip_white(start, end);
00176   for (p = *start; p < end; p++) {
00177     if (!isdigit(*p))
00178       break;
00179   }
00180   number = parsed_string(*start, p);
00181 
00182   *start = p;
00183   return number;
00184 }
00185 
00186 static char *
00187 parse_gen_ident (char **start, char *end, const char *valid_chars)
00188 {
00189   char *ident, *p;
00190 
00191   /* No skip_white(start, end)? */
00192   for (p = *start; p < end; p++) {
00193     if (!strchr(valid_chars, *p))
00194       break;
00195   }
00196   ident = parsed_string(*start, p);
00197 
00198   *start = p;
00199   return ident;
00200 }
00201 
00202 char *
00203 parse_ident (char **start, char *end)
00204 {
00205   static const char *valid_chars =
00206     "!\"#$&'*+,-.0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\\^_`abcdefghijklmnopqrstuvwxyz|~";
00207 
00208   return parse_gen_ident(start, end, valid_chars);
00209 }
00210 
00211 char *
00212 parse_val_ident (char **start, char *end)
00213 {
00214   static const char *valid_chars =
00215     "!\"#$&'*+,-./0123456789:;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\\^_`abcdefghijklmnopqrstuvwxyz|~";
00216 
00217   return parse_gen_ident(start, end, valid_chars);
00218 }
00219 
00220 char *
00221 parse_opt_ident (char **start, char *end)
00222 {
00223   if (*start < end && **start == '@') {
00224     (*start)++;
00225     return parse_ident(start, end);
00226   }
00227 
00228   return NULL;
00229 }
00230 
00231 #define DDIGITS_MAX 10
00232 pdf_obj *
00233 parse_pdf_number (char **pp, char *endptr)
00234 {
00235   char    *p;
00236   unsigned long ipart = 0, dpart = 0;
00237   int      nddigits = 0, sign = 1;
00238   int      has_dot = 0;
00239   static double ipot[DDIGITS_MAX+1] = {
00240     1.0,
00241     0.1,
00242     0.01,
00243     0.001,
00244     0.0001,
00245     0.00001,
00246     0.000001,
00247     0.0000001,
00248     0.00000001,
00249     0.000000001,
00250     0.0000000001
00251   };
00252 
00253   p = *pp;
00254   skip_white(&p, endptr);
00255   if (p >= endptr ||
00256       (!isdigit(p[0]) && p[0] != '.' &&
00257        p[0] != '+' && p[0] != '-')) {
00258     WARN("Could not find a numeric object.");
00259     return NULL;
00260   }
00261 
00262   if (p[0] == '-') {
00263     if (p + 1 >= endptr) {
00264       WARN("Could not find a numeric object.");
00265       return NULL;
00266     }
00267     sign = -1;
00268     p++;
00269   } else if (p[0] == '+') {
00270     if (p + 1 >= endptr) {
00271       WARN("Could not find a numeric object.");
00272       return NULL;
00273     }
00274     sign =  1;
00275     p++;
00276   }
00277 
00278   while (p < endptr && !istokensep(p[0])) {
00279     if (p[0] == '.') {
00280       if (has_dot) { /* Two dots */
00281        WARN("Could not find a numeric object.");
00282        return NULL;
00283       } else {
00284        has_dot = 1;
00285       }
00286     } else if (isdigit(p[0])) {
00287       if (has_dot) {
00288        if (nddigits == DDIGITS_MAX && pdf_obj_get_verbose() > 1) {
00289          WARN("Number with more than %d fractional digits.", DDIGITS_MAX);
00290        } else if (nddigits < DDIGITS_MAX) {
00291          dpart = dpart * 10 + p[0] - '0';
00292          nddigits++;
00293        } /* Ignore decimal digits more than DDIGITS_MAX */
00294       } else {
00295        ipart = ipart * 10 + p[0] - '0';
00296       }
00297     } else {
00298       WARN("Could not find a numeric object.");
00299       return NULL;
00300     }
00301     p++;
00302   }
00303 
00304   *pp = p;
00305   return pdf_new_number((double) sign * (((double ) ipart) + dpart * ipot[nddigits]));
00306 }
00307 
00308 /*
00309  * PDF Name:
00310  *
00311  *  PDF-1.2+: Two hexadecimal digits preceded by a number sign.
00312  */
00313 static int
00314 pn_getc (char **pp, char *endptr)
00315 {
00316   int   ch = 0;
00317   char *p;
00318 
00319   p  = *pp;
00320   if (p[0] == '#') {
00321     if (p + 2 >= endptr) {
00322       *pp = endptr;
00323       return -1;
00324     }
00325     if (!isxdigit(p[1]) || !isxdigit(p[2])) {
00326       *pp += 3;
00327       return -1;
00328     }
00329     ch   = (xtoi(p[1]) << 4);
00330     ch  += xtoi(p[2]);
00331     *pp += 3;
00332   } else {
00333     ch = p[0];
00334     *pp += 1;
00335   }
00336 
00337   return ch;
00338 }
00339 
00340 #ifndef PDF_NAME_LEN_MAX
00341 #define PDF_NAME_LEN_MAX 128
00342 #endif
00343 
00344 #ifndef PDF_STRING_LEN_MAX
00345 #define PDF_STRING_LEN_MAX 65535
00346 #endif
00347 
00348 #define STRING_BUFFER_SIZE PDF_STRING_LEN_MAX+1
00349 static char sbuf[PDF_STRING_LEN_MAX+1];
00350 
00351 
00352 pdf_obj *
00353 parse_pdf_name (char **pp, char *endptr)
00354 {
00355   char  name[PDF_NAME_LEN_MAX+1];
00356   int   ch, len = 0;
00357 
00358   skip_white(pp, endptr);
00359   if (*pp >= endptr || **pp != '/') {
00360     WARN("Could not find a name object.");
00361     return NULL;
00362   }
00363 
00364   (*pp)++;
00365   while (*pp < endptr && !istokensep(**pp)) {
00366     ch = pn_getc(pp, endptr);
00367     if (ch < 0 || ch > 0xff) {
00368       WARN("Invalid char in PDF name object. (ignored)");
00369     } else if (ch == 0) {
00370       WARN("Null char not allowed in PDF name object. (ignored)");
00371     } else if (len < STRING_BUFFER_SIZE) {
00372       if (len == PDF_NAME_LEN_MAX) {
00373        WARN("PDF name length too long. (>= %d bytes)", PDF_NAME_LEN_MAX);
00374       }
00375       name[len++] = ch;
00376     } else {
00377       WARN("PDF name length too long. (>= %d bytes, truncated)",
00378           STRING_BUFFER_SIZE);
00379     }
00380   }
00381   if (len < 1) {
00382     WARN("No valid name object found.");
00383     return NULL;
00384   }
00385   name[len] = '\0';
00386 
00387   return pdf_new_name(name);
00388 }
00389 
00390 pdf_obj *
00391 parse_pdf_boolean (char **pp, char *endptr)
00392 {
00393   skip_white(pp, endptr);
00394   if (*pp + 4 <= endptr &&
00395       !strncmp(*pp, "true", 4)) {
00396     if (*pp + 4 == endptr ||
00397        istokensep(*(*pp + 4))) {
00398       *pp += 4;
00399       return pdf_new_boolean(1);
00400     }
00401   } else if (*pp + 5 <= endptr &&
00402             !strncmp(*pp, "false", 5)) {
00403     if (*pp + 5 == endptr ||
00404        istokensep(*(*pp + 5))) {
00405       *pp += 5;
00406       return pdf_new_boolean(0);
00407     }
00408   }
00409 
00410   WARN("Not a boolean object.");
00411 
00412   return NULL;
00413 }
00414 
00415 pdf_obj *
00416 parse_pdf_null (char **pp, char *endptr)
00417 {
00418   skip_white(pp, endptr);
00419   if (*pp + 4 > endptr) {
00420     WARN("Not a null object.");
00421     return NULL;
00422   } else if (*pp + 4 < endptr &&
00423             !istokensep(*(*pp+4))) {
00424     WARN("Not a null object.");
00425     return NULL;
00426   } else if (!strncmp(*pp, "null", 4)) {
00427     *pp += 4;
00428     return pdf_new_null();
00429   }
00430 
00431   WARN("Not a null object.");
00432 
00433   return NULL;
00434 }
00435 
00436 /*
00437  * PDF Literal String
00438  */
00439 #ifndef isodigit
00440 #define isodigit(c) ((c) >= '0' && (c) <= '7')
00441 #endif
00442 static int
00443 ps_getescc (char **pp, char *endptr)
00444 {
00445   int   ch, i;
00446   char  *p;
00447 
00448   p = *pp + 1; /* backslash assumed. */
00449   switch (p[0]) {
00450   case 'n': ch = '\n'; p++; break;
00451   case 'r': ch = '\r'; p++; break;
00452   case 't': ch = '\t'; p++; break;
00453   case 'b': ch = '\b'; p++; break;
00454   case 'f': ch = '\f'; p++; break;
00455 
00456     /*
00457      * An end-of-line marker preceded by a backslash must be ignored.
00458      */
00459   case '\n':
00460     ch = -1;
00461     p++;
00462     break;
00463   case '\r':
00464     ch = -1;
00465     p++;
00466     if (p < endptr && p[0] == '\n')
00467       p++;
00468     break;
00469   default:
00470     if (p[0] == '\\' ||
00471        p[0] == '('  || p[0] == ')') {
00472       ch = p[0];
00473       p++;
00474     } else if (isodigit(p[0])) {
00475       ch = 0;
00476       /* Don't forget isodigit() is a macro. */
00477       for (i = 0; i < 3 &&
00478             p < endptr && isodigit(p[0]); i++) {
00479         ch = (ch << 3) + (p[0] - '0');
00480        p++;
00481       }
00482       ch = (ch & 0xff); /* Ignore overflow. */
00483     } else {
00484       ch = ((unsigned char) p[0]); /* Ignore only backslash. */
00485       p++;
00486     }
00487   }
00488 
00489   *pp = p;
00490   return ch;
00491 }
00492 
00493 static pdf_obj *
00494 parse_pdf_literal_string (char **pp, char *endptr)
00495 {
00496   int    ch, op_count = 0, len = 0;
00497   char  *p;
00498 
00499   p = *pp;
00500 
00501   skip_white(&p, endptr);
00502 
00503   if (p >= endptr || p[0] != '(')
00504     return NULL;
00505 
00506   p++;
00507 
00508   /* The carriage return (CR, 0x0d) and line feed (LF, 0x0a) characters,
00509    * also called newline characters, are treated as end-of-line (EOL)
00510    * markers. The combination of a carriage return followed immediately
00511    * by a line feed is treated as one EOL marker.
00512    * [PDF Reference, 6th ed., version 1.7, p. 50] */
00513 
00514   /* If an end-of-line marker appears within a literal string
00515    * without a preceding backslash, the result is equivalent to
00516    * \n (regardless of whether the end-of-line marker was
00517    * a carriage return, a line feed, or both).
00518    * [PDF Reference, 6th ed., version 1.7, p. 55] */
00519 
00520   while (p < endptr) {
00521 
00522     ch = p[0];
00523 
00524     if (ch == ')' && op_count < 1)
00525       break;
00526 
00527 #ifndef PDF_PARSE_STRICT
00528     if (parser_state.tainted) {
00529       if (p + 1 < endptr && (ch & 0x80)) {
00530        if (len + 2 >= PDF_STRING_LEN_MAX) {
00531          WARN("PDF string length too long. (limit: %ld)",
00532               PDF_STRING_LEN_MAX);
00533          return NULL;
00534        }
00535        sbuf[len++] = p[0];
00536        sbuf[len++] = p[1];
00537        p += 2;
00538        continue;
00539       }
00540     }
00541 #endif /* !PDF_PARSE_STRICT */
00542 
00543     if (len + 1 >= PDF_STRING_LEN_MAX) {
00544       WARN("PDF string length too long. (limit: %ld)",
00545           PDF_STRING_LEN_MAX);
00546       return NULL;
00547     }
00548 
00549     switch (ch) {
00550     case '\\':
00551       ch = ps_getescc(&p, endptr);
00552       if (ch >= 0)
00553        sbuf[len++] = (ch & 0xff);
00554       break;
00555     case '\r':
00556       p++;
00557       if (p < endptr && p[0] == '\n')
00558        p++;
00559       sbuf[len++] = '\n';
00560       break;
00561     default:
00562       if (ch == '(')
00563        op_count++;
00564       else if (ch == ')')
00565        op_count--;
00566       sbuf[len++] = ch;
00567       p++;
00568       break;
00569     }
00570   }
00571 
00572   if (op_count > 0 ||
00573       p >= endptr  || p[0] != ')') {
00574     WARN("Unbalanced parens/truncated PDF literal string.");
00575     return NULL;
00576   }
00577 
00578   *pp = p + 1;
00579   return pdf_new_string(sbuf, len);
00580 }
00581 
00582 /*
00583  * PDF Hex String
00584  */
00585 static int
00586 xtoi (char ch)
00587 {
00588   if (ch >= '0' && ch <= '9')
00589     return ch - '0';
00590   if (ch >= 'A' && ch <= 'F')
00591     return (ch - 'A') + 10;
00592   if (ch >= 'a' && ch <= 'f')
00593     return (ch - 'a') + 10;
00594 
00595   return -1;
00596 }
00597 
00598 static pdf_obj *
00599 parse_pdf_hex_string (char **pp, char *endptr)
00600 {
00601   char  *p;
00602   long   len;
00603 
00604   p = *pp;
00605 
00606   skip_white(&p, endptr);
00607   if (p >= endptr || p[0] != '<')
00608     return NULL;
00609 
00610   p++;
00611 
00612   len = 0;
00613   /*
00614    * PDF Reference does not describe how to treat invalid char.
00615    * Zero is appended if final hex digit is missing.
00616    */
00617   while (p < endptr && p[0] != '>' && len < PDF_STRING_LEN_MAX) {
00618     int  ch;
00619 
00620     skip_white(&p, endptr);
00621     if (p >= endptr || p[0] == '>')
00622       break;
00623 
00624     ch = (xtoi(p[0]) << 4);
00625     p++;
00626 
00627     skip_white(&p, endptr);
00628     if (p < endptr && p[0] != '>') {
00629       ch += xtoi(p[0]);
00630       p++;
00631     }
00632     sbuf[len++] = (ch & 0xff);
00633   }
00634 
00635   if (p >= endptr) {
00636     WARN("Premature end of input hex string.");
00637     return NULL;
00638   } else if (p[0] != '>') {
00639     WARN("PDF string length too long. (limit: %ld)", PDF_STRING_LEN_MAX);
00640     return NULL;
00641   }
00642 
00643   *pp = p + 1;
00644   return pdf_new_string(sbuf, len);
00645 }
00646 
00647 pdf_obj *
00648 parse_pdf_string (char **pp, char *endptr)
00649 {
00650   skip_white(pp, endptr);
00651   if (*pp + 2 <= endptr) {
00652     if (**pp == '(')
00653       return parse_pdf_literal_string(pp, endptr);
00654     else if (**pp == '<' &&
00655             (*(*pp + 1) == '>' || isxdigit(*(*pp + 1)))) {
00656       return parse_pdf_hex_string(pp, endptr);
00657     }
00658   }
00659 
00660   WARN("Could not find a string object.");
00661 
00662   return NULL;
00663 }
00664 
00665 #ifndef PDF_PARSE_STRICT
00666 pdf_obj *
00667 parse_pdf_tainted_dict (char **pp, char *endptr)
00668 {
00669   pdf_obj *result;
00670 
00671   parser_state.tainted = 1;
00672   result  = parse_pdf_dict(pp, endptr, NULL);
00673   parser_state.tainted = 0;
00674 
00675   return result;
00676 }
00677 #else /* PDF_PARSE_STRICT */
00678 pdf_obj *
00679 parse_pdf_tainted_dict (char **pp, char *endptr)
00680 {
00681   return parse_pdf_dict(pp, endptr, NULL);
00682 }
00683 #endif /* !PDF_PARSE_STRICT */
00684 
00685 pdf_obj *
00686 parse_pdf_dict (char **pp, char *endptr, pdf_file *pf)
00687 {
00688   pdf_obj *result = NULL;
00689   char    *p;
00690 
00691   p = *pp;
00692 
00693   skip_white(&p, endptr);
00694 
00695   /* At least four letter <<>>. */
00696   if (p + 4 > endptr ||
00697       p[0] != '<'    || p[1] != '<') {
00698     return NULL;
00699   }
00700   p += 2;
00701 
00702   result = pdf_new_dict();
00703 
00704   skip_white(&p, endptr);
00705   while (p < endptr && p[0] != '>') {
00706     pdf_obj *key, *value;
00707 
00708     skip_white(&p, endptr);
00709     key = parse_pdf_name(&p, endptr);
00710     if (!key) {
00711       WARN("Could not find a key in dictionary object.");
00712       pdf_release_obj(result);
00713       return NULL;
00714     }
00715 
00716     skip_white(&p, endptr);
00717 
00718     value = parse_pdf_object(&p, endptr, pf);
00719     if (!value) {
00720       pdf_release_obj(key); 
00721       pdf_release_obj(value);
00722       pdf_release_obj(result);
00723       WARN("Could not find a value in dictionary object.");
00724       return NULL;
00725     }
00726     pdf_add_dict(result, key, value);
00727 
00728     skip_white(&p, endptr);
00729   }
00730 
00731   if (p + 2 > endptr ||
00732       p[0] != '>'    || p[1] != '>') {
00733     WARN("Syntax error: Dictionary object ended prematurely.");
00734     pdf_release_obj(result);
00735     return NULL;
00736   }
00737 
00738   *pp = p + 2; /* skip >> */
00739   return result;
00740 }
00741 
00742 pdf_obj *
00743 parse_pdf_array (char **pp, char *endptr, pdf_file *pf)
00744 {
00745   pdf_obj *result;
00746   char    *p;
00747 
00748   p = *pp;
00749 
00750   skip_white(&p, endptr);
00751   if (p + 2 > endptr || p[0] != '[') {
00752     WARN("Could not find an array object.");
00753     return NULL;
00754   }
00755 
00756   result = pdf_new_array();
00757 
00758   p++;
00759   skip_white(&p, endptr);
00760 
00761   while (p < endptr && p[0] != ']') {
00762     pdf_obj *elem;
00763 
00764     elem = parse_pdf_object(&p, endptr, pf);
00765     if (!elem) {
00766       pdf_release_obj(result); 
00767       WARN("Could not find a valid object in array object.");
00768       return NULL;
00769     }
00770     pdf_add_array(result, elem);
00771 
00772     skip_white(&p, endptr);
00773   }
00774 
00775   if (p >= endptr || p[0] != ']') {
00776     WARN("Array object ended prematurely.");
00777     pdf_release_obj(result);
00778     return NULL;
00779   }
00780 
00781   *pp = p + 1; /* skip ] */
00782   return result;
00783 }
00784 
00785 static pdf_obj *
00786 parse_pdf_stream (char **pp, char *endptr, pdf_obj *dict, pdf_file *pf)
00787 {
00788   pdf_obj *result = NULL;
00789   char    *p;
00790   pdf_obj *stream_dict;
00791   long     stream_length;
00792 
00793   p = *pp;
00794   skip_white(&p, endptr);
00795   if (p + 6 > endptr ||
00796       strncmp(p, "stream", 6)) {
00797     return NULL;
00798   }
00799   p += 6;
00800 
00801   /* The keyword stream that follows the stream dictionary
00802    * should be followed by an end-of-line marker consisting of
00803    * either a carriage return (0x0D;\r) and a line feed (0x0A;\n)
00804    * or just a line feed, and not by a carriage return alone.
00805    * [PDF Reference, 6th ed., version 1.7, pp. 60-61] */
00806 
00807   /* Notice that TeX translates an end-of-line marker to a single space. */
00808   if (p < endptr && p[0] == '\n') {
00809     p++;
00810   } else if (p + 1 < endptr &&
00811              (p[0] == '\r' && p[1] == '\n')) {
00812     p += 2;
00813   }
00814 
00815   /* Stream length */
00816   {
00817     pdf_obj *tmp, *tmp2;
00818 
00819     tmp = pdf_lookup_dict(dict, "Length");
00820  
00821     if (tmp != NULL) {
00822       tmp2 = pdf_deref_obj(tmp);
00823       if (pdf_obj_typeof(tmp2) != PDF_NUMBER)
00824         stream_length = -1;
00825       else {
00826         stream_length = (long) pdf_number_value(tmp2);
00827       }
00828       pdf_release_obj(tmp2);
00829     }
00830     else {
00831       return NULL;
00832     }
00833   }
00834 
00835   
00836   if (stream_length < 0 ||
00837       p + stream_length > endptr)
00838     return NULL;
00839 
00840   /*
00841    * If Filter is not applied, set STREAM_COMPRESS flag.
00842    * Should we use filter for ASCIIHexEncode/ASCII85Encode-ed streams?
00843    */
00844   {
00845     pdf_obj *filters;
00846 
00847     filters = pdf_lookup_dict(dict, "Filter");
00848     if (!filters && stream_length > 10) {
00849       result = pdf_new_stream(STREAM_COMPRESS);
00850     } else {
00851       result = pdf_new_stream(0);
00852     }
00853   }
00854 
00855   stream_dict = pdf_stream_dict(result);
00856   pdf_merge_dict(stream_dict, dict);
00857 
00858   pdf_add_stream(result, p, stream_length);
00859   p += stream_length;
00860 
00861   /* Check "endsteam" */
00862   {
00863     /* It is recommended that there be an end-of-line marker
00864      * after the data and before endstream; this marker is not included
00865      * in the stream length. 
00866      * [PDF Reference, 6th ed., version 1.7, pp. 61] */
00867     if (p < endptr && p[0] == '\r')
00868       p++;
00869     if (p < endptr && p[0] == '\n')
00870       p++;
00871 
00872     if (p + 9 > endptr ||
00873         memcmp(p, "endstream", 9)) {
00874       pdf_release_obj(result);
00875       return NULL;
00876     }
00877     p += 9;
00878   }
00879 
00880   *pp = p;
00881   return  result;
00882 }
00883 
00884 #ifndef PDF_PARSE_STRICT
00885 
00886 /* PLEASE REMOVE THIS */
00887 #include "specials.h"
00888 
00889 /* This is not PDF indirect reference. */
00890 static pdf_obj *
00891 parse_pdf_reference (char **start, char *end)
00892 {
00893   pdf_obj *result = NULL;
00894   char    *name;
00895 
00896   SAVE(*start, end);
00897 
00898   skip_white(start, end);
00899   name = parse_opt_ident(start, end);
00900   if (name) {
00901     result = spc_lookup_reference(name);
00902     if (!result) {
00903       WARN("Could not find the named reference (@%s).", name);
00904       DUMP_RESTORE(*start, end);
00905     }
00906     RELEASE(name);
00907   } else {
00908     WARN("Could not find a reference name.");
00909     DUMP_RESTORE(*start, end);
00910     result = NULL;
00911   }
00912 
00913   return result;
00914 }
00915 #endif /* !PDF_PARSE_STRICT */
00916 
00917 static pdf_obj *
00918 try_pdf_reference (char *start, char *end, char **endptr, pdf_file *pf)
00919 {
00920   unsigned long id = 0;
00921   unsigned short gen = 0;
00922 
00923   ASSERT(pf);
00924 
00925   if (endptr)
00926     *endptr = start;
00927 
00928   skip_white(&start, end);
00929   if (start > end - 5 || !isdigit(*start)) {
00930     return NULL;
00931   }
00932   while (!is_space(*start)) {
00933     if (start >= end || !isdigit(*start)) {
00934       return NULL;
00935     }
00936     id = id * 10 + (*start - '0');
00937     start++;
00938   }
00939 
00940   skip_white(&start, end);
00941   if (start >= end || !isdigit(*start))
00942     return NULL;
00943   while (!is_space(*start)) {
00944     if (start >= end || !isdigit(*start))
00945       return NULL;
00946     gen = gen * 10 + (*start - '0');
00947     start++;
00948   }
00949 
00950   skip_white(&start, end);
00951   if (start >= end  || *start != 'R')
00952     return NULL;
00953   start++;
00954   if (!PDF_TOKEN_END(start, end))
00955     return NULL;
00956     
00957   if (endptr)
00958     *endptr = start;
00959 
00960   return pdf_new_indirect(pf, id, gen);
00961 }
00962 
00963 pdf_obj *
00964 parse_pdf_object (char **pp, char *endptr, pdf_file *pf)
00965 /* If pf is NULL, then indirect references are not allowed */
00966 {
00967   pdf_obj *result = NULL;
00968   char    *nextptr;
00969 
00970   skip_white(pp, endptr);
00971   if (*pp >= endptr) {
00972     WARN("Could not find any valid object.");
00973     return NULL;
00974   }
00975 
00976   switch (**pp) {
00977 
00978   case '<': 
00979 
00980     if (*(*pp + 1) != '<') {
00981       result = parse_pdf_hex_string(pp, endptr);
00982     } else {
00983       pdf_obj *dict;
00984 
00985       result = parse_pdf_dict(pp, endptr, pf);
00986       skip_white(pp, endptr);
00987       if ( result &&
00988           *pp <= endptr - 15 &&
00989           !memcmp(*pp, "stream", 6)) {
00990         dict   = result;
00991         result = parse_pdf_stream(pp, endptr, dict, pf);
00992         pdf_release_obj(dict);
00993       }
00994     }
00995 
00996     break;
00997   case '(':
00998     result = parse_pdf_string(pp, endptr);
00999     break;
01000   case '[':
01001     result = parse_pdf_array(pp, endptr, pf);
01002     break;
01003   case '/':
01004     result = parse_pdf_name(pp, endptr);
01005     break;
01006   case 'n':
01007     result = parse_pdf_null(pp, endptr);
01008     break;
01009   case 't': case 'f':
01010     result = parse_pdf_boolean(pp, endptr);
01011     break;
01012   case '+': case '-': case '.':
01013     result = parse_pdf_number(pp, endptr);
01014     break;
01015   case '0': case '1': case '2': case '3': case '4':
01016   case '5': case '6': case '7': case '8': case '9':
01017 
01018     /*
01019      * If pf != NULL, then we are parsing a PDF file,
01020      * and indirect references are allowed.
01021      */
01022     if (pf && (result = try_pdf_reference(*pp, endptr, &nextptr, pf))) {
01023       *pp = nextptr;
01024     } else {
01025       result = parse_pdf_number(pp, endptr);
01026     }
01027     break;
01028 
01029   case '@':
01030 
01031 #ifndef PDF_PARSE_STRICT
01032     result = parse_pdf_reference(pp, endptr);
01033 #endif /* !PDF_PARSE_STRICT */
01034     break;
01035 
01036   default:
01037     WARN("Unknown PDF object type.");
01038     result = NULL;
01039   }
01040 
01041   return result;
01042 }
01043