Back to index

php5  5.3.10
ascmagic.c
Go to the documentation of this file.
00001 /*
00002  * Copyright (c) Ian F. Darwin 1986-1995.
00003  * Software written by Ian F. Darwin and others;
00004  * maintained 1995-present by Christos Zoulas and others.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  * 1. Redistributions of source code must retain the above copyright
00010  *    notice immediately at the beginning of the file, without modification,
00011  *    this list of conditions, and the following disclaimer.
00012  * 2. Redistributions in binary form must reproduce the above copyright
00013  *    notice, this list of conditions and the following disclaimer in the
00014  *    documentation and/or other materials provided with the distribution.
00015  *
00016  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
00017  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00018  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00019  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
00020  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00021  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00022  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00023  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00024  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00025  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00026  * SUCH DAMAGE.
00027  */
00028 /*
00029  * ASCII magic -- file types that we know based on keywords
00030  * that can appear anywhere in the file.
00031  *
00032  * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
00033  * to handle character codes other than ASCII on a unified basis.
00034  */
00035 
00036 #include "file.h"
00037 
00038 #ifndef       lint
00039 FILE_RCSID("@(#)$File: ascmagic.c,v 1.75 2009/02/03 20:27:51 christos Exp $")
00040 #endif /* lint */
00041 
00042 #include "magic.h"
00043 #include <string.h>
00044 #include <memory.h>
00045 #include <ctype.h>
00046 #include <stdlib.h>
00047 #ifdef HAVE_UNISTD_H
00048 #include <unistd.h>
00049 #endif
00050 #include "names.h"
00051 
00052 #define MAXLINELEN 300      /* longest sane line length */
00053 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
00054                 || (x) == 0x85 || (x) == '\f')
00055 
00056 private int ascmatch(const unsigned char *, const unichar *, size_t);
00057 private unsigned char *encode_utf8(unsigned char *, size_t, unichar *, size_t);
00058 private size_t trim_nuls(const unsigned char *, size_t);
00059 
00060 /*
00061  * Undo the NUL-termination kindly provided by process()
00062  * but leave at least one byte to look at
00063  */
00064 private size_t
00065 trim_nuls(const unsigned char *buf, size_t nbytes)
00066 {
00067        while (nbytes > 1 && buf[nbytes - 1] == '\0')
00068               nbytes--;
00069 
00070        return nbytes;
00071 }
00072 
00073 protected int
00074 file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
00075 {
00076        unichar *ubuf = NULL;
00077        size_t ulen;
00078        int rv = 1;
00079 
00080        const char *code = NULL;
00081        const char *code_mime = NULL;
00082        const char *type = NULL;
00083 
00084        if (ms->flags & MAGIC_APPLE)
00085               return 0;
00086 
00087        nbytes = trim_nuls(buf, nbytes);
00088 
00089        /* If file doesn't look like any sort of text, give up. */
00090        if (file_encoding(ms, buf, nbytes, &ubuf, &ulen, &code, &code_mime,
00091            &type) == 0) {
00092               rv = 0;
00093               goto done;
00094        }
00095 
00096        rv = file_ascmagic_with_encoding(ms, buf, nbytes, ubuf, ulen, code, 
00097            type);
00098 
00099  done:
00100        if (ubuf)
00101               free(ubuf);
00102 
00103        return rv;
00104 }
00105 
00106 protected int
00107 file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf,
00108     size_t nbytes, unichar *ubuf, size_t ulen, const char *code,
00109     const char *type)
00110 {
00111        unsigned char *utf8_buf = NULL, *utf8_end;
00112        size_t mlen, i;
00113        const struct names *p;
00114        int rv = -1;
00115        int mime = ms->flags & MAGIC_MIME;
00116 
00117        const char *subtype = NULL;
00118        const char *subtype_mime = NULL;
00119 
00120        int has_escapes = 0;
00121        int has_backspace = 0;
00122        int seen_cr = 0;
00123 
00124        int n_crlf = 0;
00125        int n_lf = 0;
00126        int n_cr = 0;
00127        int n_nel = 0;
00128 
00129        size_t last_line_end = (size_t)-1;
00130        int has_long_lines = 0;
00131 
00132        if (ms->flags & MAGIC_APPLE)
00133               return 0;
00134 
00135        nbytes = trim_nuls(buf, nbytes);
00136 
00137        /* If we have fewer than 2 bytes, give up. */
00138        if (nbytes <= 1) {
00139               rv = 0;
00140               goto done;
00141        }
00142 
00143        /* Convert ubuf to UTF-8 and try text soft magic */
00144        /* malloc size is a conservative overestimate; could be
00145           improved, or at least realloced after conversion. */
00146        mlen = ulen * 6;
00147        utf8_buf = emalloc(mlen);
00148 
00149        if ((utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen)) == NULL)
00150               goto done;
00151        if ((rv = file_softmagic(ms, utf8_buf, (size_t)(utf8_end - utf8_buf),
00152            TEXTTEST)) != 0)
00153               goto done;
00154        else
00155               rv = -1;
00156 
00157        /* look for tokens from names.h - this is expensive! */
00158        if ((ms->flags & MAGIC_NO_CHECK_TOKENS) != 0)
00159               goto subtype_identified;
00160 
00161        i = 0;
00162        while (i < ulen) {
00163               size_t end;
00164 
00165               /* skip past any leading space */
00166               while (i < ulen && ISSPC(ubuf[i]))
00167                      i++;
00168               if (i >= ulen)
00169                      break;
00170 
00171               /* find the next whitespace */
00172               for (end = i + 1; end < nbytes; end++)
00173                      if (ISSPC(ubuf[end]))
00174                             break;
00175 
00176               /* compare the word thus isolated against the token list */
00177               for (p = names; p < names + NNAMES; p++) {
00178                      if (ascmatch((const unsigned char *)p->name, ubuf + i,
00179                          end - i)) {
00180                             subtype = types[p->type].human;
00181                             subtype_mime = types[p->type].mime;
00182                             goto subtype_identified;
00183                      }
00184               }
00185 
00186               i = end;
00187        }
00188 
00189 subtype_identified:
00190 
00191        /* Now try to discover other details about the file. */
00192        for (i = 0; i < ulen; i++) {
00193               if (ubuf[i] == '\n') {
00194                      if (seen_cr)
00195                             n_crlf++;
00196                      else
00197                             n_lf++;
00198                      last_line_end = i;
00199               } else if (seen_cr)
00200                      n_cr++;
00201 
00202               seen_cr = (ubuf[i] == '\r');
00203               if (seen_cr)
00204                      last_line_end = i;
00205 
00206               if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
00207                      n_nel++;
00208                      last_line_end = i;
00209               }
00210 
00211               /* If this line is _longer_ than MAXLINELEN, remember it. */
00212               if (i > last_line_end + MAXLINELEN)
00213                      has_long_lines = 1;
00214 
00215               if (ubuf[i] == '\033')
00216                      has_escapes = 1;
00217               if (ubuf[i] == '\b')
00218                      has_backspace = 1;
00219        }
00220 
00221        /* Beware, if the data has been truncated, the final CR could have
00222           been followed by a LF.  If we have HOWMANY bytes, it indicates
00223           that the data might have been truncated, probably even before
00224           this function was called. */
00225        if (seen_cr && nbytes < HOWMANY)
00226               n_cr++;
00227 
00228        if (strcmp(type, "binary") == 0) {
00229               rv = 0;
00230               goto done;
00231        }
00232        if (mime) {
00233               if ((mime & MAGIC_MIME_TYPE) != 0) {
00234                      if (subtype_mime) {
00235                             if (file_printf(ms, "%s", subtype_mime) == -1)
00236                                    goto done;
00237                      } else {
00238                             if (file_printf(ms, "text/plain") == -1)
00239                                    goto done;
00240                      }
00241               }
00242        } else {
00243               if (file_printf(ms, "%s", code) == -1)
00244                      goto done;
00245 
00246               if (subtype) {
00247                      if (file_printf(ms, " %s", subtype) == -1)
00248                             goto done;
00249               }
00250 
00251               if (file_printf(ms, " %s", type) == -1)
00252                      goto done;
00253 
00254               if (has_long_lines)
00255                      if (file_printf(ms, ", with very long lines") == -1)
00256                             goto done;
00257 
00258               /*
00259                * Only report line terminators if we find one other than LF,
00260                * or if we find none at all.
00261                */
00262               if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
00263                   (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
00264                      if (file_printf(ms, ", with") == -1)
00265                             goto done;
00266 
00267                      if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) {
00268                             if (file_printf(ms, " no") == -1)
00269                                    goto done;
00270                      } else {
00271                             if (n_crlf) {
00272                                    if (file_printf(ms, " CRLF") == -1)
00273                                           goto done;
00274                                    if (n_cr || n_lf || n_nel)
00275                                           if (file_printf(ms, ",") == -1)
00276                                                  goto done;
00277                             }
00278                             if (n_cr) {
00279                                    if (file_printf(ms, " CR") == -1)
00280                                           goto done;
00281                                    if (n_lf || n_nel)
00282                                           if (file_printf(ms, ",") == -1)
00283                                                  goto done;
00284                             }
00285                             if (n_lf) {
00286                                    if (file_printf(ms, " LF") == -1)
00287                                           goto done;
00288                                    if (n_nel)
00289                                           if (file_printf(ms, ",") == -1)
00290                                                  goto done;
00291                             }
00292                             if (n_nel)
00293                                    if (file_printf(ms, " NEL") == -1)
00294                                           goto done;
00295                      }
00296 
00297                      if (file_printf(ms, " line terminators") == -1)
00298                             goto done;
00299               }
00300 
00301               if (has_escapes)
00302                      if (file_printf(ms, ", with escape sequences") == -1)
00303                             goto done;
00304               if (has_backspace)
00305                      if (file_printf(ms, ", with overstriking") == -1)
00306                             goto done;
00307        }
00308        rv = 1;
00309 done:
00310        if (utf8_buf)
00311               efree(utf8_buf);
00312 
00313        return rv;
00314 }
00315 
00316 private int
00317 ascmatch(const unsigned char *s, const unichar *us, size_t ulen)
00318 {
00319        size_t i;
00320 
00321        for (i = 0; i < ulen; i++) {
00322               if (s[i] != us[i])
00323                      return 0;
00324        }
00325 
00326        if (s[i])
00327               return 0;
00328        else
00329               return 1;
00330 }
00331 
00332 /*
00333  * Encode Unicode string as UTF-8, returning pointer to character
00334  * after end of string, or NULL if an invalid character is found.
00335  */
00336 private unsigned char *
00337 encode_utf8(unsigned char *buf, size_t len, unichar *ubuf, size_t ulen)
00338 {
00339        size_t i;
00340        unsigned char *end = buf + len;
00341 
00342        for (i = 0; i < ulen; i++) {
00343               if (ubuf[i] <= 0x7f) {
00344                      if (end - buf < 1)
00345                             return NULL;
00346                      *buf++ = (unsigned char)ubuf[i];
00347               } else if (ubuf[i] <= 0x7ff) {
00348                      if (end - buf < 2)
00349                             return NULL;
00350                      *buf++ = (unsigned char)((ubuf[i] >> 6) + 0xc0);
00351                      *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
00352               } else if (ubuf[i] <= 0xffff) {
00353                      if (end - buf < 3)
00354                             return NULL;
00355                      *buf++ = (unsigned char)((ubuf[i] >> 12) + 0xe0);
00356                      *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
00357                      *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
00358               } else if (ubuf[i] <= 0x1fffff) {
00359                      if (end - buf < 4)
00360                             return NULL;
00361                      *buf++ = (unsigned char)((ubuf[i] >> 18) + 0xf0);
00362                      *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
00363                      *buf++ = (unsigned char)(((ubuf[i] >>  6) & 0x3f) + 0x80);
00364                      *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
00365               } else if (ubuf[i] <= 0x3ffffff) {
00366                      if (end - buf < 5)
00367                             return NULL;
00368                      *buf++ = (unsigned char)((ubuf[i] >> 24) + 0xf8);
00369                      *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80);
00370                      *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
00371                      *buf++ = (unsigned char)(((ubuf[i] >>  6) & 0x3f) + 0x80);
00372                      *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
00373               } else if (ubuf[i] <= 0x7fffffff) {
00374                      if (end - buf < 6)
00375                             return NULL;
00376                      *buf++ = (unsigned char)((ubuf[i] >> 30) + 0xfc);
00377                      *buf++ = (unsigned char)(((ubuf[i] >> 24) & 0x3f) + 0x80);
00378                      *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80);
00379                      *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
00380                      *buf++ = (unsigned char)(((ubuf[i] >>  6) & 0x3f) + 0x80);
00381                      *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
00382               } else /* Invalid character */
00383                      return NULL;
00384        }
00385 
00386        return buf;
00387 }