Back to index

php5  5.3.10
Defines | Functions | Variables
encoding.c File Reference
#include "file.h"
#include "magic.h"
#include <string.h>
#include <memory.h>
#include <stdlib.h>

Go to the source code of this file.

Defines

#define F   0 /* character never appears in text */
#define T   1 /* character appears in plain ASCII text */
#define I   2 /* character appears in ISO-8859 text */
#define X   3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */

Functions

private int looks_ascii (const unsigned char *, size_t, unichar *, size_t *)
private int looks_utf8_with_BOM (const unsigned char *, size_t, unichar *, size_t *)
private int looks_ucs16 (const unsigned char *, size_t, unichar *, size_t *)
private int looks_latin1 (const unsigned char *, size_t, unichar *, size_t *)
private int looks_extended (const unsigned char *, size_t, unichar *, size_t *)
private void from_ebcdic (const unsigned char *, size_t, unsigned char *)
protected int file_encoding (struct magic_set *ms, const unsigned char *buf, size_t nbytes, unichar **ubuf, size_t *ulen, const char **code, const char **code_mime, const char **type)
protected int file_looks_utf8 (const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)

Variables

private char text_chars [256]
private unsigned char ebcdic_to_ascii []

Define Documentation

#define F   0 /* character never appears in text */

Definition at line 177 of file encoding.c.

#define I   2 /* character appears in ISO-8859 text */

Definition at line 179 of file encoding.c.

#define T   1 /* character appears in plain ASCII text */

Definition at line 178 of file encoding.c.

#define X   3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */

Definition at line 180 of file encoding.c.


Function Documentation

protected int file_encoding ( struct magic_set ms,
const unsigned char *  buf,
size_t  nbytes,
unichar **  ubuf,
size_t ulen,
const char **  code,
const char **  code_mime,
const char **  type 
)

Definition at line 62 of file encoding.c.

{
       size_t mlen;
       int rv = 1, ucs_type;
       unsigned char *nbuf = NULL;

       mlen = (nbytes + 1) * sizeof(nbuf[0]);
       if ((nbuf = CAST(unsigned char *, calloc((size_t)1, mlen))) == NULL) {
              file_oomem(ms, mlen);
              goto done;
       }
       mlen = (nbytes + 1) * sizeof((*ubuf)[0]);
       if ((*ubuf = CAST(unichar *, calloc((size_t)1, mlen))) == NULL) {
              file_oomem(ms, mlen);
              goto done;
       }

       *type = "text";
       if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
              *code = "ASCII";
              *code_mime = "us-ascii";
       } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
              *code = "UTF-8 Unicode (with BOM)";
              *code_mime = "utf-8";
       } else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) {
              *code = "UTF-8 Unicode";
              *code_mime = "utf-8";
       } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
              if (ucs_type == 1) {
                     *code = "Little-endian UTF-16 Unicode";
                     *code_mime = "utf-16le";
              } else {
                     *code = "Big-endian UTF-16 Unicode";
                     *code_mime = "utf-16be";
              }
       } else if (looks_latin1(buf, nbytes, *ubuf, ulen)) {
              *code = "ISO-8859";
              *code_mime = "iso-8859-1";
       } else if (looks_extended(buf, nbytes, *ubuf, ulen)) {
              *code = "Non-ISO extended-ASCII";
              *code_mime = "unknown-8bit";
       } else {
              from_ebcdic(buf, nbytes, nbuf);

              if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) {
                     *code = "EBCDIC";
                     *code_mime = "ebcdic";
              } else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) {
                     *code = "International EBCDIC";
                     *code_mime = "ebcdic";
              } else { /* Doesn't look like text at all */
                     rv = 0;
                     *type = "binary";
              }
       }

 done:
       if (nbuf)
              free(nbuf);

       return rv;
}

Here is the call graph for this function:

Here is the caller graph for this function:

protected int file_looks_utf8 ( const unsigned char *  buf,
size_t  nbytes,
unichar ubuf,
size_t ulen 
)

Definition at line 275 of file encoding.c.

{
       size_t i;
       int n;
       unichar c;
       int gotone = 0, ctrl = 0;

       if (ubuf)
              *ulen = 0;

       for (i = 0; i < nbytes; i++) {
              if ((buf[i] & 0x80) == 0) {    /* 0xxxxxxx is plain ASCII */
                     /*
                      * Even if the whole file is valid UTF-8 sequences,
                      * still reject it if it uses weird control characters.
                      */

                     if (text_chars[buf[i]] != T)
                            ctrl = 1;

                     if (ubuf)
                            ubuf[(*ulen)++] = buf[i];
              } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
                     return -1;
              } else {                       /* 11xxxxxx begins UTF-8 */
                     int following;

                     if ((buf[i] & 0x20) == 0) {        /* 110xxxxx */
                            c = buf[i] & 0x1f;
                            following = 1;
                     } else if ((buf[i] & 0x10) == 0) { /* 1110xxxx */
                            c = buf[i] & 0x0f;
                            following = 2;
                     } else if ((buf[i] & 0x08) == 0) { /* 11110xxx */
                            c = buf[i] & 0x07;
                            following = 3;
                     } else if ((buf[i] & 0x04) == 0) { /* 111110xx */
                            c = buf[i] & 0x03;
                            following = 4;
                     } else if ((buf[i] & 0x02) == 0) { /* 1111110x */
                            c = buf[i] & 0x01;
                            following = 5;
                     } else
                            return -1;

                     for (n = 0; n < following; n++) {
                            i++;
                            if (i >= nbytes)
                                   goto done;

                            if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
                                   return -1;

                            c = (c << 6) + (buf[i] & 0x3f);
                     }

                     if (ubuf)
                            ubuf[(*ulen)++] = c;
                     gotone = 1;
              }
       }
done:
       return ctrl ? 0 : (gotone ? 2 : 1);
}

Here is the caller graph for this function:

private void from_ebcdic ( const unsigned char *  buf,
size_t  nbytes,
unsigned char *  out 
)

Definition at line 477 of file encoding.c.

{
       size_t i;

       for (i = 0; i < nbytes; i++) {
              out[i] = ebcdic_to_ascii[buf[i]];
       }
}

Here is the caller graph for this function:

private int looks_ascii ( const unsigned char *  buf,
size_t  nbytes,
unichar ubuf,
size_t ulen 
)

Definition at line 205 of file encoding.c.

{
       size_t i;

       *ulen = 0;

       for (i = 0; i < nbytes; i++) {
              int t = text_chars[buf[i]];

              if (t != T)
                     return 0;

              ubuf[(*ulen)++] = buf[i];
       }

       return 1;
}

Here is the caller graph for this function:

private int looks_extended ( const unsigned char *  buf,
size_t  nbytes,
unichar ubuf,
size_t ulen 
)

Definition at line 244 of file encoding.c.

{
       size_t i;

       *ulen = 0;

       for (i = 0; i < nbytes; i++) {
              int t = text_chars[buf[i]];

              if (t != T && t != I && t != X)
                     return 0;

              ubuf[(*ulen)++] = buf[i];
       }

       return 1;
}

Here is the caller graph for this function:

private int looks_latin1 ( const unsigned char *  buf,
size_t  nbytes,
unichar ubuf,
size_t ulen 
)

Definition at line 225 of file encoding.c.

{
       size_t i;

       *ulen = 0;

       for (i = 0; i < nbytes; i++) {
              int t = text_chars[buf[i]];

              if (t != T && t != I)
                     return 0;

              ubuf[(*ulen)++] = buf[i];
       }

       return 1;
}

Here is the caller graph for this function:

private int looks_ucs16 ( const unsigned char *  buf,
size_t  nbytes,
unichar ubuf,
size_t ulen 
)

Definition at line 356 of file encoding.c.

{
       int bigend;
       size_t i;

       if (nbytes < 2)
              return 0;

       if (buf[0] == 0xff && buf[1] == 0xfe)
              bigend = 0;
       else if (buf[0] == 0xfe && buf[1] == 0xff)
              bigend = 1;
       else
              return 0;

       *ulen = 0;

       for (i = 2; i + 1 < nbytes; i += 2) {
              /* XXX fix to properly handle chars > 65536 */

              if (bigend)
                     ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
              else
                     ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];

              if (ubuf[*ulen - 1] == 0xfffe)
                     return 0;
              if (ubuf[*ulen - 1] < 128 &&
                  text_chars[(size_t)ubuf[*ulen - 1]] != T)
                     return 0;
       }

       return 1 + bigend;
}

Here is the caller graph for this function:

private int looks_utf8_with_BOM ( const unsigned char *  buf,
size_t  nbytes,
unichar ubuf,
size_t ulen 
)

Definition at line 346 of file encoding.c.

{
       if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf)
              return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen);
       else
              return -1;
}

Here is the call graph for this function:

Here is the caller graph for this function:


Variable Documentation

private unsigned char ebcdic_to_ascii[]
Initial value:
 {
  0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
 16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
'&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
'-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
'{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
'}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
'\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
}

Definition at line 419 of file encoding.c.

private char text_chars[256]
Initial value:
 {
       
       F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  
       
       F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  
       T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  
       T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  
       T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  
       T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  
       T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  
       T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  
       
       X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  
       X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  
       I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  
       I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  
       I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  
       I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  
       I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  
       I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   
}

Definition at line 182 of file encoding.c.