Back to index

php5  5.3.10
html.c
Go to the documentation of this file.
00001 /*
00002    +----------------------------------------------------------------------+
00003    | PHP Version 5                                                        |
00004    +----------------------------------------------------------------------+
00005    | Copyright (c) 1997-2012 The PHP Group                                |
00006    +----------------------------------------------------------------------+
00007    | This source file is subject to version 3.01 of the PHP license,      |
00008    | that is bundled with this package in the file LICENSE, and is        |
00009    | available through the world-wide-web at the following url:           |
00010    | http://www.php.net/license/3_01.txt                                  |
00011    | If you did not receive a copy of the PHP license and are unable to   |
00012    | obtain it through the world-wide-web, please send a note to          |
00013    | license@php.net so we can mail you a copy immediately.               |
00014    +----------------------------------------------------------------------+
00015    | Authors: Rasmus Lerdorf <rasmus@php.net>                             |
00016    |          Jaakko Hyvätti <jaakko.hyvatti@iki.fi>                      |
00017    |          Wez Furlong <wez@thebrainroom.com>                          |
00018    +----------------------------------------------------------------------+
00019 */
00020 
00021 /* $Id: html.c 321634 2012-01-01 13:15:04Z felipe $ */
00022 
00023 /*
00024  * HTML entity resources:
00025  *
00026  * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp
00027  * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp
00028  * http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT
00029  *
00030  * http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2
00031  * 
00032  */
00033 
00034 #include "php.h"
00035 #if PHP_WIN32
00036 #include "config.w32.h"
00037 #else
00038 #include <php_config.h>
00039 #endif
00040 #include "html.h"
00041 #include "php_string.h"
00042 #include "SAPI.h"
00043 #if HAVE_LOCALE_H
00044 #include <locale.h>
00045 #endif
00046 #if HAVE_LANGINFO_H
00047 #include <langinfo.h>
00048 #endif
00049 
00050 #if HAVE_MBSTRING
00051 # include "ext/mbstring/mbstring.h"
00052 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
00053 #endif
00054 
00055 enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252,
00056                                      cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, 
00057                                      cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
00058                                      cs_cp1251, cs_8859_5, cs_cp866, cs_macroman
00059                                    };
00060 typedef const char *const entity_table_t;
00061 
00062 /* codepage 1252 is a Windows extension to iso-8859-1. */
00063 static entity_table_t ent_cp_1252[] = {
00064        "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger",
00065        "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
00066        NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo",
00067        "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
00068        "oelig", NULL, NULL, "Yuml" 
00069 };
00070 
00071 static entity_table_t ent_iso_8859_1[] = {
00072        "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
00073        "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
00074        "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
00075        "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
00076        "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
00077        "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
00078        "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
00079        "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
00080        "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
00081        "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
00082        "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
00083        "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
00084        "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
00085        "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
00086        "uuml", "yacute", "thorn", "yuml"
00087 };
00088 
00089 static entity_table_t ent_iso_8859_15[] = {
00090        "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
00091        "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
00092        "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */
00093        "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm",
00094        "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
00095        "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
00096        "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
00097        "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
00098        "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
00099        "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
00100        "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
00101        "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
00102        "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
00103        "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
00104        "uuml", "yacute", "thorn", "yuml"
00105 };
00106 
00107 static entity_table_t ent_uni_338_402[] = {
00108        /* 338 (0x0152) */
00109        "OElig", "oelig", NULL, NULL, NULL, NULL,
00110        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00111        /* 352 (0x0160) */
00112        "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL,
00113        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00114        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00115        /* 376 (0x0178) */
00116        "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00117        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00118        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00119        /* 400 (0x0190) */
00120        NULL, NULL, "fnof"
00121 };
00122 
00123 static entity_table_t ent_uni_spacing[] = {
00124        /* 710 */
00125        "circ",
00126        /* 711 - 730 */
00127        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00128        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00129        /* 731 - 732 */
00130        NULL, "tilde"
00131 };
00132 
00133 static entity_table_t ent_uni_greek[] = {
00134        /* 913 */
00135        "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
00136        "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
00137        NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
00138        /* 938 - 944 are not mapped */
00139        NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00140        "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
00141        "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
00142        "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
00143        /* 970 - 976 are not mapped */
00144        NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00145        "thetasym", "upsih",
00146        NULL, NULL, NULL,
00147        "piv"
00148 };
00149 
00150 static entity_table_t ent_uni_punct[] = {
00151        /* 8194 */
00152        "ensp", "emsp", NULL, NULL, NULL, NULL, NULL,
00153        "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm",
00154        NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL,
00155        /* 8216 */
00156        "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL,
00157        "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip",
00158        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL,
00159        /* 8242 */
00160        "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL,
00161        NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL,
00162        "frasl"
00163 };
00164 
00165 static entity_table_t ent_uni_euro[] = {
00166        "euro"
00167 };
00168 
00169 static entity_table_t ent_uni_8465_8501[] = {
00170        /* 8465 */
00171        "image", NULL, NULL, NULL, NULL, NULL, NULL,
00172        /* 8472 */
00173        "weierp", NULL, NULL, NULL,
00174        /* 8476 */
00175        "real", NULL, NULL, NULL, NULL, NULL,
00176        /* 8482 */
00177        "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00178        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00179        /* 8501 */
00180        "alefsym",
00181 };
00182 
00183 static entity_table_t ent_uni_8592_9002[] = {
00184        /* 8592 (0x2190) */
00185        "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL,
00186        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00187        /* 8608 (0x21a0) */
00188        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00189        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00190        /* 8624 (0x21b0) */
00191        NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL,
00192        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00193        /* 8640 (0x21c0) */
00194        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00195        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00196        /* 8656 (0x21d0) */
00197        "lArr", "uArr", "rArr", "dArr", "hArr", NULL, NULL, NULL,
00198        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00199        /* 8672 (0x21e0) */
00200        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00201        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00202        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00203        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00204        /* 8704 (0x2200) */
00205        "forall", NULL, "part", "exist", NULL, "empty", NULL, "nabla",
00206        "isin", "notin", NULL, "ni", NULL, NULL, NULL, "prod",
00207        /* 8720 (0x2210) */
00208        NULL, "sum", "minus", NULL, NULL, NULL, NULL, "lowast",
00209        NULL, NULL, "radic", NULL, NULL, "prop", "infin", NULL,
00210        /* 8736 (0x2220) */
00211        "ang", NULL, NULL, NULL, NULL, NULL, NULL, "and",
00212        "or", "cap", "cup", "int", NULL, NULL, NULL, NULL,
00213        /* 8752 (0x2230) */
00214        NULL, NULL, NULL, NULL, "there4", NULL, NULL, NULL,
00215        NULL, NULL, NULL, NULL, "sim", NULL, NULL, NULL,
00216        /* 8768 (0x2240) */
00217        NULL, NULL, NULL, NULL, NULL, "cong", NULL, NULL,
00218        "asymp", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00219        /* 8784 (0x2250) */
00220        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00221        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00222        /* 8800 (0x2260) */
00223        "ne", "equiv", NULL, NULL, "le", "ge", NULL, NULL,
00224        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00225        /* 8816 (0x2270) */
00226        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00227        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00228        /* 8832 (0x2280) */
00229        NULL, NULL, "sub", "sup", "nsub", NULL, "sube", "supe",
00230        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00231        /* 8848 (0x2290) */
00232        NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes",
00233        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00234        /* 8864 (0x22a0) */
00235        NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL,
00236        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00237        /* 8880 (0x22b0) */
00238        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00239        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00240        /* 8896 (0x22c0) */
00241        NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL,
00242        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00243        /* 8912 (0x22d0) */
00244        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00245        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00246        /* 8928 (0x22e0) */
00247        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00248        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00249        /* 8944 (0x22f0) */
00250        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00251        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00252        /* 8960 (0x2300) */
00253        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00254        "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL,
00255        /* 8976 (0x2310) */
00256        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00257        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00258        /* 8992 (0x2320) */
00259        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00260        NULL, "lang", "rang"
00261 };
00262 
00263 static entity_table_t ent_uni_9674[] = {
00264        /* 9674 */
00265        "loz"
00266 };
00267 
00268 static entity_table_t ent_uni_9824_9830[] = {
00269        /* 9824 */
00270        "spades", NULL, NULL, "clubs", NULL, "hearts", "diams"
00271 };
00272 
00273 static entity_table_t ent_koi8r[] = {
00274        "#1105", /* "jo "*/
00275        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
00276        NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */
00277        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
00278        "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092", 
00279        "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084", 
00280        "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090", 
00281        "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096", 
00282        "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041", 
00283        "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048", 
00284        "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055", 
00285        "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042",
00286        "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063", 
00287        "#1066"
00288 };
00289 
00290 static entity_table_t ent_cp_1251[] = {
00291        "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger",
00292        "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036",
00293        "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220",
00294        "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250",
00295        "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118",
00296        "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy",
00297        "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn",
00298        "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105",
00299        "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111",
00300        "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046",
00301        "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053",
00302        "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060",
00303        "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067",
00304        "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074",
00305        "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081",
00306        "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088",
00307        "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095",
00308        "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102",
00309        "#1103"
00310 };
00311 
00312 static entity_table_t ent_iso_8859_5[] = {
00313        "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062",
00314        "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069",
00315        "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076",
00316        "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083",
00317        "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090",
00318        "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
00319        "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104",
00320        "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111",
00321        "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118",
00322        "#1119"
00323 };
00324 
00325 static entity_table_t ent_cp_866[] = {
00326 
00327        "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566", 
00328        "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552", 
00329        "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560", 
00330        "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608", 
00331        "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090", 
00332        "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", 
00333        "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025", 
00334        "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118", 
00335        "#176", "#8729", "#183", "#8730", "#8470", "#164",  "#9632", 
00336        "#160"
00337 };
00338 
00339 /* MacRoman has a couple of low-ascii chars that need mapping too */
00340 /* Vertical tab (ASCII 11) is often used to store line breaks inside */
00341 /* DB exports, this mapping changes it to a space */
00342 static entity_table_t ent_macroman[] = {
00343        "sp", NULL, NULL, NULL,
00344        NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00345        NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00346        NULL, NULL, NULL, NULL, NULL, "quot", NULL,
00347        NULL, NULL, "amp", NULL, NULL, NULL, NULL,
00348        NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00349        NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00350        NULL, NULL, NULL, "lt", NULL, "gt", NULL,
00351        NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00352        NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00353        NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00354        NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00355        NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00356        NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00357        NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00358        NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00359        NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00360        NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml",
00361        "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring",
00362        "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave",
00363        "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml",
00364        "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg",
00365        "cent", "pound", "sect", "bull", "para", "szlig", "reg",
00366        "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash",
00367        "infin", "plusmn", "le", "ge", "yen", "micro", "part",
00368        "sum", "prod", "pi", "int", "ordf", "ordm", "Omega",
00369        "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof",
00370        "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave",
00371        "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo",
00372        "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml",
00373        "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger",
00374        "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute",
00375        "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute",
00376        "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305",
00377        "circ", "tilde", "macr", "#728", "#729", "#730", "cedil",
00378        "#733", "#731", "#711"
00379 };
00380 
00381 struct html_entity_map {
00382        enum entity_charset charset;       /* charset identifier */
00383        unsigned int basechar;                    /* char code at start of table */
00384        unsigned int endchar;                     /* last char code in the table */
00385        entity_table_t *table;                    /* the table of mappings */
00386 };
00387 
00388 static const struct html_entity_map entity_map[] = {
00389        { cs_cp1252,         0x80, 0x9f, ent_cp_1252 },
00390        { cs_cp1252,         0xa0, 0xff, ent_iso_8859_1 },
00391        { cs_8859_1,         0xa0, 0xff, ent_iso_8859_1 },
00392        { cs_8859_15,               0xa0, 0xff, ent_iso_8859_15 },
00393        { cs_utf_8,          0xa0, 0xff, ent_iso_8859_1 },
00394        { cs_utf_8,          338,  402,  ent_uni_338_402 },
00395        { cs_utf_8,          710,  732,  ent_uni_spacing },
00396        { cs_utf_8,          913,  982,  ent_uni_greek },
00397        { cs_utf_8,          8194, 8260, ent_uni_punct },
00398        { cs_utf_8,          8364, 8364, ent_uni_euro }, 
00399        { cs_utf_8,          8465, 8501, ent_uni_8465_8501 },
00400        { cs_utf_8,          8592, 9002, ent_uni_8592_9002 },
00401        { cs_utf_8,          9674, 9674, ent_uni_9674 },
00402        { cs_utf_8,          9824, 9830, ent_uni_9824_9830 },
00403        { cs_big5,                  0xa0, 0xff, ent_iso_8859_1 },
00404        { cs_gb2312,         0xa0, 0xff, ent_iso_8859_1 },
00405        { cs_big5hkscs,      0xa0, 0xff, ent_iso_8859_1 },
00406        { cs_sjis,                  0xa0, 0xff, ent_iso_8859_1 },
00407        { cs_eucjp,                 0xa0, 0xff, ent_iso_8859_1 },
00408        { cs_koi8r,              0xa3, 0xff, ent_koi8r },
00409        { cs_cp1251,         0x80, 0xff, ent_cp_1251 },
00410        { cs_8859_5,         0xc0, 0xff, ent_iso_8859_5 },
00411        { cs_cp866,              0xc0, 0xff, ent_cp_866 },
00412        { cs_macroman,              0x0b, 0xff, ent_macroman },
00413        { cs_terminator }
00414 };
00415 
00416 static const struct {
00417        const char *codeset;
00418        enum entity_charset charset;
00419 } charset_map[] = {
00420        { "ISO-8859-1",      cs_8859_1 },
00421        { "ISO8859-1",              cs_8859_1 },
00422        { "ISO-8859-15",     cs_8859_15 },
00423        { "ISO8859-15",      cs_8859_15 },
00424        { "utf-8",                  cs_utf_8 },
00425        { "cp1252",          cs_cp1252 },
00426        { "Windows-1252",    cs_cp1252 },
00427        { "1252",           cs_cp1252 }, 
00428        { "BIG5",                   cs_big5 },
00429        { "950",            cs_big5 },
00430        { "GB2312",                 cs_gb2312 },
00431        { "936",            cs_gb2312 },
00432        { "BIG5-HKSCS",             cs_big5hkscs },
00433        { "Shift_JIS",              cs_sjis },
00434        { "SJIS",            cs_sjis },
00435        { "932",            cs_sjis },
00436        { "EUCJP",           cs_eucjp },
00437        { "EUC-JP",                 cs_eucjp },
00438        { "KOI8-R",         cs_koi8r },
00439        { "koi8-ru",        cs_koi8r },
00440        { "koi8r",          cs_koi8r },
00441        { "cp1251",         cs_cp1251 },
00442        { "Windows-1251",   cs_cp1251 },
00443        { "win-1251",       cs_cp1251 },
00444        { "iso8859-5",      cs_8859_5 },
00445        { "iso-8859-5",     cs_8859_5 },
00446        { "cp866",          cs_cp866 },
00447        { "866",            cs_cp866 },    
00448        { "ibm866",         cs_cp866 },
00449        { "MacRoman",       cs_macroman },
00450        { NULL }
00451 };
00452 
00453 static const struct {
00454        unsigned short charcode;
00455        char *entity;
00456        int entitylen;
00457        int flags;
00458 } basic_entities[] = {
00459        { '"', "&quot;",     6,     ENT_HTML_QUOTE_DOUBLE },
00460        { '\'',       "&#039;",     6,     ENT_HTML_QUOTE_SINGLE },
00461        { '\'',       "&#39;",      5,     ENT_HTML_QUOTE_SINGLE },
00462        { '<', "&lt;",              4,     0 },
00463        { '>', "&gt;",              4,     0 },
00464        { 0, NULL, 0, 0 }
00465 };
00466        
00467 struct basic_entities_dec {
00468        unsigned short charcode;
00469        char entity[8];
00470        int entitylen;       
00471 };
00472        
00473 #define MB_RETURN { \
00474                      *newpos = pos;       \
00475                      mbseq[mbpos] = '\0'; \
00476                      *mbseqlen = mbpos;   \
00477                      return this_char; }
00478                                    
00479 #define MB_WRITE(mbchar) { \
00480                      mbspace--;  \
00481                      if (mbspace == 0) {      \
00482                             MB_RETURN;           \
00483                      }                        \
00484                      mbseq[mbpos++] = (mbchar); }
00485 
00486 /* skip one byte and return */
00487 #define MB_FAILURE(pos) do { \
00488        *newpos = pos + 1; \
00489        *status = FAILURE; \
00490        return 0; \
00491 } while (0)
00492 
00493 #define CHECK_LEN(pos, chars_need)               \
00494        if (chars_need < 1) {                                          \
00495               if((str_len - (pos)) < chars_need) {      \
00496                      *newpos = pos;                                          \
00497                      *status = FAILURE;                               \
00498                      return 0;                                               \
00499               }                                                                     \
00500        } else {                                                              \
00501               if((str_len - (pos)) < chars_need) {      \
00502                      *newpos = pos + 1;                               \
00503                      *status = FAILURE;                               \
00504                      return 0;                                               \
00505               }                                                                     \
00506        }
00507 
00508 /* {{{ get_next_char
00509  */
00510 inline static unsigned int get_next_char(enum entity_charset charset,
00511               unsigned char * str,
00512               int str_len,
00513               int * newpos,
00514               unsigned char * mbseq,
00515               int * mbseqlen, 
00516               int *status)
00517 {
00518        int pos = *newpos;
00519        int mbpos = 0;
00520        int mbspace = *mbseqlen;
00521        unsigned int this_char = 0;
00522        unsigned char next_char;
00523 
00524        *status = SUCCESS;
00525 
00526        if (mbspace <= 0) {
00527               *mbseqlen = 0;
00528               CHECK_LEN(pos, 1);
00529               *newpos = pos + 1;
00530               return str[pos];
00531        }
00532 
00533        switch (charset) {
00534               case cs_utf_8:
00535                      {
00536                             unsigned char c;
00537                             CHECK_LEN(pos, 1);
00538                             c = str[pos];
00539                             if (c < 0x80) {
00540                                    MB_WRITE(c);
00541                                    this_char = c;
00542                                    pos++;
00543                             } else if (c < 0xc2) {
00544                                    MB_FAILURE(pos);
00545                             } else if (c < 0xe0) {
00546                                    CHECK_LEN(pos, 2);
00547                                    if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
00548                                           MB_FAILURE(pos);
00549                                    }
00550                                    this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
00551                                    if (this_char < 0x80) {
00552                                           MB_FAILURE(pos);
00553                                    }
00554                                    MB_WRITE((unsigned char)c);
00555                                    MB_WRITE((unsigned char)str[pos + 1]);
00556                                    pos += 2;
00557                             } else if (c < 0xf0) {
00558                                    CHECK_LEN(pos, 3);
00559                                    if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
00560                                           MB_FAILURE(pos);
00561                                    }
00562                                    if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
00563                                           MB_FAILURE(pos);
00564                                    }
00565                                    this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
00566                                    if (this_char < 0x800) {
00567                                           MB_FAILURE(pos);
00568                                    } else if (this_char >= 0xd800 && this_char <= 0xdfff) {
00569                                           MB_FAILURE(pos);
00570                                    }
00571                                    MB_WRITE((unsigned char)c);
00572                                    MB_WRITE((unsigned char)str[pos + 1]);
00573                                    MB_WRITE((unsigned char)str[pos + 2]);
00574                                    pos += 3;
00575                             } else if (c < 0xf5) {
00576                                    CHECK_LEN(pos, 4);
00577                                    if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
00578                                           MB_FAILURE(pos);
00579                                    }
00580                                    if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
00581                                           MB_FAILURE(pos);
00582                                    }
00583                                    if (str[pos + 3] < 0x80 || str[pos + 3] > 0xbf) {
00584                                           MB_FAILURE(pos);
00585                                    }
00586                                    this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
00587                                    if (this_char < 0x10000 || this_char > 0x10FFFF) {
00588                                           MB_FAILURE(pos);
00589                                    }
00590                                    MB_WRITE((unsigned char)c);
00591                                    MB_WRITE((unsigned char)str[pos + 1]);
00592                                    MB_WRITE((unsigned char)str[pos + 2]);
00593                                    MB_WRITE((unsigned char)str[pos + 3]);
00594                                    pos += 4;
00595                             } else {
00596                                    MB_FAILURE(pos);
00597                             }
00598                      }
00599                      break;
00600               case cs_big5:
00601               case cs_gb2312:
00602               case cs_big5hkscs:
00603                      {
00604                             CHECK_LEN(pos, 1);
00605                             this_char = str[pos++];
00606                             /* check if this is the first of a 2-byte sequence */
00607                             if (this_char >= 0x81 && this_char <= 0xfe) {
00608                                    /* peek at the next char */
00609                                    CHECK_LEN(pos, 1);
00610                                    next_char = str[pos++];
00611                                    if ((next_char >= 0x40 && next_char <= 0x7e) ||
00612                                                  (next_char >= 0xa1 && next_char <= 0xfe)) {
00613                                           /* yes, this a wide char */
00614                                           MB_WRITE(this_char);
00615                                           MB_WRITE(next_char);
00616                                           this_char = (this_char << 8) | next_char;
00617                                    } else {
00618                                           MB_FAILURE(pos);
00619                                    }
00620                             } else {
00621                                    MB_WRITE(this_char);
00622                             }
00623                      }
00624                      break;
00625               case cs_sjis:
00626                      {
00627                             CHECK_LEN(pos, 1);
00628                             this_char = str[pos++];
00629                             /* check if this is the first of a 2-byte sequence */
00630                             if ((this_char >= 0x81 && this_char <= 0x9f) ||
00631                                    (this_char >= 0xe0 && this_char <= 0xfc)) {
00632                                    /* peek at the next char */
00633                                    CHECK_LEN(pos, 1);
00634                                    next_char = str[pos++];
00635                                    if ((next_char >= 0x40 && next_char <= 0x7e) ||
00636                                           (next_char >= 0x80 && next_char <= 0xfc))
00637                                    {
00638                                           /* yes, this a wide char */
00639                                           MB_WRITE(this_char);
00640                                           MB_WRITE(next_char);
00641                                           this_char = (this_char << 8) | next_char;
00642                                    } else {
00643                                           MB_FAILURE(pos);
00644                                    }
00645                             } else {
00646                                    MB_WRITE(this_char);
00647                             }
00648                             break;
00649                      }
00650               case cs_eucjp:
00651                      {
00652                             CHECK_LEN(pos, 1);
00653                             this_char = str[pos++];
00654                             /* check if this is the first of a multi-byte sequence */
00655                             if (this_char >= 0xa1 && this_char <= 0xfe) {
00656                                    /* peek at the next char */
00657                                    CHECK_LEN(pos, 1);
00658                                    next_char = str[pos++];
00659                                    if (next_char >= 0xa1 && next_char <= 0xfe) {
00660                                           /* yes, this a jis kanji char */
00661                                           MB_WRITE(this_char);
00662                                           MB_WRITE(next_char);
00663                                           this_char = (this_char << 8) | next_char;
00664                                    } else {
00665                                           MB_FAILURE(pos);
00666                                    }
00667                             } else if (this_char == 0x8e) {
00668                                    /* peek at the next char */
00669                                    CHECK_LEN(pos, 1);
00670                                    next_char = str[pos++];
00671                                    if (next_char >= 0xa1 && next_char <= 0xdf) {
00672                                           /* JIS X 0201 kana */
00673                                           MB_WRITE(this_char);
00674                                           MB_WRITE(next_char);
00675                                           this_char = (this_char << 8) | next_char;
00676                                    } else {
00677                                           MB_FAILURE(pos);
00678                                    }
00679                             } else if (this_char == 0x8f) {
00680                                    /* peek at the next two char */
00681                                    unsigned char next2_char;
00682                                    CHECK_LEN(pos, 2);
00683                                    next_char = str[pos];
00684                                    next2_char = str[pos + 1];
00685                                    pos += 2;
00686                                    if ((next_char >= 0xa1 && next_char <= 0xfe) &&
00687                                           (next2_char >= 0xa1 && next2_char <= 0xfe)) {
00688                                           /* JIS X 0212 hojo-kanji */
00689                                           MB_WRITE(this_char);
00690                                           MB_WRITE(next_char);
00691                                           MB_WRITE(next2_char);
00692                                           this_char = (this_char << 16) | (next_char << 8) | next2_char;
00693                                    } else {
00694                                           MB_FAILURE(pos);
00695                                    }
00696                             } else {
00697                                    MB_WRITE(this_char);
00698                             }
00699                             break;
00700                      }
00701               default:
00702                      /* single-byte charsets */
00703                      CHECK_LEN(pos, 1);
00704                      this_char = str[pos++];
00705                      MB_WRITE(this_char);
00706                      break;
00707        }
00708        MB_RETURN;
00709 }
00710 /* }}} */
00711 
00712 /* {{{ entity_charset determine_charset
00713  * returns the charset identifier based on current locale or a hint.
00714  * defaults to iso-8859-1 */
00715 static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC)
00716 {
00717        int i;
00718        enum entity_charset charset = cs_8859_1;
00719        int len = 0;
00720        zval *uf_result = NULL;
00721 
00722        /* Guarantee default behaviour for backwards compatibility */
00723        if (charset_hint == NULL)
00724               return cs_8859_1;
00725 
00726        if ((len = strlen(charset_hint)) != 0) {
00727               goto det_charset;
00728        }
00729 #if HAVE_MBSTRING
00730 #if !defined(COMPILE_DL_MBSTRING)
00731        /* XXX: Ugly things. Why don't we look for a more sophisticated way? */
00732        switch (MBSTRG(current_internal_encoding)) {
00733               case mbfl_no_encoding_8859_1:
00734                      return cs_8859_1;
00735 
00736               case mbfl_no_encoding_utf8:
00737                      return cs_utf_8;
00738 
00739               case mbfl_no_encoding_euc_jp:
00740               case mbfl_no_encoding_eucjp_win:
00741                      return cs_eucjp;
00742 
00743               case mbfl_no_encoding_sjis:
00744               case mbfl_no_encoding_sjis_open:
00745               case mbfl_no_encoding_cp932:
00746                      return cs_sjis;
00747 
00748               case mbfl_no_encoding_cp1252:
00749                      return cs_cp1252;
00750 
00751               case mbfl_no_encoding_8859_15:
00752                      return cs_8859_15;
00753 
00754               case mbfl_no_encoding_big5:
00755                      return cs_big5;
00756 
00757               case mbfl_no_encoding_euc_cn:
00758               case mbfl_no_encoding_hz:
00759               case mbfl_no_encoding_cp936:
00760                      return cs_gb2312;
00761 
00762               case mbfl_no_encoding_koi8r:
00763                      return cs_koi8r;
00764 
00765               case mbfl_no_encoding_cp866:
00766                      return cs_cp866;
00767 
00768               case mbfl_no_encoding_cp1251:
00769                      return cs_cp1251;
00770 
00771               case mbfl_no_encoding_8859_5:
00772                      return cs_8859_5;
00773 
00774               default:
00775                      ;
00776        }
00777 #else
00778        {
00779               zval nm_mb_internal_encoding;
00780 
00781               ZVAL_STRING(&nm_mb_internal_encoding, "mb_internal_encoding", 0);
00782 
00783               if (call_user_function_ex(CG(function_table), NULL, &nm_mb_internal_encoding, &uf_result, 0, NULL, 1, NULL TSRMLS_CC) != FAILURE) {
00784 
00785                      charset_hint = Z_STRVAL_P(uf_result);
00786                      len = Z_STRLEN_P(uf_result);
00787                      
00788                      if (len == 4) { /* sizeof(none|auto|pass)-1 */
00789                             if (!memcmp("pass", charset_hint, sizeof("pass") - 1) || 
00790                                 !memcmp("auto", charset_hint, sizeof("auto") - 1) || 
00791                                 !memcmp("none", charset_hint, sizeof("none") - 1)) {
00792                                    
00793                                    charset_hint = NULL;
00794                                    len = 0;
00795                             }
00796                      }
00797                      goto det_charset;
00798               }
00799        }
00800 #endif
00801 #endif
00802 
00803        charset_hint = SG(default_charset);
00804        if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
00805               goto det_charset;
00806        }
00807 
00808        /* try to detect the charset for the locale */
00809 #if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET)
00810        charset_hint = nl_langinfo(CODESET);
00811        if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
00812               goto det_charset;
00813        }
00814 #endif
00815 
00816 #if HAVE_LOCALE_H
00817        /* try to figure out the charset from the locale */
00818        {
00819               char *localename;
00820               char *dot, *at;
00821 
00822               /* lang[_territory][.codeset][@modifier] */
00823               localename = setlocale(LC_CTYPE, NULL);
00824 
00825               dot = strchr(localename, '.');
00826               if (dot) {
00827                      dot++;
00828                      /* locale specifies a codeset */
00829                      at = strchr(dot, '@');
00830                      if (at)
00831                             len = at - dot;
00832                      else
00833                             len = strlen(dot);
00834                      charset_hint = dot;
00835               } else {
00836                      /* no explicit name; see if the name itself
00837                       * is the charset */
00838                      charset_hint = localename;
00839                      len = strlen(charset_hint);
00840               }
00841        }
00842 #endif
00843 
00844 det_charset:
00845 
00846        if (charset_hint) {
00847               int found = 0;
00848               
00849               /* now walk the charset map and look for the codeset */
00850               for (i = 0; charset_map[i].codeset; i++) {
00851                      if (len == strlen(charset_map[i].codeset) && strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
00852                             charset = charset_map[i].charset;
00853                             found = 1;
00854                             break;
00855                      }
00856               }
00857               if (!found) {
00858                      php_error_docref(NULL TSRMLS_CC, E_WARNING, "charset `%s' not supported, assuming iso-8859-1",
00859                                    charset_hint);
00860               }
00861        }
00862        if (uf_result != NULL) {
00863               zval_ptr_dtor(&uf_result);
00864        }
00865        return charset;
00866 }
00867 /* }}} */
00868 
00869 /* {{{ php_utf32_utf8 */
00870 size_t php_utf32_utf8(unsigned char *buf, unsigned k)
00871 {
00872        size_t retval = 0;
00873 
00874        if (k < 0x80) {
00875               buf[0] = k;
00876               retval = 1;
00877        } else if (k < 0x800) {
00878               buf[0] = 0xc0 | (k >> 6);
00879               buf[1] = 0x80 | (k & 0x3f);
00880               retval = 2;
00881        } else if (k < 0x10000) {
00882               buf[0] = 0xe0 | (k >> 12);
00883               buf[1] = 0x80 | ((k >> 6) & 0x3f);
00884               buf[2] = 0x80 | (k & 0x3f);
00885               retval = 3;
00886        } else if (k < 0x200000) {
00887               buf[0] = 0xf0 | (k >> 18);
00888               buf[1] = 0x80 | ((k >> 12) & 0x3f);
00889               buf[2] = 0x80 | ((k >> 6) & 0x3f);
00890               buf[3] = 0x80 | (k & 0x3f);
00891               retval = 4;
00892        } else if (k < 0x4000000) {
00893               buf[0] = 0xf8 | (k >> 24);
00894               buf[1] = 0x80 | ((k >> 18) & 0x3f);
00895               buf[2] = 0x80 | ((k >> 12) & 0x3f);
00896               buf[3] = 0x80 | ((k >> 6) & 0x3f);
00897               buf[4] = 0x80 | (k & 0x3f);
00898               retval = 5;
00899        } else {
00900               buf[0] = 0xfc | (k >> 30);
00901               buf[1] = 0x80 | ((k >> 24) & 0x3f);
00902               buf[2] = 0x80 | ((k >> 18) & 0x3f);
00903               buf[3] = 0x80 | ((k >> 12) & 0x3f);
00904               buf[4] = 0x80 | ((k >> 6) & 0x3f);
00905               buf[5] = 0x80 | (k & 0x3f);
00906               retval = 6;
00907        }
00908        buf[retval] = '\0';
00909 
00910        return retval;
00911 }
00912 /* }}} */
00913 
00914 /* {{{ php_unescape_html_entities
00915  */
00916 PHPAPI char *php_unescape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
00917 {
00918        int retlen;
00919        int j, k;
00920        char *replaced, *ret, *p, *q, *lim, *next;
00921        enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
00922        unsigned char replacement[15];
00923        int replacement_len;
00924 
00925        ret = estrndup(old, oldlen);
00926        retlen = oldlen;
00927        if (!retlen) {
00928               goto empty_source;
00929        }
00930        
00931        if (all) {
00932               /* look for a match in the maps for this charset */
00933               for (j = 0; entity_map[j].charset != cs_terminator; j++) {
00934                      if (entity_map[j].charset != charset)
00935                             continue;
00936 
00937                      for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) {
00938                             unsigned char entity[32];
00939                             int entity_length = 0;
00940 
00941                             if (entity_map[j].table[k - entity_map[j].basechar] == NULL)
00942                                    continue;
00943 
00944                             entity_length = slprintf(entity, sizeof(entity), "&%s;", entity_map[j].table[k - entity_map[j].basechar]);
00945                             if (entity_length >= sizeof(entity)) {
00946                                    continue;
00947                             }
00948 
00949                             /* When we have MBCS entities in the tables above, this will need to handle it */
00950                             replacement_len = 0;
00951                             switch (charset) {
00952                                    case cs_8859_1:
00953                                    case cs_cp1252:
00954                                    case cs_8859_15:
00955                                    case cs_cp1251:
00956                                    case cs_8859_5:
00957                                    case cs_cp866:
00958                                    case cs_koi8r:
00959                                           replacement[0] = k;
00960                                           replacement[1] = '\0';
00961                                           replacement_len = 1;
00962                                           break;
00963 
00964                                    case cs_big5:
00965                                    case cs_gb2312:
00966                                    case cs_big5hkscs:
00967                                    case cs_sjis:
00968                                    case cs_eucjp:
00969                                           /* we cannot properly handle those multibyte encodings
00970                                            * with php_str_to_str. skip it. */ 
00971                                           continue;
00972 
00973                                    case cs_utf_8:
00974                                           replacement_len = php_utf32_utf8(replacement, k);
00975                                           break;
00976 
00977                                    default:
00978                                           php_error_docref(NULL TSRMLS_CC, E_WARNING, "cannot yet handle MBCS!");
00979                                           efree(ret);
00980                                           return NULL;
00981                             }
00982 
00983                             if (php_memnstr(ret, entity, entity_length, ret+retlen)) {
00984                                    replaced = php_str_to_str(ret, retlen, entity, entity_length, replacement, replacement_len, &retlen);
00985                                    efree(ret);
00986                                    ret = replaced;
00987                             }
00988                      }
00989               }
00990        }
00991 
00992        for (j = 0; basic_entities[j].charcode != 0; j++) {
00993 
00994               if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
00995                      continue;
00996               
00997               replacement[0] = (unsigned char)basic_entities[j].charcode;
00998               replacement[1] = '\0';
00999 
01000               if (php_memnstr(ret, basic_entities[j].entity, basic_entities[j].entitylen, ret+retlen)) {        
01001                      replaced = php_str_to_str(ret, retlen, basic_entities[j].entity, basic_entities[j].entitylen, replacement, 1, &retlen);
01002                      efree(ret);
01003                      ret = replaced;
01004               }
01005        }
01006 
01007        /* replace numeric entities & "&amp;" */
01008        lim = ret + retlen;
01009        for (p = ret, q = ret; p < lim;) {
01010               int code;
01011 
01012               if (p[0] == '&') {
01013                      if (p + 2 < lim) {
01014                             if (p[1] == '#') {
01015                                    int invalid_code = 0;
01016 
01017                                    if (p[2] == 'x' || p[2] == 'X') {
01018                                           code = strtol(p + 3, &next, 16);
01019                                    } else {
01020                                           code = strtol(p + 2, &next, 10);
01021                                    }
01022 
01023                                    if ((code == '\'' && !(quote_style & ENT_HTML_QUOTE_SINGLE)) ||
01024                                           (code == '"' && !(quote_style & ENT_HTML_QUOTE_DOUBLE))) {
01025                                           invalid_code = 1;
01026                                    }
01027 
01028                                    if (next != NULL && *next == ';' && !invalid_code) {
01029                                           switch (charset) {
01030                                                  case cs_utf_8:
01031                                                         q += php_utf32_utf8(q, code);
01032                                                         break;
01033 
01034                                                  case cs_8859_1:
01035                                                  case cs_8859_5:
01036                                                  case cs_8859_15:
01037                                                         if ((code >= 0x80 && code < 0xa0) || code > 0xff) {
01038                                                                invalid_code = 1;
01039                                                         } else {
01040                                                                *(q++) = code;
01041                                                         }
01042                                                         break;
01043 
01044                                                  case cs_cp1252:
01045                                                         if (code > 0xff) {
01046                                                                invalid_code = 1;
01047                                                         } else {
01048                                                                *(q++) = code;
01049                                                         }
01050                                                         break;
01051 
01052                                                  case cs_cp1251:
01053                                                  case cs_cp866:
01054                                                  case cs_big5:
01055                                                  case cs_big5hkscs:
01056                                                  case cs_sjis:
01057                                                  case cs_eucjp:
01058                                                         if (code >= 0x80) {
01059                                                                invalid_code = 1;
01060                                                         } else {
01061                                                                *(q++) = code;
01062                                                         }
01063                                                         break;
01064 
01065                                                  case cs_gb2312:
01066                                                         if (code >= 0x81) {
01067                                                                invalid_code = 1;
01068                                                         } else {
01069                                                                *(q++) = code;
01070                                                         }
01071                                                         break;
01072 
01073                                                  default:
01074                                                         /* for backwards compatilibity */
01075                                                         invalid_code = 1;
01076                                                         break;
01077                                           }
01078                                           if (invalid_code) {
01079                                                  for (; p <= next; p++) {
01080                                                         *(q++) = *p;
01081                                                  }
01082                                           }
01083                                           p = next + 1;
01084                                    } else {
01085                                           *(q++) = *(p++);     
01086                                           *(q++) = *(p++);     
01087                                    }
01088                             } else if (p + 4 < lim &&
01089                                                  p[1] == 'a' && p[2] == 'm' &&p[3] == 'p' &&
01090                                                  p[4] == ';') {
01091                                    *(q++) = '&';
01092                                    p += 5;
01093                             } else {
01094                                    *(q++) = *(p++);
01095                                    *(q++) = *(p++);
01096                             }
01097                      } else {
01098                             *(q++) = *(p++);     
01099                      }
01100               } else {
01101                      *(q++) = *(p++);     
01102               }
01103        }
01104        *q = '\0';
01105        retlen = (size_t)(q - ret);
01106 empty_source: 
01107        *newlen = retlen;
01108        return ret;
01109 }
01110 /* }}} */
01111 
01112 PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
01113 {
01114        return php_escape_html_entities_ex(old, oldlen, newlen, all, quote_style, hint_charset, 1 TSRMLS_CC);
01115 }
01116 
01117 
01118 /* {{{ php_escape_html_entities
01119  */
01120 PHPAPI char *php_escape_html_entities_ex(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset, zend_bool double_encode TSRMLS_DC)
01121 {
01122        int i, j, maxlen, len;
01123        char *replaced;
01124        enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
01125        int matches_map;
01126 
01127        maxlen = 2 * oldlen;
01128        if (maxlen < 128)
01129               maxlen = 128;
01130        replaced = emalloc (maxlen);
01131        len = 0;
01132        i = 0;
01133        while (i < oldlen) {
01134               unsigned char mbsequence[16];      /* allow up to 15 characters in a multibyte sequence */
01135               int mbseqlen = sizeof(mbsequence);
01136               int status = SUCCESS;
01137               unsigned int this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status);
01138 
01139               if(status == FAILURE) {
01140                      /* invalid MB sequence */
01141                      if (quote_style & ENT_HTML_IGNORE_ERRORS) {
01142                             continue;
01143                      }
01144                      efree(replaced);
01145                      if(!PG(display_errors)) {
01146                             php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid multibyte sequence in argument");
01147                      }
01148                      *newlen = 0;
01149                      return STR_EMPTY_ALLOC();
01150               }
01151               matches_map = 0;
01152 
01153               if (len + 16 > maxlen)
01154                      replaced = erealloc (replaced, maxlen += 128);
01155 
01156               if (all) {
01157                      /* look for a match in the maps for this charset */
01158                      unsigned char *rep = NULL;
01159 
01160 
01161                      for (j = 0; entity_map[j].charset != cs_terminator; j++) {
01162                             if (entity_map[j].charset == charset
01163                                           && this_char >= entity_map[j].basechar
01164                                           && this_char <= entity_map[j].endchar) {
01165                                    rep = (unsigned char*)entity_map[j].table[this_char - entity_map[j].basechar];
01166                                    if (rep == NULL) {
01167                                           /* there is no entity for this position; fall through and
01168                                            * just output the character itself */
01169                                           break;
01170                                    }
01171 
01172                                    matches_map = 1;
01173                                    break;
01174                             }
01175                      }
01176 
01177                      if (matches_map) {
01178                             int l = strlen(rep);
01179                             /* increase the buffer size */
01180                             if (len + 2 + l >= maxlen) {
01181                                    replaced = erealloc(replaced, maxlen += 128);
01182                             }
01183 
01184                             replaced[len++] = '&';
01185                             strlcpy(replaced + len, rep, maxlen);
01186                             len += l;
01187                             replaced[len++] = ';';
01188                      }
01189               }
01190               if (!matches_map) {  
01191                      int is_basic = 0;
01192 
01193                      if (this_char == '&') {
01194                             if (double_encode) {
01195 encode_amp:
01196                                    memcpy(replaced + len, "&amp;", sizeof("&amp;") - 1);
01197                                    len += sizeof("&amp;") - 1;
01198                             } else {
01199                                    char *e = memchr(old + i, ';', oldlen - i);
01200                                    char *s = old + i;
01201 
01202                                    if (!e || (e - s) > 10) { /* minor optimization to avoid "entities" over 10 chars in length */
01203                                           goto encode_amp;
01204                                    } else {
01205                                           if (*s == '#') { /* numeric entities */
01206                                                  s++;
01207                                                  /* Hex (&#x5A;) */
01208                                                  if (*s == 'x' || *s == 'X') {
01209                                                         s++;
01210                                                         while (s < e) {
01211                                                                if (!isxdigit((int)*(unsigned char *)s++)) {
01212                                                                       goto encode_amp;
01213                                                                }
01214                                                         }
01215                                                  /* Dec (&#90;)*/
01216                                                  } else {
01217                                                         while (s < e) {
01218                                                                if (!isdigit((int)*(unsigned char *)s++)) {
01219                                                                       goto encode_amp;
01220                                                                }
01221                                                         }
01222                                                  }
01223                                           } else { /* text entities */
01224                                                  while (s < e) {
01225                                                         if (!isalnum((int)*(unsigned char *)s++)) {
01226                                                                goto encode_amp;
01227                                                         }
01228                                                  }
01229                                           }
01230                                           replaced[len++] = '&';
01231                                    }
01232                             }
01233                             is_basic = 1;
01234                      } else {
01235                             for (j = 0; basic_entities[j].charcode != 0; j++) {
01236                                    if ((basic_entities[j].charcode != this_char) ||
01237                                                  (basic_entities[j].flags &&
01238                                                  (quote_style & basic_entities[j].flags) == 0)) {
01239                                           continue;
01240                                    }
01241 
01242                                    memcpy(replaced + len, basic_entities[j].entity, basic_entities[j].entitylen);
01243                                    len += basic_entities[j].entitylen;
01244               
01245                                    is_basic = 1;
01246                                    break;
01247                             }
01248                      }
01249 
01250                      if (!is_basic) {
01251                             /* a wide char without a named entity; pass through the original sequence */
01252                             if (mbseqlen > 1) {
01253                                    memcpy(replaced + len, mbsequence, mbseqlen);
01254                                    len += mbseqlen;
01255                             } else {
01256                                    replaced[len++] = (unsigned char)this_char;
01257                             }
01258                      }
01259               }
01260        }
01261        replaced[len] = '\0';
01262        *newlen = len;
01263 
01264        return replaced;
01265 
01266 
01267 }
01268 /* }}} */
01269 
01270 /* {{{ php_html_entities
01271  */
01272 static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
01273 {
01274        char *str, *hint_charset = NULL;
01275        int str_len, hint_charset_len = 0;
01276        int len;
01277        long quote_style = ENT_COMPAT;
01278        char *replaced;
01279        zend_bool double_encode = 1;
01280 
01281        if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls!b", &str, &str_len, &quote_style, &hint_charset, &hint_charset_len, &double_encode) == FAILURE) {
01282               return;
01283        }
01284 
01285        replaced = php_escape_html_entities_ex(str, str_len, &len, all, quote_style, hint_charset, double_encode TSRMLS_CC);
01286        RETVAL_STRINGL(replaced, len, 0);
01287 }
01288 /* }}} */
01289 
01290 #define HTML_SPECIALCHARS   0
01291 #define HTML_ENTITIES              1
01292 
01293 /* {{{ register_html_constants
01294  */
01295 void register_html_constants(INIT_FUNC_ARGS)
01296 {
01297        REGISTER_LONG_CONSTANT("HTML_SPECIALCHARS", HTML_SPECIALCHARS, CONST_PERSISTENT|CONST_CS);
01298        REGISTER_LONG_CONSTANT("HTML_ENTITIES", HTML_ENTITIES, CONST_PERSISTENT|CONST_CS);
01299        REGISTER_LONG_CONSTANT("ENT_COMPAT", ENT_COMPAT, CONST_PERSISTENT|CONST_CS);
01300        REGISTER_LONG_CONSTANT("ENT_QUOTES", ENT_QUOTES, CONST_PERSISTENT|CONST_CS);
01301        REGISTER_LONG_CONSTANT("ENT_NOQUOTES", ENT_NOQUOTES, CONST_PERSISTENT|CONST_CS);
01302        REGISTER_LONG_CONSTANT("ENT_IGNORE", ENT_IGNORE, CONST_PERSISTENT|CONST_CS);
01303 }
01304 /* }}} */
01305 
01306 /* {{{ proto string htmlspecialchars(string string [, int quote_style[, string charset[, bool double_encode]]])
01307    Convert special characters to HTML entities */
01308 PHP_FUNCTION(htmlspecialchars)
01309 {
01310        php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
01311 }
01312 /* }}} */
01313 
01314 /* {{{ proto string htmlspecialchars_decode(string string [, int quote_style])
01315    Convert special HTML entities back to characters */
01316 PHP_FUNCTION(htmlspecialchars_decode)
01317 {
01318        char *str, *new_str, *e, *p;
01319        int len, j, i, new_len;
01320        long quote_style = ENT_COMPAT;
01321        struct basic_entities_dec basic_entities_dec[8];
01322 
01323        if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, &quote_style) == FAILURE) {
01324               return;
01325        }
01326 
01327        new_str = estrndup(str, len);
01328        new_len = len;
01329        e = new_str + new_len;
01330 
01331        if (!(p = memchr(new_str, '&', new_len))) {
01332               RETURN_STRINGL(new_str, new_len, 0);
01333        }
01334 
01335        for (j = 0, i = 0; basic_entities[i].charcode != 0; i++) {
01336               if (basic_entities[i].flags && !(quote_style & basic_entities[i].flags)) {
01337                      continue;
01338               }
01339               basic_entities_dec[j].charcode = basic_entities[i].charcode;
01340               memcpy(basic_entities_dec[j].entity, basic_entities[i].entity, basic_entities[i].entitylen + 1);
01341               basic_entities_dec[j].entitylen = basic_entities[i].entitylen;
01342               j++;
01343        }
01344        basic_entities_dec[j].charcode = '&';
01345        basic_entities_dec[j].entitylen = sizeof("&amp;") - 1;
01346        memcpy(basic_entities_dec[j].entity, "&amp;", sizeof("&amp;"));
01347        i = j + 1;
01348        
01349        do {
01350               int l = e - p;
01351        
01352               for (j = 0; j < i; j++) {
01353                      if (basic_entities_dec[j].entitylen > l) {
01354                             continue;
01355                      }
01356                      if (!memcmp(p, basic_entities_dec[j].entity, basic_entities_dec[j].entitylen)) {
01357                             int e_len = basic_entities_dec[j].entitylen - 1;
01358               
01359                             *p++ = basic_entities_dec[j].charcode;
01360                             memmove(p, p + e_len, (e - p - e_len));
01361                             e -= e_len;
01362                             goto done;
01363                      }
01364               }
01365               p++;
01366 
01367 done:
01368               if (p >= e) {
01369                      break;
01370               }
01371        } while ((p = memchr(p, '&', (e - p))));
01372 
01373        new_len = e - new_str;
01374 
01375        new_str[new_len] = '\0';
01376        RETURN_STRINGL(new_str, new_len, 0);
01377 }
01378 /* }}} */
01379 
01380 /* {{{ proto string html_entity_decode(string string [, int quote_style][, string charset])
01381    Convert all HTML entities to their applicable characters */
01382 PHP_FUNCTION(html_entity_decode)
01383 {
01384        char *str, *hint_charset = NULL;
01385        int str_len, hint_charset_len = 0, len;
01386        long quote_style = ENT_COMPAT;
01387        char *replaced;
01388 
01389        if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls", &str, &str_len,
01390                                                    &quote_style, &hint_charset, &hint_charset_len) == FAILURE) {
01391               return;
01392        }
01393 
01394        replaced = php_unescape_html_entities(str, str_len, &len, 1, quote_style, hint_charset TSRMLS_CC);
01395        if (replaced) {
01396               RETURN_STRINGL(replaced, len, 0);
01397        }
01398        RETURN_FALSE;
01399 }
01400 /* }}} */
01401 
01402 
01403 /* {{{ proto string htmlentities(string string [, int quote_style[, string charset[, bool double_encode]]])
01404    Convert all applicable characters to HTML entities */
01405 PHP_FUNCTION(htmlentities)
01406 {
01407        php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
01408 }
01409 /* }}} */
01410 
01411 /* {{{ proto array get_html_translation_table([int table [, int quote_style [, string charset_hint]]])
01412    Returns the internal translation table used by htmlspecialchars and htmlentities */
01413 PHP_FUNCTION(get_html_translation_table)
01414 {
01415        long which = HTML_SPECIALCHARS, quote_style = ENT_COMPAT;
01416        unsigned int i;
01417        int j;
01418        unsigned char ind[5]; /* max # of 8-bit code units (4; for UTF-8) + 1 for \0 */
01419        void *dummy;
01420        char *charset_hint = NULL;
01421        int charset_hint_len;
01422        enum entity_charset charset;
01423 
01424        if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|lls",
01425                      &which, &quote_style, &charset_hint, &charset_hint_len) == FAILURE) {
01426               return;
01427        }
01428 
01429        charset = determine_charset(charset_hint TSRMLS_CC);
01430 
01431        array_init(return_value);
01432 
01433        switch (which) {
01434        case HTML_ENTITIES:
01435               for (j = 0; entity_map[j].charset != cs_terminator; j++) {
01436                      if (entity_map[j].charset != charset)
01437                             continue;
01438                      for (i = 0; i <= entity_map[j].endchar - entity_map[j].basechar; i++) {
01439                             char buffer[16];
01440                             unsigned k;
01441                             size_t written;
01442 
01443                             if (entity_map[j].table[i] == NULL)
01444                                    continue;
01445                                    
01446                             k = i + entity_map[j].basechar;
01447 
01448                             switch (charset) {
01449                             case cs_utf_8:
01450                                    written = php_utf32_utf8(ind, k);
01451                                    ind[written] = '\0';
01452                                    break;
01453                             case cs_big5:
01454                             case cs_gb2312:
01455                             case cs_big5hkscs:
01456                             case cs_sjis:
01457                                    /* we have no mappings for these, but if we had... */
01458                                    /* break through */
01459                             default: /* one byte */
01460                                    written = 1;
01461                                    ind[0] = (unsigned char)k;
01462                                    ind[1] = '\0';
01463                                    break;
01464                             }
01465 
01466                             snprintf(buffer, sizeof(buffer), "&%s;", entity_map[j].table[i]);
01467                             if (zend_hash_find(Z_ARRVAL_P(return_value), (const char*)ind, written+1, &dummy) == FAILURE) {
01468                                    /* in case of the single quote, which is repeated, the first one wins,
01469                                           * so don't replace the existint mapping */
01470                                    add_assoc_string(return_value, (const char*)ind, buffer, 1);
01471                             }
01472                      }
01473               }
01474               /* break thru */
01475 
01476        case HTML_SPECIALCHARS:
01477               add_assoc_stringl(return_value, "&", "&amp;", sizeof("&amp;") - 1, 1);
01478               for (j = 0; basic_entities[j].charcode != 0; j++) {
01479                      if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
01480                             continue;
01481                             
01482                      ind[0] = (unsigned char)basic_entities[j].charcode;
01483                      ind[1] = '\0';
01484                      if (zend_hash_find(Z_ARRVAL_P(return_value), (const char*)ind, 2, &dummy) == FAILURE) {
01485                             add_assoc_stringl(return_value, ind, basic_entities[j].entity,
01486                                    basic_entities[j].entitylen, 1);
01487                      }
01488               }
01489 
01490               break;
01491        }
01492 }
01493 /* }}} */
01494 
01495 /*
01496  * Local variables:
01497  * tab-width: 4
01498  * c-basic-offset: 4
01499  * End:
01500  * vim600: sw=4 ts=4 fdm=marker
01501  * vim<600: sw=4 ts=4
01502  */