Back to index

php5  5.3.10
mbfilter_htmlent.c
Go to the documentation of this file.
00001 /*
00002  * "streamable kanji code filter and converter"
00003  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
00004  *
00005  * LICENSE NOTICES
00006  *
00007  * This file is part of "streamable kanji code filter and converter",
00008  * which is distributed under the terms of GNU Lesser General Public 
00009  * License (version 2) as published by the Free Software Foundation.
00010  *
00011  * This software is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with "streamable kanji code filter and converter";
00018  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
00019  * Suite 330, Boston, MA  02111-1307  USA
00020  *
00021  * The author of this part: Marcus Boerger <helly@php.net>
00022  *
00023  */
00024 /*
00025  * The source code included in this files was separated from mbfilter.c
00026  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
00027  * 
00028  */
00029 
00030 #ifdef HAVE_CONFIG_H
00031 #include "config.h"
00032 #endif
00033 
00034 #ifdef HAVE_STRING_H
00035 #include <string.h>
00036 #endif
00037 
00038 #ifdef HAVE_STRINGS_H
00039 #include <strings.h>
00040 #endif
00041 
00042 #include "mbfilter.h"
00043 #include "mbfilter_htmlent.h"
00044 #include "html_entities.h"
00045 
00046 static const int htmlentitifieds[256] = {
00047   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00048   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00049   0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00050   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,
00051   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00052   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00053   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00054   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00055   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00056   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00057   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00058   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00059   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00060   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00061   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00062   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
00063 };
00064 
00065 static const char *mbfl_encoding_html_ent_aliases[] = {"HTML", "html", NULL};
00066 
00067 const mbfl_encoding mbfl_encoding_html_ent = {
00068        mbfl_no_encoding_html_ent,
00069        "HTML-ENTITIES",
00070        "HTML-ENTITIES",
00071        (const char *(*)[])&mbfl_encoding_html_ent_aliases,
00072        NULL,
00073        MBFL_ENCTYPE_HTML_ENT
00074 };
00075 
00076 const struct mbfl_convert_vtbl vtbl_wchar_html = {
00077        mbfl_no_encoding_wchar,
00078        mbfl_no_encoding_html_ent,
00079        mbfl_filt_conv_common_ctor,
00080        mbfl_filt_conv_common_dtor,
00081        mbfl_filt_conv_html_enc,
00082        mbfl_filt_conv_html_enc_flush
00083 };
00084 
00085 const struct mbfl_convert_vtbl vtbl_html_wchar = {
00086        mbfl_no_encoding_html_ent,
00087        mbfl_no_encoding_wchar,
00088        mbfl_filt_conv_html_dec_ctor,
00089        mbfl_filt_conv_html_dec_dtor,
00090        mbfl_filt_conv_html_dec,
00091        mbfl_filt_conv_html_dec_flush };
00092 
00093 
00094 #define CK(statement)       do { if ((statement) < 0) return (-1); } while (0)
00095 
00096 /*
00097  * any => HTML
00098  */
00099 int mbfl_filt_conv_html_enc(int c, mbfl_convert_filter *filter)
00100 {
00101        int tmp[64];
00102        int i;
00103        unsigned int uc;
00104        const mbfl_html_entity_entry *e;
00105 
00106        if (c < sizeof(htmlentitifieds) / sizeof(htmlentitifieds[0]) &&
00107                             htmlentitifieds[c] != 1) {
00108               CK((*filter->output_function)(c, filter->data));
00109        } else {
00110               CK((*filter->output_function)('&', filter->data));
00111               for (i = 0; (e = &mbfl_html_entity_list[i])->name != NULL; i++) {
00112                      if (c == e->code) {
00113                             char *p;
00114                             
00115                             for (p = e->name; *p != '\0'; p++) {
00116                                    CK((*filter->output_function)((int)*p, filter->data));
00117                             }
00118                             goto last;
00119                      }
00120               }
00121 
00122               {
00123                      int *p = tmp + sizeof(tmp) / sizeof(tmp[0]);
00124 
00125                      CK((*filter->output_function)('#', filter->data));
00126 
00127                      uc = (unsigned int)c;
00128 
00129                      *(--p) = '\0';
00130                      do {
00131                             *(--p) = "0123456789"[uc % 10];
00132                             uc /= 10;
00133                      } while (uc);
00134 
00135                      for (; *p != '\0'; p++) {
00136                             CK((*filter->output_function)(*p, filter->data));
00137                      }
00138               }
00139        last:
00140               CK((*filter->output_function)(';', filter->data));
00141        }
00142        return c;
00143 }
00144 
00145 int mbfl_filt_conv_html_enc_flush(mbfl_convert_filter *filter)
00146 {
00147        filter->status = 0;
00148        filter->opaque = NULL;
00149 
00150        if (filter->flush_function != NULL) {
00151               (*filter->flush_function)(filter->data);
00152        }
00153 
00154        return 0;
00155 }
00156 
00157 /*
00158  * HTML => any
00159  */
00160 #define html_enc_buffer_size       16
00161 static const char html_entity_chars[] = "#0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
00162 
00163 void mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter *filter)
00164 {
00165        filter->status = 0;
00166        filter->opaque = mbfl_malloc(html_enc_buffer_size+1);
00167 }
00168        
00169 void mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter *filter)
00170 {
00171        filter->status = 0;
00172        if (filter->opaque)
00173        {
00174               mbfl_free((void*)filter->opaque);
00175        }
00176        filter->opaque = NULL;
00177 }
00178 
00179 int mbfl_filt_conv_html_dec(int c, mbfl_convert_filter *filter)
00180 {
00181        int  pos, ent = 0;
00182        mbfl_html_entity_entry *entity;
00183        char *buffer = (char*)filter->opaque;
00184 
00185        if (!filter->status) {
00186               if (c == '&' ) {
00187                      filter->status = 1;
00188                      buffer[0] = '&';
00189               } else {
00190                      CK((*filter->output_function)(c, filter->data));
00191               }
00192        } else {
00193               if (c == ';') {
00194                      if (buffer[1]=='#') {
00195                             if (filter->status > 2 && (buffer[2] == 'x' || buffer[2] == 'X')) {
00196                                    if (filter->status > 3) {
00197                                           /* numeric entity */
00198                                           for (pos=3; pos<filter->status; pos++) {
00199                                                  int v =  buffer[pos];
00200                                                  if (v >= '0' && v <= '9') {
00201                                                         v = v - '0';
00202                                                  } else if (v >= 'A' && v <= 'F') {
00203                                                         v = v - 'A' + 10;
00204                                                  } else if (v >= 'a' && v <= 'f') {
00205                                                         v = v - 'a' + 10;
00206                                                  } else {
00207                                                         ent = -1;
00208                                                         break;
00209                                                  }
00210                                                  ent = ent * 16 + v;
00211                                           }
00212                                    } else {
00213                                           ent = -1;
00214                                    }
00215                             } else {
00216                                    /* numeric entity */
00217                                    if (filter->status > 2) {
00218                                           for (pos=2; pos<filter->status; pos++) {
00219                                                  int v = buffer[pos];
00220                                                  if (v >= '0' && v <= '9') {
00221                                                         v = v - '0';
00222                                                  } else {
00223                                                         ent = -1;
00224                                                         break;
00225                                                  }
00226                                                  ent = ent*10 + v;
00227                                           }
00228                                    } else {
00229                                           ent = -1;
00230                                    }
00231                             }
00232                             if (ent >= 0 && ent < 0x110000) {
00233                                    CK((*filter->output_function)(ent, filter->data));
00234                             } else {
00235                                    for (pos = 0; pos < filter->status; pos++) {
00236                                           CK((*filter->output_function)(buffer[pos], filter->data));
00237                                    }
00238                                    CK((*filter->output_function)(c, filter->data));
00239                             }
00240                             filter->status = 0;
00241                             /*php_error_docref("ref.mbstring" TSRMLS_CC, E_NOTICE, "mbstring decoded '%s'=%d", buffer, ent);*/
00242                      } else {
00243                             /* named entity */
00244                             buffer[filter->status] = 0;
00245                             entity = (mbfl_html_entity_entry *)mbfl_html_entity_list;
00246                             while (entity->name) {
00247                                    if (!strcmp(buffer+1, entity->name))      {
00248                                           ent = entity->code;
00249                                           break;
00250                                    }
00251                                    entity++;
00252                             }
00253                             if (ent) {
00254                                    /* decoded */
00255                                    CK((*filter->output_function)(ent, filter->data));
00256                                    filter->status = 0;
00257                                    /*php_error_docref("ref.mbstring" TSRMLS_CC, E_NOTICE,"mbstring decoded '%s'=%d", buffer, ent);*/
00258                             } else { 
00259                                    /* failure */
00260                                    buffer[filter->status++] = ';';
00261                                    buffer[filter->status] = 0;
00262                                    /* php_error_docref("ref.mbstring" TSRMLS_CC, E_WARNING, "mbstring cannot decode '%s'", buffer); */
00263                                    mbfl_filt_conv_html_dec_flush(filter);
00264                             }
00265                      }
00266               } else {
00267                      /* add character */
00268                      buffer[filter->status++] = c;
00269                      /* add character and check */
00270                      if (!strchr(html_entity_chars, c) || filter->status+1==html_enc_buffer_size || (c=='#' && filter->status>2))
00271                      {
00272                             /* illegal character or end of buffer */
00273                             if (c=='&')
00274                                    filter->status--;
00275                             buffer[filter->status] = 0;
00276                             /* php_error_docref("ref.mbstring" TSRMLS_CC, E_WARNING, "mbstring cannot decode '%s'", buffer)l */
00277                             mbfl_filt_conv_html_dec_flush(filter);
00278                             if (c=='&')
00279                             {
00280                                    buffer[filter->status++] = '&';
00281                             }
00282                      }
00283               }
00284        }
00285        return c;
00286 }
00287 
00288 int mbfl_filt_conv_html_dec_flush(mbfl_convert_filter *filter)
00289 {
00290        int status, pos = 0;
00291        unsigned char *buffer;
00292        int err = 0;
00293 
00294        buffer = (unsigned char*)filter->opaque;
00295        status = filter->status;
00296        filter->status = 0;
00297 
00298        /* flush fragments */
00299        while (status--) {
00300               int e = (*filter->output_function)(buffer[pos++], filter->data);
00301               if (e != 0)
00302                      err = e;
00303        }
00304 
00305        if (filter->flush_function != NULL) {
00306               (*filter->flush_function)(filter->data);
00307        }
00308 
00309        return err;
00310 }
00311 
00312