Back to index

courier  0.68.2
html.c
Go to the documentation of this file.
00001 /*
00002 ** Copyright 2011 Double Precision, Inc.  See COPYING for
00003 ** distribution information.
00004 */
00005 
00006 /*
00007 */
00008 
00009 #include "html.h"
00010 
00011 #include "unicode/unicode.h"
00012 #include "rfc2045/rfc2045.h"
00013 #include <stdlib.h>
00014 #include <string.h>
00015 
00016 #define SPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) == '\r')
00017 
00018 struct taginfo {
00019 
00020        char tagname[16];
00021        int flags;
00022 };
00023 
00024 static const char hex[]="0123456789ABCDEF";
00025 
00026 
00027 #define FLAG_NOENDTAG       0x01
00028 #define FLAG_DISCARD 0x02
00029 #define FLAG_NOPRINT 0x04
00030 
00031 #define FLAG_BLOCKQUOTE_CITE 0x1000
00032 
00033 static const struct taginfo tags[]={
00034        {"a"},
00035        {"abbr"},
00036        {"acronym"},
00037        {"address"},
00038        {"b"},
00039        {"base",      FLAG_NOENDTAG},
00040        {"basefont",  FLAG_NOENDTAG},
00041        {"bdo"},
00042        {"big"},
00043        {"blockquote"},
00044        {"br", FLAG_NOENDTAG},
00045        {"caption"},
00046        {"center"},
00047        {"cite"},
00048        {"code"},
00049        {"col",              FLAG_NOENDTAG},
00050        {"colgroup"},
00051        {"dd"},
00052        {"del"},
00053        {"dfn"},
00054        {"dir"},
00055        {"div"},
00056        {"dl"},
00057        {"dt"},
00058        {"em"},
00059        {"font"},
00060        {"h1"},
00061        {"h2"},
00062        {"h3"},
00063        {"h4"},
00064        {"h5"},
00065        {"h6"},
00066        {"hr",        FLAG_NOENDTAG},
00067        {"i"},
00068        {"img",              FLAG_NOENDTAG},
00069        {"ins"},
00070        {"kbd"},
00071        {"li"},
00072        {"menu"},
00073        {"ol"},
00074        {"p"},
00075        {"pre"},
00076        {"q"},
00077        {"s"},
00078        {"samp"},
00079        {"script",    FLAG_DISCARD},
00080        {"small"},
00081        {"span"},
00082        {"strike"},
00083        {"strong"},
00084        {"style",     FLAG_DISCARD},
00085        {"sub"},
00086        {"sup"},
00087        {"table"},
00088        {"tbody"},
00089        {"td"},
00090        {"tfoot"},
00091        {"th"},
00092        {"thead"},
00093        {"title"},
00094        {"tr"},
00095        {"tt"},
00096        {"u"},
00097        {"ul"},
00098        {"var"},
00099 };
00100 
00101 static const struct taginfo div_tag={"div"};
00102 
00103 static const struct taginfo blockquote_cite_tag={"blockquote",
00104                                            FLAG_BLOCKQUOTE_CITE};
00105 
00106 static const struct taginfo unknown_tag={" unknown", FLAG_NOPRINT};
00107 
00108 static const struct taginfo span_discard_tag={" discard",
00109                                          FLAG_DISCARD | FLAG_NOPRINT};
00110 
00111 struct attr {
00112        struct unicode_buf name; /* Attribute name */
00113        struct unicode_buf value; /* Attribute value */
00114 };
00115 
00116 struct htmlfilter_info {
00117 
00118        /* The output function receives the HTML-filtered stream */
00119 
00120        void (*output_func)(const unicode_char *, size_t, void *);
00121        void *output_func_arg;
00122 
00123        /* Content base for relative URLs */
00124        char *contentbase;
00125 
00126        /* Prepend to http: and https: links */
00127        char *http_prefix;
00128 
00129        /* Prepent to mailto: links */
00130        char *mailto_prefix;
00131 
00132        /* A cid: link gets passed to this function for processing. */
00133        char *(*convert_cid_func)(const char *, void *);
00134        void *convert_cid_func_arg;
00135 
00136        /* Current handle for the input HTML stream */
00137 
00138        size_t (*handler_func)(struct htmlfilter_info *,
00139                             const unicode_char *,
00140                             size_t);
00141 
00142        /*
00143        ** An & entity name. Or a tag name. Or an attribute name or value.
00144        */
00145 
00146        struct unicode_buf atom;
00147 
00148        /*
00149        ** An attribute value
00150        */
00151 
00152        struct unicode_buf value;
00153 
00154        /*
00155        ** Another atom
00156        */
00157 
00158        struct unicode_buf atom2;
00159 
00160        /*
00161        ** Quoting character
00162        */
00163 
00164        unicode_char value_quote;
00165 
00166        /* Current tag being processed */
00167        const struct taginfo *tag;
00168 
00169        /* Whether parsed an empty tag */
00170        int tag_empty;
00171 
00172        struct attr attrs[32];
00173        size_t attrs_index;
00174 
00175        /*
00176        ** Current list of active elements.
00177        ** We limit the number of open elements to 128
00178        */
00179 
00180        const struct taginfo *open_elements[128];
00181        size_t n_open_elements;
00182 
00183        /*
00184        ** How many elements have been open since the first element whose
00185        ** contents should be discarded
00186        */
00187 
00188        size_t n_discarded;
00189 };
00190 
00191 static void free_last_attr(struct htmlfilter_info *p)
00192 {
00193        size_t i=--p->attrs_index;
00194 
00195        unicode_buf_deinit(&p->attrs[i].name);
00196        unicode_buf_deinit(&p->attrs[i].value);
00197 }
00198 
00199 static void free_attrs(struct htmlfilter_info *p)
00200 {
00201        while (p->attrs_index)
00202               free_last_attr(p);
00203 }
00204 
00205 static size_t handle_chars(struct htmlfilter_info *p,
00206                         const unicode_char *uc,
00207                         size_t cnt);
00208 
00209 static size_t handle_entity(struct htmlfilter_info *p,
00210                          const unicode_char *uc,
00211                          size_t cnt);
00212 
00213 static size_t seen_lt(struct htmlfilter_info *p,
00214                     const unicode_char *uc,
00215                     size_t cnt);
00216 
00217 static size_t seen_ltexcl(struct htmlfilter_info *p,
00218                        const unicode_char *uc,
00219                        size_t cnt);
00220 
00221 static size_t seen_sgentity(struct htmlfilter_info *p,
00222                          const unicode_char *uc,
00223                          size_t cnt);
00224 
00225 static size_t seen_ltspace(struct htmlfilter_info *p,
00226                         const unicode_char *uc,
00227                         size_t cnt);
00228 
00229 static size_t seen_pi(struct htmlfilter_info *p,
00230                     const unicode_char *uc,
00231                     size_t cnt);
00232 
00233 static size_t seen_piq(struct htmlfilter_info *p,
00234                      const unicode_char *uc,
00235                      size_t cnt);
00236 
00237 static size_t seen_comment(struct htmlfilter_info *p,
00238                         const unicode_char *uc,
00239                         size_t cnt);
00240 
00241 static size_t seen_commentdash(struct htmlfilter_info *p,
00242                             const unicode_char *uc,
00243                             size_t cnt);
00244 
00245 static size_t seen_commentdashdash(struct htmlfilter_info *p,
00246                                const unicode_char *uc,
00247                                size_t cnt);
00248 
00249 static size_t seen_closing_elem(struct htmlfilter_info *p,
00250                             const unicode_char *uc,
00251                             size_t cnt);
00252 
00253 static size_t seen_opening_elem(struct htmlfilter_info *p,
00254                             const unicode_char *uc,
00255                             size_t cnt);
00256 
00257 static size_t seen_attr(struct htmlfilter_info *p,
00258                      const unicode_char *uc,
00259                      size_t cnt);
00260 
00261 static size_t seen_attrname(struct htmlfilter_info *p,
00262                          const unicode_char *uc,
00263                          size_t cnt);
00264 
00265 static size_t seen_attrvalue_1stchar(struct htmlfilter_info *p,
00266                                  const unicode_char *uc,
00267                                  size_t cnt);
00268 
00269 static size_t seen_attrvalue(struct htmlfilter_info *p,
00270                           const unicode_char *uc,
00271                           size_t cnt);
00272 
00273 static size_t seen_attrvalue_entity(struct htmlfilter_info *p,
00274                                 const unicode_char *uc,
00275                                 size_t cnt);
00276 
00277 struct htmlfilter_info *htmlfilter_alloc(void (*output_func)
00278                                     (const unicode_char *, size_t, void *),
00279                                     void *output_func_arg)
00280 {
00281        struct htmlfilter_info *p;
00282 
00283        p=calloc(1, sizeof(*p));
00284        if (!p)
00285               return p;
00286 
00287        p->output_func=output_func;
00288        p->output_func_arg=output_func_arg;
00289 
00290        unicode_buf_init(&p->atom, 2048);
00291        unicode_buf_init(&p->atom2, 2048);
00292        unicode_buf_init(&p->value, 8192);
00293 
00294        p->handler_func=handle_chars;
00295        return p;
00296 }
00297 
00298 static void close_elements_until(struct htmlfilter_info *p, size_t i);
00299 
00300 void htmlfilter_free(struct htmlfilter_info *p)
00301 {
00302        close_elements_until(p, 0);
00303 
00304        free_attrs(p);
00305 
00306        unicode_buf_deinit(&p->atom);
00307        unicode_buf_deinit(&p->atom2);
00308        unicode_buf_deinit(&p->value);
00309 
00310        if (p->contentbase)
00311               free(p->contentbase);
00312 
00313        if (p->http_prefix)
00314               free(p->http_prefix);
00315 
00316        if (p->mailto_prefix)
00317               free(p->mailto_prefix);
00318 
00319        free(p);
00320 }
00321 
00322 void htmlfilter_set_contentbase(struct htmlfilter_info *p,
00323                             const char *contentbase)
00324 {
00325        if (p->contentbase)
00326               free(p->contentbase);
00327 
00328        p->contentbase=strdup(contentbase);
00329 }
00330 
00331 
00332 void htmlfilter_set_http_prefix(struct htmlfilter_info *p,
00333                             const char *http_prefix)
00334 {
00335        if (p->http_prefix)
00336               free(p->http_prefix);
00337 
00338        p->http_prefix=http_prefix ? strdup(http_prefix):NULL;
00339 }
00340 
00341 void htmlfilter_set_mailto_prefix(struct htmlfilter_info *p,
00342                               const char *mailto_prefix)
00343 {
00344        if (p->mailto_prefix)
00345               free(p->mailto_prefix);
00346 
00347        p->mailto_prefix=mailto_prefix ? strdup(mailto_prefix):NULL;
00348 }
00349 
00350 void htmlfilter_set_convertcid(struct htmlfilter_info *p,
00351                             char *(*convert_cid_func)(const char *, void *),
00352                             void *convert_cid_func_arg)
00353 {
00354        p->convert_cid_func=convert_cid_func;
00355        p->convert_cid_func_arg=convert_cid_func_arg;
00356 }
00357 
00358 void htmlfilter(struct htmlfilter_info *p,
00359               const unicode_char *str, size_t cnt)
00360 {
00361        while (cnt)
00362        {
00363               size_t n=(*p->handler_func)(p, str, cnt);
00364 
00365               str += n;
00366               cnt -= n;
00367        }
00368 }
00369 
00370 /*
00371 ** Output HTML text content
00372 */
00373 
00374 static void output(struct htmlfilter_info *p,
00375                  const unicode_char *uc,
00376                  size_t cnt)
00377 {
00378        if (cnt && !p->n_discarded)
00379               (*p->output_func)(uc, cnt, p->output_func_arg);
00380 }
00381 
00382 /*
00383 ** Output HTML text content given as iso-8859-1 chars.
00384 */
00385 static void output_chars(struct htmlfilter_info *p,
00386                       const char *str,
00387                       size_t cnt)
00388 {
00389        unicode_char unicode_buf[256];
00390 
00391        while (cnt)
00392        {
00393               size_t n=sizeof(unicode_buf)/sizeof(unicode_buf[0]), i;
00394 
00395               if (n > cnt)
00396                      n=cnt;
00397 
00398               for (i=0; i<n; ++i)
00399                      unicode_buf[i]=(unsigned char)str[i];
00400 
00401               str += n;
00402               cnt -= n;
00403               output(p, unicode_buf, n);
00404        }
00405 }
00406 
00407 /*
00408 ** HANDLER: Text content.
00409 */
00410 
00411 static size_t handle_chars(struct htmlfilter_info *p,
00412                         const unicode_char *uc,
00413                         size_t cnt)
00414 {
00415        size_t i;
00416 
00417        for (i=0; i<cnt; ++i)
00418               switch (uc[i]) {
00419               case '&':
00420                      if (i)
00421                      {
00422                             output(p, uc, i);
00423                             return i;
00424                      }
00425 
00426                      unicode_buf_clear(&p->atom);
00427 
00428                      unicode_buf_append(&p->atom, uc+i, 1);
00429                      p->handler_func=handle_entity;
00430                      return 1;
00431               case '<':
00432                      if (i)
00433                      {
00434                             output(p, uc, i);
00435                             return i;
00436                      }
00437                      p->handler_func=seen_lt;
00438 
00439                      free_attrs(p);
00440                      return 1;
00441 
00442               case '>':
00443                      if (i)
00444                             output(p, uc, i);
00445                      {
00446                             static const unicode_char gt[]=
00447                                    {'&','g','t',';'};
00448 
00449                             output(p, gt, 4);
00450                      }
00451                      return i+1;
00452               }
00453 
00454        output(p, uc, cnt);
00455        return cnt;
00456 }
00457 
00458 /*
00459 ** Convert alphanumeric to lowercase.
00460 **
00461 ** Returns: non-zero US-ASCII lowercase value of passed character if the
00462 ** passed character is US-ASCII alphabetic or numeric, 0 otherwise.
00463 */
00464 static unicode_char isualnum(unicode_char c)
00465 {
00466        if (c >= 'a' && c <= 'z')
00467               return c;
00468 
00469        if (c >= 'A' && c <= 'Z')
00470               return c + ('a'-'A');
00471 
00472        if (c >= '0' && c <= '9')
00473               return c;
00474 
00475        return 0;
00476 }
00477 
00478 /*
00479 ** HANDLER: html entity.
00480 */
00481 
00482 static size_t handle_entity(struct htmlfilter_info *p,
00483                          const unicode_char *uc,
00484                          size_t cnt)
00485 {
00486        size_t i;
00487 
00488        if (unicode_buf_len(&p->atom) == 1 && *uc == '#')
00489        {
00490               unicode_buf_append(&p->atom, uc, 1);
00491               return 1;
00492        }
00493 
00494        for (i=0; i<cnt; ++i)
00495        {
00496               unicode_char c=isualnum(uc[i]);
00497 
00498               if (c != 0)
00499               {
00500                      unicode_buf_append(&p->atom, &c, 1);
00501                      continue;
00502               }
00503 
00504               p->handler_func=handle_chars;
00505               if (uc[i] == ';')
00506               {
00507                      /*
00508                      ** It's well-formed
00509                      */
00510                      output(p, unicode_buf_ptr(&p->atom),
00511                             unicode_buf_len(&p->atom));
00512                      output_chars(p, ";", 1);
00513                      return ++i;
00514               }
00515 
00516               break;
00517        }
00518        return i;
00519 }
00520 
00521 /*
00522 ** HANDLER: first character after an <
00523 */
00524 
00525 static size_t seen_lt(struct htmlfilter_info *p,
00526                     const unicode_char *uc,
00527                     size_t cnt)
00528 {
00529        if (*uc == '?')
00530        {
00531               p->handler_func=seen_pi;
00532               return 1;
00533        }
00534 
00535        if (*uc == '!')
00536        {
00537               p->handler_func=seen_ltexcl;
00538               return 1;
00539        }
00540 
00541        unicode_buf_clear(&p->atom);
00542        p->handler_func=seen_ltspace;
00543        return seen_ltspace(p, uc, cnt);
00544 }
00545 
00546 /*
00547 ** HANDLER: "<!"
00548 */
00549 
00550 static size_t seen_ltexcl(struct htmlfilter_info *p,
00551                        const unicode_char *uc,
00552                        size_t cnt)
00553 {
00554        if (*uc == '-')
00555        {
00556               /* Assume an SGML comment */
00557 
00558               p->handler_func=seen_comment;
00559 
00560               return seen_comment(p, uc, cnt);
00561        }
00562 
00563        p->handler_func=seen_sgentity;
00564        return seen_sgentity(p, uc, cnt);
00565 }
00566 
00567 /*
00568 ** HANDLER: "<! ..."
00569 */
00570 
00571 static size_t seen_sgentity(struct htmlfilter_info *p,
00572                          const unicode_char *uc,
00573                          size_t cnt)
00574 {
00575        size_t i;
00576 
00577        for (i=0; i<cnt; i++)
00578               if (uc[i] == '>')
00579               {
00580                      p->handler_func=handle_chars;
00581 
00582                      return i+1;
00583               }
00584 
00585        return i;
00586 }
00587 
00588 /*
00589 ** HANDLER: "<" followed by whitespace
00590 */
00591 
00592 static size_t seen_ltspace(struct htmlfilter_info *p,
00593                         const unicode_char *uc,
00594                         size_t cnt)
00595 {
00596        if (SPACE(*uc))
00597               return 1;
00598 
00599        if (*uc == '/')
00600        {
00601               p->handler_func=seen_closing_elem;
00602               return 1;
00603        }
00604 
00605        if (isualnum(*uc))
00606        {
00607               p->handler_func=seen_opening_elem;
00608               return seen_opening_elem(p, uc, cnt);
00609        }
00610 
00611        /* Syntax error, punt */
00612 
00613        p->handler_func=handle_chars;
00614        return handle_chars(p, uc, cnt);
00615 }
00616 
00617 /*
00618 ** HANDLER: <?
00619 */
00620 
00621 static size_t seen_pi(struct htmlfilter_info *p,
00622                     const unicode_char *uc,
00623                     size_t cnt)
00624 {
00625        size_t i;
00626 
00627        for (i=0; i<cnt; ++i)
00628        {
00629               if (uc[i] == '?')
00630               {
00631                      p->handler_func=seen_piq;
00632                      return i+1;
00633               }
00634        }
00635        return i;
00636 }
00637 
00638 /*
00639 ** HANDLER: <? .... ?
00640 */
00641 
00642 static size_t seen_piq(struct htmlfilter_info *p,
00643                      const unicode_char *uc,
00644                      size_t cnt)
00645 {
00646        p->handler_func=seen_pi;
00647 
00648        if (*uc == '>')
00649        {
00650               p->handler_func=handle_chars;
00651               return 1;
00652        }
00653 
00654        /* Look for the next ? */
00655 
00656        return seen_pi(p, uc, cnt);
00657 }
00658 
00659 /*
00660 ** HANDLER: Seen <!
00661 */
00662 
00663 static size_t seen_comment(struct htmlfilter_info *p,
00664                         const unicode_char *uc,
00665                         size_t cnt)
00666 {
00667        size_t i;
00668 
00669        for (i=0; i<cnt; ++i)
00670        {
00671               if (uc[i] == '-')
00672               {
00673                      p->handler_func=seen_commentdash;
00674                      return i+1;
00675               }
00676        }
00677        return i;
00678 }
00679 
00680 /*
00681 ** HANDLER: Seen <! .... -
00682 */
00683 
00684 static size_t seen_commentdash(struct htmlfilter_info *p,
00685                             const unicode_char *uc,
00686                             size_t cnt)
00687 {
00688        if (*uc == '-')
00689        {
00690               p->handler_func=seen_commentdashdash;
00691               return 1;
00692        }
00693        p->handler_func=seen_comment;
00694        return seen_comment(p, uc, cnt);
00695 }
00696 
00697 /*
00698 ** HANDLER: Seen <! .... --
00699 */
00700 
00701 static size_t seen_commentdashdash(struct htmlfilter_info *p,
00702                                const unicode_char *uc,
00703                                size_t cnt)
00704 {
00705        if (*uc == '>')
00706        {
00707               p->handler_func=handle_chars;
00708               return 1;
00709        }
00710 
00711        p->handler_func=seen_commentdash;
00712        return seen_commentdash(p, uc, cnt);
00713 }
00714 
00715 /*
00716 ** Comparison function for bsearch() when searching the tags array.
00717 */
00718 
00719 static int search_tags(const void *key, const void *elem)
00720 {
00721        size_t i;
00722        const char *cp=((const struct taginfo *)elem)->tagname;
00723        unicode_char c;
00724        const struct unicode_buf *ukey=(struct unicode_buf *)key;
00725        const unicode_char *k=unicode_buf_ptr(ukey);
00726        size_t kl=unicode_buf_len(ukey);
00727 
00728        for (i=0; (c=i >= kl ? 0:k[i]) != 0 || cp[i] != 0; ++i)
00729        {
00730               unicode_char c2=(unsigned char)cp[i];
00731 
00732               if (c < c2)
00733                      return -1;
00734 
00735               if (c > c2)
00736                      return 1;
00737        }
00738        return 0;
00739 }
00740 
00741 /*
00742 ** Sometimes we may need to change one element into another one.
00743 */
00744 
00745 static const struct taginfo *change_element(const struct taginfo *tag)
00746 {
00747        if (strcmp(tag->tagname, "base") == 0)
00748               return &div_tag;
00749 
00750        if (strcmp(tag->tagname, "script") == 0 ||
00751            strcmp(tag->tagname, "style") == 0)
00752               return &span_discard_tag;
00753        return tag;
00754 }
00755 
00756 /*
00757 ** Emit text, escaping special characters.
00758 */
00759 
00760 static void output_escaped(struct htmlfilter_info *p,
00761                         const unicode_char *uc,
00762                         size_t cnt)
00763 {
00764        while (cnt)
00765        {
00766               size_t i;
00767 
00768               for (i=0; i<cnt; i++)
00769                      if (uc[i] < ' ' || uc[i] > 127 ||
00770                          uc[i] == '<' || uc[i] == '>' || uc[i] == '&' ||
00771                          uc[i] == '"')
00772                             break;
00773 
00774               if (i)
00775                      output(p, uc, i);
00776 
00777               uc += i;
00778               cnt -= i;
00779 
00780               if (cnt)
00781               {
00782                      unicode_char c;
00783                      char buf[sizeof(unicode_char)*2+4];
00784                      char *cp;
00785 
00786                      c= *uc++;
00787                      --cnt;
00788 
00789                      cp=buf+sizeof(buf)-1;
00790                      *cp=0;
00791                      *--cp=';';
00792                      do
00793                      {
00794                             *--cp=hex[c & 15];
00795                             c /= 16;
00796                      } while (c);
00797                      *--cp='x';
00798                      *--cp='#';
00799                      *--cp='&';
00800 
00801                      output_chars(p, cp, strlen(cp));
00802               }
00803        }
00804 }
00805 
00806 /*
00807 ** Completed parsing of a tag.
00808 */
00809 
00810 static void open_element(struct htmlfilter_info *p)
00811 {
00812        size_t i=0;
00813        int discard_was_increased=0;
00814 
00815        p->tag=change_element(p->tag);
00816 
00817        if (p->n_open_elements >=
00818            sizeof(p->open_elements)/sizeof(p->open_elements[0]))
00819               return; /* Too many open elements */
00820 
00821        if ((p->tag->flags & FLAG_DISCARD) || p->n_discarded)
00822        {
00823               ++p->n_discarded;
00824               discard_was_increased=1;
00825        }
00826 
00827        if (p->tag->flags & FLAG_NOENDTAG)
00828               p->tag_empty=1; /* Make it so, Number One. */
00829 
00830        if (p->tag->flags & FLAG_NOPRINT)
00831               ++p->n_discarded; /* Temporary */
00832 
00833        p->open_elements[p->n_open_elements++]=p->tag;
00834 
00835        /*
00836        ** For A elements, the title attribute will have the full target
00837        ** URL. Attempt to extract the hostname and show it before the
00838        ** A element.
00839        */
00840 
00841        if (strcmp(p->tag->tagname, "a") == 0)
00842        {
00843               size_t i;
00844 
00845               for (i=0; i<p->attrs_index; ++i)
00846               {
00847                      if (unicode_buf_cmp_str(&p->attrs[i].name, "title", 5)
00848                          == 0)
00849                      {
00850                             size_t j, k;
00851 
00852                             for (j=0; j<unicode_buf_len(&p->attrs[i].value);
00853                                  ++j)
00854                             {
00855                                    if (unicode_buf_ptr(&p->attrs[i].value)
00856                                        [j] == ':')
00857                                    {
00858                                           ++j;
00859                                           break;
00860                                    }
00861                             }
00862 
00863                             while (j<unicode_buf_len(&p->attrs[i].value) &&
00864                                    unicode_buf_ptr(&p->attrs[i].value)[j]
00865                                    == '/')
00866                                    ++j;
00867                             k=j;
00868 
00869                             while (k<unicode_buf_len(&p->attrs[i].value))
00870                             {
00871                                    switch (unicode_buf_ptr(&p->attrs[i]
00872                                                         .value)[k]) {
00873                                    case '/':
00874                                    case '?':
00875                                    case '#':
00876                                           break;
00877                                    default:
00878                                           ++k;
00879                                           continue;
00880                                    }
00881                                    break;
00882                             }
00883 
00884                             if (k > j)
00885                             {
00886                                    static const char span[]=
00887                                           "<span class=\"urlhost\">[";
00888 
00889                                    output_chars(p, span,
00890                                                sizeof(span)-1);
00891                                    output_escaped(p,
00892                                                  unicode_buf_ptr(&p->
00893                                                                attrs[i]
00894                                                                .value)
00895                                                  +j, k-j);
00896 
00897                                    output_chars(p, "]</span>", 8);
00898                             }
00899                             break;
00900                      }
00901               }
00902        }
00903 
00904        output_chars(p, "<", 1);
00905        output_chars(p, p->tag->tagname, strlen(p->tag->tagname));
00906 
00907        for (i=0; i<p->attrs_index; ++i)
00908        {
00909               output_chars(p, " ", 1);
00910               output(p, unicode_buf_ptr(&p->attrs[i].name),
00911                      unicode_buf_len(&p->attrs[i].name));
00912 
00913               if (unicode_buf_len(&p->attrs[i].value) > 0)
00914               {
00915                      output_chars(p, "=\"", 2);
00916 
00917                      output_escaped(p, unicode_buf_ptr(&p->attrs[i].value),
00918                                    unicode_buf_len(&p->attrs[i].value));
00919                      output_chars(p, "\"", 1);
00920               }
00921        }
00922 
00923        if (p->tag_empty)
00924               output_chars(p, " /", 2);
00925 
00926        output_chars(p, ">", 1);
00927 
00928        if (p->tag_empty)
00929        {
00930               /* This tag did not really open */
00931 
00932               --p->n_open_elements;
00933 
00934               if (discard_was_increased)
00935                      --p->n_discarded;
00936        }
00937 
00938        if (!p->tag_empty && p->tag->flags & FLAG_BLOCKQUOTE_CITE)
00939        {
00940               static const char str[]="<div class=\"quotedtext\">";
00941 
00942               output_chars(p, str, sizeof(str)-1);
00943        }
00944 
00945        if (p->tag->flags & FLAG_NOPRINT)
00946               --p->n_discarded; /* Was temporary */
00947 }
00948 
00949 /* Close an element */
00950 
00951 static void close_element(struct htmlfilter_info *p,
00952                        const struct taginfo *tag)
00953 {
00954        size_t i;
00955 
00956        tag=change_element(tag);
00957 
00958        /* Search for the tag that we are closing */
00959 
00960        i=p->n_open_elements;
00961 
00962        while (i)
00963        {
00964               if (strcmp(p->open_elements[i-1]->tagname, tag->tagname) == 0)
00965                      break;
00966               --i;
00967        }
00968 
00969        if (!i)
00970               return; /* Did not find a matching open element */
00971 
00972        close_elements_until(p, --i);
00973 }
00974 
00975 static void close_elements_until(struct htmlfilter_info *p, size_t i)
00976 {
00977        while (p->n_open_elements > i)
00978        {
00979               --p->n_open_elements;
00980 
00981               if (!p->n_discarded &&
00982                   (p->open_elements[p->n_open_elements]->flags & FLAG_NOPRINT)
00983                   == 0)
00984               {
00985                      const char *cp=
00986                             p->open_elements[p->n_open_elements]->tagname;
00987 
00988                      if (p->open_elements[p->n_open_elements]->flags &
00989                          p->tag->flags & FLAG_BLOCKQUOTE_CITE)
00990                             output_chars(p, "</div>", 6);
00991 
00992                      output_chars(p, "</", 2);
00993                      output_chars(p, cp, strlen(cp));
00994                      output_chars(p, ">", 1);
00995               }
00996 
00997               if (p->n_discarded)
00998                      --p->n_discarded;
00999        }
01000 }
01001 
01002 /*
01003 ** HANDLER: Seen </
01004 */
01005 
01006 static size_t seen_closing_elem(struct htmlfilter_info *p,
01007                             const unicode_char *uc,
01008                             size_t cnt)
01009 {
01010        size_t i;
01011        unicode_char c;
01012 
01013        for (i=0; i<cnt; ++i)
01014        {
01015               if (uc[i] == '>')
01016               {
01017                      const struct taginfo *tag;
01018 
01019                      p->handler_func=handle_chars;
01020 
01021                      tag=bsearch(&p->atom,
01022                                 tags,
01023                                 sizeof(tags)/sizeof(tags[0]),
01024                                 sizeof(tags[0]),
01025                                 search_tags);
01026 
01027                      /*
01028                      ** Change unknown elements to a <span>
01029                      */
01030 
01031                      if (!tag)
01032                             tag= &unknown_tag;
01033 
01034                      close_element(p, tag);
01035                      return i+1;
01036               }
01037 
01038               /* Loose parsing - ignore spaces wherever they are */
01039 
01040               if (SPACE(uc[i]))
01041                      continue;
01042 
01043               if ((c=uc[i]) == ':' || (c=isualnum(c)) != 0)
01044               {
01045                      unicode_buf_append(&p->atom, &c, 1);
01046                      continue;
01047               }
01048 
01049               /*
01050               ** Syntax error, punt.
01051               */
01052 
01053               p->handler_func=handle_chars;
01054               return i;
01055        }
01056 
01057        return i;
01058 }
01059 
01060 /*
01061 ** HANDLER: <  [expecting tag
01062 **
01063 ** Collect element name.
01064 */
01065 
01066 static size_t seen_opening_elem(struct htmlfilter_info *p,
01067                             const unicode_char *uc,
01068                             size_t cnt)
01069 {
01070        size_t i;
01071 
01072        for (i=0; i<cnt; ++i)
01073        {
01074               unicode_char c;
01075 
01076               if ((c=uc[i]) == ':' || (c=isualnum(c)) != 0)
01077               {
01078                      unicode_buf_append(&p->atom, &c, 1);
01079                      continue;
01080               }
01081 
01082               /*
01083               ** End of element name.
01084               */
01085 
01086               p->tag=bsearch(&p->atom,
01087                             tags,
01088                             sizeof(tags)/sizeof(tags[0]),
01089                             sizeof(tags[0]),
01090                             search_tags);
01091 
01092               /*
01093               ** Change unknown elements to a <span>
01094               */
01095 
01096               if (!p->tag)
01097                      p->tag= &unknown_tag;
01098 
01099               p->handler_func=seen_attr;
01100               p->tag_empty=0;
01101               return i;
01102        }
01103        return i;
01104 }
01105 
01106 static void save_attr(struct htmlfilter_info *p);
01107 
01108 /*
01109 ** HANDLER: expecting attribute name or >
01110 */
01111 
01112 static size_t seen_attr(struct htmlfilter_info *p,
01113                      const unicode_char *uc,
01114                      size_t cnt)
01115 {
01116        if (SPACE(*uc))
01117               return 1;
01118 
01119        if (*uc == '/')
01120        {
01121               p->tag_empty=1;
01122               return 1;
01123        }
01124 
01125        if (isualnum(*uc))
01126        {
01127               unicode_buf_clear(&p->atom);
01128               p->handler_func=seen_attrname;
01129               return seen_attrname(p, uc, cnt);
01130        }
01131 
01132        p->handler_func=handle_chars;
01133 
01134        if (*uc == '>')
01135               open_element(p);
01136 
01137        return 1;
01138 }
01139 
01140 /*
01141 ** After munging a URL, append the original URL, using URL-escaping.
01142 */
01143 
01144 static void append_orig_href(struct htmlfilter_info *p,
01145                           struct unicode_buf *dst,
01146                           const char *url)
01147 {
01148        size_t n=strlen(url);
01149 
01150        while (n)
01151        {
01152               size_t i;
01153 
01154               for (i=0; i<n; i++)
01155                      if (!isualnum(url[i]))
01156                             break;
01157 
01158               if (i == 0)
01159               {
01160                      unicode_char b[3];
01161 
01162                      b[0]='%';
01163                      b[1]=hex[ (url[0] >> 4) & 15];
01164                      b[2]=hex[ url[0] & 15];
01165 
01166                      unicode_buf_append(dst, b, 3);
01167                      ++url;
01168                      --n;
01169                      continue;
01170               }
01171 
01172               unicode_buf_append_char(dst, url, i);
01173               url += i;
01174               n -= i;
01175        }
01176 }
01177 
01178 /*
01179 ** Munge an HREF url accordingly.
01180 **
01181 ** Returns non-0 if the URL was recognized and munged.
01182 **
01183 ** A 0 return means that I do not understand what this URL is, so it should
01184 ** be omitted.
01185 */
01186 
01187 static int change_href(struct htmlfilter_info *p,
01188                      char *url,
01189                      struct unicode_buf *dst,
01190                      int must_be_cid, /* Understand only CID: urls */
01191                      int *was_http_url
01192                      /* Set to non-0 if the munged URL was http or https */
01193                      )
01194 {
01195        size_t i;
01196 
01197        *was_http_url=0;
01198 
01199        /* Convert the method to lowercase */
01200 
01201        for (i=0; url[i] && url[i] != ':'; ++i)
01202        {
01203               if (url[i] >= 'A' && url[i] <= 'Z')
01204                      url[i] += 'a'-'A';
01205        }
01206 
01207        if (strncmp(url, "cid:", 4) == 0 && p->convert_cid_func)
01208        {
01209               char *q;
01210 
01211               if ((q=(*p->convert_cid_func)
01212                    (url+4, p->convert_cid_func_arg)) != NULL)
01213               {
01214                      unicode_buf_append_char(dst, q, strlen(q));
01215                      free(q);
01216                      return 1;
01217               }
01218        }
01219 
01220        if (must_be_cid)
01221               return 0;
01222 
01223        if ((strncmp(url, "http:", 5) == 0 ||
01224             strncmp(url, "https:", 6) == 0)
01225            && p->http_prefix && *p->http_prefix)
01226        {
01227               *was_http_url=1;
01228               unicode_buf_append_char(dst, p->http_prefix, strlen(p->http_prefix));
01229               append_orig_href(p, dst, url);
01230               return 1;
01231        }
01232 
01233        if (strncmp(url, "mailto:", 7) == 0
01234            && p->mailto_prefix && *p->mailto_prefix)
01235        {
01236               size_t i;
01237 
01238               for (i=0; url[i]; ++i)
01239                      if (url[i] == '?')
01240                      {
01241                             url[i]='&';
01242                             break;
01243                      }
01244 
01245               unicode_buf_append_char(dst, p->mailto_prefix,
01246                                    strlen(p->mailto_prefix));
01247               append_orig_href(p, dst, url+7);
01248               return 1;
01249        }
01250 
01251        return 0;
01252 }
01253 
01254 /*
01255 ** Completed parsing of attribute[=value]?
01256 **
01257 ** If value was provided, malloc a buffer for it, copy it, put it into
01258 ** cur_attr->value.
01259 */
01260 
01261 static void save_attr_int(struct htmlfilter_info *p,
01262                        struct unicode_buf *name,
01263                        struct unicode_buf *value)
01264 {
01265        struct attr *cur_attr;
01266 
01267        if (p->attrs_index >= sizeof(p->attrs)/sizeof(p->attrs[0]))
01268               return;
01269 
01270        cur_attr=p->attrs + p->attrs_index;
01271 
01272        ++p->attrs_index;
01273 
01274        unicode_buf_init_copy(&cur_attr->name, name);
01275        unicode_buf_init_copy(&cur_attr->value, value);
01276 }
01277 
01278 static int is_attr(struct htmlfilter_info *p, const char *c)
01279 {
01280        return unicode_buf_cmp_str(&p->atom, c, strlen(c)) == 0;
01281 }
01282 
01283 /*
01284 ** Convert the current attribute that contains a URL to utf-8, if necessary
01285 ** and resolve against contentbase, if necessary.
01286 */
01287 static char *resolve_url(struct htmlfilter_info *p)
01288 {
01289        char *buf;
01290        size_t size;
01291        char *cp;
01292 
01293        libmail_u_convert_handle_t h=
01294               libmail_u_convert_fromu_init("utf-8", &buf, &size, 1);
01295 
01296        if (h)
01297        {
01298               libmail_u_convert_uc(h, unicode_buf_ptr(&p->value),
01299                                  unicode_buf_len(&p->value));
01300 
01301               if (libmail_u_convert_deinit(h, NULL))
01302                      buf=NULL;
01303        }
01304        else
01305        {
01306               buf=NULL;
01307        }
01308 
01309        if (!buf)
01310               return NULL;
01311 
01312        if (p->contentbase && *p->contentbase)
01313        {
01314               cp=rfc2045_append_url(p->contentbase, buf);
01315 
01316               free(buf);
01317               buf=cp;
01318        }
01319        return (buf);
01320 }
01321 
01322 /*
01323 ** Take the contents of an HREF (or a SRC), prepend contentbase, if necessary
01324 ** then invoke change_href() and save the result as the replacement
01325 ** HREF/SRC attribute.
01326 **
01327 ** Returns the original HREF/SRC was HTTP or HTTPS url in the malloc-ed
01328 ** buffer, or NULL if the HREF/SRC was not http or https (but something else).
01329 */
01330 
01331 static char *handle_url(struct htmlfilter_info *p,
01332                      int must_be_cid)
01333 {
01334        struct unicode_buf new_href;
01335        char *cp;
01336        int http_url;
01337 
01338        char *retval=NULL;
01339 
01340        if ((cp=resolve_url(p)) == NULL)
01341               return NULL;
01342 
01343        unicode_buf_init(&new_href, (size_t)-1);
01344 
01345        if (change_href(p, cp, &new_href, must_be_cid, &http_url))
01346        {
01347               save_attr_int(p, &p->atom, &new_href);
01348 
01349               if (!http_url)
01350               {
01351                      free(cp);
01352                      cp=NULL;
01353               }
01354 
01355               retval=cp;
01356               cp=NULL;
01357        }
01358 
01359        if (cp)
01360               free(cp);
01361 
01362        unicode_buf_deinit(&new_href);
01363        return retval;
01364 }
01365 
01366 /*
01367 ** If this is the second occurence of the same attribute, nuke it.
01368 ** Only one occurence of each attribute.
01369 */
01370 
01371 static int attr_already_exists(struct htmlfilter_info *p,
01372                             struct unicode_buf *name)
01373 {
01374        size_t i;
01375 
01376        for (i=0; i<p->attrs_index; ++i)
01377        {
01378               if (unicode_buf_cmp(&p->attrs[i].name, name) == 0)
01379                      return 1;
01380        }
01381        return 0;
01382 }
01383 
01384 static void save_attr(struct htmlfilter_info *p)
01385 {
01386        p->handler_func=seen_attr;
01387 
01388        if (attr_already_exists(p, &p->atom))
01389               return;
01390 
01391        /*
01392        ** Transform <blockquote type="cite"> into
01393        **
01394        ** <blockquote class="citeN"> where N nests from 0 to 2.
01395        */
01396 
01397        if (is_attr(p, "type") && strcmp(p->tag->tagname, "blockquote") == 0 &&
01398            unicode_buf_len(&p->value) == 4)
01399        {
01400               size_t i;
01401 
01402               for (i=0; i<4; ++i)
01403                      if (isualnum(unicode_buf_ptr(&p->value)[i])
01404                          != "cite"[i])
01405                             break;
01406 
01407               if (i == 4)
01408               {
01409                      size_t n=0, j;
01410                      char buf[10];
01411 
01412                      for (j=0; j<p->n_open_elements; ++j)
01413                             if (p->open_elements[j]->flags &
01414                                 FLAG_BLOCKQUOTE_CITE)
01415                                    ++n;
01416 
01417                      p->tag=&blockquote_cite_tag;
01418 
01419                      sprintf(buf, "cite%d", (int)(n % 3));
01420 
01421                      unicode_buf_clear(&p->value);
01422                      unicode_buf_append_char(&p->value, buf, strlen(buf));
01423 
01424                      unicode_buf_clear(&p->atom);
01425                      unicode_buf_append_char(&p->atom, "class", 5);
01426 
01427                      if (!attr_already_exists(p, &p->atom))
01428                      {
01429                             save_attr_int(p, &p->atom, &p->value);
01430                             return;
01431                      }
01432               }
01433        }
01434 
01435        /*
01436        ** Do not allow title attributes on an A element, we'll supply our
01437        ** own.
01438        */
01439 
01440        if (is_attr(p, "title") &&
01441            strcmp(p->tag->tagname, "a") == 0)
01442               return;
01443 
01444        if (is_attr(p, "lang")
01445            || is_attr(p, "title")
01446            || is_attr(p, "dir")
01447            || is_attr(p, "size")
01448            || is_attr(p, "color")
01449            || is_attr(p, "face")
01450 
01451            || is_attr(p, "span")
01452            || is_attr(p, "width")
01453            || is_attr(p, "height")
01454            || is_attr(p, "align")
01455            || is_attr(p, "char")
01456            || is_attr(p, "charoff")
01457            || is_attr(p, "valign")
01458            || is_attr(p, "alt")
01459            )
01460        {
01461               /* Safe attributes */
01462 
01463               save_attr_int(p, &p->atom, &p->value);
01464               return;
01465        }
01466 
01467        if (is_attr(p, "src") && strcmp(p->tag->tagname, "img") == 0)
01468        {
01469               char *url=handle_url(p, 1);
01470 
01471               if (url)
01472                      free(url);
01473               return;
01474        }
01475 
01476        if (is_attr(p, "href"))
01477        {
01478               if (strcmp(p->tag->tagname, "base") == 0)
01479               {
01480                      char *buf=malloc(unicode_buf_len(&p->value)+1);
01481 
01482                      if (buf)
01483                      {
01484                             size_t i;
01485 
01486                             for (i=0; i<unicode_buf_len(&p->value); ++i)
01487                             {
01488                                    buf[i]=unicode_buf_ptr(&p->value)[i];
01489                             }
01490                             buf[i]=0;
01491 
01492                             htmlfilter_set_contentbase(p, buf);
01493                             free(buf);
01494                      }
01495                      return;
01496               }
01497 
01498 
01499               if (strcmp(p->tag->tagname, "a") == 0)
01500               {
01501                      char *url;
01502 
01503                      if ((url=handle_url(p, 0)) != NULL)
01504                      {
01505                             /* Append target=_blank to HREF */
01506 
01507                             unicode_buf_clear(&p->atom);
01508                             unicode_buf_append_char(&p->atom, "target", 6);
01509                             unicode_buf_clear(&p->value);
01510                             unicode_buf_append_char(&p->value, "_blank", 6);
01511                             save_attr_int(p, &p->atom, &p->value);
01512 
01513                             /* Append the full URL in the title tag */
01514 
01515                             unicode_buf_clear(&p->atom);
01516                             unicode_buf_append_char(&p->atom, "title", 5);
01517                             unicode_buf_clear(&p->value);
01518                             unicode_buf_append_char(&p->value, url, strlen(url));
01519                             save_attr_int(p, &p->atom, &p->value);
01520                             free(url);
01521 
01522                      }
01523                      return;
01524               }
01525        }
01526 }
01527 
01528 /*
01529 ** HANDLER: reading attribute name.
01530 */
01531 
01532 static size_t seen_attrname(struct htmlfilter_info *p,
01533                          const unicode_char *uc,
01534                          size_t cnt)
01535 {
01536        size_t i;
01537 
01538        for (i=0; i<cnt; ++i)
01539        {
01540               unicode_char c;
01541 
01542               if ((c=uc[i]) == ':' || c == '-' || (c=isualnum(c)) != 0)
01543               {
01544                      unicode_buf_append(&p->atom, &c, 1);
01545                      continue;
01546               }
01547 
01548               unicode_buf_clear(&p->value);
01549               p->value_quote=0;
01550 
01551               p->handler_func=seen_attr; /* No value expected */
01552 
01553               if (uc[i] == '=')
01554               {
01555                      p->handler_func=seen_attrvalue_1stchar;
01556                      return ++i;
01557               }
01558               save_attr(p);
01559               return i;
01560        }
01561        return cnt;
01562 }
01563 
01564 /*
01565 ** HANDLER: expecting first character of the attribute's value.
01566 */
01567 
01568 static size_t seen_attrvalue_1stchar(struct htmlfilter_info *p,
01569                                  const unicode_char *uc,
01570                                  size_t cnt)
01571 {
01572        p->handler_func=seen_attrvalue;
01573 
01574        switch (*uc) {
01575        case '\'':
01576        case '\"':
01577               p->value_quote= *uc;
01578               return 1;
01579        }
01580 
01581        return seen_attrvalue(p, uc, cnt);
01582 }
01583 
01584 /*
01585 ** HANDLER: expecting the value of an attribute.
01586 */
01587 
01588 static size_t seen_attrvalue(struct htmlfilter_info *p,
01589                           const unicode_char *uc,
01590                           size_t cnt)
01591 {
01592        size_t i;
01593 
01594        for (i=0; i<cnt; ++i)
01595        {
01596               if (uc[i] == '&')
01597               {
01598                      unicode_buf_append(&p->value, uc, i);
01599                      unicode_buf_clear(&p->atom2);
01600                      p->handler_func=seen_attrvalue_entity;
01601                      return i+1;
01602               }
01603 
01604               /*
01605               ** If the first char in the value is ' or ", another one ends
01606               ** the value. Otherwise, the value gets ended by a / or >
01607               */
01608 
01609               if (p->value_quote)
01610               {
01611                      if (uc[i] == p->value_quote)
01612                      {
01613                             unicode_buf_append(&p->value, uc, i);
01614                             save_attr(p);
01615                             return i+1;
01616                      }
01617               }
01618               else if (SPACE(uc[i]) || uc[i] == '/' || uc[i] == '>')
01619               {
01620                      unicode_buf_append(&p->value, uc, i);
01621                      save_attr(p);
01622                      return i;
01623               }
01624        }
01625        unicode_buf_append(&p->value, uc, i);
01626        return cnt;
01627 }
01628 
01629 /*
01630 ** atom2 should contain one of:
01631 **
01632 **  #<decimal value>
01633 **  #[xX]<hex value>
01634 **  <entity>
01635 */
01636 
01637 static void append_entity(struct htmlfilter_info *p)
01638 {
01639        unicode_char v=0;
01640 
01641        if (unicode_buf_len(&p->atom2) &&
01642            unicode_buf_ptr(&p->atom2)[0] == '#')
01643        {
01644               const unicode_char *u=unicode_buf_ptr(&p->atom2);
01645               size_t n=unicode_buf_len(&p->atom2);
01646 
01647               ++u;
01648               --n;
01649 
01650               if (n && (*u == 'x' || *u == 'X'))
01651               {
01652                      while (--n)
01653                      {
01654                             unicode_char c=*++u;
01655                             const char *cp;
01656 
01657                             if (c >= 'a' && c <= 'f')
01658                                    c += 'A'-'a';
01659 
01660                             if (c < ' ' || c > 127)
01661                                    break;
01662 
01663                             cp=strchr(hex, c);
01664 
01665                             if (!cp)
01666                                    break;
01667 
01668                             v = v * 16 + (cp-hex);
01669                      }
01670               }
01671               else
01672               {
01673                      while (n)
01674                      {
01675                             unicode_char c= *u++;
01676 
01677                             --n;
01678 
01679                             if (c < '0' || c > '9')
01680                                    break;
01681 
01682                             v = v * 10 + (c-'0');
01683                      }
01684               }
01685        }
01686        else
01687        {
01688               char entitybuf[32];
01689               size_t i;
01690 
01691               if (unicode_buf_len(&p->atom2) >= sizeof(entitybuf))
01692                      return;
01693 
01694               for (i=0; i<unicode_buf_len(&p->atom2); ++i)
01695               {
01696                      unicode_char c=unicode_buf_ptr(&p->atom2)[i];
01697 
01698                      if ((unsigned char)c != c)
01699                             return;
01700                      entitybuf[i]=c;
01701               }
01702               entitybuf[i]=0;
01703 
01704               if ((v=unicode_html40ent_lookup(entitybuf)) == 0)
01705                      return;
01706        }
01707 
01708        unicode_buf_append(&p->value, &v, 1);
01709 }
01710 
01711 /*
01712 ** HANDLER: &entity in an attribute.
01713 **
01714 ** We generally expect &name; or &#name;
01715 **
01716 ** However there's plenty of broken HTML that does not &-escape attribute
01717 ** values containing URLs.
01718 */
01719 
01720 static size_t seen_attrvalue_entity(struct htmlfilter_info *p,
01721                                 const unicode_char *uc,
01722                                 size_t cnt)
01723 {
01724        size_t i;
01725 
01726        if (unicode_buf_len(&p->atom2) == 0 && *uc == '#')
01727        {
01728               unicode_buf_append(&p->atom2, uc, 1);
01729               return 1;
01730        }
01731 
01732        for (i=0; i<cnt; ++i)
01733        {
01734               unicode_char c=isualnum(uc[i]);
01735 
01736               if (c)
01737               {
01738                      unicode_buf_append(&p->atom2, uc+i, 1);
01739                      continue;
01740               }
01741 
01742               switch (uc[i]) {
01743               case ';':
01744                      append_entity(p);
01745                      ++i;
01746                      break;
01747               case '&':
01748               case '=':
01749 
01750                      /* Broken URL, most likely */
01751 
01752                      {
01753                             unicode_char amp='&';
01754 
01755                             unicode_buf_append(&p->value, &amp, 1);
01756                      }
01757                      unicode_buf_append_buf(&p->value, &p->atom2);
01758                      break;
01759               default:
01760                      /* Not ...&foo;..., not ...&foo&..., not ...&foo=... */
01761 
01762                      /* forget the whole thing */
01763                      break;
01764               }
01765               p->handler_func=seen_attrvalue;
01766               return i;
01767        }
01768        return cnt;
01769 }