Back to index

libcitadel  8.12
html_to_ascii.c
Go to the documentation of this file.
00001 /*
00002  * Functions which handle translation between HTML and plain text
00003  * Copyright (c) 2000-2010 by the citadel.org team
00004  *
00005  * This program is open source software; you can redistribute it and/or modify
00006  * it under the terms of the GNU General Public License as published by
00007  * the Free Software Foundation; either version 3 of the License, or
00008  * (at your option) any later version.
00009  *
00010  * This program is distributed in the hope that it will be useful,
00011  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013  * GNU General Public License for more details.
00014  *
00015  * You should have received a copy of the GNU General Public License
00016  * along with this program; if not, write to the Free Software
00017  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00018  */
00019 
00020 #include <stdlib.h>
00021 #include <unistd.h>
00022 #include <stdio.h>
00023 #include <signal.h>
00024 #include <sys/types.h>
00025 #include <ctype.h>
00026 #include <string.h>
00027 #include <sys/stat.h>
00028 #include <errno.h>
00029 #include <limits.h>
00030 
00031 #if TIME_WITH_SYS_TIME
00032 # include <sys/time.h>
00033 # include <time.h>
00034 #else
00035 # if HAVE_SYS_TIME_H
00036 #  include <sys/time.h>
00037 # else
00038 #  include <time.h>
00039 # endif
00040 #endif
00041 
00042 #include "libcitadel.h"
00043  
00044 
00045 /*
00046  * Convert HTML to plain text.
00047  *
00048  * inputmsg      = pointer to raw HTML message
00049  * screenwidth   = desired output screenwidth
00050  * do_citaformat = set to 1 to indent newlines with spaces
00051  */
00052 char *html_to_ascii(const char *inputmsg, int msglen, int screenwidth, int do_citaformat) {
00053        char inbuf[SIZ];
00054        int inbuf_len = 0;
00055        char outbuf[SIZ];
00056        char tag[1024];
00057        int done_reading = 0;
00058        const char *inptr;
00059        char *outptr;
00060        size_t outptr_buffer_size;
00061        size_t output_len = 0;
00062        int i, j, ch, did_out, rb, scanch;
00063        int nest = 0;        /* Bracket nesting level */
00064        int blockquote = 0;  /* BLOCKQUOTE nesting level */
00065        int styletag = 0;    /* STYLE tag nesting level */
00066        int styletag_start = 0;
00067        int bytes_processed = 0;
00068        char nl[128];
00069 
00070        tag[0] = '\0';
00071        strcpy(nl, "\n");
00072        inptr = inputmsg;
00073        strcpy(inbuf, "");
00074        strcpy(outbuf, "");
00075        if (msglen == 0) msglen = strlen(inputmsg);
00076 
00077        outptr_buffer_size = strlen(inptr) + SIZ;
00078        outptr = malloc(outptr_buffer_size);
00079        if (outptr == NULL) return NULL;
00080        strcpy(outptr, "");
00081        output_len = 0;
00082 
00083        do {
00084               /* Fill the input buffer */
00085               inbuf_len = strlen(inbuf);
00086               if ( (done_reading == 0) && (inbuf_len < (SIZ-128)) ) {
00087 
00088                      ch = *inptr++;
00089                      if (ch != 0) {
00090                             inbuf[inbuf_len++] = ch;
00091                             inbuf[inbuf_len] = 0;
00092                      } 
00093                      else {
00094                             done_reading = 1;
00095                      }
00096 
00097                      ++bytes_processed;
00098                      if (bytes_processed > msglen) {
00099                             done_reading = 1;
00100                      }
00101 
00102               }
00103 
00104               /* Do some parsing */
00105               if (!IsEmptyStr(inbuf)) {
00106 
00107 
00108                   /* Fold in all the spacing */
00109                   for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
00110                      if (inbuf[i]==10) inbuf[i]=32;
00111                      if (inbuf[i]==13) inbuf[i]=32;
00112                      if (inbuf[i]==9) inbuf[i]=32;
00113                   }
00114                   for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
00115                      while ((inbuf[i]==32)&&(inbuf[i+1]==32)) {
00116                             strcpy(&inbuf[i], &inbuf[i+1]);
00117                      }
00118                   }
00119 
00120                   for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
00121 
00122                      ch = inbuf[i];
00123 
00124                      if (ch == '<') {
00125                             ++nest;
00126                             strcpy(tag, "");
00127                      }
00128 
00129                      else if (ch == '>') {       /* We have a tag. */
00130                             if (nest > 0) --nest;
00131 
00132                             /* Unqualify the tag (truncate at first space) */
00133                             if (strchr(tag, ' ') != NULL) {
00134                                    strcpy(strchr(tag, ' '), "");
00135                             }
00136                             
00137                             if (!strcasecmp(tag, "P")) {
00138                                    strcat(outbuf, nl);
00139                                    strcat(outbuf, nl);
00140                             }
00141 
00142                             if (!strcasecmp(tag, "/DIV")) {
00143                                    strcat(outbuf, nl);
00144                                    strcat(outbuf, nl);
00145                             }
00146 
00147                             if (!strcasecmp(tag, "LI")) {
00148                                    strcat(outbuf, nl);
00149                                    strcat(outbuf, " * ");
00150                             }
00151 
00152                             else if (!strcasecmp(tag, "/UL")) {
00153                                    strcat(outbuf, nl);
00154                                    strcat(outbuf, nl);
00155                             }
00156 
00157                             else if (!strcasecmp(tag, "H1")) {
00158                                    strcat(outbuf, nl);
00159                                    strcat(outbuf, nl);
00160                             }
00161 
00162                             else if (!strcasecmp(tag, "H2")) {
00163                                    strcat(outbuf, nl);
00164                                    strcat(outbuf, nl);
00165                             }
00166 
00167                             else if (!strcasecmp(tag, "H3")) {
00168                                    strcat(outbuf, nl);
00169                                    strcat(outbuf, nl);
00170                             }
00171 
00172                             else if (!strcasecmp(tag, "H4")) {
00173                                    strcat(outbuf, nl);
00174                                    strcat(outbuf, nl);
00175                             }
00176 
00177                             else if (!strcasecmp(tag, "/H1")) {
00178                                    strcat(outbuf, nl);
00179                             }
00180 
00181                             else if (!strcasecmp(tag, "/H2")) {
00182                                    strcat(outbuf, nl);
00183                             }
00184 
00185                             else if (!strcasecmp(tag, "/H3")) {
00186                                    strcat(outbuf, nl);
00187                             }
00188 
00189                             else if (!strcasecmp(tag, "/H4")) {
00190                                    strcat(outbuf, nl);
00191                             }
00192 
00193                             else if (!strcasecmp(tag, "HR")) {
00194                                    strcat(outbuf, nl);
00195                                    strcat(outbuf, " ");
00196                                    for (j=0; j<screenwidth-2; ++j)
00197                                           strcat(outbuf, "-");
00198                                    strcat(outbuf, nl);
00199                             }
00200 
00201                             else if (
00202                                    (!strcasecmp(tag, "B"))
00203                                    || (!strcasecmp(tag, "/B"))
00204                                    || (!strcasecmp(tag, "STRONG"))
00205                                    || (!strcasecmp(tag, "/STRONG"))
00206                             ) {
00207                                    strcat(outbuf, "*");
00208                                    
00209                             }
00210 
00211                             else if (
00212                                    (!strcasecmp(tag, "I"))
00213                                    || (!strcasecmp(tag, "/I"))
00214                                    || (!strcasecmp(tag, "EM"))
00215                                    || (!strcasecmp(tag, "/EM"))
00216                             ) {
00217                                    strcat(outbuf, "/");
00218                                    
00219                             }
00220 
00221                             else if (
00222                                    (!strcasecmp(tag, "U"))
00223                                    || (!strcasecmp(tag, "/U"))
00224                             ) {
00225                                    strcat(outbuf, "_");
00226                                    
00227                             }
00228 
00229                             else if (!strcasecmp(tag, "BR")) {
00230                                    strcat(outbuf, nl);
00231                             }
00232 
00233                             else if (!strcasecmp(tag, "TR")) {
00234                                    strcat(outbuf, nl);
00235                             }
00236 
00237                             else if (!strcasecmp(tag, "/TABLE")) {
00238                                    strcat(outbuf, nl);
00239                             }
00240 
00241                             else if (!strcasecmp(tag, "BLOCKQUOTE")) {
00242                                    ++blockquote;
00243                                    strcpy(nl, "\n");
00244                                    for (j=0; j<blockquote; ++j) strcat(nl, ">");
00245                                    strcat(outbuf, nl);
00246                             }
00247 
00248                             else if (!strcasecmp(tag, "/BLOCKQUOTE")) {
00249                                    strcat(outbuf, "\n");
00250                                    --blockquote;
00251                                    strcpy(nl, "\n");
00252                                    for (j=0; j<blockquote; ++j) strcat(nl, ">");
00253                                    strcat(outbuf, nl);
00254                             }
00255 
00256                             else if (!strcasecmp(tag, "STYLE")) {
00257                                    ++styletag;
00258                                    if (styletag == 1) {
00259                                           styletag_start = strlen(outbuf);
00260                                    }
00261                             }
00262 
00263                             else if (!strcasecmp(tag, "/STYLE")) {
00264                                    --styletag;
00265                                    if (styletag == 0) {
00266                                           outbuf[styletag_start] = 0;
00267                                    }
00268                             }
00269 
00270                      }
00271 
00272                      else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) {
00273                             tag[strlen(tag)+1] = 0;
00274                             tag[strlen(tag)] = ch;
00275                      }
00276                             
00277                      else if (!nest) {
00278                             outbuf[strlen(outbuf)+1] = 0;
00279                             outbuf[strlen(outbuf)] = ch;
00280                      }
00281                   }
00282                   strcpy(inbuf, &inbuf[i]);
00283               }
00284 
00285               /* Convert &; tags to the forbidden characters */
00286               if (!IsEmptyStr(outbuf)) for (i=0; !IsEmptyStr(&outbuf[i]); ++i) {
00287 
00288                      /* Character entity references */
00289                      if (!strncasecmp(&outbuf[i], "&nbsp;", 6)) {
00290                             outbuf[i] = ' ';
00291                             strcpy(&outbuf[i+1], &outbuf[i+6]);
00292                      }
00293 
00294                      if (!strncasecmp(&outbuf[i], "&ensp;", 6)) {
00295                             outbuf[i] = ' ';
00296                             strcpy(&outbuf[i+1], &outbuf[i+6]);
00297                      }
00298 
00299                      if (!strncasecmp(&outbuf[i], "&emsp;", 6)) {
00300                             outbuf[i] = ' ';
00301                             strcpy(&outbuf[i+1], &outbuf[i+6]);
00302                      }
00303 
00304                      if (!strncasecmp(&outbuf[i], "&thinsp;", 8)) {
00305                             outbuf[i] = ' ';
00306                             strcpy(&outbuf[i+1], &outbuf[i+8]);
00307                      }
00308 
00309                      else if (!strncasecmp(&outbuf[i], "&lt;", 4)) {
00310                             outbuf[i] = '<';
00311                             strcpy(&outbuf[i+1], &outbuf[i+4]);
00312                      }
00313 
00314                      else if (!strncasecmp(&outbuf[i], "&gt;", 4)) {
00315                             outbuf[i] = '>';
00316                             strcpy(&outbuf[i+1], &outbuf[i+4]);
00317                      }
00318 
00319                      else if (!strncasecmp(&outbuf[i], "&amp;", 5)) {
00320                             strcpy(&outbuf[i+1], &outbuf[i+5]);
00321                      }
00322 
00323                      else if (!strncasecmp(&outbuf[i], "&quot;", 6)) {
00324                             outbuf[i] = '\"';
00325                             strcpy(&outbuf[i+1], &outbuf[i+6]);
00326                      }
00327 
00328                      else if (!strncasecmp(&outbuf[i], "&lsquo;", 7)) {
00329                             outbuf[i] = '`';
00330                             strcpy(&outbuf[i+1], &outbuf[i+7]);
00331                      }
00332 
00333                      else if (!strncasecmp(&outbuf[i], "&rsquo;", 7)) {
00334                             outbuf[i] = '\'';
00335                             strcpy(&outbuf[i+1], &outbuf[i+7]);
00336                      }
00337 
00338                      else if (!strncasecmp(&outbuf[i], "&copy;", 6)) {
00339                             outbuf[i] = '(';
00340                             outbuf[i+1] = 'c';
00341                             outbuf[i+2] = ')';
00342                             strcpy(&outbuf[i+3], &outbuf[i+6]);
00343                      }
00344 
00345                      else if (!strncasecmp(&outbuf[i], "&bull;", 6)) {
00346                             outbuf[i] = ' ';
00347                             outbuf[i+1] = '*';
00348                             outbuf[i+2] = ' ';
00349                             strcpy(&outbuf[i+3], &outbuf[i+6]);
00350                      }
00351 
00352                      else if (!strncasecmp(&outbuf[i], "&hellip;", 8)) {
00353                             outbuf[i] = '.';
00354                             outbuf[i+1] = '.';
00355                             outbuf[i+2] = '.';
00356                             strcpy(&outbuf[i+3], &outbuf[i+8]);
00357                      }
00358 
00359                      else if (!strncasecmp(&outbuf[i], "&trade;", 7)) {
00360                             outbuf[i] = '(';
00361                             outbuf[i+1] = 't';
00362                             outbuf[i+2] = 'm';
00363                             outbuf[i+3] = ')';
00364                             strcpy(&outbuf[i+4], &outbuf[i+7]);
00365                      }
00366 
00367                      else if (!strncasecmp(&outbuf[i], "&reg;", 5)) {
00368                             outbuf[i] = '(';
00369                             outbuf[i+1] = 'r';
00370                             outbuf[i+2] = ')';
00371                             strcpy(&outbuf[i+3], &outbuf[i+5]);
00372                      }
00373 
00374                      else if (!strncasecmp(&outbuf[i], "&frac14;", 8)) {
00375                             outbuf[i] = '1';
00376                             outbuf[i+1] = '/';
00377                             outbuf[i+2] = '4';
00378                             strcpy(&outbuf[i+3], &outbuf[i+8]);
00379                      }
00380 
00381                      else if (!strncasecmp(&outbuf[i], "&frac12;", 8)) {
00382                             outbuf[i] = '1';
00383                             outbuf[i+1] = '/';
00384                             outbuf[i+2] = '2';
00385                             strcpy(&outbuf[i+3], &outbuf[i+8]);
00386                      }
00387 
00388                      else if (!strncasecmp(&outbuf[i], "&frac34;", 8)) {
00389                             outbuf[i] = '3';
00390                             outbuf[i+1] = '/';
00391                             outbuf[i+2] = '4';
00392                             strcpy(&outbuf[i+3], &outbuf[i+8]);
00393                      }
00394 
00395                      else if (!strncasecmp(&outbuf[i], "&ndash;", 7)) {
00396                             outbuf[i] = '-';
00397                             outbuf[i+1] = '-';
00398                             strcpy(&outbuf[i+2], &outbuf[i+7]);
00399                      }
00400 
00401                      else if (!strncasecmp(&outbuf[i], "&mdash;", 7)) {
00402                             outbuf[i] = '-';
00403                             outbuf[i+1] = '-';
00404                             outbuf[i+2] = '-';
00405                             strcpy(&outbuf[i+3], &outbuf[i+7]);
00406                      }
00407 
00408                      else if (!strncmp(&outbuf[i], "&Ccedil;", 8)) {
00409                             outbuf[i] = 'C';
00410                             strcpy(&outbuf[i+1], &outbuf[i+8]);
00411                      }
00412 
00413                      else if (!strncasecmp(&outbuf[i], "&ccedil;", 8)) {
00414                             outbuf[i] = 'c';
00415                             strcpy(&outbuf[i+1], &outbuf[i+8]);
00416                      }
00417 
00418                      else if (!strncmp(&outbuf[i], "&Egrave;", 8)) {
00419                             outbuf[i] = 'E';
00420                             strcpy(&outbuf[i+1], &outbuf[i+8]);
00421                      }
00422 
00423                      else if (!strncasecmp(&outbuf[i], "&egrave;", 8)) {
00424                             outbuf[i] = 'e';
00425                             strcpy(&outbuf[i+1], &outbuf[i+8]);
00426                      }
00427 
00428                      else if (!strncmp(&outbuf[i], "&Ecirc;", 7)) {
00429                             outbuf[i] = 'E';
00430                             strcpy(&outbuf[i+1], &outbuf[i+7]);
00431                      }
00432 
00433                      else if (!strncasecmp(&outbuf[i], "&ecirc;", 7)) {
00434                             outbuf[i] = 'e';
00435                             strcpy(&outbuf[i+1], &outbuf[i+7]);
00436                      }
00437 
00438                      else if (!strncmp(&outbuf[i], "&Eacute;", 8)) {
00439                             outbuf[i] = 'E';
00440                             strcpy(&outbuf[i+1], &outbuf[i+8]);
00441                      }
00442 
00443                      else if (!strncasecmp(&outbuf[i], "&eacute;", 8)) {
00444                             outbuf[i] = 'e';
00445                             strcpy(&outbuf[i+1], &outbuf[i+8]);
00446                      }
00447 
00448                      else if (!strncmp(&outbuf[i], "&Agrave;", 8)) {
00449                             outbuf[i] = 'A';
00450                             strcpy(&outbuf[i+1], &outbuf[i+8]);
00451                      }
00452 
00453                      else if (!strncasecmp(&outbuf[i], "&agrave;", 8)) {
00454                             outbuf[i] = 'a';
00455                             strcpy(&outbuf[i+1], &outbuf[i+8]);
00456                      }
00457 
00458                      else if (!strncasecmp(&outbuf[i], "&ldquo;", 7)) {
00459                             outbuf[i] = '\"';
00460                             strcpy(&outbuf[i+1], &outbuf[i+7]);
00461                      }
00462 
00463                      else if (!strncasecmp(&outbuf[i], "&rdquo;", 7)) {
00464                             outbuf[i] = '\"';
00465                             strcpy(&outbuf[i+1], &outbuf[i+7]);
00466                      }
00467 
00468                      else if (!strncasecmp(&outbuf[i], "&acute;", 7)) {
00469                             outbuf[i] = '\'';
00470                             strcpy(&outbuf[i+1], &outbuf[i+7]);
00471                      }
00472 
00473                      else if (!strncasecmp(&outbuf[i], "&#8217;", 7)) {
00474                             outbuf[i] = '\'';
00475                             strcpy(&outbuf[i+1], &outbuf[i+7]);
00476                      }
00477 
00478                      else if (!strncasecmp(&outbuf[i], "&#8211;", 7)) {
00479                             outbuf[i] = '-';
00480                             strcpy(&outbuf[i+1], &outbuf[i+7]);
00481                      }
00482 
00483                      /* two-digit decimal equivalents */
00484                      else if (outbuf[i] == '&'       &&
00485                              outbuf[i + 1] == '#'   &&
00486                              isdigit(outbuf[i + 2]) && 
00487                              isdigit(outbuf[i + 3]) &&
00488                              (outbuf[i+4] == ';') ) 
00489                      {
00490                             scanch = 0;
00491                             sscanf(&outbuf[i+2], "%02d", &scanch);
00492                             outbuf[i] = scanch;
00493                             strcpy(&outbuf[i+1], &outbuf[i+5]);
00494                      }
00495 
00496                      /* three-digit decimal equivalents */
00497                      else if (outbuf[i] == '&'       &&
00498                              outbuf[i + 1] == '#'   &&
00499                              isdigit(outbuf[i + 2]) && 
00500                              isdigit(outbuf[i + 3]) && 
00501                              isdigit(outbuf[i + 4]) &&
00502                              (outbuf[i + 5] == ';') ) 
00503                      {
00504                             scanch = 0;
00505                             sscanf(&outbuf[i+2], "%03d", &scanch);
00506                             outbuf[i] = scanch;
00507                             strcpy(&outbuf[i+1], &outbuf[i+6]);
00508                      }
00509 
00510                      /* four-digit decimal equivalents */
00511                      else if (outbuf[i] == '&'       &&
00512                              outbuf[i + 1] == '#'   &&
00513                              isdigit(outbuf[i + 2]) && 
00514                              isdigit(outbuf[i + 3]) && 
00515                              isdigit(outbuf[i + 4]) &&
00516                              isdigit(outbuf[i + 5]) &&
00517                              (outbuf[i + 6] == ';') ) 
00518                      {
00519                             scanch = 0;
00520                             sscanf(&outbuf[i+2], "%04d", &scanch);
00521                             outbuf[i] = scanch;
00522                             strcpy(&outbuf[i+1], &outbuf[i+7]);
00523                      }
00524 
00525               }
00526 
00527               /* Make sure the output buffer is big enough */
00528               if ((output_len + strlen(outbuf) + SIZ) > outptr_buffer_size) {
00529                      outptr_buffer_size += SIZ;
00530                      outptr = realloc(outptr, outptr_buffer_size);
00531                      if (outptr == NULL) {
00532                             abort();
00533                      }
00534               }
00535 
00536               /* Output any lines terminated with hard line breaks */
00537               do {
00538                      did_out = 0;
00539                      if (strlen(outbuf) > 0) {
00540                          for (i = 0; i<strlen(outbuf); ++i) {
00541                             if ( (i<(screenwidth-2)) && (outbuf[i]=='\n')) {
00542 
00543                                    strncpy(&outptr[output_len], outbuf, i+1);
00544                                    output_len += (i+1);
00545 
00546                                    if (do_citaformat) {
00547                                           strcpy(&outptr[output_len], " ");
00548                                           ++output_len;
00549                                    }
00550 
00551                                    strcpy(outbuf, &outbuf[i+1]);
00552                                    i = 0;
00553                                    did_out = 1;
00554                             }
00555                      }
00556                   }
00557               } while (did_out);
00558 
00559               /* Add soft line breaks */
00560               if (strlen(outbuf) > (screenwidth - 2 )) {
00561                      rb = (-1);
00562                      for (i=0; i<(screenwidth-2); ++i) {
00563                             if (outbuf[i]==32) rb = i;
00564                      }
00565                      if (rb>=0) {
00566                             strncpy(&outptr[output_len], outbuf, rb);
00567                             output_len += rb;
00568                             strcpy(&outptr[output_len], nl);
00569                             output_len += strlen(nl);
00570                             if (do_citaformat) {
00571                                    strcpy(&outptr[output_len], " ");
00572                                    ++output_len;
00573                             }
00574                             strcpy(outbuf, &outbuf[rb+1]);
00575                      } else {
00576                             strncpy(&outptr[output_len], outbuf,
00577                                    screenwidth-2);
00578                             output_len += (screenwidth-2);
00579                             strcpy(&outptr[output_len], nl);
00580                             output_len += strlen(nl);
00581                             if (do_citaformat) {
00582                                    strcpy(&outptr[output_len], " ");
00583                                    ++output_len;
00584                             }
00585                             strcpy(outbuf, &outbuf[screenwidth-2]);
00586                      }
00587               }
00588 
00589        } while (done_reading == 0);
00590 
00591        strcpy(&outptr[output_len], outbuf);
00592        output_len += strlen(outbuf);
00593 
00594        /* Strip leading/trailing whitespace.  We can't do this with
00595         * striplt() because it uses too many strlen()'s
00596         */
00597        while ((output_len > 0) && (isspace(outptr[0]))) {
00598               strcpy(outptr, &outptr[1]);
00599               --output_len;
00600        }
00601        while ((output_len > 0) && (isspace(outptr[output_len-1]))) {
00602               outptr[output_len-1] = 0;
00603               --output_len;
00604        }
00605 
00606        if ((output_len > 0) && (outptr[output_len-1] != '\n')) {
00607               strcat(outptr, "\n");
00608               ++output_len;
00609        }
00610 
00611        return outptr;
00612 
00613 }