Back to index

lightning-sunbird  0.9+nobinonly
nsEscape.cpp
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is mozilla.org code.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 1998
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *
00024  * Alternatively, the contents of this file may be used under the terms of
00025  * either of the GNU General Public License Version 2 or later (the "GPL"),
00026  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00027  * in which case the provisions of the GPL or the LGPL are applicable instead
00028  * of those above. If you wish to allow use of your version of this file only
00029  * under the terms of either the GPL or the LGPL, and not to allow others to
00030  * use your version of this file under the terms of the MPL, indicate your
00031  * decision by deleting the provisions above and replace them with the notice
00032  * and other provisions required by the GPL or the LGPL. If you do not delete
00033  * the provisions above, a recipient may use your version of this file under
00034  * the terms of any one of the MPL, the GPL or the LGPL.
00035  *
00036  * ***** END LICENSE BLOCK ***** */
00037  
00038 //     First checked in on 98/12/03 by John R. McMullen, derived from net.h/mkparse.c.
00039 
00040 #include "nsEscape.h"
00041 #include "nsMemory.h"
00042 #include "nsCRT.h"
00043 #include "nsReadableUtils.h"
00044 
00045 const int netCharType[256] =
00046 /*     Bit 0         xalpha        -- the alphas
00047 **     Bit 1         xpalpha              -- as xalpha but 
00048 **                             converts spaces to plus and plus to %2B
00049 **     Bit 3 ...     path          -- as xalphas but doesn't escape '/'
00050 */
00051     /*   0 1 2 3 4 5 6 7 8 9 A B C D E F */
00052     {    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x */
00053                0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* 1x */
00054                0,0,0,0,0,0,0,0,0,0,7,4,0,7,7,4,  /* 2x   !"#$%&'()*+,-./      */
00055          7,7,7,7,7,7,7,7,7,7,0,0,0,0,0,0, /* 3x  0123456789:;<=>?      */
00056             0,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,     /* 4x  @ABCDEFGHIJKLMNO  */
00057             /* bits for '@' changed from 7 to 0 so '@' can be escaped   */
00058             /* in usernames and passwords in publishing.                */
00059             7,7,7,7,7,7,7,7,7,7,7,0,0,0,0,7,     /* 5X  PQRSTUVWXYZ[\]^_      */
00060             0,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,     /* 6x  `abcdefghijklmno      */
00061             7,7,7,7,7,7,7,7,7,7,7,0,0,0,0,0,     /* 7X  pqrstuvwxyz{\}~      DEL */
00062                0, };
00063 
00064 /* decode % escaped hex codes into character values
00065  */
00066 #define UNHEX(C) \
00067     ((C >= '0' && C <= '9') ? C - '0' : \
00068      ((C >= 'A' && C <= 'F') ? C - 'A' + 10 : \
00069      ((C >= 'a' && C <= 'f') ? C - 'a' + 10 : 0)))
00070 
00071 
00072 #define IS_OK(C) (netCharType[((unsigned int) (C))] & (flags))
00073 #define HEX_ESCAPE '%'
00074 
00075 //----------------------------------------------------------------------------------------
00076 static char* nsEscapeCount(
00077     const char * str,
00078     nsEscapeMask flags,
00079     size_t* out_len)
00080 //----------------------------------------------------------------------------------------
00081 {
00082        if (!str)
00083               return 0;
00084 
00085     size_t i, len = 0, charsToEscape = 0;
00086     static const char hexChars[] = "0123456789ABCDEF";
00087 
00088        register const unsigned char* src = (const unsigned char *) str;
00089     while (*src)
00090        {
00091         len++;
00092         if (!IS_OK(*src++))
00093             charsToEscape++;
00094        }
00095 
00096     // calculate how much memory should be allocated
00097     // original length + 2 bytes for each escaped character + terminating '\0'
00098     // do the sum in steps to check for overflow
00099     size_t dstSize = len + 1 + charsToEscape;
00100     if (dstSize <= len)
00101        return 0;
00102     dstSize += charsToEscape;
00103     if (dstSize < len)
00104        return 0;
00105 
00106     // fail if we need more than 4GB
00107     // size_t is likely to be long unsigned int but nsMemory::Alloc(size_t)
00108     // calls NS_Alloc_P(size_t) which calls PR_Malloc(PRUint32), so there is
00109     // no chance to allocate more than 4GB using nsMemory::Alloc()
00110     if (dstSize > PR_UINT32_MAX)
00111         return 0;
00112 
00113        char* result = (char *)nsMemory::Alloc(dstSize);
00114     if (!result)
00115         return 0;
00116 
00117     register unsigned char* dst = (unsigned char *) result;
00118        src = (const unsigned char *) str;
00119        if (flags == url_XPAlphas)
00120        {
00121            for (i = 0; i < len; i++)
00122               {
00123                      unsigned char c = *src++;
00124                      if (IS_OK(c))
00125                             *dst++ = c;
00126                      else if (c == ' ')
00127                             *dst++ = '+'; /* convert spaces to pluses */
00128                      else 
00129                      {
00130                             *dst++ = HEX_ESCAPE;
00131                             *dst++ = hexChars[c >> 4];  /* high nibble */
00132                             *dst++ = hexChars[c & 0x0f];       /* low nibble */
00133                      }
00134               }
00135        }
00136        else
00137        {
00138            for (i = 0; i < len; i++)
00139               {
00140                      unsigned char c = *src++;
00141                      if (IS_OK(c))
00142                             *dst++ = c;
00143                      else 
00144                      {
00145                             *dst++ = HEX_ESCAPE;
00146                             *dst++ = hexChars[c >> 4];  /* high nibble */
00147                             *dst++ = hexChars[c & 0x0f];       /* low nibble */
00148                      }
00149               }
00150        }
00151 
00152     *dst = '\0';     /* tack on eos */
00153        if(out_len)
00154               *out_len = dst - (unsigned char *) result;
00155     return result;
00156 }
00157 
00158 //----------------------------------------------------------------------------------------
00159 NS_COM char* nsEscape(const char * str, nsEscapeMask flags)
00160 //----------------------------------------------------------------------------------------
00161 {
00162     if(!str)
00163         return NULL;
00164     return nsEscapeCount(str, flags, NULL);
00165 }
00166 
00167 //----------------------------------------------------------------------------------------
00168 NS_COM char* nsUnescape(char * str)
00169 //----------------------------------------------------------------------------------------
00170 {
00171        nsUnescapeCount(str);
00172        return str;
00173 }
00174 
00175 //----------------------------------------------------------------------------------------
00176 NS_COM PRInt32 nsUnescapeCount(char * str)
00177 //----------------------------------------------------------------------------------------
00178 {
00179     register char *src = str;
00180     register char *dst = str;
00181     static const char hexChars[] = "0123456789ABCDEFabcdef";
00182 
00183     char c1[] = " ";
00184     char c2[] = " ";
00185     char* const pc1 = c1;
00186     char* const pc2 = c2;
00187 
00188     while (*src)
00189     {
00190         c1[0] = *(src+1);
00191         if (*(src+1) == '\0') 
00192             c2[0] = '\0';
00193         else
00194             c2[0] = *(src+2);
00195 
00196         if (*src != HEX_ESCAPE || PL_strpbrk(pc1, hexChars) == 0 || 
00197                                   PL_strpbrk(pc2, hexChars) == 0 )
00198               *dst++ = *src++;
00199         else  
00200               {
00201               src++; /* walk over escape */
00202               if (*src)
00203             {
00204               *dst = UNHEX(*src) << 4;
00205               src++;
00206             }
00207               if (*src)
00208             {
00209               *dst = (*dst + UNHEX(*src));
00210               src++;
00211             }
00212               dst++;
00213         }
00214     }
00215 
00216     *dst = 0;
00217     return (int)(dst - str);
00218 
00219 } /* NET_UnEscapeCnt */
00220 
00221 
00222 NS_COM char *
00223 nsEscapeHTML(const char * string)
00224 {
00225        /* XXX Hardcoded max entity len. The +1 is for the trailing null. */
00226        char *rv = (char *) nsMemory::Alloc(strlen(string) * 6 + 1);
00227        char *ptr = rv;
00228 
00229        if(rv)
00230          {
00231               for(; *string != '\0'; string++)
00232                 {
00233                      if(*string == '<')
00234                        {
00235                             *ptr++ = '&';
00236                             *ptr++ = 'l';
00237                             *ptr++ = 't';
00238                             *ptr++ = ';';
00239                        }
00240                      else if(*string == '>')
00241                        {
00242                             *ptr++ = '&';
00243                             *ptr++ = 'g';
00244                             *ptr++ = 't';
00245                             *ptr++ = ';';
00246                        }
00247                      else if(*string == '&')
00248                        {
00249                             *ptr++ = '&';
00250                             *ptr++ = 'a';
00251                             *ptr++ = 'm';
00252                             *ptr++ = 'p';
00253                             *ptr++ = ';';
00254                        }
00255                      else if (*string == '"')
00256                        {
00257                             *ptr++ = '&';
00258                             *ptr++ = 'q';
00259                             *ptr++ = 'u';
00260                             *ptr++ = 'o';
00261                             *ptr++ = 't';
00262                             *ptr++ = ';';
00263                        }                  
00264                      else if (*string == '\'')
00265                        {
00266                             *ptr++ = '&';
00267                             *ptr++ = '#';
00268                             *ptr++ = '3';
00269                             *ptr++ = '9';
00270                             *ptr++ = ';';
00271                        }
00272                      else
00273                        {
00274                             *ptr++ = *string;
00275                        }
00276                 }
00277               *ptr = '\0';
00278          }
00279 
00280        return(rv);
00281 }
00282 
00283 NS_COM PRUnichar *
00284 nsEscapeHTML2(const PRUnichar *aSourceBuffer, PRInt32 aSourceBufferLen)
00285 {
00286   // if the caller didn't calculate the length
00287   if (aSourceBufferLen == -1) {
00288     aSourceBufferLen = nsCRT::strlen(aSourceBuffer); // ...then I will
00289   }
00290 
00291   /* XXX Hardcoded max entity len. */
00292   PRUnichar *resultBuffer = (PRUnichar *)nsMemory::Alloc(aSourceBufferLen *
00293                             6 * sizeof(PRUnichar) + sizeof(PRUnichar('\0')));
00294   PRUnichar *ptr = resultBuffer;
00295 
00296   if (resultBuffer) {
00297     PRInt32 i;
00298 
00299     for(i = 0; i < aSourceBufferLen; i++) {
00300       if(aSourceBuffer[i] == '<') {
00301         *ptr++ = '&';
00302         *ptr++ = 'l';
00303         *ptr++ = 't';
00304         *ptr++ = ';';
00305       } else if(aSourceBuffer[i] == '>') {
00306         *ptr++ = '&';
00307         *ptr++ = 'g';
00308         *ptr++ = 't';
00309         *ptr++ = ';';
00310       } else if(aSourceBuffer[i] == '&') {
00311         *ptr++ = '&';
00312         *ptr++ = 'a';
00313         *ptr++ = 'm';
00314         *ptr++ = 'p';
00315         *ptr++ = ';';
00316       } else if (aSourceBuffer[i] == '"') {
00317         *ptr++ = '&';
00318         *ptr++ = 'q';
00319         *ptr++ = 'u';
00320         *ptr++ = 'o';
00321         *ptr++ = 't';
00322         *ptr++ = ';';
00323       } else if (aSourceBuffer[i] == '\'') {
00324         *ptr++ = '&';
00325         *ptr++ = '#';
00326         *ptr++ = '3';
00327         *ptr++ = '9';
00328         *ptr++ = ';';
00329       } else {
00330         *ptr++ = aSourceBuffer[i];
00331       }
00332     }
00333     *ptr = 0;
00334   }
00335 
00336   return resultBuffer;
00337 }
00338 
00339 //----------------------------------------------------------------------------------------
00340 
00341 const int EscapeChars[256] =
00342 /*      0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F */
00343 {
00344         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,       /* 0x */
00345         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,          /* 1x */
00346         0,1023,   0, 512,1023,   0,1023,1023,1023,1023,1023,1023,1023,1023, 953, 784,       /* 2x   !"#$%&'()*+,-./     */
00347      1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1008, 912,   0,1008,   0, 768,       /* 3x  0123456789:;<=>?     */
00348      1008,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,       /* 4x  @ABCDEFGHIJKLMNO  */
00349      1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023, 896, 896, 896, 896,1023,       /* 5x  PQRSTUVWXYZ[\]^_     */
00350         0,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,       /* 6x  `abcdefghijklmno     */
00351      1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023, 896,1012, 896,1023,   0,       /* 7x  pqrstuvwxyz{|}~      */
00352         0    /* 8x  DEL               */
00353 };
00354 
00355 #define NO_NEED_ESC(C) (EscapeChars[((unsigned int) (C))] & (flags))
00356 
00357 //----------------------------------------------------------------------------------------
00358 
00359 /* returns an escaped string */
00360 
00361 /* use the following flags to specify which 
00362    part of an URL you want to escape: 
00363 
00364    esc_Scheme        =     1
00365    esc_Username      =     2
00366    esc_Password      =     4
00367    esc_Host          =     8
00368    esc_Directory     =    16
00369    esc_FileBaseName  =    32
00370    esc_FileExtension =    64
00371    esc_Param         =   128
00372    esc_Query         =   256
00373    esc_Ref           =   512
00374 */
00375 
00376 /* by default this function will not escape parts of a string
00377    that already look escaped, which means it already includes 
00378    a valid hexcode. This is done to avoid multiple escapes of
00379    a string. Use the following flags to force escaping of a 
00380    string:
00381  
00382    esc_Forced        =  1024
00383 */
00384 
00385 NS_COM PRBool NS_EscapeURL(const char *part,
00386                            PRInt32 partLen,
00387                            PRUint32 flags,
00388                            nsACString &result)
00389 {
00390     if (!part) {
00391         NS_NOTREACHED("null pointer");
00392         return PR_FALSE;
00393     }
00394 
00395     int i = 0;
00396     static const char hexChars[] = "0123456789ABCDEF";
00397     if (partLen < 0)
00398         partLen = strlen(part);
00399     PRBool forced = (flags & esc_Forced);
00400     PRBool ignoreNonAscii = (flags & esc_OnlyASCII);
00401     PRBool ignoreAscii = (flags & esc_OnlyNonASCII);
00402     PRBool writing = (flags & esc_AlwaysCopy);
00403     PRBool colon = (flags & esc_Colon);
00404 
00405     register const unsigned char* src = (const unsigned char *) part;
00406 
00407     char tempBuffer[100];
00408     unsigned int tempBufferPos = 0;
00409 
00410     PRBool previousIsNonASCII = PR_FALSE;
00411     for (i = 0; i < partLen; i++)
00412     {
00413       unsigned char c = *src++;
00414 
00415       // if the char has not to be escaped or whatever follows % is 
00416       // a valid escaped string, just copy the char.
00417       //
00418       // Also the % will not be escaped until forced
00419       // See bugzilla bug 61269 for details why we changed this
00420       //
00421       // And, we will not escape non-ascii characters if requested.
00422       // On special request we will also escape the colon even when
00423       // not covered by the matrix.
00424       // ignoreAscii is not honored for control characters (C0 and DEL)
00425       //
00426       // And, we should escape the '|' character when it occurs after any
00427       // non-ASCII character as it may be part of a multi-byte character.
00428       if ((NO_NEED_ESC(c) || (c == HEX_ESCAPE && !forced)
00429                           || (c > 0x7f && ignoreNonAscii)
00430                           || (c > 0x1f && c < 0x7f && ignoreAscii))
00431           && !(c == ':' && colon)
00432           && !(previousIsNonASCII && c == '|' && !ignoreNonAscii))
00433       {
00434         if (writing)
00435           tempBuffer[tempBufferPos++] = c;
00436       }
00437       else /* do the escape magic */
00438       {
00439         if (!writing)
00440         {
00441           result.Append(part, i);
00442           writing = PR_TRUE;
00443         }
00444         tempBuffer[tempBufferPos++] = HEX_ESCAPE;
00445         tempBuffer[tempBufferPos++] = hexChars[c >> 4]; /* high nibble */
00446         tempBuffer[tempBufferPos++] = hexChars[c & 0x0f]; /* low nibble */
00447       }
00448 
00449       if (tempBufferPos >= sizeof(tempBuffer) - 4)
00450       {
00451         NS_ASSERTION(writing, "should be writing");
00452         tempBuffer[tempBufferPos] = '\0';
00453         result += tempBuffer;
00454         tempBufferPos = 0;
00455       }
00456 
00457       previousIsNonASCII = (c > 0x7f);
00458     }
00459     if (writing) {
00460       tempBuffer[tempBufferPos] = '\0';
00461       result += tempBuffer;
00462     }
00463     return writing;
00464 }
00465 
00466 #define ISHEX(c) memchr(hexChars, c, sizeof(hexChars)-1)
00467 
00468 NS_COM PRBool NS_UnescapeURL(const char *str, PRInt32 len, PRUint32 flags, nsACString &result)
00469 {
00470     if (!str) {
00471         NS_NOTREACHED("null pointer");
00472         return PR_FALSE;
00473     }
00474 
00475     if (len < 0)
00476         len = strlen(str);
00477 
00478     PRBool ignoreNonAscii = (flags & esc_OnlyASCII);
00479     PRBool ignoreAscii = (flags & esc_OnlyNonASCII);
00480     PRBool writing = (flags & esc_AlwaysCopy);
00481     PRBool skipControl = (flags & esc_SkipControl); 
00482 
00483     static const char hexChars[] = "0123456789ABCDEFabcdef";
00484 
00485     const char *last = str;
00486     const char *p = str;
00487 
00488     for (int i=0; i<len; ++i, ++p) {
00489         //printf("%c [i=%d of len=%d]\n", *p, i, len);
00490         if (*p == HEX_ESCAPE && i < len-2) {
00491             unsigned char *p1 = ((unsigned char *) p) + 1;
00492             unsigned char *p2 = ((unsigned char *) p) + 2;
00493             if (ISHEX(*p1) && ISHEX(*p2) && 
00494                 ((*p1 < '8' && !ignoreAscii) || (*p1 >= '8' && !ignoreNonAscii)) &&
00495                 !(skipControl && 
00496                   (*p1 < '2' || (*p1 == '7' && (*p2 == 'f' || *p2 == 'F'))))) {
00497                 //printf("- p1=%c p2=%c\n", *p1, *p2);
00498                 writing = PR_TRUE;
00499                 if (p > last) {
00500                     //printf("- p=%p, last=%p\n", p, last);
00501                     result.Append(last, p - last);
00502                     last = p;
00503                 }
00504                 char u = (UNHEX(*p1) << 4) + UNHEX(*p2);
00505                 //printf("- u=%c\n", u);
00506                 result.Append(u);
00507                 i += 2;
00508                 p += 2;
00509                 last += 3;
00510             }
00511         }
00512     }
00513     if (writing && last < str + len)
00514         result.Append(last, str + len - last);
00515 
00516     return writing;
00517 }