Back to index

lightning-sunbird  0.9+nobinonly
punycode.c
Go to the documentation of this file.
00001 /*
00002 punycode.c from RFC 3492
00003 http://www.nicemice.net/idn/
00004 Adam M. Costello
00005 http://www.nicemice.net/amc/
00006 
00007 This is ANSI C code (C89) implementing Punycode (RFC 3492).
00008 
00009 
00010 C. Disclaimer and license
00011 
00012     Regarding this entire document or any portion of it (including
00013     the pseudocode and C code), the author makes no guarantees and
00014     is not responsible for any damage resulting from its use.  The
00015     author grants irrevocable permission to anyone to use, modify,
00016     and distribute it in any way that does not diminish the rights
00017     of anyone else to use, modify, and distribute it, provided that
00018     redistributed derivative works do not contain misleading author or
00019     version information.  Derivative works need not be licensed under
00020     similar terms.
00021 */
00022 
00023 #include "punycode.h"
00024 
00025 /**********************************************************/
00026 /* Implementation (would normally go in its own .c file): */
00027 
00028 #include <string.h>
00029 
00030 /*** Bootstring parameters for Punycode ***/
00031 
00032 enum { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
00033        initial_bias = 72, initial_n = 0x80, delimiter = 0x2D };
00034 
00035 /* basic(cp) tests whether cp is a basic code point: */
00036 #define basic(cp) ((punycode_uint)(cp) < 0x80)
00037 
00038 /* delim(cp) tests whether cp is a delimiter: */
00039 #define delim(cp) ((cp) == delimiter)
00040 
00041 /* decode_digit(cp) returns the numeric value of a basic code */
00042 /* point (for use in representing integers) in the range 0 to */
00043 /* base-1, or base if cp is does not represent a value.       */
00044 
00045 static punycode_uint decode_digit(punycode_uint cp)
00046 {
00047   return  cp - 48 < 10 ? cp - 22 :  cp - 65 < 26 ? cp - 65 :
00048           cp - 97 < 26 ? cp - 97 :  base;
00049 }
00050 
00051 /* encode_digit(d,flag) returns the basic code point whose value      */
00052 /* (when used for representing integers) is d, which needs to be in   */
00053 /* the range 0 to base-1.  The lowercase form is used unless flag is  */
00054 /* nonzero, in which case the uppercase form is used.  The behavior   */
00055 /* is undefined if flag is nonzero and digit d has no uppercase form. */
00056 
00057 static char encode_digit(punycode_uint d, int flag)
00058 {
00059   return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
00060   /*  0..25 map to ASCII a..z or A..Z */
00061   /* 26..35 map to ASCII 0..9         */
00062 }
00063 
00064 /* flagged(bcp) tests whether a basic code point is flagged */
00065 /* (uppercase).  The behavior is undefined if bcp is not a  */
00066 /* basic code point.                                        */
00067 
00068 #define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
00069 
00070 /* encode_basic(bcp,flag) forces a basic code point to lowercase */
00071 /* if flag is zero, uppercase if flag is nonzero, and returns    */
00072 /* the resulting code point.  The code point is unchanged if it  */
00073 /* is caseless.  The behavior is undefined if bcp is not a basic */
00074 /* code point.                                                   */
00075 
00076 static char encode_basic(punycode_uint bcp, int flag)
00077 {
00078   bcp -= (bcp - 97 < 26) << 5;
00079   return bcp + ((!flag && (bcp - 65 < 26)) << 5);
00080 }
00081 
00082 /*** Platform-specific constants ***/
00083 
00084 /* maxint is the maximum value of a punycode_uint variable: */
00085 static const punycode_uint maxint = (punycode_uint) -1;
00086 /* Because maxint is unsigned, -1 becomes the maximum value. */
00087 
00088 /*** Bias adaptation function ***/
00089 
00090 static punycode_uint adapt(
00091   punycode_uint delta, punycode_uint numpoints, int firsttime )
00092 {
00093   punycode_uint k;
00094 
00095   delta = firsttime ? delta / damp : delta >> 1;
00096   /* delta >> 1 is a faster way of doing delta / 2 */
00097   delta += delta / numpoints;
00098 
00099   for (k = 0;  delta > ((base - tmin) * tmax) / 2;  k += base) {
00100     delta /= base - tmin;
00101   }
00102 
00103   return k + (base - tmin + 1) * delta / (delta + skew);
00104 }
00105 
00106 /*** Main encode function ***/
00107 
00108 enum punycode_status punycode_encode(
00109   punycode_uint input_length,
00110   const punycode_uint input[],
00111   const unsigned char case_flags[],
00112   punycode_uint *output_length,
00113   char output[] )
00114 {
00115   punycode_uint n, delta, h, b, out, max_out, bias, j, m, q, k, t;
00116 
00117   /* Initialize the state: */
00118 
00119   n = initial_n;
00120   delta = out = 0;
00121   max_out = *output_length;
00122   bias = initial_bias;
00123 
00124   /* Handle the basic code points: */
00125 
00126   for (j = 0;  j < input_length;  ++j) {
00127     if (basic(input[j])) {
00128       if (max_out - out < 2) return punycode_big_output;
00129       output[out++] =
00130         case_flags ? encode_basic(input[j], case_flags[j]) : (char)input[j];
00131     }
00132     /* else if (input[j] < n) return punycode_bad_input; */
00133     /* (not needed for Punycode with unsigned code points) */
00134   }
00135 
00136   h = b = out;
00137 
00138   /* h is the number of code points that have been handled, b is the  */
00139   /* number of basic code points, and out is the number of characters */
00140   /* that have been output.                                           */
00141 
00142   if (b > 0) output[out++] = delimiter;
00143 
00144   /* Main encoding loop: */
00145 
00146   while (h < input_length) {
00147     /* All non-basic code points < n have been     */
00148     /* handled already.  Find the next larger one: */
00149 
00150     for (m = maxint, j = 0;  j < input_length;  ++j) {
00151       /* if (basic(input[j])) continue; */
00152       /* (not needed for Punycode) */
00153       if (input[j] >= n && input[j] < m) m = input[j];
00154     }
00155 
00156     /* Increase delta enough to advance the decoder's    */
00157     /* <n,i> state to <m,0>, but guard against overflow: */
00158 
00159     if (m - n > (maxint - delta) / (h + 1)) return punycode_overflow;
00160     delta += (m - n) * (h + 1);
00161     n = m;
00162 
00163     for (j = 0;  j < input_length;  ++j) {
00164       /* Punycode does not need to check whether input[j] is basic: */
00165       if (input[j] < n /* || basic(input[j]) */ ) {
00166         if (++delta == 0) return punycode_overflow;
00167       }
00168 
00169       if (input[j] == n) {
00170         /* Represent delta as a generalized variable-length integer: */
00171 
00172         for (q = delta, k = base;  ;  k += base) {
00173           if (out >= max_out) return punycode_big_output;
00174           t = k <= bias /* + tmin */ ? tmin :     /* +tmin not needed */
00175               k >= bias + tmax ? tmax : k - bias;
00176           if (q < t) break;
00177           output[out++] = encode_digit(t + (q - t) % (base - t), 0);
00178           q = (q - t) / (base - t);
00179         }
00180 
00181         output[out++] = encode_digit(q, case_flags && case_flags[j]);
00182         bias = adapt(delta, h + 1, h == b);
00183         delta = 0;
00184         ++h;
00185       }
00186     }
00187 
00188     ++delta, ++n;
00189   }
00190 
00191   *output_length = out;
00192   return punycode_success;
00193 }
00194 
00195 /*** Main decode function ***/
00196 
00197 enum punycode_status punycode_decode(
00198   punycode_uint input_length,
00199   const char input[],
00200   punycode_uint *output_length,
00201   punycode_uint output[],
00202   unsigned char case_flags[] )
00203 {
00204   punycode_uint n, out, i, max_out, bias,
00205                  b, j, in, oldi, w, k, digit, t;
00206 
00207   /* Initialize the state: */
00208 
00209   n = initial_n;
00210   out = i = 0;
00211   max_out = *output_length;
00212   bias = initial_bias;
00213 
00214   /* Handle the basic code points:  Let b be the number of input code */
00215   /* points before the last delimiter, or 0 if there is none, then    */
00216   /* copy the first b code points to the output.                      */
00217 
00218   for (b = j = 0;  j < input_length;  ++j) if (delim(input[j])) b = j;
00219   if (b > max_out) return punycode_big_output;
00220 
00221   for (j = 0;  j < b;  ++j) {
00222     if (case_flags) case_flags[out] = flagged(input[j]);
00223     if (!basic(input[j])) return punycode_bad_input;
00224     output[out++] = input[j];
00225   }
00226 
00227   /* Main decoding loop:  Start just after the last delimiter if any  */
00228   /* basic code points were copied; start at the beginning otherwise. */
00229 
00230   for (in = b > 0 ? b + 1 : 0;  in < input_length;  ++out) {
00231 
00232     /* in is the index of the next character to be consumed, and */
00233     /* out is the number of code points in the output array.     */
00234 
00235     /* Decode a generalized variable-length integer into delta,  */
00236     /* which gets added to i.  The overflow checking is easier   */
00237     /* if we increase i as we go, then subtract off its starting */
00238     /* value at the end to obtain delta.                         */
00239 
00240     for (oldi = i, w = 1, k = base;  ;  k += base) {
00241       if (in >= input_length) return punycode_bad_input;
00242       digit = decode_digit(input[in++]);
00243       if (digit >= base) return punycode_bad_input;
00244       if (digit > (maxint - i) / w) return punycode_overflow;
00245       i += digit * w;
00246       t = k <= bias /* + tmin */ ? tmin :     /* +tmin not needed */
00247           k >= bias + tmax ? tmax : k - bias;
00248       if (digit < t) break;
00249       if (w > maxint / (base - t)) return punycode_overflow;
00250       w *= (base - t);
00251     }
00252 
00253     bias = adapt(i - oldi, out + 1, oldi == 0);
00254 
00255     /* i was supposed to wrap around from out+1 to 0,   */
00256     /* incrementing n each time, so we'll fix that now: */
00257 
00258     if (i / (out + 1) > maxint - n) return punycode_overflow;
00259     n += i / (out + 1);
00260     i %= (out + 1);
00261 
00262     /* Insert n at position i of the output: */
00263 
00264     /* not needed for Punycode: */
00265     /* if (decode_digit(n) <= base) return punycode_invalid_input; */
00266     if (out >= max_out) return punycode_big_output;
00267 
00268     if (case_flags) {
00269       memmove(case_flags + i + 1, case_flags + i, out - i);
00270       /* Case of last character determines uppercase flag: */
00271       case_flags[i] = flagged(input[in - 1]);
00272     }
00273 
00274     memmove(output + i + 1, output + i, (out - i) * sizeof *output);
00275     output[i++] = n;
00276   }
00277 
00278   *output_length = out;
00279   return punycode_success;
00280 }