Back to index

glibc  2.9
idna.c
Go to the documentation of this file.
00001 /* idna.c     Convert to or from IDN strings.
00002  * Copyright (C) 2002, 2003, 2004  Simon Josefsson
00003  *
00004  * This file is part of GNU Libidn.
00005  *
00006  * GNU Libidn is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * GNU Libidn is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with GNU Libidn; if not, write to the Free Software
00018  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00019  *
00020  */
00021 
00022 #if HAVE_CONFIG_H
00023 # include "config.h"
00024 #endif
00025 
00026 #include <stdlib.h>
00027 #include <string.h>
00028 #include <stringprep.h>
00029 #include <punycode.h>
00030 
00031 #include "idna.h"
00032 
00033 #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 ||      \
00034                (c) == 0xFF0E || (c) == 0xFF61)
00035 
00036 /* Core functions */
00037 
00068 int
00069 idna_to_ascii_4i (const uint32_t * in, size_t inlen, char *out, int flags)
00070 {
00071   size_t len, outlen;
00072   uint32_t *src;            /* XXX don't need to copy data? */
00073   int rc;
00074 
00075   /*
00076    * ToASCII consists of the following steps:
00077    *
00078    * 1. If all code points in the sequence are in the ASCII range (0..7F)
00079    * then skip to step 3.
00080    */
00081 
00082   {
00083     size_t i;
00084     int inasciirange;
00085 
00086     inasciirange = 1;
00087     for (i = 0; i < inlen; i++)
00088       if (in[i] > 0x7F)
00089        inasciirange = 0;
00090     if (inasciirange)
00091       {
00092        src = malloc (sizeof (in[0]) * (inlen + 1));
00093        if (src == NULL)
00094          return IDNA_MALLOC_ERROR;
00095 
00096        memcpy (src, in, sizeof (in[0]) * inlen);
00097        src[inlen] = 0;
00098 
00099        goto step3;
00100       }
00101   }
00102 
00103   /*
00104    * 2. Perform the steps specified in [NAMEPREP] and fail if there is
00105    * an error. The AllowUnassigned flag is used in [NAMEPREP].
00106    */
00107 
00108   {
00109     char *p;
00110 
00111     p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL);
00112     if (p == NULL)
00113       return IDNA_MALLOC_ERROR;
00114 
00115     len = strlen (p);
00116     do
00117       {
00118        char *newp;
00119 
00120        len = 2 * len + 10;  /* XXX better guess? */
00121        newp = realloc (p, len);
00122        if (newp == NULL)
00123          {
00124            free (p);
00125            return IDNA_MALLOC_ERROR;
00126          }
00127        p = newp;
00128 
00129        if (flags & IDNA_ALLOW_UNASSIGNED)
00130          rc = stringprep_nameprep (p, len);
00131        else
00132          rc = stringprep_nameprep_no_unassigned (p, len);
00133       }
00134     while (rc == STRINGPREP_TOO_SMALL_BUFFER);
00135 
00136     if (rc != STRINGPREP_OK)
00137       {
00138        free (p);
00139        return IDNA_STRINGPREP_ERROR;
00140       }
00141 
00142     src = stringprep_utf8_to_ucs4 (p, -1, NULL);
00143 
00144     free (p);
00145   }
00146 
00147 step3:
00148   /*
00149    * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
00150    *
00151    * (a) Verify the absence of non-LDH ASCII code points; that is,
00152    * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
00153    *
00154    * (b) Verify the absence of leading and trailing hyphen-minus;
00155    * that is, the absence of U+002D at the beginning and end of
00156    * the sequence.
00157    */
00158 
00159   if (flags & IDNA_USE_STD3_ASCII_RULES)
00160     {
00161       size_t i;
00162 
00163       for (i = 0; src[i]; i++)
00164        if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
00165            (src[i] >= 0x3A && src[i] <= 0x40) ||
00166            (src[i] >= 0x5B && src[i] <= 0x60) ||
00167            (src[i] >= 0x7B && src[i] <= 0x7F))
00168          {
00169            free (src);
00170            return IDNA_CONTAINS_NON_LDH;
00171          }
00172 
00173       if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
00174        {
00175          free (src);
00176          return IDNA_CONTAINS_MINUS;
00177        }
00178     }
00179 
00180   /*
00181    * 4. If all code points in the sequence are in the ASCII range
00182    * (0..7F), then skip to step 8.
00183    */
00184 
00185   {
00186     size_t i;
00187     int inasciirange;
00188 
00189     inasciirange = 1;
00190     for (i = 0; src[i]; i++)
00191       {
00192        if (src[i] > 0x7F)
00193          inasciirange = 0;
00194        /* copy string to output buffer if we are about to skip to step8 */
00195        if (i < 64)
00196          out[i] = src[i];
00197       }
00198     if (i < 64)
00199       out[i] = '\0';
00200     if (inasciirange)
00201       goto step8;
00202   }
00203 
00204   /*
00205    * 5. Verify that the sequence does NOT begin with the ACE prefix.
00206    *
00207    */
00208 
00209   {
00210     size_t i;
00211     int match;
00212 
00213     match = 1;
00214     for (i = 0; match && i < strlen (IDNA_ACE_PREFIX); i++)
00215       if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i])
00216        match = 0;
00217     if (match)
00218       {
00219        free (src);
00220        return IDNA_CONTAINS_ACE_PREFIX;
00221       }
00222   }
00223 
00224   /*
00225    * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
00226    * and fail if there is an error.
00227    */
00228   for (len = 0; src[len]; len++)
00229     ;
00230   src[len] = '\0';
00231   outlen = 63 - strlen (IDNA_ACE_PREFIX);
00232   rc = punycode_encode (len, src, NULL,
00233                      &outlen, &out[strlen (IDNA_ACE_PREFIX)]);
00234   if (rc != PUNYCODE_SUCCESS)
00235     {
00236       free (src);
00237       return IDNA_PUNYCODE_ERROR;
00238     }
00239   out[strlen (IDNA_ACE_PREFIX) + outlen] = '\0';
00240 
00241   /*
00242    * 7. Prepend the ACE prefix.
00243    */
00244 
00245   memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
00246 
00247   /*
00248    * 8. Verify that the number of code points is in the range 1 to 63
00249    * inclusive (0 is excluded).
00250    */
00251 
00252 step8:
00253   free (src);
00254   if (strlen (out) < 1 || strlen (out) > 63)
00255     return IDNA_INVALID_LENGTH;
00256 
00257   return IDNA_SUCCESS;
00258 }
00259 
00260 /* ToUnicode().  May realloc() utf8in. */
00261 static int
00262 idna_to_unicode_internal (char *utf8in,
00263                        uint32_t * out, size_t * outlen, int flags)
00264 {
00265   int rc;
00266   char tmpout[64];
00267   size_t utf8len = strlen (utf8in) + 1;
00268   size_t addlen = 0;
00269 
00270   /*
00271    * ToUnicode consists of the following steps:
00272    *
00273    * 1. If the sequence contains any code points outside the ASCII range
00274    * (0..7F) then proceed to step 2, otherwise skip to step 3.
00275    */
00276 
00277   {
00278     size_t i;
00279     int inasciirange;
00280 
00281     inasciirange = 1;
00282     for (i = 0; utf8in[i]; i++)
00283       if (utf8in[i] & ~0x7F)
00284        inasciirange = 0;
00285     if (inasciirange)
00286       goto step3;
00287   }
00288 
00289   /*
00290    * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
00291    * error. (If step 3 of ToASCII is also performed here, it will not
00292    * affect the overall behavior of ToUnicode, but it is not
00293    * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
00294    */
00295   do
00296     {
00297       char *newp = realloc (utf8in, utf8len + addlen);
00298       if (newp == NULL)
00299        {
00300          free (utf8in);
00301          return IDNA_MALLOC_ERROR;
00302        }
00303       utf8in = newp;
00304       if (flags & IDNA_ALLOW_UNASSIGNED)
00305        rc = stringprep_nameprep (utf8in, utf8len + addlen);
00306       else
00307        rc = stringprep_nameprep_no_unassigned (utf8in, utf8len + addlen);
00308       addlen += 1;
00309     }
00310   while (rc == STRINGPREP_TOO_SMALL_BUFFER);
00311 
00312   if (rc != STRINGPREP_OK)
00313     {
00314       free (utf8in);
00315       return IDNA_STRINGPREP_ERROR;
00316     }
00317 
00318   /* 3. Verify that the sequence begins with the ACE prefix, and save a
00319    * copy of the sequence.
00320    */
00321 
00322 step3:
00323   if (memcmp (IDNA_ACE_PREFIX, utf8in, strlen (IDNA_ACE_PREFIX)) != 0)
00324     {
00325       free (utf8in);
00326       return IDNA_NO_ACE_PREFIX;
00327     }
00328 
00329   /* 4. Remove the ACE prefix.
00330    */
00331 
00332   memmove (utf8in, &utf8in[strlen (IDNA_ACE_PREFIX)],
00333           strlen (utf8in) - strlen (IDNA_ACE_PREFIX) + 1);
00334 
00335   /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
00336    * and fail if there is an error. Save a copy of the result of
00337    * this step.
00338    */
00339 
00340   (*outlen)--;                     /* reserve one for the zero */
00341 
00342   rc = punycode_decode (strlen (utf8in), utf8in, outlen, out, NULL);
00343   if (rc != PUNYCODE_SUCCESS)
00344     {
00345       free (utf8in);
00346       return IDNA_PUNYCODE_ERROR;
00347     }
00348 
00349   out[*outlen] = 0;         /* add zero */
00350 
00351   /* 6. Apply ToASCII.
00352    */
00353 
00354   rc = idna_to_ascii_4i (out, *outlen, tmpout, flags);
00355   if (rc != IDNA_SUCCESS)
00356     {
00357       free (utf8in);
00358       return rc;
00359     }
00360 
00361   /* 7. Verify that the result of step 6 matches the saved copy from
00362    * step 3, using a case-insensitive ASCII comparison.
00363    */
00364 
00365   if (strcasecmp (utf8in, tmpout + strlen (IDNA_ACE_PREFIX)) != 0)
00366     {
00367       free (utf8in);
00368       return IDNA_ROUNDTRIP_VERIFY_ERROR;
00369     }
00370 
00371   /* 8. Return the saved copy from step 5.
00372    */
00373 
00374   free (utf8in);
00375   return IDNA_SUCCESS;
00376 }
00377 
00413 int
00414 idna_to_unicode_44i (const uint32_t * in, size_t inlen,
00415                    uint32_t * out, size_t * outlen, int flags)
00416 {
00417   int rc;
00418   size_t outlensave = *outlen;
00419   char *p;
00420 
00421   p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL);
00422   if (p == NULL)
00423     return IDNA_MALLOC_ERROR;
00424 
00425   rc = idna_to_unicode_internal (p, out, outlen, flags);
00426   if (rc != IDNA_SUCCESS)
00427     {
00428       memcpy (out, in, sizeof (in[0]) * (inlen < outlensave ?
00429                                     inlen : outlensave));
00430       *outlen = inlen;
00431     }
00432 
00433   /* p is freed in idna_to_unicode_internal.  */
00434 
00435   return rc;
00436 }
00437 
00438 /* Wrappers that handle several labels */
00439 
00452 int
00453 idna_to_ascii_4z (const uint32_t * input, char **output, int flags)
00454 {
00455   const uint32_t *start = input;
00456   const uint32_t *end = input;
00457   char buf[64];
00458   char *out = NULL;
00459   int rc;
00460 
00461   /* 1) Whenever dots are used as label separators, the following
00462      characters MUST be recognized as dots: U+002E (full stop),
00463      U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
00464      U+FF61 (halfwidth ideographic full stop). */
00465 
00466   if (input[0] == 0)
00467     {
00468       /* Handle implicit zero-length root label. */
00469       *output = malloc (1);
00470       if (!*output)
00471        return IDNA_MALLOC_ERROR;
00472       strcpy (*output, "");
00473       return IDNA_SUCCESS;
00474     }
00475 
00476   if (DOTP (input[0]) && input[1] == 0)
00477     {
00478       /* Handle explicit zero-length root label. */
00479       *output = malloc (2);
00480       if (!*output)
00481        return IDNA_MALLOC_ERROR;
00482       strcpy (*output, ".");
00483       return IDNA_SUCCESS;
00484     }
00485 
00486   *output = NULL;
00487   do
00488     {
00489       end = start;
00490 
00491       for (; *end && !DOTP (*end); end++)
00492        ;
00493 
00494       if (*end == '\0' && start == end)
00495        {
00496          /* Handle explicit zero-length root label. */
00497          buf[0] = '\0';
00498        }
00499       else
00500        {
00501          rc = idna_to_ascii_4i (start, end - start, buf, flags);
00502          if (rc != IDNA_SUCCESS)
00503            return rc;
00504        }
00505 
00506       if (out)
00507        {
00508          char *newp = realloc (out, strlen (out) + 1 + strlen (buf) + 1);
00509          if (!newp)
00510            {
00511              free (out);
00512              return IDNA_MALLOC_ERROR;
00513            }
00514          out = newp;
00515          strcat (out, ".");
00516          strcat (out, buf);
00517        }
00518       else
00519        {
00520          out = (char *) malloc (strlen (buf) + 1);
00521          if (!out)
00522            return IDNA_MALLOC_ERROR;
00523          strcpy (out, buf);
00524        }
00525 
00526       start = end + 1;
00527     }
00528   while (*end);
00529 
00530   *output = out;
00531 
00532   return IDNA_SUCCESS;
00533 }
00534 
00547 int
00548 idna_to_ascii_8z (const char *input, char **output, int flags)
00549 {
00550   uint32_t *ucs4;
00551   size_t ucs4len;
00552   int rc;
00553 
00554   ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
00555   if (!ucs4)
00556     return IDNA_ICONV_ERROR;
00557 
00558   rc = idna_to_ascii_4z (ucs4, output, flags);
00559 
00560   free (ucs4);
00561 
00562   return rc;
00563 
00564 }
00565 
00578 int
00579 idna_to_ascii_lz (const char *input, char **output, int flags)
00580 {
00581   char *utf8;
00582   int rc;
00583 
00584   utf8 = stringprep_locale_to_utf8 (input);
00585   if (!utf8)
00586     return IDNA_ICONV_ERROR;
00587 
00588   rc = idna_to_ascii_8z (utf8, output, flags);
00589 
00590   free (utf8);
00591 
00592   return rc;
00593 }
00594 
00608 int
00609 idna_to_unicode_4z4z (const uint32_t * input, uint32_t ** output, int flags)
00610 {
00611   const uint32_t *start = input;
00612   const uint32_t *end = input;
00613   uint32_t *buf;
00614   size_t buflen;
00615   uint32_t *out = NULL;
00616   size_t outlen = 0;
00617   int rc;
00618 
00619   *output = NULL;
00620 
00621   do
00622     {
00623       end = start;
00624 
00625       for (; *end && !DOTP (*end); end++)
00626        ;
00627 
00628       buflen = end - start;
00629       buf = malloc (sizeof (buf[0]) * (buflen + 1));
00630       if (!buf)
00631        return IDNA_MALLOC_ERROR;
00632 
00633       rc = idna_to_unicode_44i (start, end - start, buf, &buflen, flags);
00634       /* don't check rc as per specification! */
00635 
00636       if (out)
00637        {
00638          uint32_t *newp = realloc (out,
00639                                 sizeof (out[0])
00640                                 * (outlen + 1 + buflen + 1));
00641          if (!newp)
00642            {
00643              free (buf);
00644              free (out);
00645              return IDNA_MALLOC_ERROR;
00646            }
00647          out = newp;
00648          out[outlen++] = 0x002E;   /* '.' (full stop) */
00649          memcpy (out + outlen, buf, sizeof (buf[0]) * buflen);
00650          outlen += buflen;
00651          out[outlen] = 0x0;
00652          free (buf);
00653        }
00654       else
00655        {
00656          out = buf;
00657          outlen = buflen;
00658          out[outlen] = 0x0;
00659        }
00660 
00661       start = end + 1;
00662     }
00663   while (*end);
00664 
00665   *output = out;
00666 
00667   return IDNA_SUCCESS;
00668 }
00669 
00683 int
00684 idna_to_unicode_8z4z (const char *input, uint32_t ** output, int flags)
00685 {
00686   uint32_t *ucs4;
00687   size_t ucs4len;
00688   int rc;
00689 
00690   ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
00691   if (!ucs4)
00692     return IDNA_ICONV_ERROR;
00693 
00694   rc = idna_to_unicode_4z4z (ucs4, output, flags);
00695   free (ucs4);
00696 
00697   return rc;
00698 }
00699 
00713 int
00714 idna_to_unicode_8z8z (const char *input, char **output, int flags)
00715 {
00716   uint32_t *ucs4;
00717   int rc;
00718 
00719   rc = idna_to_unicode_8z4z (input, &ucs4, flags);
00720   *output = stringprep_ucs4_to_utf8 (ucs4, -1, NULL, NULL);
00721   free (ucs4);
00722 
00723   if (!*output)
00724     return IDNA_ICONV_ERROR;
00725 
00726   return rc;
00727 }
00728 
00743 int
00744 idna_to_unicode_8zlz (const char *input, char **output, int flags)
00745 {
00746   char *utf8;
00747   int rc;
00748 
00749   rc = idna_to_unicode_8z8z (input, &utf8, flags);
00750   *output = stringprep_utf8_to_locale (utf8);
00751   free (utf8);
00752 
00753   if (!*output)
00754     return IDNA_ICONV_ERROR;
00755 
00756   return rc;
00757 }
00758 
00774 int
00775 idna_to_unicode_lzlz (const char *input, char **output, int flags)
00776 {
00777   char *utf8;
00778   int rc;
00779 
00780   utf8 = stringprep_locale_to_utf8 (input);
00781   if (!utf8)
00782     return IDNA_ICONV_ERROR;
00783 
00784   rc = idna_to_unicode_8zlz (utf8, output, flags);
00785   free (utf8);
00786 
00787   return rc;
00788 }
00789