Back to index

glibc  2.9
tst-rxspencer.c
Go to the documentation of this file.
00001 /* Regular expression tests.
00002    Copyright (C) 2003, 2005 Free Software Foundation, Inc.
00003    This file is part of the GNU C Library.
00004    Contributed by Jakub Jelinek <jakub@redhat.com>, 2003.
00005 
00006    The GNU C Library is free software; you can redistribute it and/or
00007    modify it under the terms of the GNU Lesser General Public
00008    License as published by the Free Software Foundation; either
00009    version 2.1 of the License, or (at your option) any later version.
00010 
00011    The GNU C Library is distributed in the hope that it will be useful,
00012    but WITHOUT ANY WARRANTY; without even the implied warranty of
00013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014    Lesser General Public License for more details.
00015 
00016    You should have received a copy of the GNU Lesser General Public
00017    License along with the GNU C Library; if not, write to the Free
00018    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
00019    02111-1307 USA.  */
00020 
00021 #include <sys/types.h>
00022 #include <mcheck.h>
00023 #include <regex.h>
00024 #include <stdio.h>
00025 #include <stdlib.h>
00026 #include <string.h>
00027 #include <locale.h>
00028 #include <getopt.h>
00029 
00030 static void
00031 replace_special_chars (char *str)
00032 {
00033   for (; (str = strpbrk (str, "NTSZ")) != NULL; ++str)
00034     switch (*str)
00035       {
00036       case 'N': *str = '\n'; break;
00037       case 'T': *str = '\t'; break;
00038       case 'S': *str = ' '; break;
00039       case 'Z': *str = '\0'; break;
00040       }
00041 }
00042 
00043 static void
00044 glibc_re_syntax (char *str)
00045 {
00046   char *p, *end = strchr (str, '\0') + 1;
00047 
00048   /* Replace [[:<:]] with < and [[:>:]] with >.  */
00049   for (p = str; (p = strstr (p, "[[:")) != NULL; )
00050     if ((p[3] == '<' || p[3] == '>') && strncmp (p + 4, ":]]", 3) == 0)
00051       {
00052         p[0] = '\\';
00053         p[1] = p[3];
00054         memmove (p + 2, p + 7, end - p - 7);
00055         end -= 5;
00056         p += 2;
00057       }
00058     else
00059       p += 3;
00060 }
00061 
00062 static char *
00063 mb_replace (char *dst, const char c)
00064 {
00065   switch (c)
00066     {
00067     /* Replace a with \'a and A with \'A.  */
00068     case 'a':
00069       *dst++ = '\xc3';
00070       *dst++ = '\xa1';
00071       break;
00072     case 'A':
00073       *dst++ = '\xc3';
00074       *dst++ = '\x81';
00075       break;
00076     /* Replace b with \v{c} and B with \v{C}.  */
00077     case 'b':
00078       *dst++ = '\xc4';
00079       *dst++ = '\x8d';
00080       break;
00081     case 'B':
00082       *dst++ = '\xc4';
00083       *dst++ = '\x8c';
00084       break;
00085     /* Replace c with \v{d} and C with \v{D}.  */
00086     case 'c':
00087       *dst++ = '\xc4';
00088       *dst++ = '\x8f';
00089       break;
00090     case 'C':
00091       *dst++ = '\xc4';
00092       *dst++ = '\x8e';
00093       break;
00094     /* Replace d with \'e and D with \'E.  */
00095     case 'd':
00096       *dst++ = '\xc3';
00097       *dst++ = '\xa9';
00098       break;
00099     case 'D':
00100       *dst++ = '\xc3';
00101       *dst++ = '\x89';
00102       break;
00103     }
00104   return dst;
00105 }
00106 
00107 static char *
00108 mb_frob_string (const char *str, const char *letters)
00109 {
00110   char *ret, *dst;
00111   const char *src;
00112 
00113   if (str == NULL)
00114     return NULL;
00115 
00116   ret = malloc (2 * strlen (str) + 1);
00117   if (ret == NULL)
00118     return NULL;
00119 
00120   for (src = str, dst = ret; *src; ++src)
00121     if (strchr (letters, *src))
00122       dst = mb_replace (dst, *src);
00123     else
00124       *dst++ = *src;
00125   *dst = '\0';
00126   return ret;
00127 }
00128 
00129 /* Like mb_frob_string, but don't replace anything between
00130    [: and :], [. and .] or [= and =] or characters escaped
00131    with a backslash.  */
00132 
00133 static char *
00134 mb_frob_pattern (const char *str, const char *letters)
00135 {
00136   char *ret, *dst;
00137   const char *src;
00138   int in_class = 0, escaped = 0;
00139 
00140   if (str == NULL)
00141     return NULL;
00142 
00143   ret = malloc (2 * strlen (str) + 1);
00144   if (ret == NULL)
00145     return NULL;
00146 
00147   for (src = str, dst = ret; *src; ++src)
00148     if (*src == '\\')
00149       {
00150        escaped ^= 1;
00151        *dst++ = *src;
00152       }
00153     else if (escaped)
00154       {
00155        escaped = 0;
00156        *dst++ = *src;
00157        continue;
00158       }
00159     else if (!in_class && strchr (letters, *src))
00160       dst = mb_replace (dst, *src);
00161     else
00162       {
00163        if (!in_class && *src == '[' && strchr (":.=", src[1]))
00164          in_class = 1;
00165        else if (in_class && *src == ']' && strchr (":.=", src[-1]))
00166          in_class = 0;
00167        *dst++ = *src;
00168       }
00169   *dst = '\0';
00170   return ret;
00171 }
00172 
00173 static int
00174 check_match (regmatch_t *rm, int idx, const char *string,
00175             const char *match, const char *fail)
00176 {
00177   if (match[0] == '-' && match[1] == '\0')
00178     {
00179       if (rm[idx].rm_so == -1 && rm[idx].rm_eo == -1)
00180        return 0;
00181       printf ("%s rm[%d] unexpectedly matched\n", fail, idx);
00182       return 1;
00183     }
00184 
00185   if (rm[idx].rm_so == -1 || rm[idx].rm_eo == -1)
00186     {
00187       printf ("%s rm[%d] unexpectedly did not match\n", fail, idx);
00188       return 1;
00189     }
00190 
00191   if (match[0] == '@')
00192     {
00193       if (rm[idx].rm_so != rm[idx].rm_eo)
00194        {
00195          printf ("%s rm[%d] not empty\n", fail, idx);
00196          return 1;
00197        }
00198 
00199       if (strncmp (string + rm[idx].rm_so, match + 1, strlen (match + 1) ?: 1))
00200        {
00201          printf ("%s rm[%d] not matching %s\n", fail, idx, match);
00202          return 1;
00203        }
00204       return 0;
00205     }
00206 
00207   if (rm[idx].rm_eo - rm[idx].rm_so != strlen (match)
00208       || strncmp (string + rm[idx].rm_so, match,
00209                 rm[idx].rm_eo - rm[idx].rm_so))
00210     {
00211       printf ("%s rm[%d] not matching %s\n", fail, idx, match);
00212       return 1;
00213     }
00214 
00215   return 0;
00216 }
00217 
00218 static int
00219 test (const char *pattern, int cflags, const char *string, int eflags,
00220       char *expect, char *matches, const char *fail)
00221 {
00222   regex_t re;
00223   regmatch_t rm[10];
00224   int n, ret = 0;
00225 
00226   n = regcomp (&re, pattern, cflags);
00227   if (n != 0)
00228     {
00229       char buf[500];
00230       if (eflags == -1)
00231        {
00232          static struct { reg_errcode_t code; const char *name; } codes []
00233 #define C(x) { REG_##x, #x }
00234            = { C(NOERROR), C(NOMATCH), C(BADPAT), C(ECOLLATE),
00235               C(ECTYPE), C(EESCAPE), C(ESUBREG), C(EBRACK),
00236               C(EPAREN), C(EBRACE), C(BADBR), C(ERANGE),
00237               C(ESPACE), C(BADRPT) };
00238 
00239          for (int i = 0; i < sizeof (codes) / sizeof (codes[0]); ++i)
00240            if (n == codes[i].code)
00241              {
00242               if (strcmp (string, codes[i].name))
00243                 {
00244                   printf ("%s regcomp returned REG_%s (expected REG_%s)\n",
00245                          fail, codes[i].name, string);
00246                   return 1;
00247                 }
00248                return 0;
00249              }
00250 
00251          printf ("%s regcomp return value REG_%d\n", fail, n);
00252          return 1;
00253        }
00254 
00255       regerror (n, &re, buf, sizeof (buf));
00256       printf ("%s regcomp failed: %s\n", fail, buf);
00257       return 1;
00258     }
00259 
00260   if (eflags == -1)
00261     {
00262       regfree (&re);
00263 
00264       /* The test case file assumes something only guaranteed by the
00265         rxspencer regex implementation.  Namely that for empty
00266         expressions regcomp() return REG_EMPTY.  This is not the case
00267         for us and so we ignore this error.  */
00268       if (strcmp (string, "EMPTY") == 0)
00269        return 0;
00270 
00271       printf ("%s regcomp unexpectedly succeeded\n", fail);
00272       return 1;
00273     }
00274 
00275   if (regexec (&re, string, 10, rm, eflags))
00276     {
00277       regfree (&re);
00278       if (expect == NULL)
00279        return 0;
00280       printf ("%s regexec failed\n", fail);
00281       return 1;
00282     }
00283 
00284   regfree (&re);
00285 
00286   if (expect == NULL)
00287     {
00288       printf ("%s regexec unexpectedly succeeded\n", fail);
00289       return 1;
00290     }
00291 
00292   if (cflags & REG_NOSUB)
00293     return 0;
00294 
00295   ret = check_match (rm, 0, string, expect, fail);
00296   if (matches == NULL)
00297     return ret;
00298 
00299   for (n = 1; ret == 0 && n < 10; ++n)
00300     {
00301       char *p = NULL;
00302 
00303       if (matches)
00304        {
00305          p = strchr (matches, ',');
00306          if (p != NULL)
00307            *p = '\0';
00308        }
00309       ret = check_match (rm, n, string, matches ?: "-", fail);
00310       if (p)
00311        {
00312          *p = ',';
00313          matches = p + 1;
00314        }
00315       else
00316        matches = NULL;
00317     }
00318 
00319   return ret;
00320 }
00321 
00322 static int
00323 mb_test (const char *pattern, int cflags, const char *string, int eflags,
00324         char *expect, const char *matches, const char *letters,
00325         const char *fail)
00326 {
00327   char *pattern_mb = mb_frob_pattern (pattern, letters);
00328   const char *string_mb
00329     = eflags == -1 ? string : mb_frob_string (string, letters);
00330   char *expect_mb = mb_frob_string (expect, letters);
00331   char *matches_mb = mb_frob_string (matches, letters);
00332   int ret = 0;
00333 
00334   if (!pattern_mb || !string_mb
00335       || (expect && !expect_mb) || (matches && !matches_mb))
00336     {
00337       printf ("%s %m", fail);
00338       ret = 1;
00339     }
00340   else
00341     ret = test (pattern_mb, cflags, string_mb, eflags, expect_mb,
00342               matches_mb, fail);
00343 
00344   free (matches_mb);
00345   free (expect_mb);
00346   if (string_mb != string)
00347     free ((char *) string_mb);
00348   free (pattern_mb);
00349   return ret;
00350 }
00351 
00352 static int
00353 mb_tests (const char *pattern, int cflags, const char *string, int eflags,
00354          char *expect, const char *matches)
00355 {
00356   int ret = 0;
00357   int i;
00358   char letters[9], fail[20];
00359 
00360   /* The tests aren't supposed to work with xdigit, since a-dA-D are
00361      hex digits while \'a \'A \v{c}\v{C}\v{d}\v{D}\'e \'E are not.  */
00362   if (strstr (pattern, "[:xdigit:]"))
00363     return 0;
00364 
00365   /* XXX: regex ATM handles only single byte equivalence classes.  */
00366   if (strstr (pattern, "[[=b=]]"))
00367     return 0;
00368 
00369   for (i = 1; i < 16; ++i)
00370     {
00371       char *p = letters;
00372       if (i & 1)
00373        {
00374          if (!strchr (pattern, 'a') && !strchr (string, 'a')
00375              && !strchr (pattern, 'A') && !strchr (string, 'A'))
00376            continue;
00377          *p++ = 'a', *p++ = 'A';
00378        }
00379       if (i & 2)
00380        {
00381          if (!strchr (pattern, 'b') && !strchr (string, 'b')
00382              && !strchr (pattern, 'B') && !strchr (string, 'B'))
00383            continue;
00384          *p++ = 'b', *p++ = 'B';
00385        }
00386       if (i & 4)
00387        {
00388          if (!strchr (pattern, 'c') && !strchr (string, 'c')
00389              && !strchr (pattern, 'C') && !strchr (string, 'C'))
00390            continue;
00391          *p++ = 'c', *p++ = 'C';
00392        }
00393       if (i & 8)
00394        {
00395          if (!strchr (pattern, 'd') && !strchr (string, 'd')
00396              && !strchr (pattern, 'D') && !strchr (string, 'D'))
00397            continue;
00398          *p++ = 'd', *p++ = 'D';
00399        }
00400       *p++ = '\0';
00401       sprintf (fail, "UTF-8 %s FAIL", letters);
00402       ret |= mb_test (pattern, cflags, string, eflags, expect, matches,
00403                     letters, fail);
00404     }
00405   return ret;
00406 }
00407 
00408 int
00409 main (int argc, char **argv)
00410 {
00411   int ret = 0;
00412   char *line = NULL;
00413   size_t line_len = 0;
00414   ssize_t len;
00415   FILE *f;
00416   static int test_utf8 = 0;
00417   static const struct option options[] =
00418     {
00419       {"utf8",       no_argument,  &test_utf8,   1},
00420       {NULL,  0,            NULL,         0 }
00421     };
00422 
00423   mtrace ();
00424 
00425   while (getopt_long (argc, argv, "", options, NULL) >= 0);
00426 
00427   if (optind + 1 != argc)
00428     {
00429       fprintf (stderr, "Missing test filename\n");
00430       return 1;
00431     }
00432 
00433   f = fopen (argv[optind], "r");
00434   if (f == NULL)
00435     {
00436       fprintf (stderr, "Couldn't open %s\n", argv[optind]);
00437       return 1;
00438     }
00439 
00440   while ((len = getline (&line, &line_len, f)) > 0)
00441     {
00442       char *pattern, *flagstr, *string, *expect, *matches, *p;
00443       int cflags = REG_EXTENDED, eflags = 0, try_bre_ere = 0;
00444 
00445       if (line[len - 1] == '\n')
00446         line[len - 1] = '\0';
00447 
00448       /* Skip comments and empty lines.  */
00449       if (*line == '#' || *line == '\0')
00450        continue;
00451 
00452       puts (line);
00453       fflush (stdout);
00454 
00455       pattern = strtok (line, "\t");
00456       if (pattern == NULL)
00457         continue;
00458 
00459       if (strcmp (pattern, "\"\"") == 0)
00460        pattern += 2;
00461 
00462       flagstr = strtok (NULL, "\t");
00463       if (flagstr == NULL)
00464         continue;
00465 
00466       string = strtok (NULL, "\t");
00467       if (string == NULL)
00468         continue;
00469 
00470       if (strcmp (string, "\"\"") == 0)
00471        string += 2;
00472 
00473       for (p = flagstr; *p; ++p)
00474        switch (*p)
00475          {
00476          case '-':
00477            break;
00478          case 'b':
00479            cflags &= ~REG_EXTENDED;
00480            break;
00481          case '&':
00482            try_bre_ere = 1;
00483            break;
00484          case 'C':
00485            eflags = -1;
00486            break;
00487          case 'i':
00488            cflags |= REG_ICASE;
00489            break;
00490          case 's':
00491            cflags |= REG_NOSUB;
00492            break;
00493          case 'n':
00494            cflags |= REG_NEWLINE;
00495            break;
00496          case '^':
00497            eflags |= REG_NOTBOL;
00498            break;
00499          case '$':
00500            eflags |= REG_NOTEOL;
00501            break;
00502          case 'm':
00503          case 'p':
00504          case '#':
00505            /* Not supported.  */
00506            flagstr = NULL;
00507            break;
00508          }
00509 
00510       if (flagstr == NULL)
00511        continue;
00512 
00513       replace_special_chars (pattern);
00514       glibc_re_syntax (pattern);
00515       if (eflags != -1)
00516         replace_special_chars (string);
00517 
00518       expect = strtok (NULL, "\t");
00519       matches = NULL;
00520       if (expect != NULL)
00521         {
00522          replace_special_chars (expect);
00523          matches = strtok (NULL, "\t");
00524          if (matches != NULL)
00525            replace_special_chars (matches);
00526         }
00527 
00528       if (setlocale (LC_ALL, "C") == NULL)
00529        {
00530          puts ("setlocale C failed");
00531          ret = 1;
00532        }
00533       if (test (pattern, cflags, string, eflags, expect, matches, "FAIL")
00534          || (try_bre_ere
00535              && test (pattern, cflags & ~REG_EXTENDED, string, eflags,
00536                      expect, matches, "FAIL")))
00537        ret = 1;
00538       else if (test_utf8)
00539        {
00540          if (setlocale (LC_ALL, "cs_CZ.UTF-8") == NULL)
00541            {
00542              puts ("setlocale cs_CZ.UTF-8 failed");
00543              ret = 1;
00544            }
00545          else if (test (pattern, cflags, string, eflags, expect, matches,
00546                       "UTF-8 FAIL")
00547                  || (try_bre_ere
00548                      && test (pattern, cflags & ~REG_EXTENDED, string,
00549                             eflags, expect, matches, "UTF-8 FAIL")))
00550            ret = 1;
00551          else if (mb_tests (pattern, cflags, string, eflags, expect, matches)
00552                  || (try_bre_ere
00553                      && mb_tests (pattern, cflags & ~REG_EXTENDED, string,
00554                                 eflags, expect, matches)))
00555            ret = 1;
00556        }
00557     }
00558 
00559   free (line);
00560   fclose (f);
00561   return ret;
00562 }