Back to index

php5  5.3.10
pcredemo.c
Go to the documentation of this file.
00001 /*************************************************
00002 *           PCRE DEMONSTRATION PROGRAM           *
00003 *************************************************/
00004 
00005 /* This is a demonstration program to illustrate the most straightforward ways
00006 of calling the PCRE regular expression library from a C program. See the
00007 pcresample documentation for a short discussion ("man pcresample" if you have
00008 the PCRE man pages installed).
00009 
00010 In Unix-like environments, if PCRE is installed in your standard system
00011 libraries, you should be able to compile this program using this command:
00012 
00013 gcc -Wall pcredemo.c -lpcre -o pcredemo
00014 
00015 If PCRE is not installed in a standard place, it is likely to be installed with
00016 support for the pkg-config mechanism. If you have pkg-config, you can compile
00017 this program using this command:
00018 
00019 gcc -Wall pcredemo.c `pkg-config --cflags --libs libpcre` -o pcredemo
00020 
00021 If you do not have pkg-config, you may have to use this:
00022 
00023 gcc -Wall pcredemo.c -I/usr/local/include -L/usr/local/lib \
00024   -R/usr/local/lib -lpcre -o pcredemo
00025 
00026 Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
00027 library files for PCRE are installed on your system. Only some operating
00028 systems (e.g. Solaris) use the -R option.
00029 
00030 Building under Windows:
00031 
00032 If you want to statically link this program against a non-dll .a file, you must
00033 define PCRE_STATIC before including pcre.h, otherwise the pcre_malloc() and
00034 pcre_free() exported functions will be declared __declspec(dllimport), with
00035 unwanted results. So in this environment, uncomment the following line. */
00036 
00037 /* #define PCRE_STATIC */
00038 
00039 #include <stdio.h>
00040 #include <string.h>
00041 #include <pcre.h>
00042 
00043 #define OVECCOUNT 30    /* should be a multiple of 3 */
00044 
00045 
00046 int main(int argc, char **argv)
00047 {
00048 pcre *re;
00049 const char *error;
00050 char *pattern;
00051 char *subject;
00052 unsigned char *name_table;
00053 unsigned int option_bits;
00054 int erroffset;
00055 int find_all;
00056 int crlf_is_newline;
00057 int namecount;
00058 int name_entry_size;
00059 int ovector[OVECCOUNT];
00060 int subject_length;
00061 int rc, i;
00062 int utf8;
00063 
00064 
00065 /**************************************************************************
00066 * First, sort out the command line. There is only one possible option at  *
00067 * the moment, "-g" to request repeated matching to find all occurrences,  *
00068 * like Perl's /g option. We set the variable find_all to a non-zero value *
00069 * if the -g option is present. Apart from that, there must be exactly two *
00070 * arguments.                                                              *
00071 **************************************************************************/
00072 
00073 find_all = 0;
00074 for (i = 1; i < argc; i++)
00075   {
00076   if (strcmp(argv[i], "-g") == 0) find_all = 1;
00077     else break;
00078   }
00079 
00080 /* After the options, we require exactly two arguments, which are the pattern,
00081 and the subject string. */
00082 
00083 if (argc - i != 2)
00084   {
00085   printf("Two arguments required: a regex and a subject string\n");
00086   return 1;
00087   }
00088 
00089 pattern = argv[i];
00090 subject = argv[i+1];
00091 subject_length = (int)strlen(subject);
00092 
00093 
00094 /*************************************************************************
00095 * Now we are going to compile the regular expression pattern, and handle *
00096 * and errors that are detected.                                          *
00097 *************************************************************************/
00098 
00099 re = pcre_compile(
00100   pattern,              /* the pattern */
00101   0,                    /* default options */
00102   &error,               /* for error message */
00103   &erroffset,           /* for error offset */
00104   NULL);                /* use default character tables */
00105 
00106 /* Compilation failed: print the error message and exit */
00107 
00108 if (re == NULL)
00109   {
00110   printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);
00111   return 1;
00112   }
00113 
00114 
00115 /*************************************************************************
00116 * If the compilation succeeded, we call PCRE again, in order to do a     *
00117 * pattern match against the subject string. This does just ONE match. If *
00118 * further matching is needed, it will be done below.                     *
00119 *************************************************************************/
00120 
00121 rc = pcre_exec(
00122   re,                   /* the compiled pattern */
00123   NULL,                 /* no extra data - we didn't study the pattern */
00124   subject,              /* the subject string */
00125   subject_length,       /* the length of the subject */
00126   0,                    /* start at offset 0 in the subject */
00127   0,                    /* default options */
00128   ovector,              /* output vector for substring information */
00129   OVECCOUNT);           /* number of elements in the output vector */
00130 
00131 /* Matching failed: handle error cases */
00132 
00133 if (rc < 0)
00134   {
00135   switch(rc)
00136     {
00137     case PCRE_ERROR_NOMATCH: printf("No match\n"); break;
00138     /*
00139     Handle other special cases if you like
00140     */
00141     default: printf("Matching error %d\n", rc); break;
00142     }
00143   pcre_free(re);     /* Release memory used for the compiled pattern */
00144   return 1;
00145   }
00146 
00147 /* Match succeded */
00148 
00149 printf("\nMatch succeeded at offset %d\n", ovector[0]);
00150 
00151 
00152 /*************************************************************************
00153 * We have found the first match within the subject string. If the output *
00154 * vector wasn't big enough, say so. Then output any substrings that were *
00155 * captured.                                                              *
00156 *************************************************************************/
00157 
00158 /* The output vector wasn't big enough */
00159 
00160 if (rc == 0)
00161   {
00162   rc = OVECCOUNT/3;
00163   printf("ovector only has room for %d captured substrings\n", rc - 1);
00164   }
00165 
00166 /* Show substrings stored in the output vector by number. Obviously, in a real
00167 application you might want to do things other than print them. */
00168 
00169 for (i = 0; i < rc; i++)
00170   {
00171   char *substring_start = subject + ovector[2*i];
00172   int substring_length = ovector[2*i+1] - ovector[2*i];
00173   printf("%2d: %.*s\n", i, substring_length, substring_start);
00174   }
00175 
00176 
00177 /**************************************************************************
00178 * That concludes the basic part of this demonstration program. We have    *
00179 * compiled a pattern, and performed a single match. The code that follows *
00180 * shows first how to access named substrings, and then how to code for    *
00181 * repeated matches on the same subject.                                   *
00182 **************************************************************************/
00183 
00184 /* See if there are any named substrings, and if so, show them by name. First
00185 we have to extract the count of named parentheses from the pattern. */
00186 
00187 (void)pcre_fullinfo(
00188   re,                   /* the compiled pattern */
00189   NULL,                 /* no extra data - we didn't study the pattern */
00190   PCRE_INFO_NAMECOUNT,  /* number of named substrings */
00191   &namecount);          /* where to put the answer */
00192 
00193 if (namecount <= 0) printf("No named substrings\n"); else
00194   {
00195   unsigned char *tabptr;
00196   printf("Named substrings\n");
00197 
00198   /* Before we can access the substrings, we must extract the table for
00199   translating names to numbers, and the size of each entry in the table. */
00200 
00201   (void)pcre_fullinfo(
00202     re,                       /* the compiled pattern */
00203     NULL,                     /* no extra data - we didn't study the pattern */
00204     PCRE_INFO_NAMETABLE,      /* address of the table */
00205     &name_table);             /* where to put the answer */
00206 
00207   (void)pcre_fullinfo(
00208     re,                       /* the compiled pattern */
00209     NULL,                     /* no extra data - we didn't study the pattern */
00210     PCRE_INFO_NAMEENTRYSIZE,  /* size of each entry in the table */
00211     &name_entry_size);        /* where to put the answer */
00212 
00213   /* Now we can scan the table and, for each entry, print the number, the name,
00214   and the substring itself. */
00215 
00216   tabptr = name_table;
00217   for (i = 0; i < namecount; i++)
00218     {
00219     int n = (tabptr[0] << 8) | tabptr[1];
00220     printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
00221       ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]);
00222     tabptr += name_entry_size;
00223     }
00224   }
00225 
00226 
00227 /*************************************************************************
00228 * If the "-g" option was given on the command line, we want to continue  *
00229 * to search for additional matches in the subject string, in a similar   *
00230 * way to the /g option in Perl. This turns out to be trickier than you   *
00231 * might think because of the possibility of matching an empty string.    *
00232 * What happens is as follows:                                            *
00233 *                                                                        *
00234 * If the previous match was NOT for an empty string, we can just start   *
00235 * the next match at the end of the previous one.                         *
00236 *                                                                        *
00237 * If the previous match WAS for an empty string, we can't do that, as it *
00238 * would lead to an infinite loop. Instead, a special call of pcre_exec() *
00239 * is made with the PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED flags set.    *
00240 * The first of these tells PCRE that an empty string at the start of the *
00241 * subject is not a valid match; other possibilities must be tried. The   *
00242 * second flag restricts PCRE to one match attempt at the initial string  *
00243 * position. If this match succeeds, an alternative to the empty string   *
00244 * match has been found, and we can print it and proceed round the loop,  *
00245 * advancing by the length of whatever was found. If this match does not  *
00246 * succeed, we still stay in the loop, advancing by just one character.   *
00247 * In UTF-8 mode, which can be set by (*UTF8) in the pattern, this may be *
00248 * more than one byte.                                                    *
00249 *                                                                        *
00250 * However, there is a complication concerned with newlines. When the     *
00251 * newline convention is such that CRLF is a valid newline, we want must  *
00252 * advance by two characters rather than one. The newline convention can  *
00253 * be set in the regex by (*CR), etc.; if not, we must find the default.  *
00254 *************************************************************************/
00255 
00256 if (!find_all)     /* Check for -g */
00257   {
00258   pcre_free(re);   /* Release the memory used for the compiled pattern */
00259   return 0;        /* Finish unless -g was given */
00260   }
00261 
00262 /* Before running the loop, check for UTF-8 and whether CRLF is a valid newline
00263 sequence. First, find the options with which the regex was compiled; extract
00264 the UTF-8 state, and mask off all but the newline options. */
00265 
00266 (void)pcre_fullinfo(re, NULL, PCRE_INFO_OPTIONS, &option_bits);
00267 utf8 = option_bits & PCRE_UTF8;
00268 option_bits &= PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_CRLF|
00269                PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF;
00270 
00271 /* If no newline options were set, find the default newline convention from the
00272 build configuration. */
00273 
00274 if (option_bits == 0)
00275   {
00276   int d;
00277   (void)pcre_config(PCRE_CONFIG_NEWLINE, &d);
00278   /* Note that these values are always the ASCII ones, even in
00279   EBCDIC environments. CR = 13, NL = 10. */
00280   option_bits = (d == 13)? PCRE_NEWLINE_CR :
00281           (d == 10)? PCRE_NEWLINE_LF :
00282           (d == (13<<8 | 10))? PCRE_NEWLINE_CRLF :
00283           (d == -2)? PCRE_NEWLINE_ANYCRLF :
00284           (d == -1)? PCRE_NEWLINE_ANY : 0;
00285   }
00286 
00287 /* See if CRLF is a valid newline sequence. */
00288 
00289 crlf_is_newline =
00290      option_bits == PCRE_NEWLINE_ANY ||
00291      option_bits == PCRE_NEWLINE_CRLF ||
00292      option_bits == PCRE_NEWLINE_ANYCRLF;
00293 
00294 /* Loop for second and subsequent matches */
00295 
00296 for (;;)
00297   {
00298   int options = 0;                 /* Normally no options */
00299   int start_offset = ovector[1];   /* Start at end of previous match */
00300 
00301   /* If the previous match was for an empty string, we are finished if we are
00302   at the end of the subject. Otherwise, arrange to run another match at the
00303   same point to see if a non-empty match can be found. */
00304 
00305   if (ovector[0] == ovector[1])
00306     {
00307     if (ovector[0] == subject_length) break;
00308     options = PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED;
00309     }
00310 
00311   /* Run the next matching operation */
00312 
00313   rc = pcre_exec(
00314     re,                   /* the compiled pattern */
00315     NULL,                 /* no extra data - we didn't study the pattern */
00316     subject,              /* the subject string */
00317     subject_length,       /* the length of the subject */
00318     start_offset,         /* starting offset in the subject */
00319     options,              /* options */
00320     ovector,              /* output vector for substring information */
00321     OVECCOUNT);           /* number of elements in the output vector */
00322 
00323   /* This time, a result of NOMATCH isn't an error. If the value in "options"
00324   is zero, it just means we have found all possible matches, so the loop ends.
00325   Otherwise, it means we have failed to find a non-empty-string match at a
00326   point where there was a previous empty-string match. In this case, we do what
00327   Perl does: advance the matching position by one character, and continue. We
00328   do this by setting the "end of previous match" offset, because that is picked
00329   up at the top of the loop as the point at which to start again.
00330 
00331   There are two complications: (a) When CRLF is a valid newline sequence, and
00332   the current position is just before it, advance by an extra byte. (b)
00333   Otherwise we must ensure that we skip an entire UTF-8 character if we are in
00334   UTF-8 mode. */
00335 
00336   if (rc == PCRE_ERROR_NOMATCH)
00337     {
00338     if (options == 0) break;                    /* All matches found */
00339     ovector[1] = start_offset + 1;              /* Advance one byte */
00340     if (crlf_is_newline &&                      /* If CRLF is newline & */
00341         start_offset < subject_length - 1 &&    /* we are at CRLF, */
00342         subject[start_offset] == '\r' &&
00343         subject[start_offset + 1] == '\n')
00344       ovector[1] += 1;                          /* Advance by one more. */
00345     else if (utf8)                              /* Otherwise, ensure we */
00346       {                                         /* advance a whole UTF-8 */
00347       while (ovector[1] < subject_length)       /* character. */
00348         {
00349         if ((subject[ovector[1]] & 0xc0) != 0x80) break;
00350         ovector[1] += 1;
00351         }
00352       }
00353     continue;    /* Go round the loop again */
00354     }
00355 
00356   /* Other matching errors are not recoverable. */
00357 
00358   if (rc < 0)
00359     {
00360     printf("Matching error %d\n", rc);
00361     pcre_free(re);    /* Release memory used for the compiled pattern */
00362     return 1;
00363     }
00364 
00365   /* Match succeded */
00366 
00367   printf("\nMatch succeeded again at offset %d\n", ovector[0]);
00368 
00369   /* The match succeeded, but the output vector wasn't big enough. */
00370 
00371   if (rc == 0)
00372     {
00373     rc = OVECCOUNT/3;
00374     printf("ovector only has room for %d captured substrings\n", rc - 1);
00375     }
00376 
00377   /* As before, show substrings stored in the output vector by number, and then
00378   also any named substrings. */
00379 
00380   for (i = 0; i < rc; i++)
00381     {
00382     char *substring_start = subject + ovector[2*i];
00383     int substring_length = ovector[2*i+1] - ovector[2*i];
00384     printf("%2d: %.*s\n", i, substring_length, substring_start);
00385     }
00386 
00387   if (namecount <= 0) printf("No named substrings\n"); else
00388     {
00389     unsigned char *tabptr = name_table;
00390     printf("Named substrings\n");
00391     for (i = 0; i < namecount; i++)
00392       {
00393       int n = (tabptr[0] << 8) | tabptr[1];
00394       printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
00395         ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]);
00396       tabptr += name_entry_size;
00397       }
00398     }
00399   }      /* End of loop to find second and subsequent matches */
00400 
00401 printf("\n");
00402 pcre_free(re);       /* Release memory used for the compiled pattern */
00403 return 0;
00404 }
00405 
00406 /* End of pcredemo.c */