Back to index

opendkim  2.6.4
dkim-mailparse.c
Go to the documentation of this file.
00001 /*
00002 **  Copyright (c) 2005, 2007, 2008 Sendmail, Inc. and its suppliers.
00003 **    All rights reserved.
00004 **
00005 **  Copyright (c) 2009, 2010 The OpenDKIM Project.  All rights reserved.
00006 */
00007 
00008 #ifndef lint
00009 static char dkim_mailparse_c_id[] = "@(#)$Id: dkim-mailparse.c,v 1.5.34.1 2010/10/27 21:43:08 cm-msk Exp $";
00010 #endif /* !lint */
00011 
00012 /* system inludes */
00013 #include <sys/types.h>
00014 #include <ctype.h>
00015 #include <string.h>
00016 #include <limits.h>
00017 #include <stdio.h>
00018 
00019 /* libopendkim includes */
00020 #include "dkim-mailparse.h"
00021 
00022 /* types */
00023 typedef unsigned long cmap_elem_type;
00024 
00025 /* symbolic names */
00026 #define DKIM_MAILPARSE_OK          0      /* success */
00027 #define DKIM_MAILPARSE_ERR_PUNBALANCED    1      /* unbalanced parentheses */
00028 #define DKIM_MAILPARSE_ERR_QUNBALANCED    2      /* unbalanced quotes */
00029 #define DKIM_MAILPARSE_ERR_SUNBALANCED    3      /* unbalanced sq. brackets */
00030 
00031 /* a bitmap for the "specials" character class */
00032 #define       CMAP_NBITS           (sizeof(cmap_elem_type) * CHAR_BIT)
00033 #define       CMAP_NELEMS          ((1 + UCHAR_MAX) / CMAP_NBITS)
00034 #define       CMAP_INDEX(i)        ((unsigned char)(i) / CMAP_NBITS)
00035 #define       CMAP_BIT(i)          (1L << (unsigned char)(i) % CMAP_NBITS)
00036 #define       CMAP_TST(ar, c)      ((ar)[CMAP_INDEX(c)] &  CMAP_BIT(c))
00037 #define       CMAP_SET(ar, c)      ((ar)[CMAP_INDEX(c)] |= CMAP_BIT(c))
00038 
00039 static unsigned char const SPECIALS[] = "<>@,;:\\\"/[]?=";
00040 
00041 #ifdef DKIM_MAILPARSE_TEST
00042 /*
00043 **  DKIM_MAIL_UNESCAPE -- remove escape characters from a string
00044 **
00045 **  Parameters:
00046 **     s -- the string to be unescaped
00047 **
00048 **  Return value:
00049 **     s.
00050 */
00051 
00052 static char *
00053 dkim_mail_unescape(char *s)
00054 {
00055        char          *w;
00056        char const    *r, *p, *e;
00057 
00058        if (s == NULL)
00059               return NULL;
00060 
00061        r = w = s;
00062        e = s + strlen(s);
00063 
00064        while ((p = memchr(r, '\\', e - s)) != NULL)
00065        {
00066               if (p > s)
00067               {
00068                      if (r != w)
00069                             memmove(w, r, p - r);
00070                      w += p - r;
00071               }
00072 
00073               if (p[1] == '\0')
00074               {
00075                      r = p + 1;
00076               }
00077               else
00078               {
00079                      *w++ = p[1];
00080                      r = p + 2;
00081               }
00082        }
00083 
00084        if (r > w)
00085        {
00086               if (e > r)
00087               {
00088                      memmove(w, r, e - r);
00089                      w += e - r;
00090               }
00091               *w = '\0';
00092        }
00093 
00094        return s;
00095 }
00096 #endif /* DKIM_MAILPARSE_TEST */
00097 
00098 /*
00099 **  DKIM_MAIL_MATCHING_PAREN -- return the location past matching opposite
00100 **                              parentheses
00101 **
00102 **  Parameters:
00103 **     s -- start of string to be processed
00104 **     e -- end of string to be processed
00105 **     open_paren -- open parenthesis character
00106 **     close_paren -- close parenthesis character
00107 **
00108 **  Return value:
00109 **     Location of the final close parenthesis character in the string.
00110 **     For example, given "xxx((yyyy)zz)aaaa", would return the location
00111 **     of the second ")".  There may be more beyond that, but at that point
00112 **     everything is balanced.
00113 */
00114 
00115 static u_char *
00116 dkim_mail_matching_paren(u_char *s, u_char *e, int open_paren, int close_paren)
00117 {
00118        int           paren = 1;
00119 
00120        for (; s < e; s++)
00121        {
00122               if (*s == close_paren)
00123               {
00124                      if (--paren == 0)
00125                             break;
00126               }
00127               else if (*s == open_paren)
00128               {
00129                      paren++;
00130               }
00131               else if (*s == '\\')
00132               {
00133                      if (s[1] != '\0')
00134                             s++;
00135               }
00136        }
00137 
00138        return s;
00139 }
00140 
00141 /*
00142 **  DKIM_MAIL_FIRST_SPECIAL -- find the first "special" character
00143 **
00144 **  Parameters:
00145 **     p -- input string
00146 **     e -- end of input string
00147 **     special_out -- pointer to the first special character found
00148 **
00149 **  Return value:
00150 **     0 on success, or an DKIM_MAILPARSE_ERR_* on failure.
00151 */
00152 
00153 static int
00154 dkim_mail_first_special(u_char *p, u_char *e, u_char **special_out)
00155 {
00156        size_t        i;
00157        cmap_elem_type       is_special[CMAP_NELEMS] = { 0 };
00158        u_char        *at_ptr = NULL;
00159 
00160        /* set up special finder */
00161        for (i = 0; SPECIALS[i] != '\0'; i++)
00162               CMAP_SET(is_special, SPECIALS[i]);
00163 
00164        for (; p < e && *p != '\0'; p++)
00165        {
00166               /* skip white space between tokens */
00167               while (p < e && (*p == '(' ||
00168                                (isascii(*p) && isspace(*p))))
00169               {
00170                      if (*p != '(')
00171                      {
00172                             p++;
00173                      }
00174                      else
00175                      {
00176                             p = dkim_mail_matching_paren(p + 1, e,
00177                                                          '(', ')');
00178                             if (*p == '\0')
00179                                    return DKIM_MAILPARSE_ERR_PUNBALANCED;
00180                             else
00181                                    p++;
00182                      }
00183               }
00184 
00185               if (*p == '\0')
00186                      break;
00187 
00188               if (*p == '"')
00189               {
00190                      p = dkim_mail_matching_paren(p + 1, e, '\0', '"');
00191                      if (*p == '\0')
00192                             return DKIM_MAILPARSE_ERR_QUNBALANCED;
00193               }
00194               else if (*p == '[')
00195               {
00196                      p = dkim_mail_matching_paren(p + 1, e, '\0', ']');
00197                      if (*p == '\0')
00198                             return DKIM_MAILPARSE_ERR_SUNBALANCED;
00199               }
00200               else if (CMAP_TST(is_special, *p))
00201               {
00202                      if (*p == '<')
00203                      {
00204                             *special_out = p;
00205                             return 0;
00206                      }
00207                      else if (*p == ':' || *p == ';' || *p == ',')
00208                      {
00209                             if (at_ptr != NULL)
00210                                    *special_out = at_ptr;
00211                             else
00212                                    *special_out = p;
00213                             return 0; 
00214                      }
00215                      else if (*p == '@')
00216                      {
00217                             at_ptr = p;
00218                      }
00219               }
00220               else
00221               {
00222                      while (*p != '\0' &&
00223                             !CMAP_TST(is_special, *p) &&
00224                             (!isascii(*p) ||
00225                              !isspace((unsigned char) *p)) &&
00226                             *p != '(')
00227                             p++;
00228                      p--;
00229               }
00230        }
00231 
00232        *special_out = p;
00233        return 0;
00234 }
00235 
00236 /*
00237 **  DKIM_MAIL_TOKEN -- find the next token
00238 **
00239 **  Parameters:
00240 **     s -- start of input string
00241 **     e -- end of input string
00242 **     type_out -- type of token (returned)
00243 **     start_out -- start of token (returned)
00244 **     end_out -- start of token (returned)
00245 **     uncommented_whitespace -- set to TRUE if uncommented whitespace is
00246 **                               discovered (returned)
00247 **
00248 **  Return value:
00249 **     0 on success, or an DKIM_MAILPARSE_ERR_* on failure.
00250 */
00251 
00252 static int
00253 dkim_mail_token(u_char *s, u_char *e, int *type_out, u_char **start_out,
00254                 u_char **end_out, int *uncommented_whitespace)
00255 {
00256        u_char *p;
00257        int err = 0;
00258        size_t i;
00259        int token_type;
00260        cmap_elem_type is_special[CMAP_NELEMS] = { 0 };
00261        u_char *token_start, *token_end;
00262 
00263        *start_out = NULL;
00264        *end_out   = NULL;
00265        *type_out  = 0;
00266 
00267        err = 0;
00268 
00269        /* set up special finder */
00270        for (i = 0; SPECIALS[i] != '\0'; i++)
00271               CMAP_SET(is_special, SPECIALS[i]);
00272 
00273        p = s;
00274 
00275        /* skip white space between tokens */
00276        while (p < e && (*p == '(' ||
00277                         (isascii((unsigned char) *p) &&
00278                          isspace((unsigned char) *p))))
00279        {
00280               if (*p != '(')
00281               {
00282                      *uncommented_whitespace = 1;
00283                      p++;
00284               }
00285               else
00286               {
00287                      p = dkim_mail_matching_paren(p + 1, e, '(', ')');
00288                      if (*p == '\0')
00289                             return DKIM_MAILPARSE_ERR_PUNBALANCED;
00290                      else
00291                             p++;
00292               }
00293        }
00294 
00295        if (p >= e || *p == '\0')
00296               return 0;
00297 
00298        /* our new token starts here */
00299        token_start = p;
00300 
00301        /* fill in the token contents and type */
00302        if (*p == '"')
00303        {
00304               token_end = dkim_mail_matching_paren(p + 1, e, '\0', '"');
00305               token_type = '"';
00306               if (*token_end != '\0')
00307                      token_end++;
00308               else
00309                      err = DKIM_MAILPARSE_ERR_QUNBALANCED;
00310        }
00311        else if (*p == '[')
00312        {
00313               token_end = p = dkim_mail_matching_paren(p + 1, e, '\0', ']');
00314               token_type = '[';
00315               if (*token_end != '\0')
00316                      token_end++;
00317               else
00318                      err = DKIM_MAILPARSE_ERR_SUNBALANCED;
00319        }
00320        else if (CMAP_TST(is_special, *p))
00321        {
00322               token_end  = p + 1;
00323               token_type = *p;
00324        }
00325        else
00326        {
00327               while (p < e && *p != '\0' && !CMAP_TST(is_special, *p) &&
00328                      (!isascii(*p) || !isspace((unsigned char) *p)) &&
00329                      *p != '(')
00330                      p++;
00331 
00332               token_end = p;
00333               token_type = 'x';
00334        }
00335 
00336        *start_out = token_start;
00337        *end_out   = token_end;
00338        *type_out  = token_type;
00339 
00340        return err;
00341 }
00342 
00343 /*
00344 **  DKIM_MAIL_PARSE -- extract the local-part and hostname from a mail
00345 **                     header field, e.g. "From:"
00346 **
00347 **  Parameters:
00348 **     line -- input line
00349 **     user_out -- pointer to "local-part" (returned)
00350 **     domain_out -- pointer to hostname (returned)
00351 **
00352 **  Return value:
00353 **     0 on success, or an DKIM_MAILPARSE_ERR_* on failure.
00354 **
00355 **  Notes:
00356 **     Input string is modified.
00357 */
00358 
00359 int
00360 dkim_mail_parse(unsigned char *line, unsigned char **user_out,
00361                 unsigned char **domain_out)
00362 {
00363        int type;
00364        int ws;
00365        int err;
00366        u_char *e, *special;
00367        u_char *tok_s, *tok_e;
00368        u_char *w;
00369 
00370        *user_out = NULL;
00371        *domain_out = NULL;
00372 
00373        err = 0;
00374        w = line;
00375        e = line + strlen((char *) line);
00376        ws = 0;
00377 
00378        for (;;)
00379        {
00380               err = dkim_mail_first_special(line, e, &special);
00381               if (err != 0)
00382                      return err;
00383               
00384               /* given the construct we're looking at, do the right thing */
00385               switch (*special)
00386               {
00387                 case '<':
00388                      /* display name <address> */
00389                      line = special + 1;
00390                      for (;;)
00391                      {
00392                             err = dkim_mail_token(line, e, &type, &tok_s,
00393                                                   &tok_e, &ws);
00394                             if (err != 0)
00395                                    return err;
00396 
00397                             if (type == '>' || type == '\0')
00398                             {
00399                                    *w = '\0';
00400                                    return 0;
00401                             }
00402                             else if (type == '@')
00403                             {
00404                                    *w++ = '\0';
00405                                    *domain_out = w;
00406                             }
00407                             else if (type == ',' || type == ':')
00408                             {
00409                                    /* source route punctuation */
00410                                    *user_out = NULL;
00411                                    *domain_out = NULL;
00412                             }
00413                             else
00414                             {
00415                                    if (*user_out == NULL)
00416                                           *user_out = w;
00417                                    memmove(w, tok_s, tok_e - tok_s);
00418                                    w += tok_e - tok_s;
00419                             }
00420                             line = tok_e;
00421                      }
00422 
00423                 case ';':
00424                 case ':':
00425                 case ',':
00426                      /* skip a group name or result */
00427                      line = special + 1;
00428                      break;
00429 
00430                 default:
00431                      /* (display name) addr(display name)ess */
00432                      ws = 0;
00433                      for (;;)
00434                      {
00435                             err = dkim_mail_token(line, e, &type, &tok_s,
00436                                                   &tok_e, &ws);
00437                             if (err != 0)
00438                                    return err;
00439 
00440                             if (type == '\0' ||  type == ',' || type == ';')
00441                             {
00442                                    *w = '\0';
00443                                    break;
00444                             }
00445                             else if (type == '@')
00446                             {
00447                                    *w++ = '\0';
00448                                    *domain_out = w;
00449                                    ws = 0;
00450                             }
00451                             else
00452                             {
00453 
00454                                    if (*user_out == NULL)
00455                                           *user_out = w;
00456                                    else if (type == 'x' && ws == 1)
00457                                           *w++ = ' ';
00458 
00459                                    memmove(w, tok_s, tok_e - tok_s);
00460                                    w += tok_e - tok_s;
00461 
00462                                    ws = 0;
00463                             }
00464 
00465                             line = tok_e;
00466                      }
00467                      return 0;
00468               }
00469        }
00470 }
00471 
00472 #ifdef DKIM_MAILPARSE_TEST
00473 int
00474 main(int argc, char **argv)
00475 {
00476        int err;
00477        char *domain, *user;
00478 
00479        if (argc != 2)
00480        {
00481               fprintf(stderr, "Usage: %s mailheader\n", argv[0]);
00482               exit(64);
00483        }
00484 
00485        err = dkim_mail_parse(argv[1], &user, &domain);
00486 
00487        if (err)
00488        {
00489               printf("error %d\n", err);
00490        }
00491        else
00492        {
00493               printf("user: '%s'\ndomain: '%s'\n", 
00494                      user ? dkim_mail_unescape(user) : "null",
00495                      domain ? dkim_mail_unescape(domain) : "null");
00496        }
00497 
00498        return 0;
00499 }
00500 #endif /* DKIM_MAILPARSE_TEST */