Back to index

openldap  2.4.31
phonetic.c
Go to the documentation of this file.
00001 /* phonetic.c - routines to do phonetic matching */
00002 /* $OpenLDAP$ */
00003 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
00004  *
00005  * Copyright 1998-2012 The OpenLDAP Foundation.
00006  * All rights reserved.
00007  *
00008  * Redistribution and use in source and binary forms, with or without
00009  * modification, are permitted only as authorized by the OpenLDAP
00010  * Public License.
00011  *
00012  * A copy of this license is available in the file LICENSE in the
00013  * top-level directory of the distribution or, alternatively, at
00014  * <http://www.OpenLDAP.org/license.html>.
00015  */
00016 /* Portions Copyright (c) 1995 Regents of the University of Michigan.
00017  * All rights reserved.
00018  *
00019  * Redistribution and use in source and binary forms are permitted
00020  * provided that this notice is preserved and that due credit is given
00021  * to the University of Michigan at Ann Arbor. The name of the University
00022  * may not be used to endorse or promote products derived from this
00023  * software without specific prior written permission. This software
00024  * is provided ``as is'' without express or implied warranty.
00025  */
00026 
00027 #include "portable.h"
00028 
00029 #include <stdio.h>
00030 
00031 #include <ac/ctype.h>
00032 #include <ac/string.h>
00033 #include <ac/socket.h>
00034 #include <ac/time.h>
00035 
00036 #include "slap.h"
00037 
00038 #if !defined(SLAPD_METAPHONE) && !defined(SLAPD_PHONETIC)
00039 #define SLAPD_METAPHONE
00040 #endif
00041 
00042 #define iswordbreak(x)  (!isascii(x) || isspace((unsigned char) (x)) || \
00043                       ispunct((unsigned char) (x)) || \
00044                       isdigit((unsigned char) (x)) || (x) == '\0')
00045 
00046 #if 0
00047 static char *
00048 first_word( char *s )
00049 {
00050        if ( s == NULL ) {
00051               return( NULL );
00052        }
00053 
00054        while ( iswordbreak( *s ) ) {
00055               if ( *s == '\0' ) {
00056                      return( NULL );
00057               } else {
00058                      s++;
00059               }
00060        }
00061 
00062        return( s );
00063 }
00064 
00065 static char *
00066 next_word( char *s )
00067 {
00068        if ( s == NULL ) {
00069               return( NULL );
00070        }
00071 
00072        while ( ! iswordbreak( *s ) ) {
00073               s++;
00074        }
00075 
00076        while ( iswordbreak( *s ) ) {
00077               if ( *s == '\0' ) {
00078                      return( NULL );
00079               } else {
00080                      s++;
00081               }
00082        }
00083 
00084        return( s );
00085 }
00086 
00087 static char *
00088 word_dup( char *w )
00089 {
00090        char   *s, *ret;
00091        char   save;
00092 
00093        for ( s = w; !iswordbreak( *s ); s++ )
00094               ;      /* NULL */
00095        save = *s;
00096        *s = '\0';
00097        ret = ch_strdup( w );
00098        *s = save;
00099 
00100        return( ret );
00101 }
00102 #endif /* 0 */
00103 
00104 #ifndef MAXPHONEMELEN
00105 #define MAXPHONEMELEN       4
00106 #endif
00107 
00108 #if defined(SLAPD_PHONETIC)
00109 
00110 /* lifted from isode-8.0 */
00111 char *
00112 phonetic( char *s )
00113 {
00114         char  code, adjacent, ch;
00115        char   *p;
00116         int   i;
00117        char   phoneme[MAXPHONEMELEN + 1];
00118 
00119         p = s;
00120         if (  p == NULL || *p == '\0' ) {
00121                 return( NULL );
00122         }
00123 
00124         adjacent = '0';
00125        phoneme[0] = TOUPPER((unsigned char)*p);
00126 
00127        phoneme[1]  = '\0';
00128         for ( i = 0; i < 99 && (! iswordbreak(*p)); p++ ) {
00129               ch = TOUPPER ((unsigned char)*p);
00130 
00131                 code = '0';
00132 
00133                 switch (ch) {
00134                 case 'B':
00135                 case 'F':
00136               case 'P':
00137                 case 'V':
00138                         code = (adjacent != '1') ? '1' : '0';
00139                         break;
00140                 case 'S':
00141                 case 'C':
00142                 case 'G':
00143                 case 'J':
00144                 case 'K':
00145                 case 'Q':
00146                 case 'X':
00147                 case 'Z':
00148                         code = (adjacent != '2') ? '2' : '0';
00149                         break;
00150                 case 'D':
00151                 case 'T':
00152                         code = (adjacent != '3') ? '3' : '0';
00153                         break;
00154                 case 'L':
00155                         code = (adjacent != '4') ? '4' : '0';
00156                         break;
00157                 case 'M':
00158                 case 'N':
00159                         code = (adjacent != '5') ? '5' : '0';
00160                         break;
00161                 case 'R':
00162                         code = (adjacent != '6') ? '6' : '0';
00163                         break;
00164                 default:
00165                         adjacent = '0';
00166                 }
00167 
00168                 if ( i == 0 ) {
00169                      adjacent = code;
00170                      i++;
00171               } else if ( code != '0' ) {
00172                      if ( i == MAXPHONEMELEN )
00173                             break;
00174                         adjacent = phoneme[i] = code;
00175                         i++;
00176                 }
00177         }
00178 
00179        if ( i > 0 )
00180               phoneme[i] = '\0';
00181 
00182         return( ch_strdup( phoneme ) );
00183 }
00184 
00185 #elif defined(SLAPD_METAPHONE)
00186 
00187 /*
00188  * Metaphone was originally developed by Lawrence Philips and
00189  * published in the "Computer Language" magazine in 1990.
00190  */
00191 /*
00192  * Metaphone copied from C Gazette, June/July 1991, pp 56-57,
00193  * author Gary A. Parker, with changes by Bernard Tiffany of the
00194  * University of Michigan, and more changes by Tim Howes of the
00195  * University of Michigan.
00196  */
00197 
00198 /* Character coding array */
00199 static const char  vsvfn[26] = {
00200           1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2,
00201        /* A   B  C   D  E  F  G   H  I  J  K  L  M  */
00202           2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0};
00203        /* N  O  P  Q  R  S  T  U  V  W  X  Y  Z  */
00204 
00205 /* Macros to access character coding array */
00206 #define vowel(x)        ((x) != '\0' && vsvfn[(x) - 'A'] & 1)  /* AEIOU */
00207 #define same(x)         ((x) != '\0' && vsvfn[(x) - 'A'] & 2)  /* FJLMNR */
00208 #define varson(x)       ((x) != '\0' && vsvfn[(x) - 'A'] & 4)  /* CGPST */
00209 #define frontv(x)       ((x) != '\0' && vsvfn[(x) - 'A'] & 8)  /* EIY */
00210 #define noghf(x)        ((x) != '\0' && vsvfn[(x) - 'A'] & 16) /* BDH */
00211 
00212 char *
00213 phonetic( char *Word )
00214 {
00215        char           *n, *n_start, *n_end;      /* pointers to string */
00216        char           *metaph_end; /* pointers to metaph */
00217        char            ntrans[40]; /* word with uppercase letters */
00218        int             KSflag;     /* state flag for X -> KS */
00219        char          buf[MAXPHONEMELEN + 2];
00220        char          *Metaph;
00221 
00222        /*
00223         * Copy Word to internal buffer, dropping non-alphabetic characters
00224         * and converting to upper case
00225         */
00226 
00227        for (n = ntrans + 4, n_end = ntrans + 35; !iswordbreak( *Word ) &&
00228            n < n_end; Word++) {
00229               if (isalpha((unsigned char)*Word))
00230                      *n++ = TOUPPER((unsigned char)*Word);
00231        }
00232        Metaph = buf;
00233        *Metaph = '\0';
00234        if (n == ntrans + 4) {
00235               return( ch_strdup( buf ) );        /* Return if null */
00236        }
00237        n_end = n;           /* Set n_end to end of string */
00238 
00239        /* ntrans[0] will always be == 0 */
00240        ntrans[0] = '\0';
00241        ntrans[1] = '\0';
00242        ntrans[2] = '\0';
00243        ntrans[3] = '\0';
00244        *n++ = 0;
00245        *n++ = 0;
00246        *n++ = 0;
00247        *n = 0;                     /* Pad with nulls */
00248        n = ntrans + 4;             /* Assign pointer to start */
00249 
00250        /* Check for PN, KN, GN, AE, WR, WH, and X at start */
00251        switch (*n) {
00252        case 'P':
00253        case 'K':
00254        case 'G':
00255               /* 'PN', 'KN', 'GN' becomes 'N' */
00256               if (*(n + 1) == 'N')
00257                      *n++ = 0;
00258               break;
00259        case 'A':
00260               /* 'AE' becomes 'E' */
00261               if (*(n + 1) == 'E')
00262                      *n++ = 0;
00263               break;
00264        case 'W':
00265               /* 'WR' becomes 'R', and 'WH' to 'H' */
00266               if (*(n + 1) == 'R')
00267                      *n++ = 0;
00268               else if (*(n + 1) == 'H') {
00269                      *(n + 1) = *n;
00270                      *n++ = 0;
00271               }
00272               break;
00273        case 'X':
00274               /* 'X' becomes 'S' */
00275               *n = 'S';
00276               break;
00277        }
00278 
00279        /*
00280         * Now, loop step through string, stopping at end of string or when
00281         * the computed 'metaph' is MAXPHONEMELEN characters long
00282         */
00283 
00284        KSflag = 0;          /* state flag for KS translation */
00285        for (metaph_end = Metaph + MAXPHONEMELEN, n_start = n;
00286             n <= n_end && Metaph < metaph_end; n++) {
00287               if (KSflag) {
00288                      KSflag = 0;
00289                      *Metaph++ = 'S';
00290               } else {
00291                      /* Drop duplicates except for CC */
00292                      if (*(n - 1) == *n && *n != 'C')
00293                             continue;
00294                      /* Check for F J L M N R or first letter vowel */
00295                      if (same(*n) || (n == n_start && vowel(*n)))
00296                             *Metaph++ = *n;
00297                      else
00298                             switch (*n) {
00299                             case 'B':
00300 
00301                                    /*
00302                                     * B unless in -MB
00303                                     */
00304                                    if (n == (n_end - 1) && *(n - 1) != 'M')
00305                                           *Metaph++ = *n;
00306                                    break;
00307                             case 'C':
00308 
00309                                    /*
00310                                     * X if in -CIA-, -CH- else S if in
00311                                     * -CI-, -CE-, -CY- else dropped if
00312                                     * in -SCI-, -SCE-, -SCY- else K
00313                                     */
00314                                    if (*(n - 1) != 'S' || !frontv(*(n + 1))) {
00315                                           if (*(n + 1) == 'I' && *(n + 2) == 'A')
00316                                                  *Metaph++ = 'X';
00317                                           else if (frontv(*(n + 1)))
00318                                                  *Metaph++ = 'S';
00319                                           else if (*(n + 1) == 'H')
00320                                                  *Metaph++ = ((n == n_start && !vowel(*(n + 2)))
00321                                                   || *(n - 1) == 'S')
00322                                                      ? (char) 'K' : (char) 'X';
00323                                           else
00324                                                  *Metaph++ = 'K';
00325                                    }
00326                                    break;
00327                             case 'D':
00328 
00329                                    /*
00330                                     * J if in DGE or DGI or DGY else T
00331                                     */
00332                                    *Metaph++ = (*(n + 1) == 'G' && frontv(*(n + 2)))
00333                                        ? (char) 'J' : (char) 'T';
00334                                    break;
00335                             case 'G':
00336 
00337                                    /*
00338                                     * F if in -GH and not B--GH, D--GH,
00339                                     * -H--GH, -H---GH else dropped if
00340                                     * -GNED, -GN, -DGE-, -DGI-, -DGY-
00341                                     * else J if in -GE-, -GI-, -GY- and
00342                                     * not GG else K
00343                                     */
00344                                    if ((*(n + 1) != 'J' || vowel(*(n + 2))) &&
00345                                        (*(n + 1) != 'N' || ((n + 1) < n_end &&
00346                                                          (*(n + 2) != 'E' || *(n + 3) != 'D'))) &&
00347                                        (*(n - 1) != 'D' || !frontv(*(n + 1))))
00348                                           *Metaph++ = (frontv(*(n + 1)) &&
00349                                                       *(n + 2) != 'G') ? (char) 'G' : (char) 'K';
00350                                    else if (*(n + 1) == 'H' && !noghf(*(n - 3)) &&
00351                                            *(n - 4) != 'H')
00352                                           *Metaph++ = 'F';
00353                                    break;
00354                             case 'H':
00355 
00356                                    /*
00357                                     * H if before a vowel and not after
00358                                     * C, G, P, S, T else dropped
00359                                     */
00360                                    if (!varson(*(n - 1)) && (!vowel(*(n - 1)) ||
00361                                                     vowel(*(n + 1))))
00362                                           *Metaph++ = 'H';
00363                                    break;
00364                             case 'K':
00365 
00366                                    /*
00367                                     * dropped if after C else K
00368                                     */
00369                                    if (*(n - 1) != 'C')
00370                                           *Metaph++ = 'K';
00371                                    break;
00372                             case 'P':
00373 
00374                                    /*
00375                                     * F if before H, else P
00376                                     */
00377                                    *Metaph++ = *(n + 1) == 'H' ?
00378                                        (char) 'F' : (char) 'P';
00379                                    break;
00380                             case 'Q':
00381 
00382                                    /*
00383                                     * K
00384                                     */
00385                                    *Metaph++ = 'K';
00386                                    break;
00387                             case 'S':
00388 
00389                                    /*
00390                                     * X in -SH-, -SIO- or -SIA- else S
00391                                     */
00392                                    *Metaph++ = (*(n + 1) == 'H' ||
00393                                                (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
00394                                                    *(n + 2) == 'A')))
00395                                        ? (char) 'X' : (char) 'S';
00396                                    break;
00397                             case 'T':
00398 
00399                                    /*
00400                                     * X in -TIA- or -TIO- else 0 (zero)
00401                                     * before H else dropped if in -TCH-
00402                                     * else T
00403                                     */
00404                                    if (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
00405                                                     *(n + 2) == 'A'))
00406                                           *Metaph++ = 'X';
00407                                    else if (*(n + 1) == 'H')
00408                                           *Metaph++ = '0';
00409                                    else if (*(n + 1) != 'C' || *(n + 2) != 'H')
00410                                           *Metaph++ = 'T';
00411                                    break;
00412                             case 'V':
00413 
00414                                    /*
00415                                     * F
00416                                     */
00417                                    *Metaph++ = 'F';
00418                                    break;
00419                             case 'W':
00420 
00421                                    /*
00422                                     * W after a vowel, else dropped
00423                                     */
00424                             case 'Y':
00425 
00426                                    /*
00427                                     * Y unless followed by a vowel
00428                                     */
00429                                    if (vowel(*(n + 1)))
00430                                           *Metaph++ = *n;
00431                                    break;
00432                             case 'X':
00433 
00434                                    /*
00435                                     * KS
00436                                     */
00437                                    if (n == n_start)
00438                                           *Metaph++ = 'S';
00439                                    else {
00440                                           *Metaph++ = 'K';     /* Insert K, then S */
00441                                           KSflag = 1;
00442                                    }
00443                                    break;
00444                             case 'Z':
00445 
00446                                    /*
00447                                     * S
00448                                     */
00449                                    *Metaph++ = 'S';
00450                                    break;
00451                             }
00452               }
00453        }
00454 
00455        *Metaph = 0;         /* Null terminate */
00456        return( ch_strdup( buf ) );
00457 }
00458 
00459 #endif /* SLAPD_METAPHONE */