Back to index

php5  5.3.10
metaphone.c
Go to the documentation of this file.
00001 /*
00002    +----------------------------------------------------------------------+
00003    | PHP Version 5                                                        |
00004    +----------------------------------------------------------------------+
00005    | Copyright (c) 1997-2012 The PHP Group                                |
00006    +----------------------------------------------------------------------+
00007    | This source file is subject to version 3.01 of the PHP license,      |
00008    | that is bundled with this package in the file LICENSE, and is        |
00009    | available through the world-wide-web at the following url:           |
00010    | http://www.php.net/license/3_01.txt                                  |
00011    | If you did not receive a copy of the PHP license and are unable to   |
00012    | obtain it through the world-wide-web, please send a note to          |
00013    | license@php.net so we can mail you a copy immediately.               |
00014    +----------------------------------------------------------------------+
00015    | Author: Thies C. Arntzen <thies@thieso.net>                          |
00016    +----------------------------------------------------------------------+
00017 */
00018 
00019 /* $Id: metaphone.c 321634 2012-01-01 13:15:04Z felipe $ */
00020 
00021 /*
00022        Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com> 
00023 */
00024 
00025 #include "php.h"
00026 #include "php_metaphone.h"
00027 
00028 static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional);
00029 
00030 /* {{{ proto string metaphone(string text[, int phones])
00031    Break english phrases down into their phonemes */
00032 PHP_FUNCTION(metaphone)
00033 {
00034        char *str;
00035        char *result = 0;
00036        int str_len;
00037        long phones = 0;
00038 
00039        if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &str_len,
00040                                                    &phones) == FAILURE) {
00041               return;
00042        }
00043 
00044        if (metaphone((unsigned char *)str, str_len, phones, &result, 1) == 0) {
00045               RETVAL_STRING(result, 0);
00046        } else {
00047               if (result) {
00048                      efree(result);
00049               }
00050               RETURN_FALSE;
00051        }
00052 }
00053 /* }}} */
00054 
00055 /* 
00056    this is now the original code by Michael G Schwern:
00057    i've changed it just a slightly bit (use emalloc, 
00058    get rid of includes etc) 
00059        - thies - 13.09.1999
00060 */
00061 
00062 /*-----------------------------  */
00063 /* this used to be "metaphone.h" */
00064 /*-----------------------------  */
00065 
00066 /* Special encodings */
00067 #define  SH   'X'
00068 #define  TH          '0'
00069 
00070 /*-----------------------------  */
00071 /* end of "metaphone.h"          */
00072 /*-----------------------------  */
00073 
00074 /*----------------------------- */
00075 /* this used to be "metachar.h" */
00076 /*----------------------------- */
00077 
00078 /* Metachar.h ... little bits about characters for metaphone */
00079 /*-- Character encoding array & accessing macros --*/
00080 /* Stolen directly out of the book... */
00081 char _codes[26] =
00082 {
00083        1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
00084 /*  a  b c  d e f g  h i j k l m n o p q r s t u v w x y z */
00085 };
00086 
00087 
00088 #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
00089 
00090 #define isvowel(c)  (ENCODE(c) & 1)              /* AEIOU */
00091 
00092 /* These letters are passed through unchanged */
00093 #define NOCHANGE(c) (ENCODE(c) & 2)              /* FJMNR */
00094 
00095 /* These form dipthongs when preceding H */
00096 #define AFFECTH(c)  (ENCODE(c) & 4)              /* CGPST */
00097 
00098 /* These make C and G soft */
00099 #define MAKESOFT(c) (ENCODE(c) & 8)              /* EIY */
00100 
00101 /* These prevent GH from becoming F */
00102 #define NOGHTOF(c)  (ENCODE(c) & 16)      /* BDH */
00103 
00104 /*----------------------------- */
00105 /* end of "metachar.h"          */
00106 /*----------------------------- */
00107 
00108 /* I suppose I could have been using a character pointer instead of
00109  * accesssing the array directly... */
00110 
00111 /* Look at the next letter in the word */
00112 #define Next_Letter (toupper(word[w_idx+1]))
00113 /* Look at the current letter in the word */
00114 #define Curr_Letter (toupper(word[w_idx]))
00115 /* Go N letters back. */
00116 #define Look_Back_Letter(n) (w_idx >= n ? toupper(word[w_idx-n]) : '\0')
00117 /* Previous letter.  I dunno, should this return null on failure? */
00118 #define Prev_Letter (Look_Back_Letter(1))
00119 /* Look two letters down.  It makes sure you don't walk off the string. */
00120 #define After_Next_Letter   (Next_Letter != '\0' ? toupper(word[w_idx+2]) \
00121                                                                                   : '\0')
00122 #define Look_Ahead_Letter(n) (toupper(Lookahead(word+w_idx, n)))
00123 
00124 
00125 /* Allows us to safely look ahead an arbitrary # of letters */
00126 /* I probably could have just used strlen... */
00127 static char Lookahead(char *word, int how_far)
00128 {
00129        char letter_ahead = '\0';   /* null by default */
00130        int idx;
00131        for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
00132        /* Edge forward in the string... */
00133 
00134        letter_ahead = word[idx];   /* idx will be either == to how_far or
00135                                                          * at the end of the string
00136                                                          */
00137        return letter_ahead;
00138 }
00139 
00140 
00141 /* phonize one letter
00142  * We don't know the buffers size in advance. On way to solve this is to just
00143  * re-allocate the buffer size. We're using an extra of 2 characters (this
00144  * could be one though; or more too). */
00145 #define Phonize(c)   { \
00146                                           if (p_idx >= max_buffer_len) { \
00147                                                  *phoned_word = safe_erealloc(*phoned_word, 2, sizeof(char), max_buffer_len); \
00148                                                  max_buffer_len += 2; \
00149                                           } \
00150                                           (*phoned_word)[p_idx++] = c; \
00151                                    }
00152 /* Slap a null character on the end of the phoned word */
00153 #define End_Phoned_Word     { \
00154                                                  if (p_idx == max_buffer_len) { \
00155                                                         *phoned_word = safe_erealloc(*phoned_word, 1, sizeof(char), max_buffer_len); \
00156                                                  } \
00157                                                  (*phoned_word)[p_idx] = '\0'; \
00158                                           }
00159 /* How long is the phoned word? */
00160 #define Phone_Len    (p_idx)
00161 
00162 /* Note is a letter is a 'break' in the word */
00163 #define Isbreak(c)  (!isalpha(c))
00164 
00165 /* {{{ metaphone
00166  */
00167 static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional)
00168 {
00169        int w_idx = 0;                            /* point in the phonization we're at. */
00170        int p_idx = 0;                            /* end of the phoned phrase */
00171        int max_buffer_len = 0;            /* maximum length of the destination buffer */
00172 
00173 /*-- Parameter checks --*/
00174        /* Negative phoneme length is meaningless */
00175 
00176        if (max_phonemes < 0)
00177               return -1;
00178 
00179        /* Empty/null string is meaningless */
00180        /* Overly paranoid */
00181        /* assert(word != NULL && word[0] != '\0'); */
00182 
00183        if (word == NULL)
00184               return -1;
00185 
00186 /*-- Allocate memory for our phoned_phrase --*/
00187        if (max_phonemes == 0) {    /* Assume largest possible */
00188               max_buffer_len = word_len;
00189               *phoned_word = safe_emalloc(sizeof(char), word_len, 1);
00190        } else {
00191               max_buffer_len = max_phonemes;
00192               *phoned_word = safe_emalloc(sizeof(char), max_phonemes, 1);
00193        }
00194 
00195 
00196 /*-- The first phoneme has to be processed specially. --*/
00197        /* Find our first letter */
00198        for (; !isalpha(Curr_Letter); w_idx++) {
00199               /* On the off chance we were given nothing but crap... */
00200               if (Curr_Letter == '\0') {
00201                      End_Phoned_Word
00202                             return SUCCESS;      /* For testing */
00203               }
00204        }
00205 
00206        switch (Curr_Letter) {
00207               /* AE becomes E */
00208        case 'A':
00209               if (Next_Letter == 'E') {
00210                      Phonize('E');
00211                      w_idx += 2;
00212               }
00213               /* Remember, preserve vowels at the beginning */
00214               else {
00215                      Phonize('A');
00216                      w_idx++;
00217               }
00218               break;
00219               /* [GKP]N becomes N */
00220        case 'G':
00221        case 'K':
00222        case 'P':
00223               if (Next_Letter == 'N') {
00224                      Phonize('N');
00225                      w_idx += 2;
00226               }
00227               break;
00228               /* WH becomes W, 
00229                  WR becomes R 
00230                  W if followed by a vowel */
00231        case 'W':
00232               if (Next_Letter == 'R') {
00233                      Phonize(Next_Letter);
00234                      w_idx += 2;
00235               } else if (Next_Letter == 'H' || isvowel(Next_Letter)) {
00236                      Phonize('W');
00237                      w_idx += 2;
00238               }
00239               /* else ignore */
00240               break;
00241               /* X becomes S */
00242        case 'X':
00243               Phonize('S');
00244               w_idx++;
00245               break;
00246               /* Vowels are kept */
00247               /* We did A already
00248                  case 'A':
00249                  case 'a':
00250                */
00251        case 'E':
00252        case 'I':
00253        case 'O':
00254        case 'U':
00255               Phonize(Curr_Letter);
00256               w_idx++;
00257               break;
00258        default:
00259               /* do nothing */
00260               break;
00261        }
00262 
00263 
00264 
00265        /* On to the metaphoning */
00266        for (; Curr_Letter != '\0' &&
00267                (max_phonemes == 0 || Phone_Len < max_phonemes);
00268                w_idx++) {
00269               /* How many letters to skip because an eariler encoding handled     
00270                * multiple letters */
00271               unsigned short int skip_letter = 0;
00272 
00273 
00274               /* THOUGHT:  It would be nice if, rather than having things like...
00275                * well, SCI.  For SCI you encode the S, then have to remember
00276                * to skip the C.  So the phonome SCI invades both S and C.  It would
00277                * be better, IMHO, to skip the C from the S part of the encoding.
00278                * Hell, I'm trying it.
00279                */
00280 
00281               /* Ignore non-alphas */
00282               if (!isalpha(Curr_Letter))
00283                      continue;
00284 
00285               /* Drop duplicates, except CC */
00286               if (Curr_Letter == Prev_Letter &&
00287                      Curr_Letter != 'C')
00288                      continue;
00289 
00290               switch (Curr_Letter) {
00291                      /* B -> B unless in MB */
00292               case 'B':
00293                      if (Prev_Letter != 'M')
00294                             Phonize('B');
00295                      break;
00296                      /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
00297                       * (SCHW is handled in S)
00298                       *  S if -CI-, -CE- or -CY-
00299                       *  dropped if -SCI-, SCE-, -SCY- (handed in S)
00300                       *  else K
00301                       */
00302               case 'C':
00303                      if (MAKESOFT(Next_Letter)) {       /* C[IEY] */
00304                             if (After_Next_Letter == 'A' &&
00305                                    Next_Letter == 'I') {       /* CIA */
00306                                    Phonize(SH);
00307                             }
00308                             /* SC[IEY] */
00309                             else if (Prev_Letter == 'S') {
00310                                    /* Dropped */
00311                             } else {
00312                                    Phonize('S');
00313                             }
00314                      } else if (Next_Letter == 'H') {
00315                             if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) {    /* Christ, School */
00316                                    Phonize('K');
00317                             } else {
00318                                    Phonize(SH);
00319                             }
00320                             skip_letter++;
00321                      } else {
00322                             Phonize('K');
00323                      }
00324                      break;
00325                      /* J if in -DGE-, -DGI- or -DGY-
00326                       * else T
00327                       */
00328               case 'D':
00329                      if (Next_Letter == 'G' &&
00330                             MAKESOFT(After_Next_Letter)) {
00331                             Phonize('J');
00332                             skip_letter++;
00333                      } else
00334                             Phonize('T');
00335                      break;
00336                      /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
00337                       * else dropped if -GNED, -GN, 
00338                       * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
00339                       * else J if in -GE-, -GI, -GY and not GG
00340                       * else K
00341                       */
00342               case 'G':
00343                      if (Next_Letter == 'H') {
00344                             if (!(NOGHTOF(Look_Back_Letter(3)) ||
00345                                      Look_Back_Letter(4) == 'H')) {
00346                                    Phonize('F');
00347                                    skip_letter++;
00348                             } else {
00349                                    /* silent */
00350                             }
00351                      } else if (Next_Letter == 'N') {
00352                             if (Isbreak(After_Next_Letter) ||
00353                                    (After_Next_Letter == 'E' &&
00354                                     Look_Ahead_Letter(3) == 'D')) {
00355                                    /* dropped */
00356                             } else
00357                                    Phonize('K');
00358                      } else if (MAKESOFT(Next_Letter) &&
00359                                       Prev_Letter != 'G') {
00360                             Phonize('J');
00361                      } else {
00362                             Phonize('K');
00363                      }
00364                      break;
00365                      /* H if before a vowel and not after C,G,P,S,T */
00366               case 'H':
00367                      if (isvowel(Next_Letter) &&
00368                             !AFFECTH(Prev_Letter))
00369                             Phonize('H');
00370                      break;
00371                      /* dropped if after C
00372                       * else K
00373                       */
00374               case 'K':
00375                      if (Prev_Letter != 'C')
00376                             Phonize('K');
00377                      break;
00378                      /* F if before H
00379                       * else P
00380                       */
00381               case 'P':
00382                      if (Next_Letter == 'H') {
00383                             Phonize('F');
00384                      } else {
00385                             Phonize('P');
00386                      }
00387                      break;
00388                      /* K
00389                       */
00390               case 'Q':
00391                      Phonize('K');
00392                      break;
00393                      /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
00394                       * else S
00395                       */
00396               case 'S':
00397                      if (Next_Letter == 'I' &&
00398                             (After_Next_Letter == 'O' ||
00399                              After_Next_Letter == 'A')) {
00400                             Phonize(SH);
00401                      } else if (Next_Letter == 'H') {
00402                             Phonize(SH);
00403                             skip_letter++;
00404                      } else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) {
00405                             Phonize(SH);
00406                             skip_letter += 2;
00407                      } else {
00408                             Phonize('S');
00409                      }
00410                      break;
00411                      /* 'sh' in -TIA- or -TIO-
00412                       * else 'th' before H
00413                       * else T
00414                       */
00415               case 'T':
00416                      if (Next_Letter == 'I' &&
00417                             (After_Next_Letter == 'O' ||
00418                              After_Next_Letter == 'A')) {
00419                             Phonize(SH);
00420                      } else if (Next_Letter == 'H') {
00421                             Phonize(TH);
00422                             skip_letter++;
00423                      } else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) {
00424                             Phonize('T');
00425                      }
00426                      break;
00427                      /* F */
00428               case 'V':
00429                      Phonize('F');
00430                      break;
00431                      /* W before a vowel, else dropped */
00432               case 'W':
00433                      if (isvowel(Next_Letter))
00434                             Phonize('W');
00435                      break;
00436                      /* KS */
00437               case 'X':
00438                      Phonize('K');
00439                      Phonize('S');
00440                      break;
00441                      /* Y if followed by a vowel */
00442               case 'Y':
00443                      if (isvowel(Next_Letter))
00444                             Phonize('Y');
00445                      break;
00446                      /* S */
00447               case 'Z':
00448                      Phonize('S');
00449                      break;
00450                      /* No transformation */
00451               case 'F':
00452               case 'J':
00453               case 'L':
00454               case 'M':
00455               case 'N':
00456               case 'R':
00457                      Phonize(Curr_Letter);
00458                      break;
00459               default:
00460                      /* nothing */
00461                      break;
00462               }                                         /* END SWITCH */
00463 
00464               w_idx += skip_letter;
00465        }                                                /* END FOR */
00466 
00467        End_Phoned_Word;
00468 
00469        return 0;
00470 }                                                       /* END metaphone */
00471 /* }}} */
00472 
00473 /*
00474  * Local variables:
00475  * tab-width: 4
00476  * c-basic-offset: 4
00477  * End:
00478  * vim600: sw=4 ts=4 fdm=marker
00479  * vim<600: sw=4 ts=4
00480  */