Back to index

im-sdk  12.3.91
ParsePreedit.c
Go to the documentation of this file.
00001 #include  <stdio.h>
00002 #include  <string.h>
00003 #include  <stdlib.h>
00004 #include  "GeneType.h"
00005 #include  "PyBasic.h"
00006 #include  "ciku.h"
00007 
00008 VOID  DecompPeIntArray(JINT *pnOutPreedit, CHAR* szDspPreedit);
00009 JINT  MakeOPE(JINT nHalfMatchLen, JINT nIUVetc, JINT nYJCode);
00010 VOID  ParseRawInputStr(char* szPreedit, int* pnOutPreedit);
00011 JINT  GetNextUnit(char* szPreedit, JINT nPos, char* szNextUnit);
00012 JINT  MatchUnitByYinjie(char* szNextUnit);
00013 JINT  GetFirstYinJie(int* pnYJArray);
00014 
00015 /*
00016 **  Decompose pnOutPreedit Array to a szString.
00017 */
00018 VOID DecompPeIntArray(JINT *pnOutPreedit, CHAR* szDspPreedit)
00019 {
00020        JINT   i, j, tmp;
00021        JINT   nHalfMatchLen, nIUVetc, nYJCode;
00022        CHAR   szHalfMatch[7];
00023 
00024        i = 0;
00025        tmp = 1;
00026 
00027        for (i = 0; i < 256; i++)
00028               szDspPreedit[i] = '\0';
00029 
00030        for (i = 0; pnOutPreedit[i] != 0; )
00031        {
00032               tmp = pnOutPreedit[i];
00033               nYJCode = tmp & 0x01FF;
00034               nIUVetc = (tmp >> 9) & 0x07;
00035               nHalfMatchLen = (tmp >> 12) & 0x0F;
00036 
00037               if (nIUVetc != 0)
00038               {
00039                      if (nIUVetc == 4)
00040                      {
00041                             strcat (szDspPreedit, "\'");
00042                             i++;
00043                      }
00044                      else
00045                      {
00046                             /*
00047                             **  Add a space before [iuv] except that this is
00048                             **  the first CHAR in Preedit string or a ['ivu]
00049                             **  ahead it.
00050                             **  MakeOPE(0, 4, 0) is 0000 100 000000000 = 0x0800 [']
00051                             **  MakeOPE(0, 1, 0) is 0000 001 000000000 = 0x0200 [i]
00052                             **  MakeOPE(0, 2, 0) is 0000 010 000000000 = 0x0400 [u]
00053                             **  MakeOPE(0, 3, 0) is 0000 011 000000000 = 0x0600 [v]
00054                             */
00055                             if ( (i > 0) && (pnOutPreedit[i-1] != 0x0800)    \
00056                                         && (pnOutPreedit[i-1] != 0x0200)     \
00057                                         && (pnOutPreedit[i-1] != 0x0400)     \
00058                                         && (pnOutPreedit[i-1] != 0x0600) )
00059                                    strcat (szDspPreedit, " ");
00060 
00061                             if (nIUVetc == 1)
00062                                    strcat (szDspPreedit, "i");
00063                             else  if (nIUVetc == 2)
00064                                    strcat (szDspPreedit, "u");
00065                             else  if (nIUVetc == 3)
00066                                    strcat (szDspPreedit, "v");
00067                             i++;
00068                      }
00069               }
00070               else if ((nIUVetc == 0) && (nHalfMatchLen == 6))
00071               {
00072                      /*
00073                      ** A valid Yinjie code. Add a SPACE before
00074                      ** Yinjie string expect that there is a ['] ahead or it is the first.
00075                      ** MakeOPE(0, 4, 0) is 0000 100 000000000 = 0x0800
00076                      */
00077                      if ((i > 0) && (pnOutPreedit[i-1] != 0x0800))
00078                             strcat (szDspPreedit, " ");
00079 
00080                      if (nYJCode >= 450)
00081                             strcat (szDspPreedit, SHENGMUSTR[nYJCode - 450]);
00082                      else if (nYJCode < 450)
00083                             strcat (szDspPreedit, YINJIESTR_CSZ[nYJCode]);
00084 
00085                      i++;
00086               }
00087               else if ((nHalfMatchLen > 0) && (nHalfMatchLen < 6) && (nIUVetc == 0))
00088               {
00089                      for (j = 0; j < nHalfMatchLen; j++)
00090                             szHalfMatch[j] = (char)pnOutPreedit[i + j + 1];
00091                      szHalfMatch[j] = '\0';
00092                      if ((i > 0) && (pnOutPreedit[i-1] != 0x0800))
00093                             strcat (szDspPreedit, " ");
00094                      strcat (szDspPreedit, szHalfMatch);
00095                      i += (j + 1);
00096               }
00097               else
00098               {
00099                      i++;
00100                      fprintf (stderr, "Error in DecompPeIntArray()\n");
00101               }
00102        }
00103 }
00104 
00105 
00106 JINT MakeOPE(JINT nHalfMatchLen, JINT nIUVetc, JINT nYJCode)
00107 {
00108        return (nYJCode + (nIUVetc << 9) + (nHalfMatchLen << 12));
00109 }
00110 
00111 
00112 /*
00113 **  szPreedit[] contains only [a]~[z] and [']. No other chars included.
00114 **  pnOutPreedit points to a array that contains the parsed result in the
00115 **  following format:
00116 **     |  4Bits   |  3Bits   |           9Bits      |
00117 **     ===========++++++++++++=====================
00118 **     4Bits: nHalfMatchLen ==> Indicates that the following integer is
00119 **            HalfMatched Pinyin string. 0x61[a] ~ 0x7A[z].
00120 **            Min is 1, Max is 5.  If it is 7, indicates 9Bits is YJCode.
00121 **     3Bits: 000 ==> Default
00122 **            001 ==> [i]
00123 **            010 ==> [u]
00124 **            011 ==> [v]
00125 **            100 ==> [']
00126 **     9Bits: 0 ~ 511       Yinjie Code or Shengmu code
00127 **            Same as notes before function MatchUnitByYinjie().
00128 */
00129 
00130 VOID ParseRawInputStr(char* szPreedit, int* pnOutPreedit)
00131 {
00132        CHAR   szNextUnit[7];
00133        JINT   nPELen;
00134        JINT   nCurOff, nOPEOff, nRef, nPy;
00135        JINT   nMatchFlag,  nMatchSMLen,  nMatchYMLenFit,  nMatchYMLenMax,  nMatchYinjie;
00136        JINT   nMatchFlag2, nMatchSMLen2, nMatchYMLenFit2, nMatchYMLenMax2, nMatchYinjie2;
00137        JINT   i, j, tmp, tmp1, tmp2;
00138        CHAR   cEnd, cNext;
00139 
00140        JINT   nHalfMatchLen, nIUVetc, nYJCode;
00141 
00142        nHalfMatchLen = nIUVetc = nYJCode = 0;
00143        nCurOff = 0;
00144        nRef = nPy = 0;
00145        nOPEOff = 0;
00146 
00147        nPELen = (int)strlen(szPreedit);
00148 
00149        while (nPELen > nCurOff)
00150        {
00151               for (i = 0; i < 7; i++)
00152                      szNextUnit[i] = '\0';
00153 
00154               tmp = GetNextUnit(szPreedit, nCurOff, szNextUnit);
00155               nRef = (tmp >> 8) & 0x00FF;
00156               nPy = tmp & 0x00FF;
00157 
00158               /* a ['] is the front of szPreedit[nCurOff] */
00159               if (nRef == 1)
00160               {
00161                      pnOutPreedit[nOPEOff] = MakeOPE (0, 4, 0);
00162                      nOPEOff ++;
00163                      nCurOff ++;
00164               }
00165 
00166               if (nPy == 0)
00167                      nCurOff = nPELen;    /* break;  OR: nCurOff = nPELen; */
00168               else                        /* nPy > 0 */
00169               {
00170                      tmp = MatchUnitByYinjie (szNextUnit);
00171                      nMatchYMLenFit = (tmp >> 16) & 0x07;
00172                      nMatchFlag = (tmp >> 13) & 0x07;
00173                      nMatchSMLen = (tmp >> 12) & 0x01;
00174                      nMatchYMLenMax = (tmp >> 9) & 0x07;
00175                      nMatchYinjie = tmp & 0x01FF;
00176 
00177                      tmp1 = nCurOff + nMatchSMLen + nMatchYMLenMax + 1;
00178 
00179                      if (nMatchFlag == 1)
00180                      {
00181                             /* The first Char is [i][u][v] */
00182                             if (szNextUnit[0] == 'i')
00183                                    tmp = 1;
00184                             else if (szNextUnit[0] == 'u')
00185                                    tmp = 2;
00186                             else if (szNextUnit[0] == 'v')
00187                                    tmp = 3;
00188 
00189                             pnOutPreedit[nOPEOff] = MakeOPE(0, tmp, 0);
00190                             nOPEOff ++;
00191                             nCurOff ++;
00192                      }
00193                      /*
00194                      **   A Bug Fixed: ORG: zhonw => zhon w // NEW: zhonw => zh o n w
00195                      **   97-10-5
00196                      */
00197                      else if ( (nMatchFlag == 2) && (tmp1 == nPELen) )
00198                      {
00199                             nHalfMatchLen = nMatchSMLen + 1 + nMatchYMLenMax;
00200                             pnOutPreedit[nOPEOff] = MakeOPE (nHalfMatchLen, 0, 0);
00201                             nOPEOff ++;
00202                             for (j = 0; j < (nMatchSMLen + nMatchYMLenMax + 1); j++ )
00203                             {
00204                                    pnOutPreedit[nOPEOff] = (int)szNextUnit[j];
00205                                    nOPEOff ++;
00206                                    nCurOff ++;
00207                             }
00208                      }
00209                      else if (nMatchFlag == 0)
00210                      {
00211                             cEnd = szPreedit[nCurOff + nMatchSMLen + nMatchYMLenFit];
00212                             cNext = szPreedit[nCurOff + nMatchSMLen + 1 + nMatchYMLenFit];
00213 
00214                             /*
00215                             ** G[aeou]  //       N[aeiouv]  //  R[aeiou]
00216                             */
00217                             if( ((cEnd == 'g') && ((cNext == 'a') || (cNext == 'e') ||     \
00218                                                  (cNext == 'o') || (cNext == 'u') ))       \
00219                              || ((cEnd == 'n') && ((cNext == 'a') || (cNext == 'e') ||     \
00220                                                  (cNext == 'i') || (cNext == 'o') ||       \
00221                                                  (cNext == 'u') || (cNext == 'v') ))       \
00222                              || ((cEnd == 'r') && ((cNext == 'a') || (cNext == 'e') ||     \
00223                                                  (cNext == 'i') || (cNext == 'o') ||       \
00224                                                  (cNext == 'u') )) )
00225                             {
00226                                    for (i = 0; i < (nMatchSMLen + nMatchYMLenFit); i++)
00227                                           szNextUnit[i] = szPreedit[nCurOff + i];
00228                                    for (i = (nMatchSMLen + nMatchYMLenFit); i < 7; i++)
00229                                           szNextUnit[i] = '\0';
00230 
00231                                    tmp2 = MatchUnitByYinjie (szNextUnit);
00232                                    nMatchYMLenFit2 = (tmp2 >> 16) & 0x07;
00233                                    nMatchFlag2 = (tmp2 >> 13) & 0x07;
00234                                    nMatchSMLen2 = (tmp2 >> 12) & 0x01;
00235                                    nMatchYMLenMax2 = (tmp2 >> 9) & 0x07;
00236                                    nMatchYinjie2 = tmp2 & 0x01FF;
00237 
00238                                    /*
00239                                    ** The following condition may be adjusted to get more
00240                                    ** precision and general. Thus, following syntax is equal.
00241                                    ** eran <==> er'an  ana <==> an'a  IS IT BEST????
00242                                    */
00243                                    if (nMatchFlag2 == 0)
00244                                    {
00245                                           nMatchYinjie = nMatchYinjie2;
00246                                           nMatchSMLen = nMatchSMLen2;
00247                                           nMatchYMLenFit = nMatchYMLenFit2;
00248                                    }
00249                             }
00250 
00251                             pnOutPreedit[nOPEOff] = MakeOPE (6, 0, nMatchYinjie);
00252                             nOPEOff ++;
00253                             nCurOff += (nMatchSMLen + 1 + nMatchYMLenFit);
00254                      }
00255                      else
00256                      {
00257                             /* nMatchFlag = 3, or 2 in middle */
00258                             /* Setting nHalfMatchLen to 6 to avoid MakeOPE(0, 0, 0) */
00259 
00260                             pnOutPreedit[nOPEOff] = MakeOPE (6, 0, nMatchYinjie);
00261                             nOPEOff ++;
00262                             nCurOff += (nMatchSMLen + 1 + nMatchYMLenFit);
00263                      }
00264               }
00265 
00266        } /* End of While() */
00267        pnOutPreedit[nOPEOff] = 0;
00268 }
00269 
00270 
00271 /*
00272 **  szPreedit[] consisted of [a]~[z] ['] and [\0]
00273 **  nOff indicates the start offset to get next unit
00274 **  szNextUnit points to the buffer to store next unit
00275 **  The return JINT indicates the chars have been read from szPreedit.
00276 **                |  8Bits  |       8Bits |
00277 **                ==========-----------
00278 **                =: 00000001       ==> a ['] in the front of szPreedit[nOff]
00279 **                -: 00000xxx       ==> 0-1-2-3-4-5-6 Length of this unit[a]~[z]
00280 */
00281 
00282 JINT GetNextUnit(char* szPreedit, JINT nOff, char* szNextUnit)
00283 {
00284        JINT   nRef, nPy;
00285        JINT   i;
00286 
00287        nRef = nPy = 0;
00288        i = nOff;
00289 
00290        /* Skip ['] at position szPreedit[nOff] */
00291        if (szPreedit[i] == '\'')
00292        {
00293               nRef++;
00294               i++;
00295        }
00296 
00297        while( (szPreedit[i] != '\0') && (szPreedit[i] != '\'') && (nPy <= 6) )
00298        {
00299               szNextUnit[nPy] = szPreedit[i];
00300               i++;
00301               nPy++;
00302        }
00303        if (nPy == 7)
00304               nPy = 6;
00305 
00306        return ((nRef << 8) + nPy);
00307 }
00308 
00309 
00310 /*
00311 **  Lookup the SHENGMUSTR[] and YINJIESTR_CSZ[] to determine the max longest YINJIE
00312 **  which match szNextUnit from header. return the value which indicate
00313 **  the YINJIE.
00314 **  RETURN JINT          Range 0~414:   YINJIESTR_CSZ[return]
00315 **                Range > 450:   SHENGMUSTR[return - 450]
00316 **                     Notes:   return-450 don't contains ['a' 'e' 'm' 'n' 'o']
00317 **
00318 **                |<= High ......................  Low =>|
00319 **                |  3Bits  |       1Bit  |  3Bits       |  9Bits  |
00320 **                 0 - 8   9Bits:  0~414  & 450 + Shengmu
00321 **                               0x01FF indicates invalid.
00322 **                 9 - 11  3Bits:  1-2-3-4  Length of Matched Yunmu chars
00323 **                12      1Bit:   0-1 Length of Shengmu, 1 => [ch][sh][zh]
00324 **                13 - 15  4Bits:  000  ==> Full Matched
00325 **                               001  ==> Invalid Char [i][u][v]
00326 **                               010  ==> Half Matched. ex, "zho".    Length is 3
00327 **                               011  ==> Match ShengMu only
00328 **
00329 **  Question:     How about if the first CHAR in szNextUnit is [i][u][v]?
00330 **                How to convert [u] and [v] automatically?
00331 **                How to process half match question? for example, "zho"?
00332 */
00333 
00334 /*
00335 **                How to make this function match to mode "mohu pinyin"?
00336 **                          include:  l/n, n/ng/, f/h,  zh ch sh/ z c s
00337 **                NO, MOHU is only considered during "YinjieToHzstr".
00338 */
00339 JINT MatchUnitByYinjie(char* szNextUnit)
00340 {
00341        JINT   i, j;
00342        CHAR   cFirstCh;
00343        JINT   nShengmuLen, nFromYJStr, nToYJStr, nShengmuIndex;
00344        JINT   nMatchYinjie, nMatchYMLenFit, nMatchSMLen, nMatchFlag, nMatchYMLenMax;
00345        JINT   tmp;
00346 
00347        cFirstCh = szNextUnit[0];
00348 
00349        nMatchYinjie = nMatchYMLenFit = nMatchYMLenMax = nMatchSMLen = nMatchFlag = 0;
00350 
00351        if ((cFirstCh == 'i') || (cFirstCh == 'u') || (cFirstCh == 'v'))
00352        {
00353               nMatchSMLen = 0;     /* Indicates 1 */
00354               nMatchYMLenFit = 0;
00355               nMatchYMLenMax = 0;
00356               nMatchFlag = 1;             /* Invalid Char [i][u][v] */
00357               nMatchYinjie = 0x1FF;       /* Invalid Shenmu */
00358        }
00359        else if ((cFirstCh >= 'a') && (cFirstCh <= 'z'))
00360        {
00361               nMatchSMLen = 0;
00362               nMatchYMLenMax = 0;
00363               nMatchYMLenFit = 0;
00364 
00365               if ((cFirstCh == 'c') && (szNextUnit[1] == 'h'))
00366               {
00367                      nFromYJStr = INDEXSMTOYINJIE[3];
00368                      nToYJStr = INDEXSMTOYINJIE[4];
00369                      nMatchSMLen = 1;
00370                      nShengmuIndex = 3;   /* [ch], same as INDEXMAGIC[(int)('i' - 'a')] */
00371               }
00372               else if ((cFirstCh == 's') && (szNextUnit[1] == 'h'))
00373               {
00374                      nFromYJStr = INDEXSMTOYINJIE[19];
00375                      nToYJStr = INDEXSMTOYINJIE[20];
00376                      nMatchSMLen = 1;
00377                      nShengmuIndex = 19;  /* [sh], same as INDEXMAGIC[(int)('u' - 'a')] */
00378               }
00379               else if ((cFirstCh == 'z') && (szNextUnit[1] == 'h'))
00380               {
00381                      nFromYJStr = 395; /*INDEXSMTOYINJIE[25]; */
00382                      nToYJStr = 415; /*INDEXSMTOYINJIE[26]; */
00383                      nMatchSMLen = 1;
00384                      nShengmuIndex = 25;  /* [zh], same as INDEXMAGIC[(int)('v' - 'a')] */
00385               }
00386               else
00387               {
00388                      nFromYJStr = INDEXSMTOYINJIE[ INDEXMAGIC[(int)(cFirstCh - 'a')] ];
00389                      nToYJStr = INDEXSMTOYINJIE[ INDEXMAGIC[(int)(cFirstCh - 'a')] + 1];
00390                      nMatchSMLen = 0;
00391                      nShengmuIndex = INDEXMAGIC[(int)(cFirstCh - 'a')];
00392               }
00393 
00394               nShengmuLen = nMatchSMLen + 1;
00395 
00396               nMatchYinjie = 450 + nShengmuIndex;
00397               nMatchFlag = 3;                           /* Match ShengMu Only */
00398 
00399               for (i = nFromYJStr; i < nToYJStr; i++)
00400               {
00401                      for (j = nShengmuLen; ((szNextUnit[j] == YINJIESTR_CSZ[i][j]) && (YINJIESTR_CSZ[i][j] != '\0')); )
00402                      {
00403                             j += 1;
00404                      /* Null Loop Here. It is Right */
00405                      }
00406 
00407                      /*
00408                      ** [cu] => [cuan],  [su] => [suan],  [zu] => [zuan] ??
00409                      ** Get the Max Matchment Length among YINJIESTR_CSZ[i],
00410                      ** where, [i] is between nFromYJStr and nToYJStr.
00411                      ** Example, [su] and [suan] is both valid Yinjie, but [sua] is invalid.
00412                      ** How to process if user input only [sua]?
00413                      */
00414                      /*
00415                      ** !!!!NOTE!!!! The following condition cannot be changed to
00416                      ** !!!!NOTE!!!! ((j - nShengmuLen) >= nMatchYMLenMax)  ......
00417                      */
00418                      if ( ((j - nShengmuLen) > nMatchYMLenMax) && ((j - nShengmuLen) > 0) )
00419                      {
00420                             nMatchYMLenMax = j - nShengmuLen;
00421                             if (YINJIESTR_CSZ[i][j] != '\0')
00422                             {
00423                                    nMatchFlag = 2;             /* Half Matchment. [cua] */
00424                             }
00425                             else
00426                             {
00427                                    nMatchYinjie = i;
00428                                    nMatchFlag = 0;             /* Full Matchment */
00429                                    nMatchYMLenFit = nMatchYMLenMax;
00430                             }
00431                      }
00432               }
00433        }
00434        else
00435        {
00436               nMatchYinjie = nMatchYMLenFit = nMatchYMLenMax = nMatchSMLen = nMatchFlag = 0;
00437        }
00438 
00439        tmp = nMatchYinjie + (nMatchYMLenMax << 9) + (nMatchSMLen << 12) + (nMatchFlag << 13) + (nMatchYMLenFit << 16);
00440 
00441        return (tmp);
00442 }
00443 
00444 
00445 /*
00446 **  Return YinJie Code if the first is a valid YinJie. Else return -1
00447 **  Valid YinJie Code is [0 ~ 414,  and [450 ~ 475]
00448 **  If there is a ['] before the next valid Yinjie, (0x00010000 + nYJCode) is returned
00449 */
00450 JINT GetFirstYinJie(int* pnYJArray)
00451 {
00452        JINT   nHalfMatchLen, nIUVetc, nYJCode;
00453        JINT   nTmp, nTmp1;
00454 
00455        nTmp = pnYJArray[0];
00456        nTmp1 = pnYJArray[1];
00457 
00458        if (nTmp != 0)
00459        {
00460               nYJCode = nTmp & 0x01FF;
00461               nIUVetc = (nTmp >> 9) & 0x07;
00462               nHalfMatchLen = (nTmp >> 12) & 0x0F;
00463 
00464               if ((nIUVetc == 0) && (nHalfMatchLen == 6))
00465                      return nYJCode;
00466               else if (nIUVetc == 4)             /* ['] */
00467               {
00468                      nYJCode = nTmp1 & 0x01FF;
00469                      nIUVetc = (nTmp1 >> 9) & 0x07;
00470                      nHalfMatchLen = (nTmp1 >> 12) & 0x0F;
00471 
00472                      if ((nIUVetc == 0) && (nHalfMatchLen == 6))
00473                             return (nYJCode + 0x00010000);
00474                      else
00475                             return -1;
00476               }
00477               else
00478                      return -1;
00479        }
00480        else
00481               return -1;
00482 }
00483 
00484