Back to index

courier  0.68.2
unicode_wordbreak.c
Go to the documentation of this file.
00001 /*
00002 ** Copyright 2011 Double Precision, Inc.
00003 ** See COPYING for distribution information.
00004 **
00005 */
00006 
00007 #include      "unicode_config.h"
00008 #include      "unicode.h"
00009 
00010 #include <unistd.h>
00011 #include <stdint.h>
00012 #include <stdlib.h>
00013 #include <string.h>
00014 #include <errno.h>
00015 
00016 #include "wordbreaktab_internal.h"
00017 #include "wordbreaktab.h"
00018 
00019 struct unicode_wb_info {
00020        int (*cb_func)(int, void *);
00021        void *cb_arg;
00022 
00023        uint8_t prevclass;
00024        size_t wb4_cnt;
00025 
00026        size_t wb4_extra_cnt;
00027 
00028        int (*next_handler)(unicode_wb_info_t, uint8_t);
00029        int (*end_handler)(unicode_wb_info_t);
00030 };
00031 
00032 static int sot(unicode_wb_info_t i, uint8_t cl);
00033 static int wb4(unicode_wb_info_t i);
00034 static int wb1and2_done(unicode_wb_info_t i, uint8_t cl);
00035 
00036 static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl);
00037 static int seen_wb67_end_handler(unicode_wb_info_t i);
00038 static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl);
00039 
00040 static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl);
00041 static int seen_wb1112_end_handler(unicode_wb_info_t i);
00042 static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl);
00043 
00044 unicode_wb_info_t unicode_wb_init(int (*cb_func)(int, void *),
00045                               void *cb_arg)
00046 {
00047        unicode_wb_info_t i=calloc(1, sizeof(struct unicode_wb_info));
00048 
00049        if (!i)
00050               return NULL;
00051 
00052        i->next_handler=sot;
00053        i->cb_func=cb_func;
00054        i->cb_arg=cb_arg;
00055        return i;
00056 }
00057 
00058 int unicode_wb_end(unicode_wb_info_t i)
00059 {
00060        int rc;
00061 
00062        if (i->end_handler)
00063               rc=(*i->end_handler)(i);
00064        else
00065               rc=wb4(i);
00066 
00067        free(i);
00068        return rc;
00069 }
00070 
00071 int unicode_wb_next_cnt(unicode_wb_info_t i,
00072                      const unicode_char *chars,
00073                      size_t cnt)
00074 {
00075        int rc;
00076 
00077        while (cnt)
00078        {
00079               rc=unicode_wb_next(i, *chars++);
00080               --cnt;
00081               if (rc)
00082                      return rc;
00083        }
00084        return 0;
00085 }
00086 
00087 int unicode_wb_next(unicode_wb_info_t i, unicode_char ch)
00088 {
00089        return (*i->next_handler)
00090               (i, unicode_tab_lookup(ch,
00091                                    unicode_indextab,
00092                                    sizeof(unicode_indextab)
00093                                    / sizeof(unicode_indextab[0]),
00094                                    unicode_rangetab,
00095                                    unicode_classtab,
00096                                    UNICODE_WB_OTHER));
00097 }
00098 
00099 static int wb4(unicode_wb_info_t i)
00100 {
00101        int rc=0;
00102 
00103        while (i->wb4_cnt > 0)
00104        {
00105               --i->wb4_cnt;
00106 
00107               if (rc == 0)
00108                      rc=(*i->cb_func)(0, i->cb_arg);
00109        }
00110        return rc;
00111 }
00112 
00113 static int result(unicode_wb_info_t i, int flag)
00114 {
00115        int rc=wb4(i);
00116 
00117        if (rc == 0)
00118               rc=(*i->cb_func)(flag, i->cb_arg);
00119 
00120        return rc;
00121 }
00122 
00123 #define SET_HANDLER(next,end) (i->next_handler=next, i->end_handler=end)
00124 
00125 static int sot(unicode_wb_info_t i, uint8_t cl)
00126 {
00127        i->prevclass=cl;
00128        SET_HANDLER(wb1and2_done, NULL);
00129 
00130        return result(i, 1); /* WB1 */
00131 }
00132 
00133 static int wb1and2_done(unicode_wb_info_t i, uint8_t cl)
00134 {
00135        uint8_t prevclass=i->prevclass;
00136 
00137        i->prevclass=cl;
00138 
00139        if (prevclass == UNICODE_WB_CR && cl == UNICODE_WB_LF)
00140               return result(i, 0); /* WB3 */
00141 
00142        switch (prevclass) {
00143        case UNICODE_WB_CR:
00144        case UNICODE_WB_LF:
00145        case UNICODE_WB_Newline:
00146               return result(i, 1); /* WB3a */
00147        }
00148 
00149        switch (cl) {
00150        case UNICODE_WB_CR:
00151        case UNICODE_WB_LF:
00152        case UNICODE_WB_Newline:
00153               return result(i, 1); /* WB3b */
00154        }
00155 
00156        if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format)
00157        {
00158               i->prevclass=prevclass;
00159               ++i->wb4_cnt;
00160               return 0; /* WB4 */
00161        }
00162 
00163        if (prevclass == UNICODE_WB_ALetter && cl == UNICODE_WB_ALetter)
00164        {
00165               return result(i, 0); /* WB5 */
00166        }
00167 
00168        if (prevclass == UNICODE_WB_ALetter &&
00169            (cl == UNICODE_WB_MidLetter || cl == UNICODE_WB_MidNumLet))
00170        {
00171               i->wb4_extra_cnt=0;
00172               SET_HANDLER(seen_wb67_handler, seen_wb67_end_handler);
00173               return 0;
00174        }
00175 
00176        return wb67_done(i, prevclass, cl);
00177 }
00178 
00179 /*
00180 **              ALetter     (MidLetter | MidNumLet )     ?
00181 **
00182 **                                  prevclass            cl
00183 **
00184 ** Seen ALetter (MidLetter | MidNumLet), with the second character's status
00185 ** not returned yet.
00186 */
00187 
00188 static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl)
00189 {
00190        int rc;
00191        uint8_t prevclass;
00192        size_t extra_cnt;
00193 
00194        if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format)
00195        {
00196               ++i->wb4_extra_cnt;
00197               return 0;
00198        }
00199 
00200        extra_cnt=i->wb4_extra_cnt;
00201 
00202        /*
00203        ** Reset the handler to the default, then check WB6
00204        */
00205 
00206        SET_HANDLER(wb1and2_done, NULL);
00207 
00208        if (cl == UNICODE_WB_ALetter)
00209        {
00210               rc=result(i, 0); /* WB6 */
00211               i->wb4_cnt=extra_cnt;
00212 
00213               if (rc == 0)
00214                      rc=result(i, 0); /* WB7 */
00215 
00216               i->prevclass=cl;
00217                      
00218               return rc;
00219        }
00220 
00221        prevclass=i->prevclass; /* This was the second character */
00222 
00223        /*
00224        ** Process the second character, starting with WB7
00225        */
00226 
00227        rc=wb67_done(i, UNICODE_WB_ALetter, prevclass);
00228 
00229        i->prevclass=prevclass;
00230        i->wb4_cnt=extra_cnt;
00231 
00232        if (rc == 0)
00233               rc=(*i->next_handler)(i, cl);
00234        /* Process the current char now */
00235 
00236        return rc;
00237 }
00238 
00239 /*
00240 ** Seen ALetter (MidLetter | MidNumLet), with the second character's status
00241 ** not returned yet, and now sot.
00242 */
00243 
00244 static int seen_wb67_end_handler(unicode_wb_info_t i)
00245 {
00246        int rc;
00247        size_t extra_cnt=i->wb4_extra_cnt;
00248 
00249        /*
00250        ** Process the second character, starting with WB7.
00251        */
00252 
00253        rc=wb67_done(i, UNICODE_WB_ALetter, i->prevclass);
00254        i->wb4_cnt=extra_cnt;
00255        if (rc == 0)
00256               rc=wb4(i);
00257        return rc;
00258 }
00259 
00260 
00261 static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl)
00262 {
00263        if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_Numeric)
00264               return result(i, 0); /* WB8 */
00265 
00266        if (prevclass == UNICODE_WB_ALetter && cl == UNICODE_WB_Numeric)
00267               return result(i, 0); /* WB9 */
00268 
00269        if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_ALetter)
00270               return result(i, 0); /* WB10 */
00271 
00272 
00273        if (prevclass == UNICODE_WB_Numeric &&
00274            (cl == UNICODE_WB_MidNum || cl == UNICODE_WB_MidNumLet))
00275        {
00276               i->wb4_extra_cnt=0;
00277               SET_HANDLER(seen_wb1112_handler, seen_wb1112_end_handler);
00278               return 0;
00279        }
00280 
00281        return wb1112_done(i, prevclass, cl);
00282 }
00283 
00284 /*
00285 **              Numeric     (MidNum | MidNumLet )     ?
00286 **
00287 **                               prevclass            cl
00288 **
00289 ** Seen Numeric (MidNum | MidNumLet), with the second character's status
00290 ** not returned yet.
00291 */
00292 
00293 static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl)
00294 {
00295        int rc;
00296        uint8_t prevclass;
00297        size_t extra_cnt;
00298 
00299        if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format)
00300        {
00301               ++i->wb4_extra_cnt;
00302               return 0;
00303        }
00304 
00305        extra_cnt=i->wb4_extra_cnt;
00306 
00307        /*
00308        ** Reset the handler to the default, then check WB6
00309        */
00310 
00311        SET_HANDLER(wb1and2_done, NULL);
00312 
00313        if (cl == UNICODE_WB_Numeric)
00314        {
00315               rc=result(i, 0); /* WB11 */
00316               i->wb4_cnt=extra_cnt;
00317 
00318               if (rc == 0)
00319                      rc=result(i, 0); /* WB12 */
00320 
00321               i->prevclass=cl;
00322                      
00323               return rc;
00324        }
00325 
00326        prevclass=i->prevclass; /* This was the second character */
00327 
00328        /*
00329        ** Process the second character, starting with WB7
00330        */
00331 
00332        rc=wb1112_done(i, UNICODE_WB_Numeric, prevclass);
00333 
00334        i->prevclass=prevclass;
00335        i->wb4_cnt=extra_cnt;
00336 
00337        if (rc == 0)
00338               rc=(*i->next_handler)(i, cl);
00339        /* Process the current char now */
00340 
00341        return rc;
00342 }
00343 
00344 /*
00345 ** Seen Numeric (MidNum | MidNumLet), with the second character's status
00346 ** not returned yet, and now sot.
00347 */
00348 
00349 static int seen_wb1112_end_handler(unicode_wb_info_t i)
00350 {
00351        int rc;
00352        size_t extra_cnt=i->wb4_extra_cnt;
00353 
00354        /*
00355        ** Process the second character, starting with WB11.
00356        */
00357 
00358        rc=wb1112_done(i, UNICODE_WB_Numeric, i->prevclass);
00359        i->wb4_cnt=extra_cnt;
00360        if (rc == 0)
00361               rc=wb4(i);
00362        return rc;
00363 }
00364 
00365 static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl)
00366 {
00367        if (prevclass == UNICODE_WB_Katakana &&
00368            cl == UNICODE_WB_Katakana)
00369               return result(i, 0); /* WB13 */
00370 
00371        switch (prevclass) {
00372        case UNICODE_WB_ALetter:
00373        case UNICODE_WB_Numeric:
00374        case UNICODE_WB_Katakana:
00375        case UNICODE_WB_ExtendNumLet:
00376               if (cl == UNICODE_WB_ExtendNumLet)
00377                      return result(i, 0); /* WB13a */
00378        }
00379 
00380        if (prevclass == UNICODE_WB_ExtendNumLet)
00381               switch (cl) {
00382               case UNICODE_WB_ALetter:
00383               case UNICODE_WB_Numeric:
00384               case UNICODE_WB_Katakana:
00385                      return result(i, 0); /* WB13b */
00386               }
00387 
00388        return result(i, 1); /* WB14 */
00389 }
00390 
00391 /* --------------------------------------------------------------------- */
00392 
00393 struct unicode_wbscan_info {
00394        unicode_wb_info_t wb_handle;
00395 
00396        int found;
00397        size_t cnt;
00398 };
00399 
00400 static int unicode_wbscan_callback(int, void *);
00401 
00402 unicode_wbscan_info_t unicode_wbscan_init()
00403 {
00404        unicode_wbscan_info_t i=calloc(1, sizeof(struct unicode_wbscan_info));
00405 
00406        if (!i)
00407               return NULL;
00408 
00409        if ((i->wb_handle=unicode_wb_init(unicode_wbscan_callback, i)) == NULL)
00410        {
00411               free(i);
00412               return NULL;
00413        }
00414 
00415        return i;
00416 }
00417 
00418 int unicode_wbscan_next(unicode_wbscan_info_t i, unicode_char ch)
00419 {
00420        if (!i->found)
00421               unicode_wb_next(i->wb_handle, ch);
00422 
00423        return i->found;
00424 }
00425 
00426 size_t unicode_wbscan_end(unicode_wbscan_info_t i)
00427 {
00428        size_t n;
00429 
00430        unicode_wb_end(i->wb_handle);
00431 
00432        n=i->cnt;
00433        free(i);
00434        return n;
00435 }
00436 
00437 static int unicode_wbscan_callback(int flag, void *arg)
00438 {
00439        unicode_wbscan_info_t i=(unicode_wbscan_info_t)arg;
00440 
00441        if (flag && i->cnt > 0)
00442               i->found=1;
00443 
00444        if (!i->found)
00445               ++i->cnt;
00446        return 0;
00447 }
00448