Back to index

courier  0.68.2
unicode_linebreak.c
Go to the documentation of this file.
00001 /*
00002 ** Copyright 2011 Double Precision, Inc.
00003 ** See COPYING for distribution information.
00004 **
00005 */
00006 
00007 #include      "unicode_config.h"
00008 #include      "unicode.h"
00009 
00010 #include <unistd.h>
00011 #include <stdint.h>
00012 #include <stdlib.h>
00013 #include <string.h>
00014 #include <errno.h>
00015 
00016 #include "linebreaktab_internal.h"
00017 
00018 #include "linebreaktab.h"
00019 
00020 #define UNICODE_LB_SOT      0xFF
00021 
00022 struct unicode_lb_info {
00023        int (*cb_func)(int, void *);
00024        void *cb_arg;
00025 
00026        int opts;
00027 
00028        uint8_t savedclass;
00029        size_t savedcmcnt;
00030 
00031        uint8_t prevclass;
00032        uint8_t prevclass_nsp;
00033 
00034        int (*next_handler)(struct unicode_lb_info *, uint8_t);
00035        int (*end_handler)(struct unicode_lb_info *);
00036 };
00037 
00038 
00039 /* http://www.unicode.org/reports/tr14/#Algorithm */
00040 
00041 static int next_def(unicode_lb_info_t, uint8_t);
00042 static int end_def(unicode_lb_info_t);
00043 
00044 static int next_lb25_seenophy(unicode_lb_info_t, uint8_t);
00045 static int end_lb25_seenophy(unicode_lb_info_t);
00046 
00047 static int next_lb25_seennu(unicode_lb_info_t, uint8_t);
00048 
00049 static int next_lb25_seennuclcp(unicode_lb_info_t, uint8_t);
00050 
00051 static void unicode_lb_reset(unicode_lb_info_t i)
00052 {
00053        i->prevclass=i->prevclass_nsp=UNICODE_LB_SOT;
00054        i->next_handler=next_def;
00055        i->end_handler=end_def;
00056 }
00057 
00058 unicode_lb_info_t unicode_lb_init(int (*cb_func)(int, void *),
00059                               void *cb_arg)
00060 {
00061        unicode_lb_info_t i=calloc(1, sizeof(struct unicode_lb_info));
00062 
00063        i->cb_func=cb_func;
00064        i->cb_arg=cb_arg;
00065 
00066        unicode_lb_reset(i);
00067        return i;
00068 }
00069 
00070 int unicode_lb_end(unicode_lb_info_t i)
00071 {
00072        int rc=(*i->end_handler)(i);
00073 
00074        free(i);
00075        return rc;
00076 }
00077 
00078 void unicode_lb_set_opts(unicode_lb_info_t i, int opts)
00079 {
00080        i->opts=opts;
00081 }
00082 
00083 /* Default end handler has nothing to do */
00084 
00085 static int end_def(unicode_lb_info_t i)
00086 {
00087        /* LB3 N/A */
00088        return 0;
00089 }
00090 #define RESULT(x) (*i->cb_func)((x), i->cb_arg)
00091 
00092 int unicode_lb_next_cnt(unicode_lb_info_t i,
00093                      const unicode_char *chars,
00094                      size_t cnt)
00095 {
00096        while (cnt)
00097        {
00098               int rc=unicode_lb_next(i, *chars);
00099 
00100               if (rc)
00101                      return rc;
00102 
00103               ++chars;
00104               --cnt;
00105        }
00106        return 0;
00107 }
00108 
00109 int unicode_lb_lookup(unicode_char ch)
00110 {
00111        return unicode_tab_lookup(ch,
00112                               unicode_indextab,
00113                               sizeof(unicode_indextab)
00114                               / sizeof(unicode_indextab[0]),
00115                               unicode_rangetab,
00116                               unicode_classtab,
00117                               UNICODE_LB_AL /* XX, LB1 */);
00118 }
00119 
00120 int unicode_lb_next(unicode_lb_info_t i,
00121                   unicode_char ch)
00122 {
00123        return (*i->next_handler)(i, (i->opts & UNICODE_LB_OPT_DASHWJ) &&
00124                               (ch == 0x2012 || ch == 0x2013)
00125                               ? UNICODE_LB_WJ:unicode_lb_lookup(ch));
00126 }
00127 
00128 static int next_def_nolb25(unicode_lb_info_t i,
00129                         uint8_t uclass,
00130                         int nolb25);
00131 
00132 /*
00133 ** Default logic for next unicode char.
00134 */
00135 static int next_def(unicode_lb_info_t i,
00136                   uint8_t uclass)
00137 {
00138        return next_def_nolb25(i, uclass, 0);
00139 }
00140 
00141 static int next_def_nolb25(unicode_lb_info_t i,
00142                         uint8_t uclass,
00143 
00144                         /* Flag -- recursively invoked after discarding LB25 */
00145                         int nolb25)
00146 {
00147 
00148        /* Retrieve the previous unicode character's linebreak class. */
00149 
00150        uint8_t prevclass=i->prevclass;
00151        uint8_t prevclass_nsp=i->prevclass_nsp;
00152 
00153        /* Save this unicode char's linebreak class, for the next goaround */
00154        i->prevclass=uclass;
00155 
00156        if (uclass != UNICODE_LB_SP)
00157               i->prevclass_nsp=uclass;
00158 
00159        if (uclass == UNICODE_LB_NU)
00160               i->next_handler=next_lb25_seennu; /* LB25 */
00161 
00162        if (prevclass == UNICODE_LB_SOT)
00163        {
00164               if (uclass == UNICODE_LB_CM) /* LB9 */
00165                      i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL;
00166 
00167               return RESULT(UNICODE_LB_NONE); /* LB2 */
00168        }
00169 
00170        if (prevclass == UNICODE_LB_CR && uclass == UNICODE_LB_LF)
00171               return RESULT(UNICODE_LB_NONE); /* LB5 */
00172 
00173        switch (prevclass) {
00174        case UNICODE_LB_BK:
00175        case UNICODE_LB_CR:
00176        case UNICODE_LB_LF:
00177        case UNICODE_LB_NL:
00178 
00179               if (uclass == UNICODE_LB_CM)
00180               {
00181                      i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL;
00182                      /* LB9 */
00183               }
00184 
00185               return RESULT(UNICODE_LB_MANDATORY); /* LB4, LB5 */
00186 
00187        case UNICODE_LB_SP:
00188        case UNICODE_LB_ZW:
00189               if (uclass == UNICODE_LB_CM)
00190                      i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL;
00191               /* LB10 */
00192               break;
00193        default:
00194               break;
00195        }
00196 
00197        switch (uclass) {
00198 
00199               /* LB6: */
00200        case UNICODE_LB_BK:
00201        case UNICODE_LB_CR:
00202        case UNICODE_LB_LF:
00203        case UNICODE_LB_NL:
00204 
00205               /* LB7: */
00206        case UNICODE_LB_SP:
00207        case UNICODE_LB_ZW:
00208 
00209               return RESULT(UNICODE_LB_NONE);
00210        default:
00211               break;
00212        }
00213 
00214        if (prevclass_nsp == UNICODE_LB_ZW)
00215               return RESULT(UNICODE_LB_ALLOWED); /* LB8 */
00216 
00217        if (uclass == UNICODE_LB_CM)
00218        {
00219               i->prevclass=prevclass;
00220               i->prevclass_nsp=prevclass_nsp;
00221               return RESULT(UNICODE_LB_NONE); /* LB9 */
00222        }
00223 
00224        if (prevclass == UNICODE_LB_WJ || uclass == UNICODE_LB_WJ)
00225               return RESULT(UNICODE_LB_NONE); /* LB11 */
00226 
00227        if (prevclass == UNICODE_LB_GL)
00228               return RESULT(UNICODE_LB_NONE); /* LB12 */
00229 
00230        if (uclass == UNICODE_LB_GL &&
00231            prevclass != UNICODE_LB_SP &&
00232            prevclass != UNICODE_LB_BA &&
00233            prevclass != UNICODE_LB_HY)
00234               return RESULT(UNICODE_LB_NONE); /* LB12a */
00235 
00236 
00237        switch (uclass) {
00238        case UNICODE_LB_SY:
00239               if (i->opts & UNICODE_LB_OPT_SYBREAK)
00240               {
00241                      if (prevclass == UNICODE_LB_SP)
00242                             return RESULT(UNICODE_LB_ALLOWED);
00243               }
00244 
00245        case UNICODE_LB_CL:
00246        case UNICODE_LB_CP:
00247        case UNICODE_LB_EX:
00248        case UNICODE_LB_IS:
00249               return RESULT(UNICODE_LB_NONE); /* LB13 */
00250        default:
00251               break;
00252        }
00253 
00254        if ((i->opts & UNICODE_LB_OPT_SYBREAK) && prevclass == UNICODE_LB_SY)
00255               switch (uclass) {
00256               case UNICODE_LB_EX:
00257               case UNICODE_LB_AL:
00258               case UNICODE_LB_ID:
00259                      return RESULT(UNICODE_LB_NONE);
00260               }
00261 
00262        if (prevclass_nsp == UNICODE_LB_OP)
00263               return RESULT(UNICODE_LB_NONE); /* LB14 */
00264 
00265        if (prevclass_nsp == UNICODE_LB_QU && uclass == UNICODE_LB_OP)
00266               return RESULT(UNICODE_LB_NONE); /* LB15 */
00267 
00268        if ((prevclass_nsp == UNICODE_LB_CL || prevclass_nsp == UNICODE_LB_CP)
00269            && uclass == UNICODE_LB_NS)
00270               return RESULT(UNICODE_LB_NONE); /* LB16 */
00271 
00272        if (prevclass_nsp == UNICODE_LB_B2 && uclass == UNICODE_LB_B2)
00273               return RESULT(UNICODE_LB_NONE); /* LB17 */
00274 
00275        if (prevclass == UNICODE_LB_SP)
00276               return RESULT(UNICODE_LB_ALLOWED); /* LB18 */
00277 
00278        if (uclass == UNICODE_LB_QU || prevclass == UNICODE_LB_QU)
00279               return RESULT(UNICODE_LB_NONE); /* LB19 */
00280 
00281        if (uclass == UNICODE_LB_CB || prevclass == UNICODE_LB_CB)
00282               return RESULT(UNICODE_LB_ALLOWED); /* LB20 */
00283 
00284        /* LB21: */
00285 
00286        switch (uclass) {
00287        case UNICODE_LB_BA:
00288        case UNICODE_LB_HY:
00289        case UNICODE_LB_NS:
00290               return RESULT(UNICODE_LB_NONE);
00291        default:
00292               break;
00293        }
00294 
00295        if (prevclass == UNICODE_LB_BB)
00296               return RESULT(UNICODE_LB_NONE);
00297 
00298        if (uclass == UNICODE_LB_IN)
00299               switch (prevclass) {
00300               case UNICODE_LB_AL:
00301               case UNICODE_LB_ID:
00302               case UNICODE_LB_IN:
00303               case UNICODE_LB_NU:
00304                      return RESULT(UNICODE_LB_NONE); /* LB22 */
00305               default:
00306                      break;
00307               }
00308 
00309 
00310        if (prevclass == UNICODE_LB_ID && uclass == UNICODE_LB_PO)
00311               return RESULT(UNICODE_LB_NONE); /* LB23 */
00312        if (prevclass == UNICODE_LB_AL && uclass == UNICODE_LB_NU)
00313               return RESULT(UNICODE_LB_NONE); /* LB23 */
00314 
00315        if (prevclass == UNICODE_LB_NU && uclass == UNICODE_LB_AL)
00316               return RESULT(UNICODE_LB_NONE); /* LB23 */
00317 
00318 
00319        if (prevclass == UNICODE_LB_PR && uclass == UNICODE_LB_ID)
00320               return RESULT(UNICODE_LB_NONE); /* LB24 */
00321        if (prevclass == UNICODE_LB_PR && uclass == UNICODE_LB_AL)
00322               return RESULT(UNICODE_LB_NONE); /* LB24 */
00323        if (prevclass == UNICODE_LB_PO && uclass == UNICODE_LB_AL)
00324               return RESULT(UNICODE_LB_NONE); /* LB24 */
00325 
00326        if ((i->opts & UNICODE_LB_OPT_PRBREAK) && uclass == UNICODE_LB_PR)
00327               switch (prevclass) {
00328               case UNICODE_LB_PR:
00329               case UNICODE_LB_AL:
00330               case UNICODE_LB_ID:
00331                      return RESULT(UNICODE_LB_NONE);
00332               }
00333               
00334        if (!nolb25 &&
00335            (prevclass == UNICODE_LB_PR || prevclass == UNICODE_LB_PO))
00336        {
00337               if (uclass == UNICODE_LB_NU)
00338                      return RESULT(UNICODE_LB_NONE); /* LB25 */
00339 
00340               if (uclass == UNICODE_LB_OP || uclass == UNICODE_LB_HY)
00341               {
00342                      i->prevclass=prevclass;
00343                      i->prevclass_nsp=prevclass_nsp;
00344 
00345                      i->savedclass=uclass;
00346                      i->savedcmcnt=0;
00347                      i->next_handler=next_lb25_seenophy;
00348                      i->end_handler=end_lb25_seenophy;
00349                      return 0;
00350               }
00351        }
00352 
00353        if ((prevclass == UNICODE_LB_OP || prevclass == UNICODE_LB_HY) &&
00354            uclass == UNICODE_LB_NU)
00355               return RESULT(UNICODE_LB_NONE); /* LB25 */
00356 
00357        /*****/
00358 
00359        if (prevclass == UNICODE_LB_JL)
00360               switch (uclass) {
00361               case UNICODE_LB_JL:
00362               case UNICODE_LB_JV:
00363               case UNICODE_LB_H2:
00364               case UNICODE_LB_H3:
00365                      return RESULT(UNICODE_LB_NONE); /* LB26 */
00366               default:
00367                      break;
00368               }
00369 
00370        if ((prevclass == UNICODE_LB_JV ||
00371             prevclass == UNICODE_LB_H2) &&
00372            (uclass == UNICODE_LB_JV ||
00373             uclass == UNICODE_LB_JT))
00374               return RESULT(UNICODE_LB_NONE); /* LB26 */
00375 
00376        if ((prevclass == UNICODE_LB_JT ||
00377             prevclass == UNICODE_LB_H3) &&
00378            uclass == UNICODE_LB_JT)
00379               return RESULT(UNICODE_LB_NONE); /* LB26 */
00380 
00381 
00382        switch (prevclass) {
00383        case UNICODE_LB_JL:
00384        case UNICODE_LB_JV:
00385        case UNICODE_LB_JT:
00386        case UNICODE_LB_H2:
00387        case UNICODE_LB_H3:
00388               if (uclass == UNICODE_LB_IN || uclass == UNICODE_LB_PO)
00389                      return RESULT(UNICODE_LB_NONE); /* LB27 */
00390        default:
00391               break;
00392        }
00393 
00394        switch (uclass) {
00395        case UNICODE_LB_JL:
00396        case UNICODE_LB_JV:
00397        case UNICODE_LB_JT:
00398        case UNICODE_LB_H2:
00399        case UNICODE_LB_H3:
00400               if (prevclass == UNICODE_LB_PR)
00401                      return RESULT(UNICODE_LB_NONE); /* LB27 */
00402        default:
00403               break;
00404        }
00405 
00406        if (prevclass == UNICODE_LB_AL && uclass == UNICODE_LB_AL)
00407               return RESULT(UNICODE_LB_NONE); /* LB28 */
00408 
00409        if (prevclass == UNICODE_LB_IS && uclass == UNICODE_LB_AL)
00410               return RESULT(UNICODE_LB_NONE); /* LB29 */
00411 
00412        if ((prevclass == UNICODE_LB_AL || prevclass == UNICODE_LB_NU) &&
00413            uclass == UNICODE_LB_OP)
00414               return RESULT(UNICODE_LB_NONE); /* LB30 */
00415 
00416        if ((uclass == UNICODE_LB_AL || uclass == UNICODE_LB_NU) &&
00417            prevclass == UNICODE_LB_CP)
00418               return RESULT(UNICODE_LB_NONE); /* LB30 */
00419 
00420        return RESULT(UNICODE_LB_ALLOWED); /* LB31 */
00421 }
00422 
00423 /*
00424 ** Seen (PR|PO)(OP|HY), without returning the linebreak property for the second
00425 ** character, but NU did not follow. Backtrack.
00426 */
00427 
00428 static int unwind_lb25_seenophy(unicode_lb_info_t i)
00429 {
00430        int rc;
00431 
00432        /*uint8_t class=i->savedclass;*/
00433        int nolb25_flag=1;
00434 
00435        i->next_handler=next_def;
00436        i->end_handler=end_def;
00437 
00438        do
00439        {
00440               rc=next_def_nolb25(i, i->savedclass, nolb25_flag);
00441 
00442               if (rc)
00443                      return rc;
00444 
00445               /*class=UNICODE_LB_CM;*/
00446               nolb25_flag=0;
00447        } while (i->savedcmcnt--);
00448        return 0;
00449 }
00450 
00451 /*
00452 ** Seen (PR|PO)(OP|HY), without returning the linebreak property for the second
00453 ** character. If there's now a NU, we found the modified LB25 regexp.
00454 */
00455 
00456 static int next_lb25_seenophy(unicode_lb_info_t i,
00457                            uint8_t uclass)
00458 {
00459        int rc;
00460 
00461        if (uclass == UNICODE_LB_CM)
00462        {
00463               ++i->savedcmcnt; /* Keep track of CMs, and try again */
00464               return 0;
00465        }
00466 
00467        if (uclass != UNICODE_LB_NU)
00468        {
00469               rc=unwind_lb25_seenophy(i);
00470 
00471               if (rc)
00472                      return rc;
00473 
00474               return next_def_nolb25(i, uclass, 0);
00475        }
00476 
00477        do
00478        {
00479               rc=RESULT(UNICODE_LB_NONE); /* (OP|HY) feedback */
00480 
00481               if (rc)
00482                      return rc;
00483        } while (i->savedcmcnt--);
00484 
00485        i->next_handler=next_lb25_seennu;
00486        i->end_handler=end_def;
00487        i->prevclass=i->prevclass_nsp=uclass;
00488        return RESULT(UNICODE_LB_NONE);
00489 }
00490 
00491 /*
00492 ** Seen (PR|PO)(OP|HY), and now The End. Unwind, and give up.
00493 */
00494 
00495 static int end_lb25_seenophy(unicode_lb_info_t i)
00496 {
00497        int rc=unwind_lb25_seenophy(i);
00498 
00499        if (rc == 0)
00500               rc=end_def(i);
00501        return rc;
00502 }
00503 
00504 /*
00505 ** Seen an NU, modified LB25 regexp.
00506 */
00507 static int next_lb25_seennu(unicode_lb_info_t i, uint8_t uclass)
00508 {
00509        if (uclass == UNICODE_LB_NU || uclass == UNICODE_LB_SY ||
00510            uclass == UNICODE_LB_IS)
00511        {
00512               i->prevclass=i->prevclass_nsp=uclass;
00513               return RESULT(UNICODE_LB_NONE);
00514        }
00515 
00516        if (uclass == UNICODE_LB_CM)
00517               return RESULT(UNICODE_LB_NONE); /* LB9 */
00518 
00519        if (uclass == UNICODE_LB_CL || uclass == UNICODE_LB_CP)
00520        {
00521               i->prevclass=i->prevclass_nsp=uclass;
00522               i->next_handler=next_lb25_seennuclcp;
00523               i->end_handler=end_def;
00524               return RESULT(UNICODE_LB_NONE);
00525        }
00526 
00527        i->next_handler=next_def;
00528        i->end_handler=end_def;
00529 
00530        if (uclass == UNICODE_LB_PR || uclass == UNICODE_LB_PO)
00531        {
00532               i->prevclass=i->prevclass_nsp=uclass;
00533               return RESULT(UNICODE_LB_NONE);
00534        }
00535 
00536        return next_def(i, uclass); /* Not a prefix, process normally */
00537 }
00538 
00539 /*
00540 ** Seen CL|CP, in the modified LB25 regexp.
00541 */
00542 static int next_lb25_seennuclcp(unicode_lb_info_t i, uint8_t uclass)
00543 {
00544        if (uclass == UNICODE_LB_CM)
00545               return RESULT(UNICODE_LB_NONE); /* LB9 */
00546 
00547        i->next_handler=next_def;
00548        i->end_handler=end_def;
00549 
00550        if (uclass == UNICODE_LB_PR || uclass == UNICODE_LB_PO)
00551        {
00552               i->prevclass=i->prevclass_nsp=uclass;
00553 
00554               return RESULT(UNICODE_LB_NONE);
00555        }
00556 
00557        return next_def(i, uclass);
00558 }
00559 
00560 /******************/
00561 
00562 struct unicode_lbc_info {
00563        unicode_lb_info_t handle;
00564 
00565        struct unicode_buf buf;
00566 
00567        size_t buf_ptr;
00568 
00569        int (*cb_func)(int, unicode_char, void *);
00570        void *cb_arg;
00571 };
00572 
00573 static int unicode_lbc_callback(int value, void *ptr)
00574 {
00575        unicode_lbc_info_t h=(unicode_lbc_info_t)ptr;
00576 
00577        if (h->buf_ptr >= unicode_buf_len(&h->buf))
00578        {
00579               errno=EINVAL;
00580               return -1; /* Shouldn't happen */
00581        }
00582 
00583        return (*h->cb_func)(value, unicode_buf_ptr(&h->buf)[h->buf_ptr++],
00584                           h->cb_arg);
00585 }
00586 
00587 unicode_lbc_info_t unicode_lbc_init(int (*cb_func)(int, unicode_char, void *),
00588                                 void *cb_arg)
00589 {
00590        unicode_lbc_info_t h=
00591               (unicode_lbc_info_t)calloc(1, sizeof(struct unicode_lbc_info));
00592 
00593        if (!h)
00594               return NULL;
00595 
00596        h->cb_func=cb_func;
00597        h->cb_arg=cb_arg;
00598 
00599        if ((h->handle=unicode_lb_init(unicode_lbc_callback, h)) == NULL)
00600        {
00601               free(h);
00602               return NULL;
00603        }
00604        unicode_buf_init(&h->buf, (size_t)-1);
00605        return h;
00606 }
00607 
00608 void unicode_lbc_set_opts(unicode_lbc_info_t i, int opts)
00609 {
00610        unicode_lb_set_opts(i->handle, opts);
00611 }
00612        
00613 int unicode_lbc_next(unicode_lbc_info_t i, unicode_char ch)
00614 {
00615        if (i->buf_ptr >= unicode_buf_len(&i->buf))
00616        {
00617               i->buf_ptr=0;
00618               unicode_buf_clear(&i->buf);
00619        }
00620 
00621        unicode_buf_append(&i->buf, &ch, 1);
00622        return unicode_lb_next(i->handle, ch);
00623 }
00624 
00625 int unicode_lbc_end(unicode_lbc_info_t i)
00626 {
00627        int rc=unicode_lb_end(i->handle);
00628 
00629        unicode_buf_deinit(&i->buf);
00630        free(i);
00631        return rc;
00632 }