Back to index

lightning-sunbird  0.9+nobinonly
rulebrk.c
Go to the documentation of this file.
00001 /* ***** BEGIN LICENSE BLOCK *****
00002  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00003  *
00004  * The contents of this file are subject to the Mozilla Public License Version
00005  * 1.1 (the "License"); you may not use this file except in compliance with
00006  * the License. You may obtain a copy of the License at
00007  * http://www.mozilla.org/MPL/
00008  *
00009  * Software distributed under the License is distributed on an "AS IS" basis,
00010  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00011  * for the specific language governing rights and limitations under the
00012  * License.
00013  *
00014  * The Original Code is LibInThai.
00015  *
00016  * The Initial Developer of the Original Code is
00017  * Samphan Raruenrom.
00018  * Portions created by the Initial Developer are Copyright (C) 1998
00019  * the Initial Developer. All Rights Reserved.
00020  *
00021  * Contributor(s):
00022  *
00023  * Alternatively, the contents of this file may be used under the terms of
00024  * either of the GNU General Public License Version 2 or later (the "GPL"),
00025  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00026  * in which case the provisions of the GPL or the LGPL are applicable instead
00027  * of those above. If you wish to allow use of your version of this file only
00028  * under the terms of either the GPL or the LGPL, and not to allow others to
00029  * use your version of this file under the terms of the MPL, indicate your
00030  * decision by deleting the provisions above and replace them with the notice
00031  * and other provisions required by the GPL or the LGPL. If you do not delete
00032  * the provisions above, a recipient may use your version of this file under
00033  * the terms of any one of the MPL, the GPL or the LGPL.
00034  *
00035  * ***** END LICENSE BLOCK ***** */
00036 #define TH_UNICODE
00037 
00038 #include <stdlib.h>
00039 #include <assert.h>
00040 #include "th_char.h"
00041 #define th_isalpha(c)   (((c)>='a'&&(c)<='z')||((c)>='A'&&(c)<='Z'))
00042 #define th_isspace(c)   ((c)==' '||(c)=='\t')
00043 
00044 
00045 /*
00047 // Thai character type array
00048 */
00049 
00050 typedef unsigned short twb_t; 
00051 extern const twb_t _TwbType[0x100-0xa0];
00052 
00053 /*
00054 // bit definition
00055 */
00056 
00057 #define VRS 0x0001
00058 #define VRE 0x0002
00059 #define VRX 0x0004
00060 
00061 #define VRA 0x0008
00062 
00063 #define VLA 0x0010
00064 #define VLO 0x0020
00065 #define VLI 0x0040
00066 
00067 #define VC 0x0080
00068 
00069 #define CC 0x0100
00070 #define CS 0x0200
00071 
00072 #define C2 0x0400
00073 #define CHB 0x0800
00074 #define CHE 0x1000
00075 
00076 #define MT 0x2000
00077 /*
00078 //_#define me 0x2000
00079 */
00080 #define M 0x4000
00081 
00082 #define T 0x8000
00083 
00084 #define VL    (VLA|VLO|VLI)
00085 #define VR    (VRS|VRE|VRX)
00086 #define NE    (VL|VRS)
00087 #define NB    (VR|M)
00088 #define V     (VL|VR)
00089 #define CX    (CC|CS)
00090 #define C     (CX|VC)
00091 #define A (C|V|M)
00092 
00093 #define twbtype(c)   (_TwbType[th_zcode(c)])
00094 
00095 #ifndef TRUE
00096 #define TRUE 1
00097 #define FALSE 0
00098 #endif
00099 #define RETURN(b) return (b)
00100 
00101 
00102 /*
00104 */
00105 
00106 int TrbWordBreakPos(const th_char *pstr, int left, 
00107                     const th_char *rstr, int right)
00108 /*                 const ThBreakIterator *it, const th_char **p)*/
00109 {
00110        /*
00111        //int left, right;
00112        //const th_char *s = *p;
00113        */
00114     const th_char *lstr = pstr + left;
00115        th_char _c[6];
00116        twb_t _t[6];
00117        #define c(i) (_c[(i)+3])
00118        #define t(i) (_t[(i)+3])
00119        int i, j;
00120 
00121        /*
00122        //left = s - it->begin; 
00123        */
00124        if(left < 0) return -1;
00125        /*
00126         //right = (it->end == NULL) ? 4 : it->begin - s;
00127        */
00128        if(right < 1) return -1;
00129 
00130         /*
00131        // get c(0), t(0)
00132         */
00133        c(0) = rstr[0]; /* may be '\0' */
00134     if(!th_isthai(c(0))) return -1;
00135        t(0) = twbtype(c(0));
00136        if(!(t(0) & A)) return -1;
00137 
00138         /*
00139        // get c(-1), t(-1)
00140         */
00141        if(left >= 1) { 
00142               c(-1) = lstr[-1]; 
00143               if(!th_isthai(c(-1))) return 0;
00144               t(-1) = twbtype(c(-1)); 
00145               if(!(t(-1) & A)) return 0;  /* handle punctuation marks here */
00146        } else { c(-1) = 0; t(-1) = 0; }
00147 
00148        /*
00149        // get c(1..2), t(1..2)
00150        */
00151        for(i = 1; i <= 2; i++) {
00152               if(i >= right) { c(i) = 0; t(i) = 0; }
00153               else {
00154                      c(i) = rstr[i]; /* may be '\0'; */
00155                      if(!th_isthai(c(i))) right = i--;
00156                      else {
00157                             t(i) = twbtype(c(i));
00158                             if(!(t(i) & A)) right = i--;
00159                      }
00160               }
00161        }
00162        /*
00163        // get c(-2..-3), t(-2..-3)
00164        */
00165        for(i = -2, j = -2; i >= -3 ; j--) {
00166               if(j < -left) { c(i) = 0; t(i) = 0; i--; }
00167               else {
00168                      c(i) = lstr[j]; 
00169                      if(!th_isthai(c(i))) left = 0;
00170                      else {
00171                             t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0);
00172                             if(!(t(i) & A)) left = 0;
00173                             else {
00174                                    if((t(i+1) & MT) && ((t(i) & VR) || (t(i+2) & VR))) {
00175                                           c(i+1) = c(i); t(i+1) = t(i);
00176                                    } else i--;
00177                             }
00178                      }
00179               }
00180        }
00181 
00182        /*
00183        // prohibit the unlikely
00184        */
00185        if((t(-1) & C) && (t(0) & C)) {
00186          if((t(-1) & CHE) || (t(0) & CHB)) return -1;
00187        }
00188        /*
00189        // special case : vlao, C/ sara_a|aa, !sara_a
00190        */
00191        if((t(-3) & (VLA|VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) &&
00192               (c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA)) return 0;
00193 
00194        /*
00195        // prohibit break
00196        */
00197        if(t(0) & NB) return -1; 
00198        if(t(-1) & NE) return -1;
00199 
00200 
00201   /*
00202        // apply 100% rules
00203   */
00204        if(t(-1) & VRE) {
00205               if(c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0;
00206               return -1; /* usually too short syllable, part of word */
00207        }
00208 
00209        if(t(-2) & VRE) return -1;
00210 
00211        if((t(0) & C) && (t(1) & (VR|MT)) && (c(2) != TH_THANTHAKHAT)) { /*?C, NB */
00212               if((t(-1) & (VRS|VRX))  && c(1) == TH_SARA_I) return -1; /* exception */
00213               if(t(-1) & (V|M)) return 0; /* !C/ C, NB */
00214               if(t(-2) & VRS) return 0;   /* VRS, C / C, NB */
00215               if(!(t(0) & C2) && c(1) == TH_SARA_I) {   /*     / !C2 or /c, sara_i */
00216                      if(t(-2) & VRX) return 0; /* VRX, C / C, NB ? 100%? */
00217                      if(t(-2) & VC) return 0;    /* VC, C / C, NB ? 100% */
00218               }
00219        }
00220        if((t(-1) & VRX) && (t(0) & CC)) return 0;                            /* VRX/ CC */
00221        if((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V|M))) return 0;/* VRS, C/ !C */
00222 
00223        
00224        if((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) {
00225               if((t(-2) & A) && (t(-1) & CX)) return 0; /* A, CX / CX, C2 */
00226               if((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */
00227        }
00228        /*
00229        // apply 90% rules
00230        */
00231        if(t(0) & VL) return 0;
00232        if(t(1) & VL) return -1;
00233        if(c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING) return 0;
00234 
00235        /*
00236        //return -1;
00237        // apply 80% rules
00238        */
00239        if(t(0) & CHE) {
00240               if((t(-2) & VRS) && (t(-1) & C)) return 0;       /* VRS, C/ CHE */
00241               /*if(t(-1) & VRX) return 0;                             // VRX/ CHE */
00242               if(t(-1) & VC) return 0;                                /* VC/ CHE */
00243        }
00244        if(t(-1) & CHB) {
00245               if((t(0) & C) && (t(1) & VR)) return 0;   /* CHB/ CC, VR */
00246               if(t(0) & VC) return 0;                                 /* CHB/ VC */
00247        }
00248        
00249        if((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */
00250               if(t(-2) & VLI) return 0;  /* VLI,C/C,VR .*/
00251               else { /* vlao, C ? C , VR */
00252                      if(c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */
00253                      if(t(-2) & VLO) return 0; /* VLO, C/ C, !sara_a */
00254                      if(!(t(1) & VRA)) return 0; /* VLA, C/ C, !vca */
00255               }
00256        }
00257        /* C,MT,C */ 
00258        if((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1;
00259 
00260        return -1;
00261 }
00262 
00263 
00264 int TrbFollowing(const th_char *begin, int length, int offset)
00265 /*
00266 //(ThBreakIterator *this, int offset)
00267 */
00268 {
00269        const th_char *w = begin + offset;
00270     const th_char *end = begin + length;
00271        while(w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++;
00272 
00273        if(w < end && *w && !th_isthai(*w)) {
00274               int english = FALSE;
00275               while(w < end && *w && !th_isthai(*w) && !th_isspace(*w)) {
00276                      if(th_isalpha(*w)) english = TRUE;
00277                      w++; 
00278               }
00279               if(english || w == end || 
00280             (!th_isthai(*w) && th_isspace(*w))) return w - begin;
00281        } 
00282        if(w == end || *w == 0 || !th_isthai(*w)) return w - begin;
00283        w++;
00284        if(w < end && *w && th_isthai(*w)) {
00285               int brk = TrbWordBreakPos(begin, w-begin, w, end-w);
00286               while (brk < 0) {
00287                      w++;
00288                      if(w == end || *w == 0 || !th_isthai(*w)) break;
00289                      brk = TrbWordBreakPos(begin, w-begin, w, end-w);
00290               }
00291         if (brk > 0) w += brk;
00292        }
00293        if(w < end && *w && !th_isthai(*w)) {
00294               while(w < end && *w && !th_isthai(*w) && 
00295             !th_isalpha(*w) && !th_isspace(*w)) w++;
00296        }
00297        return w - begin;
00298 }
00299 
00300 
00301 /*
00303 */
00304 const twb_t  _TwbType[0x100-0xa0] = {
00305 #if 0
00306 /* 80  */    T,
00307 /* 81-8f */   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00308 /* 90  */    T,
00309 /* 91-9f */   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00310 #endif
00311 /* a0  */    0,
00312 /* a1  */    CS,
00313 /* a2  */    CS | CHE,
00314 /* a3  */    CC | CHE,
00315 /* a4  */    CS | CHE,
00316 /* a5  */    CC | CHE,
00317 /* a6  */    CS,
00318 /* a7  */    CS | CHB,
00319 /* a8  */    CS,
00320 /* a9  */    CC | CHE,
00321 /* aa  */    CS,
00322 /* ab  */    CC | CHE,
00323 /* ac  */    CC | CHB | CHE,
00324 /* ad  */    CS | CHB,
00325 /* ae  */    CS | CHB,
00326 /* af  */    CS | CHB,
00327 /* b0  */    CS,
00328 /* b1  */    CS | CHB | CHE,
00329 /* b2  */    CS | CHB | CHE,
00330 /* b3  */    CS | CHB,
00331 /* b4  */    CS,
00332 /* b5  */    CS,
00333 /* b6  */    CS,
00334 /* b7  */    CS,
00335 /* b8  */    CS,
00336 /* b9  */    CS,
00337 /* ba  */    CS,
00338 /* bb  */    CS,
00339 /* bc  */    CC | CHE,
00340 /* bd  */    CC | CHE,
00341 /* be  */    CS,
00342 /* bf  */    CS,
00343 /* c0  */    CS | CHE,
00344 /* c1  */    CS,
00345 /* c2  */    CS,
00346 /* c3  */    CS | C2 | CHE, /* ? add CHE  */
00347 /* c4  */    VC | CHE,
00348 /* c5  */    CS | C2,
00349 /* c6  */    VC | CHE,
00350 /* c7  */    VC | C2,
00351 /* c8  */    CS,
00352 /* c9  */    CS | CHB,
00353 /* ca  */    CS | CHE,
00354 /* cb  */    CC | CHE,
00355 /* CC  */    CS | CHB | CHE,
00356 /* cd  */    VC,
00357 /* ce  */    CC | CHE,
00358 /* cf  */    T,
00359 /* d0  */    VRE | VRA,
00360 /* d1   */   VRS,
00361 /* d2  */    VRX | VRA,
00362 /* d3   */   VRE,
00363 /* d4   */   VRX | VRA,
00364 /* d5   */   VRX | VRA,
00365 /* d6   */   VRS,
00366 /* d7   */   VRS | VRA,
00367 /* d8   */   VRX,
00368 /* d9   */   VRX,
00369 /* da   */   T,
00370 /* db  */ 0,
00371 /* dc  */ 0,
00372 /* dd  */ 0,
00373 /* de  */ 0,
00374 /* df  */    T,
00375 /* e0  */    VLA,
00376 /* e1  */    VLO,
00377 /* e2  */    VLO,
00378 /* e3  */    VLI,
00379 /* e4  */    VLI,
00380 /* e5  */    VRE,
00381 /* e6  */    M,
00382 /* e7   */   M,
00383 /* e8   */   M | MT,
00384 /* e9   */   M | MT,
00385 /* ea   */   M | MT,
00386 /* eb   */   M | MT,
00387 /* ec   */   M,
00388 /* ed   */   T,
00389 /* ee   */   T,
00390 /* ef  */    T,
00391 /* f0  */    T,
00392 /* f1  */    T,
00393 /* f2  */    T,
00394 /* f3  */    T,
00395 /* f4  */    T,
00396 /* f5  */    T,
00397 /* f6  */    T,
00398 /* f7  */    T,
00399 /* f8  */    T,
00400 /* f9  */    T,
00401 /* fa  */    T,
00402 /* fb  */    T,
00403 /* fc  */ 0,
00404 /* fd  */ 0,
00405 /* fe  */ 0,
00406 /* ff  */ 0
00407 };