Back to index

lightning-sunbird  0.9+nobinonly
nsJISx4501LineBreaker.cpp
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is mozilla.org code.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 1998
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *
00024  * Alternatively, the contents of this file may be used under the terms of
00025  * either of the GNU General Public License Version 2 or later (the "GPL"),
00026  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00027  * in which case the provisions of the GPL or the LGPL are applicable instead
00028  * of those above. If you wish to allow use of your version of this file only
00029  * under the terms of either the GPL or the LGPL, and not to allow others to
00030  * use your version of this file under the terms of the MPL, indicate your
00031  * decision by deleting the provisions above and replace them with the notice
00032  * and other provisions required by the GPL or the LGPL. If you do not delete
00033  * the provisions above, a recipient may use your version of this file under
00034  * the terms of any one of the MPL, the GPL or the LGPL.
00035  *
00036  * ***** END LICENSE BLOCK ***** */
00037 
00038 
00039 
00040 #include "nsJISx4501LineBreaker.h"
00041 
00042 
00043 
00044 #include "pratom.h"
00045 #include "nsLWBRKDll.h"
00046 #include "jisx4501class.h"
00047 #define TH_UNICODE
00048 #include "th_char.h"
00049 #include "rulebrk.h"
00050 #include "nsUnicharUtils.h"
00051 
00052 
00053 /* 
00054 
00055    Simplification of Pair Table in JIS X 4051
00056 
00057    1. The Origion Table - in 4.1.3
00058 
00059    In JIS x 4051. The pair table is defined as below
00060 
00061    Class of
00062    Leading    Class of Trailing Char Class
00063    Char        
00064 
00065               1  2  3  4  5  6  7  8  9 10 11 12 13 13 14 14 15 16 17 18 19 20
00066                                                  *  #  *  #
00067         1     X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  E
00068         2        X  X  X  X  X                                               X
00069         3        X  X  X  X  X                                               X
00070         4        X  X  X  X  X                                               X
00071         5        X  X  X  X  X                                               X
00072         6        X  X  X  X  X                                               X
00073         7        X  X  X  X  X  X                                            X 
00074         8        X  X  X  X  X                                X              E 
00075         9        X  X  X  X  X                                               X
00076        10        X  X  X  X  X                                               X
00077        11        X  X  X  X  X                                               X
00078        12        X  X  X  X  X                                               X  
00079        13        X  X  X  X  X                    X                          X
00080        14        X  X  X  X  X                          X                    X
00081        15        X  X  X  X  X        X                       X        X     X 
00082        16        X  X  X  X  X                                   X     X     X
00083        17        X  X  X  X  X                                               E 
00084        18        X  X  X  X  X                                X  X     X     X 
00085        19     X  E  E  E  E  E  X  X  X  X  X  X  X  X  X  X  X  X  E  X  E  E
00086        20        X  X  X  X  X                                               E
00087 
00088    * Same Char
00089    # Other Char
00090     
00091    X Cannot Break
00092 
00093    2. Simplified by remove the class which we do not care
00094 
00095    However, since we do not care about class 13(Subscript), 14(Ruby), 
00096    19(split line note begin quote), and 20(split line note end quote) 
00097    we can simplify this par table into the following 
00098 
00099    Class of
00100    Leading    Class of Trailing Char Class
00101    Char        
00102 
00103               1  2  3  4  5  6  7  8  9 10 11 12 15 16 17 18 
00104                                                  
00105         1     X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X
00106         2        X  X  X  X  X                             
00107         3        X  X  X  X  X                            
00108         4        X  X  X  X  X                           
00109         5        X  X  X  X  X                          
00110         6        X  X  X  X  X                         
00111         7        X  X  X  X  X  X                      
00112         8        X  X  X  X  X                    X    
00113         9        X  X  X  X  X                                   
00114        10        X  X  X  X  X                                  
00115        11        X  X  X  X  X                                 
00116        12        X  X  X  X  X                                
00117        15        X  X  X  X  X        X           X        X    
00118        16        X  X  X  X  X                       X     X    
00119        17        X  X  X  X  X                                  
00120        18        X  X  X  X  X                    X  X     X    
00121 
00122    3. Simplified by merged classes
00123 
00124    After the 2 simplification, the pair table have some duplication 
00125    a. class 2, 3, 4, 5, 6,  are the same- we can merged them
00126    b. class 10, 11, 12, 17  are the same- we can merged them
00127 
00128 
00129    Class of
00130    Leading    Class of Trailing Char Class
00131    Char        
00132 
00133               1 [a] 7  8  9 [b]15 16 18 
00134                                      
00135         1     X  X  X  X  X  X  X  X  X
00136       [a]        X                             
00137         7        X  X                      
00138         8        X              X    
00139         9        X                                   
00140       [b]        X                                  
00141        15        X        X     X     X    
00142        16        X                 X  X    
00143        18        X              X  X  X    
00144 
00145 
00146 
00147    4. We add THAI characters and make it breakable w/ all ther class
00148 
00149    Class of
00150    Leading    Class of Trailing Char Class
00151    Char        
00152 
00153               1 [a] 7  8  9 [b]15 16 18 THAI
00154                                      
00155         1     X  X  X  X  X  X  X  X  X
00156       [a]        X                             
00157         7        X  X                      
00158         8        X              X    
00159         9        X                                   
00160       [b]        X                                  
00161        15        X        X     X     X    
00162        16        X                 X  X    
00163        18        X              X  X  X    
00164      THAI                                T
00165       
00166      T : need special handling
00167 
00168    5. Now we use one bit to encode weather it is breakable, and use 2 bytes
00169       for one row, then the bit table will look like:
00170 
00171                  18    <-   1
00172             
00173        1  0000 0001 1111 1111  = 0x01FF
00174       [a] 0000 0000 0000 0010  = 0x0002
00175        7  0000 0000 0000 0110  = 0x0006
00176        8  0000 0000 0100 0010  = 0x0042
00177        9  0000 0000 0000 0010  = 0x0002
00178       [b] 0000 0000 0000 0010  = 0x0002
00179       15  0000 0001 0101 0010  = 0x0152
00180       16  0000 0001 1000 0010  = 0x0182
00181       18  0000 0001 1100 0010  = 0x01C2
00182     THAI  0000 0000 0000 0000  = 0x0000
00183 
00184    5. Now we map the class to number
00185       
00186       0: 1 
00187       1: [a]- 2, 3, 4, 5, 6
00188       2: 7
00189       3: 8
00190       4: 9
00191       5: [b]- 10, 11, 12, 17
00192       6: 15
00193       7: 16
00194       8: 18
00195       9: THAI
00196 
00197 */
00198 
00199 #define MAX_CLASSES 10
00200 
00201 static const PRUint16 gPair[MAX_CLASSES] = {
00202   0x01FF, 
00203   0x0002, 
00204   0x0006, 
00205   0x0042, 
00206   0x0002, 
00207   0x0002, 
00208   0x0152, 
00209   0x0182, 
00210   0x01C2,
00211   0x0000
00212 };
00213 
00214 
00215 static inline int
00216 GETCLASSFROMTABLE(const PRUint32* t, PRUint16 l)
00217 {
00218   return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f);
00219 }
00220 
00221 #define CLASS_THAI 9
00222 
00223 
00224 
00225 static inline int
00226 IS_HALFWIDTH_IN_JISx4051_CLASS3(PRUnichar u)
00227 {
00228   return ((0xff66 <= (u)) && ((u) <= 0xff70));
00229 }
00230 
00231 static inline int
00232 IS_CJK_CHAR(PRUnichar u)
00233 {
00234   return ((0x1100 <= (u) && (u) <= 0x11ff) ||
00235           (0x2e80 <= (u) && (u) <= 0xd7ff) ||
00236           (0xf900 <= (u) && (u) <= 0xfaff) ||
00237           (0xff00 <= (u) && (u) <= 0xffef) );
00238 }
00239 
00240 static inline int
00241 IS_SPACE(PRUnichar u)
00242 {
00243   return ((u) == 0x0020 || (u) == 0x0009 || (u) == 0x000a || (u) == 0x000d || (u)==0x200b);
00244 }
00245 
00246 PRInt8 nsJISx4051LineBreaker::GetClass(PRUnichar u)
00247 {
00248    PRUint16 h = u & 0xFF00;
00249    PRUint16 l = u & 0x00ff;
00250    PRInt8 c;
00251    
00252    // Handle 3 range table first
00253    if( 0x0000 == h)
00254    {
00255      c = GETCLASSFROMTABLE(gLBClass00, l);
00256    } 
00257    else if(th_isthai(u))
00258    {
00259      c = CLASS_THAI;
00260    }
00261    else if( 0x2000 == h)
00262    {
00263      c = GETCLASSFROMTABLE(gLBClass20, l);
00264    } 
00265    else if( 0x2100 == h)
00266    {
00267      c = GETCLASSFROMTABLE(gLBClass21, l);
00268    } 
00269    else if( 0x3000 == h)
00270    {
00271      c = GETCLASSFROMTABLE(gLBClass30, l);
00272    } 
00273    else if (  ( ( 0x3200 <= u) && ( u <= 0xA4CF) ) || // CJK and Yi 
00274               ( ( 0xAC00 <= h) && ( h <= 0xD7FF) ) || // Hangul
00275               ( ( 0xf900 <= h) && ( h <= 0xfaff) )
00276              )
00277    { 
00278      c = 5; // CJK charcter, Han, and Han Compatability
00279    } 
00280    else if( 0xff00 == h)
00281    {
00282      if( l < 0x0060) // Fullwidth ASCII variant 
00283      {
00284        c = GETCLASSFROMTABLE(gLBClass00, (l+0x20));
00285      } else if (l < 0x00a0) {
00286        switch (l)
00287        {
00288          case 0x61: c = GetClass(0x3002); break;
00289          case 0x62: c = GetClass(0x300c); break;
00290          case 0x63: c = GetClass(0x300d); break;
00291          case 0x64: c = GetClass(0x3001); break;
00292          case 0x65: c = GetClass(0x30fb); break;
00293          case 0x9e: c = GetClass(0x309b); break;
00294          case 0x9f: c = GetClass(0x309c); break;
00295          default:
00296            if(IS_HALFWIDTH_IN_JISx4051_CLASS3(u))
00297               c = 1; // jis x4051 class 3
00298            else
00299               c = 5; // jis x4051 class 11
00300            break;
00301        }
00302        // Halfwidth Katakana variants
00303      } else if( l < 0x00e0) {
00304        c = 8; // Halfwidth Hangul variants 
00305      } else if( l < 0x00f0) {
00306        static PRUnichar NarrowFFEx[16] = 
00307        { 
00308          0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000,
00309          0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000
00310        };
00311        c = GetClass(NarrowFFEx[l - 0x00e0]);
00312      } else {
00313        c = 8;
00314      }
00315    }
00316    else if( 0x3100 == h) { 
00317      if ( l <= 0xbf) {  // Hangul Compatibility Jamo, Bopomofo, Kanbun
00318                         // XXX: This is per UAX #14, but UAX #14 may change
00319                         // the line breaking rules about Kanbun and Bopomofo.
00320        c = 5;
00321      }
00322      else if ( l >= 0xf0)
00323      {            // Katakana small letters for Ainu 
00324        c = 1;
00325      }
00326      else   // unassigned
00327      {
00328        c = 8;
00329      }
00330    } 
00331    else {
00332      c = 8; // others 
00333    }
00334    return c;
00335 }
00336 
00337 PRBool nsJISx4051LineBreaker::GetPair(PRInt8 c1, PRInt8 c2)
00338 {
00339   NS_ASSERTION( c1 < MAX_CLASSES ,"illegal classes 1");
00340   NS_ASSERTION( c2 < MAX_CLASSES ,"illegal classes 2");
00341 
00342   return (0 == ((gPair[c1] >> c2 ) & 0x0001));
00343 }
00344 
00345 
00346 nsJISx4051LineBreaker::nsJISx4051LineBreaker(
00347    const PRUnichar* aNoBegin, PRInt32 aNoBeginLen,
00348    const PRUnichar* aNoEnd, PRInt32 aNoEndLen
00349 )
00350 {
00351 }
00352 
00353 nsJISx4051LineBreaker::~nsJISx4051LineBreaker()
00354 {
00355 }
00356 
00357 NS_IMPL_ISUPPORTS1(nsJISx4051LineBreaker, nsILineBreaker)
00358 
00359 #define U_PERIOD ((PRUnichar) '.')
00360 #define U_COMMA ((PRUnichar) ',')
00361 #define U_SPACE ((PRUnichar) ' ')
00362 #define U_RIGHT_SINGLE_QUOTATION_MARK ((PRUnichar) 0x2019)
00363 #define NEED_CONTEXTUAL_ANALYSIS(c) ((c) == U_PERIOD || \
00364                                      (c) == U_COMMA || \
00365                                      (c) == U_RIGHT_SINGLE_QUOTATION_MARK)
00366 #define NUMERIC_CLASS  6 // JIS x4051 class 15 is now map to simplified class 6
00367 #define CHARACTER_CLASS  8 // JIS x4051 class 18 is now map to simplified class 8
00368 #define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039)
00369 
00370 PRInt8  nsJISx4051LineBreaker::ContextualAnalysis(
00371   PRUnichar prev, PRUnichar cur, PRUnichar next
00372 )
00373 {
00374    if(U_COMMA == cur)
00375    {
00376      if(IS_ASCII_DIGIT (prev) && IS_ASCII_DIGIT (next))
00377        return NUMERIC_CLASS;
00378    }
00379    else if(U_PERIOD == cur)
00380    {
00381      if((IS_ASCII_DIGIT (prev) || (0x0020 == prev)) && 
00382          IS_ASCII_DIGIT (next))
00383        return NUMERIC_CLASS;
00384  
00385      // By assigning a full stop  character class only when it's followed by
00386      // class 6 (numeric), 7, and 8 (character). Note that class 9 (Thai) 
00387      // doesn't matter, either way, we prevent lines from breaking around 
00388      // full stop in those cases while  still allowing it to end a line when 
00389      // followed by CJK  characters. With an additional condition of it being 
00390      // preceded by  class 0 or class > 5, we make sure that it does not 
00391      // start a line  (see bug 164759). 
00392      PRUint8 pc = GetClass(prev);
00393      if((pc > 5 || pc == 0)  && GetClass(next) > 5)
00394        return CHARACTER_CLASS;
00395    }
00396    else if(U_RIGHT_SINGLE_QUOTATION_MARK == cur)
00397    {
00398      // somehow people use this as ' in "it's" sometimes...
00399      if(U_SPACE != next)
00400        return CHARACTER_CLASS;
00401    }
00402    return this->GetClass(cur);
00403 }
00404 
00405 
00406 NS_IMETHODIMP nsJISx4051LineBreaker::BreakInBetween(
00407   const PRUnichar* aText1 , PRUint32 aTextLen1,
00408   const PRUnichar* aText2 , PRUint32 aTextLen2,
00409   PRBool *oCanBreak)
00410 {
00411   NS_ENSURE_TRUE(aText1, NS_ERROR_NULL_POINTER);
00412   NS_ENSURE_TRUE(aText2, NS_ERROR_NULL_POINTER);
00413 
00414   if((0 == aTextLen1) || (0==aTextLen2) ||
00415      IS_HIGH_SURROGATE(aText1[aTextLen1-1]) && 
00416      IS_LOW_SURROGATE(aText2[0]) )  //Do not separate a surrogate pair
00417   {
00418      *oCanBreak = PR_FALSE;
00419      return NS_OK;
00420   }
00421 
00422   //search for CJK characters until a space is found. 
00423   //if CJK char is found before space, use 4051, otherwise western
00424   PRInt32 cur;
00425 
00426   for (cur= aTextLen1-1; cur>=0; cur--)
00427   {
00428     if (IS_SPACE(aText1[cur]))
00429       break;
00430     if (IS_CJK_CHAR(aText1[cur]))
00431       goto ROUTE_CJK_BETWEEN;
00432   }
00433 
00434   for (cur= 0; cur < (PRInt32)aTextLen2; cur++)
00435   {
00436     if (IS_SPACE(aText2[cur]))
00437       break;
00438     if (IS_CJK_CHAR(aText2[cur]))
00439       goto ROUTE_CJK_BETWEEN;
00440   }
00441 
00442   //now apply western rule.
00443   *oCanBreak = IS_SPACE(aText1[aTextLen1-1]) || IS_SPACE(aText2[0]);
00444   return NS_OK;
00445 
00446 ROUTE_CJK_BETWEEN:
00447 
00448   PRInt8 c1, c2;
00449   if(NEED_CONTEXTUAL_ANALYSIS(aText1[aTextLen1-1]))
00450     c1 = this->ContextualAnalysis((aTextLen1>1)?aText1[aTextLen1-2]:0,
00451                                   aText1[aTextLen1-1],
00452                                   aText2[0]);
00453   else 
00454     c1 = this->GetClass(aText1[aTextLen1-1]);
00455 
00456   if(NEED_CONTEXTUAL_ANALYSIS(aText2[0]))
00457     c2 = this->ContextualAnalysis(aText1[aTextLen1-1],
00458                                   aText2[0],
00459                                   (aTextLen2>1)?aText2[1]:0);
00460   else 
00461     c2 = this->GetClass(aText2[0]);
00462 
00463   /* Handle cases for THAI */
00464   if((CLASS_THAI == c1) && (CLASS_THAI == c2))
00465   {
00466      *oCanBreak = (0 == TrbWordBreakPos(aText1, aTextLen1, aText2, aTextLen2));
00467   }
00468   else 
00469   {
00470      *oCanBreak = GetPair(c1,c2);
00471   }
00472   return NS_OK;
00473 }
00474 
00475 
00476 NS_IMETHODIMP nsJISx4051LineBreaker::Next( 
00477   const PRUnichar* aText, PRUint32 aLen, PRUint32 aPos,
00478   PRUint32* oNext, PRBool *oNeedMoreText) 
00479 {
00480   NS_ENSURE_TRUE(aText, NS_ERROR_NULL_POINTER);
00481   NS_ENSURE_TRUE(oNext, NS_ERROR_NULL_POINTER);
00482   NS_ENSURE_TRUE(oNeedMoreText, NS_ERROR_NULL_POINTER);
00483   NS_ENSURE_TRUE(aPos <= aLen, NS_ERROR_ILLEGAL_VALUE);
00484 
00485   //forward check for CJK characters until a space is found. 
00486   //if CJK char is found before space, use 4051, otherwise western
00487   PRUint32 cur;
00488   for (cur = aPos; cur < aLen; ++cur)
00489   {
00490     if (IS_SPACE(aText[cur]))
00491     {
00492       *oNext = cur;
00493       *oNeedMoreText = PR_FALSE;
00494       return NS_OK;
00495     }
00496     if (IS_CJK_CHAR(aText[cur]))
00497       goto ROUTE_CJK_NEXT;
00498   }
00499   *oNext = aLen;
00500   *oNeedMoreText = PR_TRUE;
00501   return NS_OK;
00502 
00503 ROUTE_CJK_NEXT:
00504   PRInt8 c1, c2;
00505   cur = aPos;
00506   if(NEED_CONTEXTUAL_ANALYSIS(aText[cur]))
00507   {
00508     c1 = this->ContextualAnalysis((cur>0)?aText[cur-1]:0,
00509                                   aText[cur],
00510                                   (cur<(aLen-1)) ?aText[cur+1]:0);
00511   } else  {
00512     c1 = this->GetClass(aText[cur]);
00513   }
00514   
00515   if(CLASS_THAI == c1) 
00516   {
00517      *oNext = PRUint32(TrbFollowing(aText, aLen, aPos));
00518      *oNeedMoreText = PR_FALSE;
00519      return NS_OK;
00520   }
00521 
00522   for(cur++; cur <aLen; cur++)
00523   {
00524      if(NEED_CONTEXTUAL_ANALYSIS(aText[cur]))
00525      {
00526        c2= this->ContextualAnalysis((cur>0)?aText[cur-1]:0,
00527                                   aText[cur],
00528                                   (cur<(aLen-1)) ?aText[cur+1]:0);
00529      } else {
00530        c2 = this->GetClass(aText[cur]);
00531      }
00532 
00533      if(GetPair(c1, c2)) {
00534        *oNext = cur ;
00535        *oNeedMoreText = PR_FALSE;
00536        return NS_OK;
00537      }
00538      c1 = c2;
00539   }
00540   *oNext = aLen;
00541   *oNeedMoreText = PR_TRUE;
00542   return NS_OK;
00543 }
00544 
00545 NS_IMETHODIMP nsJISx4051LineBreaker::Prev( 
00546   const PRUnichar* aText, PRUint32 aLen, PRUint32 aPos,
00547   PRUint32* oPrev, PRBool *oNeedMoreText) 
00548 {
00549   NS_ENSURE_TRUE(aText, NS_ERROR_NULL_POINTER);
00550   NS_ENSURE_TRUE(oPrev, NS_ERROR_NULL_POINTER);
00551   NS_ENSURE_TRUE(oNeedMoreText, NS_ERROR_NULL_POINTER);
00552 
00553   //backward check for CJK characters until a space is found. 
00554   //if CJK char is found before space, use 4051, otherwise western
00555   PRUint32 cur;
00556   for (cur = aPos - 1; cur > 0; --cur)
00557   {
00558     if (IS_SPACE(aText[cur]))
00559     {
00560       if (cur != aPos - 1) // XXXldb Why?
00561         ++cur;
00562       *oPrev = cur;
00563       *oNeedMoreText = PR_FALSE;
00564       return NS_OK;
00565     }
00566     if (IS_CJK_CHAR(aText[cur]))
00567       goto ROUTE_CJK_PREV;
00568   }
00569 
00570   *oPrev = 0;
00571   *oNeedMoreText = PR_TRUE;
00572   return NS_OK;
00573 
00574 ROUTE_CJK_PREV:
00575   cur = aPos;
00576   PRInt8 c1, c2;
00577   if(NEED_CONTEXTUAL_ANALYSIS(aText[cur-1]))
00578   {
00579     c2 = this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:0,
00580                                   aText[cur-1],
00581                                   (cur<aLen) ?aText[cur]:0);
00582   } else  {
00583     c2 = this->GetClass(aText[cur-1]);
00584   }
00585   // To Do: 
00586   //
00587   // Should handle CLASS_THAI here
00588   //
00589   for(cur--; cur > 0; cur--)
00590   {
00591      if(NEED_CONTEXTUAL_ANALYSIS(aText[cur-1]))
00592      {
00593        c1= this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:0,
00594                                   aText[cur-1],
00595                                   (cur<aLen) ?aText[cur]:0);
00596      } else {
00597        c1 = this->GetClass(aText[cur-1]);
00598      }
00599 
00600      if(GetPair(c1, c2)) {
00601        *oPrev = cur;
00602        *oNeedMoreText = PR_FALSE;
00603        return NS_OK;
00604      }
00605      c2 = c1;
00606   }
00607   *oPrev = 0;
00608   *oNeedMoreText = PR_TRUE;
00609   return NS_OK;
00610 }
00611