Back to index

lightning-sunbird  0.9+nobinonly
nsUnicodeRange.cpp
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is mozilla.org code.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 1998
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *
00024  * Alternatively, the contents of this file may be used under the terms of
00025  * either of the GNU General Public License Version 2 or later (the "GPL"),
00026  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00027  * in which case the provisions of the GPL or the LGPL are applicable instead
00028  * of those above. If you wish to allow use of your version of this file only
00029  * under the terms of either the GPL or the LGPL, and not to allow others to
00030  * use your version of this file under the terms of the MPL, indicate your
00031  * decision by deleting the provisions above and replace them with the notice
00032  * and other provisions required by the GPL or the LGPL. If you do not delete
00033  * the provisions above, a recipient may use your version of this file under
00034  * the terms of any one of the MPL, the GPL or the LGPL.
00035  *
00036  * ***** END LICENSE BLOCK ***** */
00037 
00038 #include "nsUnicodeRange.h"
00039 
00040 // This table depends on unicode range definitions. 
00041 // Each item's index must correspond unicode range value
00042 // eg. x-cyrillic = LangGroupTable[kRangeCyrillic]
00043 const char *gUnicodeRangeToLangGroupTable[] = 
00044 {
00045   "x-cyrillic",
00046   "el",
00047   "tr",
00048   "he",
00049   "ar",
00050   "x-baltic",
00051   "th",
00052   "ko",
00053   "ja",
00054   "zh-CN",
00055   "zh-TW",
00056   "x-devanagari",
00057   "x-tamil",
00058   "x-armn",
00059   "x-beng",
00060   "x-cans",
00061   "x-ethi",
00062   "x-geor",
00063   "x-gujr",
00064   "x-guru",
00065   "x-khmr",
00066   "x-mlym"
00067 };
00068 
00069 /**********************************************************************
00070  * Unicode subranges as defined in unicode 3.0
00071  * x-western, x-central-euro, tr, x-baltic  -> latin 
00072  *  0000 - 036f 
00073  *  1e00 - 1eff
00074  *  2000 - 206f  (general punctuation)
00075  *  20a0 - 20cf  (currency symbols)
00076  *  2100 - 214f  (letterlike symbols)
00077  *  2150 - 218f  (Number Forms)
00078  * el         -> greek
00079  *  0370 - 03ff
00080  *  1f00 - 1fff
00081  * x-cyrillic -> cyrillic
00082  *  0400 - 04ff
00083  * he         -> hebrew
00084  *  0590 - 05ff
00085  * ar         -> arabic
00086  *  0600 - 06ff
00087  *  fb50 - fdff (arabic presentation forms)
00088  *  fe70 - feff (arabic presentation forms b)
00089  * th - thai
00090  *  0e00 - 0e7f
00091  * ko        -> korean
00092  *  ac00 - d7af  (hangul Syllables)
00093  *  1100 - 11ff    (jamo)
00094  *  3130 - 318f (hangul compatibility jamo)
00095  * ja
00096  *  3040 - 309f (hiragana)
00097  *  30a0 - 30ff (katakana)
00098  * zh-CN
00099  * zh-TW
00100  *
00101  * CJK
00102  *  3100 - 312f (bopomofo)
00103  *  31a0 - 31bf (bopomofo extended)
00104  *  3000 - 303f (CJK Symbols and Punctuation) 
00105  *  2e80 - 2eff (CJK radicals supplement)
00106  *  2f00 - 2fdf (Kangxi Radicals)
00107  *  2ff0 - 2fff (Ideographic Description Characters)
00108  *  3190 - 319f (kanbun)
00109  *  3200 - 32ff (Enclosed CJK letters and Months)
00110  *  3300 - 33ff (CJK compatibility)
00111  *  3400 - 4dbf (CJK Unified Ideographs Extension A)
00112  *  4e00 - 9faf (CJK Unified Ideographs)
00113  *  f900 - fa5f (CJK Compatibility Ideographs)
00114  *  fe30 - fe4f (CJK compatibility Forms)
00115  *  ff00 - ffef (halfwidth and fullwidth forms)
00116  *
00117  * Armenian
00118  *  0530 - 058f 
00119  * Sriac 
00120  *  0700 - 074f
00121  * Thaana
00122  *  0780 - 07bf
00123  * Devanagari
00124  *  0900 - 097f
00125  * Bengali
00126  *  0980 - 09ff
00127  * Gurmukhi
00128  *  0a00 - 0a7f
00129  * Gujarati
00130  *  0a80 - 0aff
00131  * Oriya
00132  *  0b00 - 0b7f
00133  * Tamil
00134  *  0b80 - 0bff
00135  * Telugu
00136  *  0c00 - 0c7f
00137  * Kannada
00138  *  0c80 - 0cff
00139  * Malayalam
00140  *  0d00 - 0d7f
00141  * Sinhala
00142  *  0d80 - 0def
00143  * Lao
00144  *  0e80 - 0eff
00145  * Tibetan
00146  *  0f00 - 0fbf
00147  * Myanmar
00148  *  1000 - 109f
00149  * Georgian
00150  *  10a0 - 10ff
00151  * Ethiopic
00152  *  1200 - 137f
00153  * Cherokee
00154  *  13a0 - 13ff
00155  * Canadian Aboriginal Syllabics
00156  *  1400 - 167f
00157  * Ogham
00158  *  1680 - 169f
00159  * Runic 
00160  *  16a0 - 16ff
00161  * Khmer
00162  *  1780 - 17ff
00163  * Mongolian
00164  *  1800 - 18af
00165  * Misc - superscripts and subscripts
00166  *  2070 - 209f
00167  * Misc - Combining Diacritical Marks for Symbols
00168  *  20d0 - 20ff
00169  * Misc - Arrows
00170  *  2190 - 21ff
00171  * Misc - Mathematical Operators
00172  *  2200 - 22ff
00173  * Misc - Miscellaneous Technical
00174  *  2300 - 23ff
00175  * Misc - Control picture
00176  *  2400 - 243f
00177  * Misc - Optical character recognition
00178  *  2440 - 2450
00179  * Misc - Enclose Alphanumerics
00180  *  2460 - 24ff
00181  * Misc - Box Drawing 
00182  *  2500 - 257f
00183  * Misc - Block Elements
00184  *  2580 - 259f
00185  * Misc - Geometric Shapes
00186  *  25a0 - 25ff
00187  * Misc - Miscellaneous Symbols
00188  *  2600 - 267f
00189  * Misc - Dingbats
00190  *  2700 - 27bf
00191  * Misc - Braille Patterns
00192  *  2800 - 28ff
00193  * Yi Syllables
00194  *  a000 - a48f
00195  * Yi radicals
00196  *  a490 - a4cf
00197  * Alphabetic Presentation Forms
00198  *  fb00 - fb4f
00199  * Misc - Combining half Marks
00200  *  fe20 - fe2f
00201  * Misc - small form variants
00202  *  fe50 - fe6f
00203  * Misc - Specials
00204  *  fff0 - ffff
00205  *********************************************************************/
00206 
00207 
00208 
00209 #define NUM_OF_SUBTABLES      9
00210 #define SUBTABLE_SIZE         16
00211 
00212 static PRUint8 gUnicodeSubrangeTable[NUM_OF_SUBTABLES][SUBTABLE_SIZE] = 
00213 { 
00214   { // table for X---
00215     kRangeTableBase+1,  //u0xxx
00216     kRangeTableBase+2,  //u1xxx
00217     kRangeTableBase+3,  //u2xxx
00218     kRangeSetCJK,       //u3xxx
00219     kRangeSetCJK,       //u4xxx
00220     kRangeSetCJK,       //u5xxx
00221     kRangeSetCJK,       //u6xxx
00222     kRangeSetCJK,       //u7xxx
00223     kRangeSetCJK,       //u8xxx
00224     kRangeSetCJK,       //u9xxx
00225     kRangeTableBase+4,  //uaxxx
00226     kRangeKorean,       //ubxxx
00227     kRangeKorean,       //ucxxx
00228     kRangeTableBase+5,  //udxxx
00229     kRangePrivate,      //uexxx
00230     kRangeTableBase+6   //ufxxx
00231   },
00232   { //table for 0X--
00233     kRangeSetLatin,          //u00xx
00234     kRangeSetLatin,          //u01xx
00235     kRangeSetLatin,          //u02xx
00236     kRangeGreek,             //u03xx     XXX 0300-036f is in fact kRangeCombiningDiacriticalMarks
00237     kRangeCyrillic,          //u04xx
00238     kRangeTableBase+7,       //u05xx, includes Cyrillic supplement, Hebrew, and Armenian
00239     kRangeArabic,            //u06xx
00240     kRangeTertiaryTable,     //u07xx
00241     kRangeUnassigned,        //u08xx
00242     kRangeTertiaryTable,     //u09xx
00243     kRangeTertiaryTable,     //u0axx
00244     kRangeTertiaryTable,     //u0bxx
00245     kRangeTertiaryTable,     //u0cxx
00246     kRangeTertiaryTable,     //u0dxx
00247     kRangeTertiaryTable,     //u0exx
00248     kRangeTibetan,           //u0fxx
00249   },
00250   { //table for 1x--
00251     kRangeTertiaryTable,     //u10xx
00252     kRangeKorean,            //u11xx
00253     kRangeEthiopic,          //u12xx
00254     kRangeTertiaryTable,     //u13xx
00255     kRangeCanadian,          //u14xx
00256     kRangeCanadian,          //u15xx
00257     kRangeTertiaryTable,     //u16xx
00258     kRangeKhmer,             //u17xx
00259     kRangeMongolian,         //u18xx
00260     kRangeUnassigned,        //u19xx
00261     kRangeUnassigned,        //u1axx
00262     kRangeUnassigned,        //u1bxx
00263     kRangeUnassigned,        //u1cxx
00264     kRangeUnassigned,        //u1dxx
00265     kRangeSetLatin,          //u1exx
00266     kRangeGreek,             //u1fxx
00267   },
00268   { //table for 2x--
00269     kRangeSetLatin,          //u20xx
00270     kRangeSetLatin,          //u21xx
00271     kRangeMathOperators,     //u22xx
00272     kRangeMiscTechnical,     //u23xx
00273     kRangeControlOpticalEnclose, //u24xx
00274     kRangeBoxBlockGeometrics, //u25xx
00275     kRangeMiscSymbols,       //u26xx
00276     kRangeDingbats,          //u27xx
00277     kRangeBraillePattern,    //u28xx
00278     kRangeUnassigned,        //u29xx
00279     kRangeUnassigned,        //u2axx
00280     kRangeUnassigned,        //u2bxx
00281     kRangeUnassigned,        //u2cxx
00282     kRangeUnassigned,        //u2dxx
00283     kRangeSetCJK,            //u2exx
00284     kRangeSetCJK,            //u2fxx
00285   },
00286   {  //table for ax--
00287     kRangeYi,                //ua0xx
00288     kRangeYi,                //ua1xx
00289     kRangeYi,                //ua2xx
00290     kRangeYi,                //ua3xx
00291     kRangeYi,                //ua4xx
00292     kRangeUnassigned,        //ua5xx
00293     kRangeUnassigned,        //ua6xx
00294     kRangeUnassigned,        //ua7xx
00295     kRangeUnassigned,        //ua8xx
00296     kRangeUnassigned,        //ua9xx
00297     kRangeUnassigned,        //uaaxx
00298     kRangeUnassigned,        //uabxx
00299     kRangeKorean,            //uacxx
00300     kRangeKorean,            //uadxx
00301     kRangeKorean,            //uaexx
00302     kRangeKorean,            //uafxx
00303   },
00304   {  //table for dx--
00305     kRangeKorean,            //ud0xx
00306     kRangeKorean,            //ud1xx
00307     kRangeKorean,            //ud2xx
00308     kRangeKorean,            //ud3xx
00309     kRangeKorean,            //ud4xx
00310     kRangeKorean,            //ud5xx
00311     kRangeKorean,            //ud6xx
00312     kRangeKorean,            //ud7xx
00313     kRangeSurrogate,         //ud8xx
00314     kRangeSurrogate,         //ud9xx
00315     kRangeSurrogate,         //udaxx
00316     kRangeSurrogate,         //udbxx
00317     kRangeSurrogate,         //udcxx
00318     kRangeSurrogate,         //uddxx
00319     kRangeSurrogate,         //udexx
00320     kRangeSurrogate,         //udfxx
00321   },
00322   { // table for fx--
00323     kRangePrivate,           //uf0xx 
00324     kRangePrivate,           //uf1xx 
00325     kRangePrivate,           //uf2xx 
00326     kRangePrivate,           //uf3xx 
00327     kRangePrivate,           //uf4xx 
00328     kRangePrivate,           //uf5xx 
00329     kRangePrivate,           //uf6xx 
00330     kRangePrivate,           //uf7xx 
00331     kRangePrivate,           //uf8xx 
00332     kRangeSetCJK,            //uf9xx 
00333     kRangeSetCJK,            //ufaxx 
00334     kRangeArabic,            //ufbxx, includes alphabic presentation form
00335     kRangeArabic,            //ufcxx
00336     kRangeArabic,            //ufdxx
00337     kRangeArabic,            //ufexx, includes Combining half marks, 
00338                              //                CJK compatibility forms, 
00339                              //                CJK compatibility forms, 
00340                              //                small form variants
00341     kRangeTableBase+8,       //uffxx, halfwidth and fullwidth forms, includes Specials
00342   },
00343   { //table for 0x0500 - 0x05ff
00344     kRangeCyrillic,          //u050x
00345     kRangeCyrillic,          //u051x
00346     kRangeCyrillic,          //u052x
00347     kRangeArmenian,          //u053x
00348     kRangeArmenian,          //u054x
00349     kRangeArmenian,          //u055x
00350     kRangeArmenian,          //u056x
00351     kRangeArmenian,          //u057x
00352     kRangeArmenian,          //u058x
00353     kRangeHebrew,            //u059x
00354     kRangeHebrew,            //u05ax
00355     kRangeHebrew,            //u05bx
00356     kRangeHebrew,            //u05cx
00357     kRangeHebrew,            //u05dx
00358     kRangeHebrew,            //u05ex
00359     kRangeHebrew,            //u05fx
00360   },
00361   { //table for 0xff00 - 0xffff
00362     kRangeSetCJK,            //uff0x, fullwidth latin
00363     kRangeSetCJK,            //uff1x, fullwidth latin
00364     kRangeSetCJK,            //uff2x, fullwidth latin
00365     kRangeSetCJK,            //uff3x, fullwidth latin
00366     kRangeSetCJK,            //uff4x, fullwidth latin
00367     kRangeSetCJK,            //uff5x, fullwidth latin
00368     kRangeSetCJK,            //uff6x, halfwidth katakana
00369     kRangeSetCJK,            //uff7x, halfwidth katakana
00370     kRangeSetCJK,            //uff8x, halfwidth katakana
00371     kRangeSetCJK,            //uff9x, halfwidth katakana
00372     kRangeSetCJK,            //uffax, halfwidth hangul jamo
00373     kRangeSetCJK,            //uffbx, halfwidth hangul jamo
00374     kRangeSetCJK,            //uffcx, halfwidth hangul jamo
00375     kRangeSetCJK,            //uffdx, halfwidth hangul jamo
00376     kRangeSetCJK,            //uffex, fullwidth symbols
00377     kRangeSpecials,          //ufffx, Specials
00378   },
00379 };
00380 
00381 // Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80) 
00382 // code points  so that the number of entries in the tertiary range
00383 // table for that range is obtained by dividing (0x1700 - 0x0700) by 128.
00384 // Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal 
00385 // syllabaries take multiple chunks and Ogham and Runic share  a single chunk.
00386 #define TERTIARY_TABLE_SIZE ((0x1700 - 0x0700) / 0x80)
00387 
00388 static PRUint8 gUnicodeTertiaryRangeTable[TERTIARY_TABLE_SIZE] =
00389 { //table for 0x0700 - 0x1600 
00390     kRangeSyriac,            //u070x
00391     kRangeThaana,            //u078x
00392     kRangeUnassigned,        //u080x  place holder(resolved in the 2ndary tab.)
00393     kRangeUnassigned,        //u088x  place holder(resolved in the 2ndary tab.)
00394     kRangeDevanagari,        //u090x
00395     kRangeBengali,           //u098x
00396     kRangeGurmukhi,          //u0a0x
00397     kRangeGujarati,          //u0a8x
00398     kRangeOriya,             //u0b0x
00399     kRangeTamil,             //u0b8x
00400     kRangeTelugu,            //u0c0x
00401     kRangeKannada,           //u0c8x
00402     kRangeMalayalam,         //u0d0x
00403     kRangeSinhala,           //u0d8x
00404     kRangeThai,              //u0e0x  
00405     kRangeLao,               //u0e8x
00406     kRangeTibetan,           //u0f0x  place holder(resolved in the 2ndary tab.)
00407     kRangeTibetan,           //u0f8x  place holder(resolved in the 2ndary tab.)
00408     kRangeMyanmar,           //u100x
00409     kRangeGeorgian,          //u108x
00410     kRangeKorean,            //u110x  place holder(resolved in the 2ndary tab.)
00411     kRangeKorean,            //u118x  place holder(resolved in the 2ndary tab.)
00412     kRangeEthiopic,          //u120x  place holder(resolved in the 2ndary tab.)
00413     kRangeEthiopic,          //u128x  place holder(resolved in the 2ndary tab.)
00414     kRangeEthiopic,          //u130x  
00415     kRangeCherokee,          //u138x
00416     kRangeCanadian,          //u140x  place holder(resolved in the 2ndary tab.)
00417     kRangeCanadian,          //u148x  place holder(resolved in the 2ndary tab.)
00418     kRangeCanadian,          //u150x  place holder(resolved in the 2ndary tab.)
00419     kRangeCanadian,          //u158x  place holder(resolved in the 2ndary tab.)
00420     kRangeCanadian,          //u160x  
00421     kRangeOghamRunic,        //u168x  this contains two scripts, Ogham & Runic
00422 };
00423 
00424 // A two level index is almost enough for locating a range, with the 
00425 // exception of u03xx and u05xx. Since we don't really care about range for
00426 // combining diacritical marks in our font application, they are 
00427 // not discriminated further. But future adoption of this module for other use 
00428 // should be aware of this limitation. The implementation can be extended if 
00429 // there is such a need.
00430 // For Indic, Southeast Asian scripts and some other scripts between
00431 // U+0700 and U+16FF, it's extended to the third level.
00432 PRUint32 FindCharUnicodeRange(PRUnichar ch)
00433 {
00434   PRUint32 range;
00435 
00436   //search the first table
00437   range = gUnicodeSubrangeTable[0][ch >> 12];
00438   
00439   if (range < kRangeTableBase)
00440     // we try to get a specific range 
00441     return range;
00442 
00443   // otherwise, we have one more table to look at
00444   range = gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x0f00) >> 8];
00445   if (range < kRangeTableBase)
00446     return range;
00447   if (range < kRangeTertiaryTable)
00448     return gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x00f0) >> 4];
00449 
00450   // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks
00451   return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7];
00452 }