Back to index

lightning-sunbird  0.9+nobinonly
nsUnicodeToTSCII.cpp
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* vim:expandtab:shiftwidth=2:tabstop=2:
00003  */
00004 /* ***** BEGIN LICENSE BLOCK *****
00005  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00006  *
00007  * The contents of this file are subject to the Mozilla Public License Version
00008  * 1.1 (the "License"); you may not use this file except in compliance with
00009  * the License. You may obtain a copy of the License at
00010  * http://www.mozilla.org/MPL/
00011  *
00012  * Software distributed under the License is distributed on an "AS IS" basis,
00013  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00014  * for the specific language governing rights and limitations under the
00015  * License.
00016  *
00017  * The Original Code is GNU C Library code (http://www.gnu.org)
00018  *
00019  * The Initial Developer of the Original Code is
00020  * Bruno Haible <bruno@clisp.org>.
00021  * Portions created by the Initial Developer are Copyright (C) 2002
00022  * the Free Software Foundation. All Rights Reserved.
00023  *
00024  * Contributor(s): 
00025  *   Jungshik Shin <jshin@mailaps.org> 
00026  *
00027  * Alternatively, the contents of this file may be used under the terms of
00028  * either the GNU General Public License Version 2 or later (the "GPL"), or
00029  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00030  * in which case the provisions of the GPL or the LGPL are applicable instead
00031  * of those above. If you wish to allow use of your version of this file only
00032  * under the terms of either the GPL or the LGPL, and not to allow others to
00033  * use your version of this file under the terms of the MPL, indicate your
00034  * decision by deleting the provisions above and replace them with the notice
00035  * and other provisions required by the GPL or the LGPL. If you do not delete
00036  * the provisions above, a recipient may use your version of this file under
00037  * the terms of any one of the MPL, the GPL or the LGPL.
00038  *
00039  * ***** END LICENSE BLOCK ***** */
00040 
00041 #include "nsUnicodeToTSCII.h"
00042 #include "nsMemory.h"
00043 #include "tamil.h"
00044 
00045 /* 
00046  *  TSCII is an 8-bit encoding consisting of:
00047  *  0x00..0x7F:       ASCII
00048  *  0x80..0x90, 0x95..0x9F, 0xAB..0xFE:
00049  *                    Tamil letters and glyphs
00050  *  0xA1..0xA5, 0xAA: Tamil combining letters (after the base character)
00051  *  0xA6..0xA8:       Tamil combining letters (before the base character)
00052  *  0x91..0x94:       Punctuation
00053  *  0xA9:             Symbols
00054  */
00055 
00056 //----------------------------------------------------------------------
00057 // Class nsUnicodeToTSCII [implementation]
00058   
00059 NS_IMPL_ISUPPORTS2(nsUnicodeToTSCII, nsIUnicodeEncoder, nsICharRepresentable)
00060 
00061 /* 
00062  * During UCS-4 to TSCII conversion, mState contains 
00063  * the last byte (or sometimes the last two bytes) to be output.
00064  * This can be:
00065  *   0x00                     Nothing pending.
00066  *   0xB8..0xC9, 0x83..0x86   A consonant.
00067  *   0xEC, 0x8A               A consonant with VIRAMA sign (final or joining).
00068  *   0x87, 0xC38A             Two consonants combined through a VIRAMA sign. 
00069  */
00070 
00071 static const PRUint8 UnicharToTSCII[] =
00072 {
00073      0,    0,    0, 0xb7,    0, 0xab, 0xac, 0xfe, // 0x0B80..0x0B87
00074   0xae, 0xaf, 0xb0,    0,    0,    0, 0xb1, 0xb2, // 0x0B88..0x0B8F
00075   0xb3,    0, 0xb4, 0xb5, 0xb6, 0xb8,    0,    0, // 0x0B90..0x0B97
00076      0, 0xb9, 0xba,    0, 0x83,    0, 0xbb, 0xbc, // 0x0B98..0x0B9F
00077      0,    0,    0, 0xbd, 0xbe,    0,    0,    0, // 0x0BA0..0x0BA7
00078   0xbf, 0xc9, 0xc0,    0,    0,    0, 0xc1, 0xc2, // 0x0BA8..0x0BAF
00079   0xc3, 0xc8, 0xc4, 0xc7, 0xc6, 0xc5,    0, 0x84, // 0x0BB0..0x0BB7
00080   0x85, 0x86,    0,    0,    0,    0, 0xa1, 0xa2, // 0x0BB8..0x0BBF
00081   0xa3, 0xa4, 0xa5,    0,    0,    0, 0xa6, 0xa7, // 0x0BC0..0x0BC7
00082   0xa8,    0,    0,    0,    0,    0,    0,    0, // 0x0BC8..0x0BCF
00083      0,    0,    0,    0,    0,    0,    0, 0xaa, // 0x0BD0..0x0BD7
00084      0,    0,    0,    0,    0,    0,    0,    0, // 0x0BD8..0x0BDF
00085      0,    0,    0,    0,    0,    0, 0x80, 0x81, // 0x0BE0..0x0BE7
00086   0x8d, 0x8e, 0x8f, 0x90, 0x95, 0x96, 0x97, 0x98, // 0x0BE8..0x0BEF
00087   0x9d, 0x9e, 0x9f,    0,    0,    0,    0,    0, // 0x0BF0..0x0BF7
00088      0,    0,    0,    0,    0,    0,    0,    0  // 0x0BF8..0x0BFF
00089 };
00090 
00091 static const PRUint8 consonant_with_u[] =
00092 {
00093   0xcc, 0x99, 0xcd, 0x9a, 0xce, 0xcf, 0xd0, 0xd1, 0xd2,
00094   0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb
00095 };
00096 
00097 static const PRUint8 consonant_with_uu[] =
00098 {
00099   0xdc, 0x9b, 0xdd, 0x9c, 0xde, 0xdf, 0xe0, 0xe1, 0xe2,
00100   0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb
00101 };
00102 
00103 static const PRUint8 consonant_with_virama[18] =
00104 {
00105   0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4,
00106   0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd
00107 };
00108 
00109 
00110 // Modified implementation of Unicode to TSCII converter in glibc by 
00111 // Bruno Haible.  My modifications are based on Unicode 3.0 chap. 9 and 
00112 // the code chart for Tamil. 
00113 NS_IMETHODIMP 
00114 nsUnicodeToTSCII::Convert(const PRUnichar * aSrc, PRInt32 * aSrcLength, 
00115                           char * aDest, PRInt32 * aDestLength)
00116 {
00117   const PRUnichar * src = aSrc;
00118   const PRUnichar * srcEnd = aSrc + *aSrcLength;
00119   char * dest = aDest;
00120   char * destEnd = dest + *aDestLength;
00121 
00122   nsresult rv = NS_OK;
00123                       
00124   while (src < srcEnd && dest < destEnd) {
00125     PRUnichar ch = *src;
00126     if (mBuffer) {                        
00127       // Attempt to combine the last character with this one.
00128       PRUint32 last = mBuffer;
00129                             
00130       // last : consonant 
00131       if (IS_TSC_CONSONANT(last)) {                      
00132         if (ch == UNI_VOWELSIGN_U && IS_TSC_CONSONANT1(last)) {                      
00133           *dest++ = consonant_with_u[last - TSC_KA];
00134           mBuffer = 0;                  
00135           ++src;
00136           continue;
00137         }                      
00138   
00139         if (ch == UNI_VOWELSIGN_UU && IS_TSC_CONSONANT1(last)) {                      
00140           *dest++ = consonant_with_uu[last - TSC_KA];          
00141           mBuffer = 0;                  
00142           ++src;                  
00143           continue;                  
00144         }                      
00145   
00146         // reorder. vowel sign goes to the left of consonant
00147         if (IS_UNI_LEFT_VOWELSIGN(ch)) {                      
00148           if (dest + 2 > destEnd)
00149             goto error_more_output;
00150           *dest++ = TSC_LEFT_VOWELSIGN(ch);
00151           *dest++ = last;                
00152           mBuffer = 0;                
00153           ++src;                  
00154           continue;                  
00155         }                      
00156   
00157         // split and reorder. consonant goes bet. two parts
00158         if (IS_UNI_2PARTS_VOWELSIGN(ch)) {                      
00159           if (dest + 3 > destEnd)
00160             goto error_more_output;
00161           *dest++ = TSC_LEFT_VOWEL_PART(ch);
00162           *dest++ = last;                
00163           *dest++ = TSC_RIGHT_VOWEL_PART(ch);
00164           mBuffer = 0;                
00165           ++src;                  
00166           continue;                  
00167         }                      
00168   
00169         // Virama
00170         if (ch == UNI_VIRAMA) {                      
00171           // consonant KA can form a conjunct with consonant SSA(SHA).
00172           // buffer dead consonant 'K' for the now.
00173           if (last == TSC_KA) {                 
00174             mBuffer = TSC_KA_DEAD;
00175           }
00176           // SA can form a conjunct when followed by 'RA'. 
00177           // buffer dead consonant 'S' for the now.
00178           else if (last == TSC_SA) {
00179             mBuffer = TSC_SA_DEAD;                
00180           }
00181           else {                    
00182             *dest++ = IS_TSC_CONSONANT1(last) ?
00183               consonant_with_virama[last - TSC_KA] : last + 5;
00184             mBuffer = 0;                
00185           }                    
00186           ++src;                  
00187           continue;                  
00188         }                      
00189 
00190         // consonant TA forms a ligature with vowel 'I' or 'II'.
00191         if (last == TSC_TA && (ch == UNI_VOWELSIGN_I || ch == UNI_VOWELSIGN_II)) {                      
00192           *dest++ = ch - (UNI_VOWELSIGN_I - TSC_TI_LIGA);
00193           mBuffer = 0;                  
00194           ++src;                  
00195           continue;                  
00196         }                      
00197       }                      
00198       else if (last == TSC_KA_DEAD) {                      
00199         // Kd + SSA =  K.SSA
00200         if (ch == UNI_SSA) {                      
00201           mBuffer = TSC_KSSA; 
00202           ++src;                  
00203           continue;                  
00204         }                      
00205       }                      
00206       else if (last == TSC_SA_DEAD) {                      
00207         // Sd + RA = S.RA. Buffer RA + Sd. 
00208         if (ch == UNI_RA) {                      
00209           mBuffer = 0xc38a;                
00210           ++src;                  
00211           continue;                  
00212         }                      
00213       }                      
00214       else if (last == TSC_KSSA) {                      
00215         if (ch == UNI_VIRAMA) {
00216           *dest++ = (char) TSC_KSSA_DEAD;
00217           mBuffer = 0;                  
00218           ++src;                  
00219           continue;                  
00220         }                      
00221 
00222         // vowel splitting/reordering should be done around conjuncts as well.
00223         // reorder. vowel sign goes to the left of consonant
00224         if (IS_UNI_LEFT_VOWELSIGN(ch)) {                      
00225           if (dest + 2 > destEnd)
00226             goto error_more_output;
00227           *dest++ = TSC_LEFT_VOWELSIGN(ch);
00228           *dest++ = last;                
00229           mBuffer = 0;                
00230           ++src;                  
00231           continue;                  
00232         }                      
00233   
00234         // split and reorder. consonant goes bet. two parts
00235         if (IS_UNI_2PARTS_VOWELSIGN(ch)) {                      
00236           if (dest + 3 > destEnd)
00237             goto error_more_output;
00238           *dest++ = TSC_LEFT_VOWEL_PART(ch);
00239           *dest++ = last;                
00240           *dest++ = TSC_RIGHT_VOWEL_PART(ch);
00241           mBuffer = 0;                
00242           ++src;                  
00243           continue;                  
00244         }                      
00245       }                      
00246       else {
00247         NS_ASSERTION(last == 0xc38a, "No other value can be buffered");
00248         if (ch == UNI_VOWELSIGN_II) {                      
00249           *dest++ = (char) TSC_SRII_LIGA;
00250           mBuffer = 0;                  
00251           ++src;                  
00252           continue;                  
00253         }                      
00254         else {
00255           // put back TSC_SA_DEAD and TSC_RA
00256           *dest++ = (char) TSC_SA_DEAD;
00257           mBuffer = TSC_RA;
00258           ++src;                  
00259           continue;                  
00260         }  
00261       }                      
00262                           
00263       /* Output the buffered character.  */              
00264       if (last >> 8) {                      
00265         if (dest + 2 >  destEnd)
00266           goto error_more_output;
00267         *dest++ = last & 0xff;              
00268         *dest++ = (last >> 8) & 0xff;              
00269       }                      
00270       else                      
00271         *dest++ = last & 0xff;                
00272       mBuffer = 0;                    
00273       continue;                    
00274     }                        
00275                         
00276     if (ch < 0x80)   // Plain ASCII character.
00277       *dest++ = (char)ch;                    
00278     else if (IS_UNI_TAMIL(ch)) {                        
00279       PRUint8 t = UnicharToTSCII[ch - UNI_TAMIL_START];
00280                             
00281       if (t != 0) {                      
00282           if (IS_TSC_CONSONANT(t))
00283             mBuffer = (PRUint32) t;              
00284           else                    
00285             *dest++ = t;                  
00286       }                      
00287       else if (IS_UNI_2PARTS_VOWELSIGN(ch)) {   
00288           // actually this is an illegal sequence.
00289           if (dest + 2 > destEnd)
00290             goto error_more_output;
00291 
00292           *dest++ = TSC_LEFT_VOWEL_PART(ch);
00293           *dest++ = TSC_RIGHT_VOWEL_PART(ch);
00294       }                      
00295       else {
00296         *aDestLength = dest - aDest;
00297         return NS_ERROR_UENC_NOMAPPING;
00298       }                      
00299     }                        
00300     else if (ch == 0x00A9)                  
00301       *dest++ = (char)ch;                    
00302     else if (IS_UNI_SINGLE_QUOTE(ch))
00303       *dest++ = ch - UNI_LEFT_SINGLE_QUOTE + TSC_LEFT_SINGLE_QUOTE;
00304     else if (IS_UNI_DOUBLE_QUOTE(ch))
00305       *dest++ = ch - UNI_LEFT_DOUBLE_QUOTE + TSC_LEFT_DOUBLE_QUOTE;
00306     else {
00307       *aDestLength = dest - aDest;
00308       return NS_ERROR_UENC_NOMAPPING;
00309     }                        
00310                         
00311     /* Now that we wrote the output increment the input pointer.  */        
00312     ++src;                      
00313   }
00314 
00315   // flush the buffer
00316   if (mBuffer >> 8) {                      
00317     // Write out the last character, two bytes. 
00318     if (dest + 2 > destEnd)
00319       goto error_more_output;
00320     *dest++ = (mBuffer >> 8) & 0xff;            
00321     *dest++ = mBuffer & 0xff;              
00322     mBuffer = 0;
00323   }                      
00324   else if (mBuffer) {
00325     // Write out the last character, a single byte.
00326     if (dest >= destEnd)
00327       goto error_more_output;
00328     *dest++ = mBuffer & 0xff;              
00329     mBuffer = 0;
00330   }                      
00331 
00332   *aSrcLength = src - aSrc;
00333   *aDestLength = dest - aDest;
00334   return rv;
00335 
00336 error_more_output:
00337   *aSrcLength = src - aSrc;
00338   *aDestLength = dest - aDest;
00339   return NS_OK_UENC_MOREOUTPUT;
00340 }
00341 
00342 NS_IMETHODIMP 
00343 nsUnicodeToTSCII::Finish(char* aDest, PRInt32* aDestLength)
00344 {
00345   if (!mBuffer) {
00346     *aDestLength = 0;
00347     return NS_OK;
00348   }
00349 
00350   if (mBuffer >> 8) {                      
00351     // Write out the last character, two bytes. 
00352     if (*aDestLength < 2) {
00353       *aDestLength = 0;
00354       return NS_OK_UENC_MOREOUTPUT;
00355     }
00356     *aDest++ = (mBuffer >> 8) & 0xff;            
00357     *aDest++ = mBuffer & 0xff;              
00358     mBuffer = 0;
00359     *aDestLength = 2;
00360   }                      
00361   else {                      
00362     // Write out the last character, a single byte.
00363     if (*aDestLength < 1) {                    
00364       *aDestLength = 0;
00365       return NS_OK_UENC_MOREOUTPUT;
00366     }
00367     *aDest++ = mBuffer & 0xff;              
00368     mBuffer = 0;
00369     *aDestLength = 1;
00370   }                      
00371   return NS_OK;
00372 }
00373 
00374 //================================================================
00375 NS_IMETHODIMP 
00376 nsUnicodeToTSCII::Reset()
00377 {
00378   mBuffer = 0;
00379   return NS_OK;
00380 }
00381 
00382 NS_IMETHODIMP 
00383 nsUnicodeToTSCII::GetMaxLength(const PRUnichar * aSrc, PRInt32 aSrcLength,
00384                                  PRInt32 * aDestLength)
00385 {
00386   // Some Tamil letters  can be decomposed into 2 glyphs in TSCII.
00387   *aDestLength = aSrcLength *  2;
00388   return NS_OK;
00389 }
00390 
00391 
00392 NS_IMETHODIMP 
00393 nsUnicodeToTSCII::FillInfo(PRUint32* aInfo)
00394 {
00395   // Tamil block is so sparse.
00396   static const PRUint8 coverage[] = {
00397     0xe8, // 11101000  U+0B87 - U+0B80
00398     0xc7, // 11000111  U+0B8F - U+0B88
00399     0x3d, // 00111101  U+0B97 - U+0B90
00400     0xd6, // 11010110  U+0B9F - U+0B98
00401     0x18, // 00011000  U+0BA7 - U+0BA0
00402     0xc7, // 11000111  U+0BAF - U+0BA8
00403     0xbf, // 10111111  U+0BB7 - U+0BB0
00404     0xc7, // 11000111  U+0BBF - U+0BB8
00405     0xc7, // 11000111  U+0BC7 - U+0BC0
00406     0x3d, // 00111101  U+0BCF - U+0BC8
00407     0x80, // 10000000  U+0BD7 - U+0BD0
00408     0x00, // 00000000  U+0BDF - U+0BD8
00409     0x80, // 10000000  U+0BE7 - U+0BE0
00410     0xff, // 11111111  U+0BEF - U+0BE8
00411     0x07, // 00000111  U+0BF7 - U+0BF0
00412   };
00413 
00414   PRUnichar i;
00415   for(i = 0; i <  0x78; i++)
00416     if (coverage[i / 8] & (1 << (i % 8)))
00417       SET_REPRESENTABLE(aInfo, i + UNI_TAMIL_START);
00418 
00419   // TSCII is a superset of US-ASCII.
00420   for(i = 0x20; i < 0x7f; i++)
00421      SET_REPRESENTABLE(aInfo, i);
00422 
00423   // additional characters in TSCII
00424   SET_REPRESENTABLE(aInfo, 0xA9);   // copyright sign
00425   SET_REPRESENTABLE(aInfo, UNI_LEFT_SINGLE_QUOTE);
00426   SET_REPRESENTABLE(aInfo, UNI_RIGHT_SINGLE_QUOTE);
00427   SET_REPRESENTABLE(aInfo, UNI_LEFT_DOUBLE_QUOTE);
00428   SET_REPRESENTABLE(aInfo, UNI_RIGHT_DOUBLE_QUOTE);
00429 
00430   return NS_OK;
00431 }
00432 
00433 NS_IMETHODIMP 
00434 nsUnicodeToTSCII::SetOutputErrorBehavior(PRInt32 aBehavior, 
00435                                            nsIUnicharEncoder *aEncoder, 
00436                                            PRUnichar aChar)
00437 {
00438   return NS_OK;
00439 }
00440 
00441 
00442 // same as the mapping of the C1(0x80-0x9f) part of  Windows-1252 to Unicode
00443 const static PRUnichar gTSCIIToTTF[] = {
00444   0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
00445   0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
00446   0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
00447   0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178
00448 };
00449 
00450 //----------------------------------------------------------------------
00451 // Class nsUnicodeToTamilTTF [implementation]
00452 //
00453 NS_IMPL_ISUPPORTS_INHERITED0(nsUnicodeToTamilTTF, nsUnicodeToTSCII)
00454 
00455 NS_IMETHODIMP 
00456 nsUnicodeToTamilTTF::Convert(const PRUnichar * aSrc, 
00457                              PRInt32 * aSrcLength, char * aDest, 
00458                              PRInt32 * aDestLength)
00459 {
00460 
00461   PRInt32 medLen, destLen;
00462   char *med;
00463 
00464   GetMaxLength(aSrc, *aSrcLength, &destLen);
00465   NS_ASSERTION(destLen  <= *aDestLength, "insufficient dest. buffer size");
00466 
00467   // TSCII converter is a single byte encoder and takes half the space 
00468   // taken by TamilTTF encoder.
00469   medLen = destLen / 2; 
00470 
00471   if (medLen > CHAR_BUFFER_SIZE) {
00472     med = (char *) nsMemory::Alloc(medLen);
00473     if (!med)
00474       return NS_ERROR_OUT_OF_MEMORY;
00475   }
00476   else 
00477     med = mStaticBuffer;
00478 
00479   nsresult rv = nsUnicodeToTSCII::Convert(aSrc, aSrcLength, med, &medLen);
00480 
00481   if (NS_FAILED(rv)) {
00482     if (med != mStaticBuffer)
00483       nsMemory::Free(med);
00484     return rv;
00485   }
00486 
00487   PRInt32 i, j;
00488 
00489   // widen 8bit TSCII to pseudo-Unicode font encoding of TSCII-Tamil font
00490   for (i = 0, j = 0; i < medLen; i++) {
00491     // Only C1 part(0x80-0x9f) needs to be mapped as if they're CP1251.
00492     PRUnichar ucs2 = (med[i] & 0xe0) == 0x80 ? 
00493                      gTSCIIToTTF[med[i] & 0x7f] : PRUint8(med[i]);
00494     // A lot of TSCII fonts are still based on TSCII 1.6 so that 
00495     // they have Tamil vowel 'I' at 0xad instead of 0xfe.
00496     if (ucs2 == 0xfe) ucs2 = 0xad;
00497     aDest[j++] = PRUint8((ucs2 & 0xff00) >> 8);
00498     aDest[j++] = PRUint8(ucs2 & 0x00ff);
00499   }
00500 
00501   *aDestLength = j;
00502 
00503   if (med != mStaticBuffer)
00504     nsMemory::Free(med);
00505 
00506   return NS_OK;
00507 }
00508 
00509 NS_IMETHODIMP
00510 nsUnicodeToTamilTTF::GetMaxLength(const PRUnichar * aSrc, PRInt32 aSrcLength, PRInt32 * aDestLength)
00511 {
00512   // Each Tamil character can generate at most two presentation forms,
00513   // but we're 'extending' them to 16bit shorts, which accounts for 
00514   // additional factor of 2.
00515   *aDestLength = (aSrcLength + 1) *  4; 
00516   
00517   return NS_OK;
00518 }
00519 
00520 NS_IMETHODIMP 
00521 nsUnicodeToTamilTTF::SetOutputErrorBehavior(PRInt32 aBehavior, 
00522                                             nsIUnicharEncoder *aEncoder, 
00523                                             PRUnichar aChar)
00524 {
00525   if (aBehavior == kOnError_CallBack && aEncoder == nsnull)
00526     return NS_ERROR_NULL_POINTER;
00527   mErrEncoder = aEncoder;
00528   mErrBehavior = aBehavior;
00529   mErrChar = aChar;
00530   return NS_OK;
00531 }
00532