Back to index

nux  3.0.0
NUTF.cpp
Go to the documentation of this file.
00001 /*
00002  * Copyright 2010 Inalogicฎ Inc.
00003  *
00004  * This program is free software: you can redistribute it and/or modify it
00005  * under the terms of the GNU Lesser General Public License, as
00006  * published by the  Free Software Foundation; either version 2.1 or 3.0
00007  * of the License.
00008  *
00009  * This program is distributed in the hope that it will be useful, but
00010  * WITHOUT ANY WARRANTY; without even the implied warranties of
00011  * MERCHANTABILITY, SATISFACTORY QUALITY or FITNESS FOR A PARTICULAR
00012  * PURPOSE.  See the applicable version of the GNU Lesser General Public
00013  * License for more details.
00014  *
00015  * You should have received a copy of both the GNU Lesser General Public
00016  * License along with this program. If not, see <http://www.gnu.org/licenses/>
00017  *
00018  * Authored by: Jay Taoko <jaytaoko@inalogic.com>
00019  *
00020  */
00021 
00022 
00023 #include "NuxCore.h"
00024 #include "NUTF.h"
00025 
00026 namespace nux
00027 {
00028 
00029   NUTF8::NUTF8 (const UNICHAR *Source)
00030   {
00031     Convert (Source);
00032   }
00033 
00034   NUTF8::NUTF8 (const std::wstring &Source)
00035   {
00036     Convert (NUX_REINTERPRET_CAST (UNICHAR *, NUX_CONST_CAST (wchar_t *, Source.c_str() ) ) );
00037   }
00038 
00039   void NUTF8::Convert (const UNICHAR *Source)
00040   {
00041     int NumBytes = 0;
00042     // *6 each UTF16 char can translate to up to 6 bytes in UTF8
00043     // +1 for NULL char
00044     size_t Size = wcslen ( (wchar_t *) Source) * 6 + 1;
00045     utf8 = new char[Size];
00046     memset (utf8, 0, Size);
00047 
00048     unsigned char TwoBytes[2];
00049     TwoBytes[0] = '\0';
00050     TwoBytes[1] = '\0';
00051 
00052     utf8[0] = '\0';
00053 
00054     //     U-00000000 – U-0000007F:       0xxxxxxx
00055     //     U-00000080 – U-000007FF:       110xxxxx 10xxxxxx
00056     //     U-00000800 – U-0000FFFF:       1110xxxx 10xxxxxx 10xxxxxx
00057     //     U-00010000 – U-001FFFFF:       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
00058     //     U-00200000 – U-03FFFFFF:       111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
00059     //     U-04000000 – U-7FFFFFFF:       1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
00060     // The original specification of UTF-8 allowed for sequences of up to six bytes covering numbers up to 31 bits
00061     // (the original limit of the universal character set). However, UTF-8 was restricted by RFC 3629 to use only
00062     // the area covered by the formal Unicode definition, U+0000 to U+10FFFF, in November 2003. So UTF-8 code point is at most 4 bytes.
00063 
00064     for (size_t n = 0; Source[n] != 0; n++)
00065     {
00066       if (Source[n] <= 0x7F)
00067       {
00068         TwoBytes[0] = (char) Source[n];
00069         STRCAT_S (utf8, Size, (const char *) &TwoBytes[0]);
00070       }
00071       else
00072       {
00073         // 11 valid bits 2 bytes
00074         if (Source[n] <= 0x7FF)
00075         {
00076           // Extract the 5 highest bits
00077           TwoBytes[0] = (char) (0xC0 + (Source[n] >> 6) );
00078           NumBytes = 2;
00079         }
00080         // 16 valid bits 3 bytes
00081         else if (Source[n] <= 0xFFFF)
00082         {
00083           // Extract the highest 4 bits
00084           TwoBytes[0] = (char) (0xE0 + (Source[n] >> 12) );
00085           NumBytes = 3;
00086         }
00087         // Unichar is only 16 bits. Do no continue because (Source[n] >> 18) does not make sense.
00088         // 21 valid bits 4 bytes
00089         else if (Source[n] <= 0x1FFFFF)
00090         {
00091           // Extract the highest 3 bits
00092           TwoBytes[0] = (char) (0xF0 + (Source[n] >> 18) );
00093           NumBytes = 4;
00094         }
00095         // Split a 26 bit character into 5 bytes
00096         else if (Source[n] <= 0x3FFFFFF)
00097         {
00098           // Extract the highest 2 bits
00099           TwoBytes[0] = (char) (0xF8 + (Source[n] >> 24) );
00100           NumBytes = 5;
00101         }
00102         // Split a 31 bit character into 6 bytes
00103         else if (Source[n] <= 0x7FFFFFFF)
00104         {
00105           // Extract the highest bit
00106           TwoBytes[0] = (char) (0xFC + (Source[n] >> 30) );
00107           NumBytes = 6;
00108         }
00109 
00110         STRCAT_S (utf8, Size, (const char *) &TwoBytes[0]);
00111 
00112         // Extract the remaining bits - 6 bits at a time
00113         for (int i = 1, shift = (NumBytes - 2) * 6; shift >= 0; i++, shift -= 6)
00114         {
00115           TwoBytes[0] = (char) (0x80 + ( (Source[n] >> shift) & 0x3F) );
00116           STRCAT_S (utf8, Size, (const char *) &TwoBytes[0]);
00117         }
00118       }
00119     }
00120   }
00121 
00122 // void NUTF8::Convert(const t_UTF32* Source)
00123 // {
00124 //     int NumBytes = 0;
00125 //
00126 //     int Size = 0;
00127 //     while(Source[Size] != 0)
00128 //     {
00129 //         ++Size;
00130 //     }
00131 //     // *6: each UTF16 char can translate to up to 6 bytes in UTF8
00132 //     // +1: for NULL char
00133 //     Size = Size * 6 + 1;
00134 //     utf8 = new char[Size*sizeof(t_UTF32)];
00135 //     memset(utf8, 0, Size*sizeof(t_UTF32));
00136 //
00137 //     unsigned char TwoBytes[2];
00138 //     TwoBytes[0] = '\0'; TwoBytes[1] = '\0';
00139 //
00140 //     utf8[0] = '\0';
00141 //
00142 //     //     U-00000000 – U-0000007F:    0xxxxxxx
00143 //     //     U-00000080 – U-000007FF:    110xxxxx 10xxxxxx
00144 //     //     U-00000800 – U-0000FFFF:    1110xxxx 10xxxxxx 10xxxxxx
00145 //     //     U-00010000 – U-001FFFFF:    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
00146 //     //     U-00200000 – U-03FFFFFF:    111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
00147 //     //     U-04000000 – U-7FFFFFFF:    1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
00148 //     // The original specification of UTF-8 allowed for sequences of up to six bytes covering numbers up to 31 bits
00149 //     // (the original limit of the universal character set). However, UTF-8 was restricted by RFC 3629 to use only
00150 //     // the area covered by the formal Unicode definition, U+0000 to U+10FFFF, in November 2003. So UTF-8 code point is at most 4 bytes.
00151 //
00152 //     for(size_t n = 0; Source[n] != 0; n++)
00153 //     {
00154 //         if (Source[n] <= 0x7F)
00155 //         {
00156 //             TwoBytes[0] = (char)Source[n];
00157 //             STRCAT_S(utf8, Size, (const char*)&TwoBytes[0]);
00158 //         }
00159 //         else
00160 //         {
00161 //             // 11 valid bits 2 bytes
00162 //             if (Source[n] <= 0x7FF)
00163 //             {
00164 //                 // Extract the 5 highest bits
00165 //                 TwoBytes[0] = (char)(0xC0 + (Source[n] >> 6));
00166 //                 NumBytes = 2;
00167 //             }
00168 //             // 16 valid bits 3 bytes
00169 //             else if (Source[n] <= 0xFFFF)
00170 //             {
00171 //                 // Extract the highest 4 bits
00172 //                 TwoBytes[0] = (char)(0xE0 + (Source[n] >> 12));
00173 //                 NumBytes = 3;
00174 //             }
00175 //             // 21 valid bits 4 bytes
00176 //             else if (Source[n] <= 0x1FFFFF)
00177 //             {
00178 //                 // Extract the highest 3 bits
00179 //                 TwoBytes[0] = (char)(0xF0 + (Source[n] >> 18));
00180 //                 NumBytes = 4;
00181 //             }
00182 //             // Split a 26 bit character into 5 bytes
00183 //             else if (Source[n] <= 0x3FFFFFF)
00184 //             {
00185 //                 // Extract the highest 2 bits
00186 //                 TwoBytes[0] = (char)(0xF8 + (Source[n] >> 24));
00187 //                 NumBytes = 5;
00188 //             }
00189 //             // Split a 31 bit character into 6 bytes
00190 //             else if (Source[n] <= 0x7FFFFFFF)
00191 //             {
00192 //                 // Extract the highest bit
00193 //                 TwoBytes[0] = (char)(0xFC + (Source[n] >> 30));
00194 //                 NumBytes = 6;
00195 //             }
00196 //
00197 //             STRCAT_S(utf8, Size, (const char*)&TwoBytes[0]);
00198 //
00199 //             // Extract the remaining bits - 6 bits at a time
00200 //             for(int i = 1, shift = (NumBytes-2)*6; shift >= 0; i++, shift -= 6)
00201 //             {
00202 //                 TwoBytes[0] = (char)(0x80 + ((Source[n] >> shift) & 0x3F));
00203 //                 STRCAT_S(utf8, Size, (const char*)&TwoBytes[0]);
00204 //             }
00205 //         }
00206 //     }
00207 // }
00208 
00209   NUTF8::~NUTF8()
00210   {
00211     delete [] utf8;
00212   }
00213 
00214   NUTF8::operator const char* ()
00215   {
00216     return utf8;
00217   }
00218 
00220 // Convert each unicode character in the source to UTF-8
00221 
00222   NUTF16::NUTF16 (const char *Source)
00223   {
00224     Convert (Source);
00225   }
00226 
00227   NUTF16::NUTF16 (const std::string &Source)
00228   {
00229     Convert (Source.c_str() );
00230   }
00231 
00232   void NUTF16::Convert (const char *Source)
00233   {
00234     //     U-00000000 – U-0000007F:       0xxxxxxx
00235     //     U-00000080 – U-000007FF:       110xxxxx 10xxxxxx
00236     //     U-00000800 – U-0000FFFF:       1110xxxx 10xxxxxx 10xxxxxx
00237     //     U-00010000 – U-001FFFFF:       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
00238     //     U-00200000 – U-03FFFFFF:       111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
00239     //     U-04000000 – U-7FFFFFFF:       1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
00240 
00241     unsigned char MSB;
00242     int temp = 0;
00243     int numbytes = 0; // Number of bytes used to represent the unicode char
00244     int pos = 0;
00245 
00246     size_t len = strlen (Source) + 1; // +1 for NULL char
00247     unicode = new UNICHAR[len*6];
00248 
00249     // Loop through the characters in the string and decode them
00250     for (size_t n = 0; n < len; ++n)
00251     {
00252       // Find the hexadecimal number following the equals sign
00253       MSB = Source[n];
00254 
00255       if (MSB <= 0x7F)
00256       {
00257         unicode[pos++] = (UNICHAR) MSB;
00258       }
00259       else
00260       {
00261         // 2 bytes
00262         if (MSB >= 0xC0 && MSB <= 0xDF)
00263         {
00264           temp = (MSB - 0xC0) << 6;
00265           numbytes = 2;
00266         }
00267         // 3 bytes
00268         else if (MSB >= 0xE0 && MSB <= 0xEF)
00269         {
00270           temp = (MSB - 0xE0) << 12;
00271           numbytes = 3;
00272         }
00273         // 4 bytes
00274         else if (MSB >= 0xF0 && MSB <= 0xF7)
00275         {
00276           temp = (MSB - 0xF0) << 18;
00277           numbytes = 4;
00278         }
00279         // 5 bytes
00280         else if (MSB >= 0xF8 && MSB <= 0xFB)
00281         {
00282           temp = (MSB - 0xF8) << 24;
00283           numbytes = 5;
00284         }
00285         // 6 bytes
00286         else if (MSB >= 0xFC && MSB <= 0xFD)
00287         {
00288           temp = (MSB - 0xFC) << 30;
00289           numbytes = 6;
00290         }
00291 
00292         // Loop through the remaining hexadecimal numbers representing the next unicode character
00293         for (int i = 0, shift = (numbytes - 2) * 6; shift >= 0; i++, shift -= 6)
00294         {
00295           int nVal = ( ( (unsigned char) Source[n+1+i]) - 0x80 ) << shift;
00296           temp += nVal;
00297         }
00298 
00299         // Add the unicode character to the final string
00300         unicode[pos++] = (UNICHAR) temp;
00301 
00302         // Move the character index in the source to the next unicode character
00303         n += (numbytes - 1);
00304       }
00305     }
00306   }
00307 
00308   NUTF16::~NUTF16()
00309   {
00310     delete [] unicode;
00311   }
00312 
00313   NUTF16::operator const UNICHAR* ()
00314   {
00315     return unicode;
00316   }
00317 
00318 }