Back to index

nux  3.0.0
NUTF.h
Go to the documentation of this file.
00001 /*
00002  * Copyright 2010 Inalogic« Inc.
00003  *
00004  * This program is free software: you can redistribute it and/or modify it
00005  * under the terms of the GNU Lesser General Public License, as
00006  * published by the  Free Software Foundation; either version 2.1 or 3.0
00007  * of the License.
00008  *
00009  * This program is distributed in the hope that it will be useful, but
00010  * WITHOUT ANY WARRANTY; without even the implied warranties of
00011  * MERCHANTABILITY, SATISFACTORY QUALITY or FITNESS FOR A PARTICULAR
00012  * PURPOSE.  See the applicable version of the GNU Lesser General Public
00013  * License for more details.
00014  *
00015  * You should have received a copy of both the GNU Lesser General Public
00016  * License along with this program. If not, see <http://www.gnu.org/licenses/>
00017  *
00018  * Authored by: Jay Taoko <jaytaoko@inalogic.com>
00019  *
00020  */
00021 
00022 
00023 #ifndef NUTF_H
00024 #define NUTF_H
00025 
00026 // http://en.wikipedia.org/wiki/UTF-16
00027 
00028 // In computing, UTF-16 (16-bit Unicode Transformation Format) is a variable-length character encoding
00029 // for Unicode, capable of encoding the entire Unicode repertoire. The encoding form maps code points
00030 // (characters) into a sequence of 16-bit words, called code units. For characters in the Basic
00031 // Multilingual Plane (BMP) the resulting encoding is a single 16-bit word. For characters in the other
00032 // planes, the encoding will result in a pair of 16-bit words, together called a surrogate pair. All possible
00033 // code points from U+0000 through U+10FFFF, except for the surrogate code points U+D800ľU+DFFF
00034 // (which are not characters), are uniquely mapped by UTF-16 regardless of the code point's current or
00035 // future character assignment or use.
00036 //
00037 // As many uses in computing require units of bytes (octets) there are three related encoding schemes
00038 // which map to octet sequences instead of words: namely UTF-16, UTF-16BE, and UTF-16LE. They
00039 // differ only in the byte order chosen to represent each 16-bit unit and whether they make use of a
00040 // Byte Order Mark. All of the schemes will result in either a 2 or 4-byte sequence for any given character.
00041 //
00042 // UTF-16 is officially defined in Annex Q of the international standard ISO/IEC 10646-1. It is also
00043 // described in The Unicode Standard version 3.0 and higher, as well as in the IETF's RFC 2781.
00044 //
00045 // UCS-2 (2-byte Universal Character Set) is an obsolete character encoding which is a predecessor
00046 // to UTF-16. The UCS-2 encoding form is nearly identical to that of UTF-16, except that it does not
00047 // support surrogate pairs and therefore can only encode characters in the BMP range U+0000 through
00048 // U+FFFF. As a consequence it is a fixed-length encoding that always encodes characters into a
00049 // single 16-bit value. As with UTF-16, there are three related encoding schemes (UCS-2, UCS-2BE, UCS-2LE)
00050 // that map characters to a specific byte sequence.
00051 //
00052 // Because of the technical similarities and upwards compatibility from UCS-2 to UTF-16, the two
00053 // encodings are often erroneously conflated and used as if interchangeable, so that strings encoded
00054 // in UTF-16 are sometimes misidentified as being encoded in UCS-2.
00055 
00056 namespace nux
00057 {
00058 
00060   class NUTF8
00061   {
00062     // UTF-8 encoded characters may theoretically be up to six bytes long, however 16-bit BMP characters are only up to three bytes long.
00063   public:
00064     explicit NUTF8 (const UNICHAR *Source);
00065     explicit NUTF8 (const std::wstring &Source);
00066     ~NUTF8();
00067 
00068     operator const char* ();
00069 
00070   private:
00071     void Convert (const UNICHAR *);
00072     //void Convert(const t_UTF32*);
00073     char *utf8;
00074 
00075   };
00076 
00078   class NUTF16
00079   {
00080   public:
00081     explicit NUTF16 (const char *Source);
00082     explicit NUTF16 (const std::string &Source);
00083     ~NUTF16();
00084 
00085     operator const UNICHAR* ();
00086 
00087   private:
00088     void Convert (const char *);
00089     UNICHAR *unicode;
00090 
00091   };
00092 
00093 }
00094 
00095 #endif // NUTF_H