Back to index

nux  3.0.0
NUni.h
Go to the documentation of this file.
00001 /*
00002  * Copyright 2010 Inalogic® Inc.
00003  *
00004  * This program is free software: you can redistribute it and/or modify it
00005  * under the terms of the GNU Lesser General Public License, as
00006  * published by the  Free Software Foundation; either version 2.1 or 3.0
00007  * of the License.
00008  *
00009  * This program is distributed in the hope that it will be useful, but
00010  * WITHOUT ANY WARRANTY; without even the implied warranties of
00011  * MERCHANTABILITY, SATISFACTORY QUALITY or FITNESS FOR A PARTICULAR
00012  * PURPOSE.  See the applicable version of the GNU Lesser General Public
00013  * License for more details.
00014  *
00015  * You should have received a copy of both the GNU Lesser General Public
00016  * License along with this program. If not, see <http://www.gnu.org/licenses/>
00017  *
00018  * Authored by: Jay Taoko <jaytaoko@inalogic.com>
00019  *
00020  */
00021 
00022 
00023 #ifndef NUNI_H
00024 #define NUNI_H
00025 
00026 /*
00027  * Copyright 2001-2004 Unicode, Inc.
00028  *
00029  * Disclaimer
00030  *
00031  * This source code is provided as is by Unicode, Inc. No claims are
00032  * made as to fitness for any particular purpose. No warranties of any
00033  * kind are expressed or implied. The recipient agrees to determine
00034  * applicability of information provided. If this file has been
00035  * purchased on magnetic or optical media from Unicode, Inc., the
00036  * sole remedy for any claim will be exchange of defective media
00037  * within 90 days of receipt.
00038  *
00039  * Limitations on Rights to Redistribute This Code
00040  *
00041  * Unicode, Inc. hereby grants the right to freely use the information
00042  * supplied in this file in the creation of products supporting the
00043  * Unicode Standard, and to make copies of this file in any form
00044  * for internal or external distribution as long as this notice
00045  * remains attached.
00046  */
00047 
00048 /* ---------------------------------------------------------------------
00049 
00050     Conversions between UTF32, UTF-16, and UTF-8.  Header file.
00051 
00052     Several funtions are included here, forming a complete set of
00053     conversions between the three formats.  UTF-7 is not included
00054     here, but is handled in a separate source file.
00055 
00056     Each of these routines takes pointers to input buffers and output
00057     buffers.  The input buffers are const.
00058 
00059     Each routine converts the text between *sourceStart and sourceEnd,
00060     putting the result into the buffer between *targetStart and
00061     targetEnd. Note: the end pointers are *after* the last item: e.g.
00062     *(sourceEnd - 1) is the last item.
00063 
00064     The return result indicates whether the conversion was successful,
00065     and if not, whether the problem was in the source or target buffers.
00066     (Only the first encountered problem is indicated.)
00067 
00068     After the conversion, *sourceStart and *targetStart are both
00069     updated to point to the end of last text successfully converted in
00070     the respective buffers.
00071 
00072     Input parameters:
00073         sourceStart - pointer to a pointer to the source buffer.
00074                 The contents of this are modified on return so that
00075                 it points at the next thing to be converted.
00076         targetStart - similarly, pointer to pointer to the target buffer.
00077         sourceEnd, targetEnd - respectively pointers to the ends of the
00078                 two buffers, for overflow checking only.
00079 
00080     These conversion functions take a ConversionFlags argument. When this
00081     flag is set to strict, both irregular sequences and isolated surrogates
00082     will cause an error.  When the flag is set to lenient, both irregular
00083     sequences and isolated surrogates are converted.
00084 
00085     Whether the flag is strict or lenient, all illegal sequences will cause
00086     an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
00087     or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
00088     must check for illegal sequences.
00089 
00090     When the flag is set to lenient, characters over 0x10FFFF are converted
00091     to the replacement character; otherwise (when the flag is set to strict)
00092     they constitute an error.
00093 
00094     Output parameters:
00095         The value "sourceIllegal" is returned from some routines if the input
00096         sequence is malformed.  When "sourceIllegal" is returned, the source
00097         value will point to the illegal value that caused the problem. E.g.,
00098         in UTF-8 when a sequence is malformed, it points to the start of the
00099         malformed sequence.
00100 
00101     Author: Mark E. Davis, 1994.
00102     Rev History: Rick McGowan, fixes & updates May 2001.
00103                  Fixes & updates, Sept 2001.
00104 
00105 ------------------------------------------------------------------------ */
00106 
00107 /* ---------------------------------------------------------------------
00108     The following 4 definitions are compiler-specific.
00109     The C standard does not guarantee that wchar_t has at least
00110     16 bits, so wchar_t is no less portable than unsigned short!
00111     All should be unsigned values to avoid sign extension during
00112     bit mask & shift operations.
00113 ------------------------------------------------------------------------ */
00114 
00115 //typedef unsigned long   unsigned int;  /* at least 32 bits */
00116 //typedef unsigned short  wchar_t;  /* at least 16 bits */
00117 //typedef unsigned char   unsigned char;   /* typically 8 bits */
00118 //typedef unsigned char   Boolean; /* 0 or 1 */
00119 
00120 
00121 namespace nux
00122 {
00123 
00124   /* Some fundamental constants */
00125 #define UNI_REPLACEMENT_CHAR (unsigned int)0x0000FFFD
00126 #define UNI_MAX_BMP (unsigned int)0x0000FFFF
00127 #define UNI_MAX_UTF16 (unsigned int)0x0010FFFF
00128 #define UNI_MAX_UTF32 (unsigned int)0x7FFFFFFF
00129 #define UNI_MAX_LEGAL_UTF32 (unsigned int)0x0010FFFF
00130 
00131   typedef enum
00132   {
00133     conversionOK = 0,           /* conversion successful */
00134     sourceExhausted,        /* partial character in source, but hit end */
00135     targetExhausted,        /* insuff. room in target for conversion */
00136     sourceIllegal           /* source sequence is illegal/malformed */
00137   } ConversionResult;
00138 
00139   typedef enum
00140   {
00141     strictConversion = 0,
00142     lenientConversion
00143   } ConversionFlags;
00144 
00145   ConversionResult ConvertUTF8toUTF16 (
00146     const unsigned char **sourceStart, const unsigned char *sourceEnd,
00147     wchar_t **targetStart, wchar_t *targetEnd, ConversionFlags flags);
00148 
00149   ConversionResult ConvertUTF16toUTF8 (
00150     const wchar_t **sourceStart, const wchar_t *sourceEnd,
00151     unsigned char **targetStart, unsigned char *targetEnd, ConversionFlags flags);
00152 
00153   ConversionResult ConvertUTF8toUTF32 (
00154     const unsigned char **sourceStart, const unsigned char *sourceEnd,
00155     unsigned int **targetStart, unsigned int *targetEnd, ConversionFlags flags);
00156 
00157   ConversionResult ConvertUTF32toUTF8 (
00158     const unsigned int **sourceStart, const unsigned int *sourceEnd,
00159     unsigned char **targetStart, unsigned char *targetEnd, ConversionFlags flags);
00160 
00161   ConversionResult ConvertUTF16toUTF32 (
00162     const wchar_t **sourceStart, const wchar_t *sourceEnd,
00163     unsigned int **targetStart, unsigned int *targetEnd, ConversionFlags flags);
00164 
00165   ConversionResult ConvertUTF32toUTF16 (
00166     const unsigned int **sourceStart, const unsigned int *sourceEnd,
00167     wchar_t **targetStart, wchar_t *targetEnd, ConversionFlags flags);
00168 
00169   bool isLegalUTF8Sequence (const unsigned char *source, const unsigned char *sourceEnd);
00170 
00171 
00172   /* intended to work the same as g_utf8_validate */
00173   bool tr_utf8_validate ( const char *str, int max_len, const char **end );
00174 
00175 }
00176 
00177 #endif /* NUNI_H */