Back to index

openldap  2.4.31
t61.c
Go to the documentation of this file.
00001 /* $OpenLDAP$ */
00002 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
00003  *
00004  * Copyright 2002-2012 The OpenLDAP Foundation.
00005  * All rights reserved.
00006  *
00007  * Redistribution and use in source and binary forms, with or without
00008  * modification, are permitted only as authorized by the OpenLDAP
00009  * Public License.
00010  *
00011  * A copy of this license is available in the file LICENSE in the
00012  * top-level directory of the distribution or, alternatively, at
00013  * <http://www.OpenLDAP.org/license.html>.
00014  */
00015 /* ACKNOWLEDGEMENTS:
00016  * This work was initially developed by Howard Chu for inclusion in
00017  * OpenLDAP Software.
00018  */
00019 
00020 /*
00021  * Basic T.61 <-> UTF-8 conversion
00022  *
00023  * These routines will perform a lossless translation from T.61 to UTF-8
00024  * and a lossy translation from UTF-8 to T.61.
00025  */
00026 
00027 #include "portable.h"
00028 
00029 #include <stdio.h>
00030 
00031 #include <ac/stdlib.h>
00032 
00033 #include <ac/socket.h>
00034 #include <ac/string.h>
00035 #include <ac/time.h>
00036 
00037 #include "ldap-int.h"
00038 #include "ldap_utf8.h"
00039 
00040 #include "ldap_defaults.h"
00041 
00042 /*
00043  * T.61 is somewhat braindead; even in the 7-bit space it is not
00044  * completely equivalent to 7-bit US-ASCII. Our definition of the
00045  * character set comes from RFC 1345 with a slightly more readable
00046  * rendition at http://std.dkuug.dk/i18n/charmaps/T.61-8BIT.
00047  *
00048  * Even though '#' and '$' are present in the 7-bit US-ASCII space,
00049  * (x23 and x24, resp.) in T.61 they are mapped to 8-bit characters
00050  * xA6 and xA4. 
00051  *
00052  * Also T.61 lacks
00053  *     backslash     \      (x5C)
00054  *     caret         ^      (x5E)
00055  *     backquote     `      (x60)
00056  *     left brace    {      (x7B)
00057  *     right brace   }      (x7D)
00058  *     tilde         ~      (x7E)
00059  *
00060  * In T.61, the codes xC1 to xCF (excluding xC9, unused) are non-spacing
00061  * accents of some form or another. There are predefined combinations
00062  * for certain characters, but they can also be used arbitrarily. The
00063  * table at dkuug.dk maps these accents to the E000 "private use" range
00064  * of the Unicode space, but I believe they more properly belong in the
00065  * 0300 range (non-spacing accents). The transformation is complicated
00066  * slightly because Unicode wants the non-spacing character to follow
00067  * the base character, while T.61 has the non-spacing character leading.
00068  * Also, T.61 specifically recognizes certain combined pairs as "characters"
00069  * but doesn't specify how to treat unrecognized pairs. This code will
00070  * always attempt to combine pairs when a known Unicode composite exists.
00071  */
00072 
00073 static const wchar_t t61_tab[] = {
00074        0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007,
00075        0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f,
00076        0x010, 0x011, 0x012, 0x013, 0x014, 0x015, 0x016, 0x017,
00077        0x018, 0x019, 0x01a, 0x01b, 0x01c, 0x01d, 0x01e, 0x01f,
00078        0x020, 0x021, 0x022, 0x000, 0x000, 0x025, 0x026, 0x027,
00079        0x028, 0x029, 0x02a, 0x02b, 0x02c, 0x02d, 0x02e, 0x02f,
00080        0x030, 0x031, 0x032, 0x033, 0x034, 0x035, 0x036, 0x037,
00081        0x038, 0x039, 0x03a, 0x03b, 0x03c, 0x03d, 0x03e, 0x03f,
00082        0x040, 0x041, 0x042, 0x043, 0x044, 0x045, 0x046, 0x047,
00083        0x048, 0x049, 0x04a, 0x04b, 0x04c, 0x04d, 0x04e, 0x04f,
00084        0x050, 0x051, 0x052, 0x053, 0x054, 0x055, 0x056, 0x057,
00085        0x058, 0x059, 0x05a, 0x05b, 0x000, 0x05d, 0x000, 0x05f,
00086        0x000, 0x061, 0x062, 0x063, 0x064, 0x065, 0x066, 0x067,
00087        0x068, 0x069, 0x06a, 0x06b, 0x06c, 0x06d, 0x06e, 0x06f,
00088        0x070, 0x071, 0x072, 0x073, 0x074, 0x075, 0x076, 0x077,
00089        0x078, 0x079, 0x07a, 0x000, 0x07c, 0x000, 0x000, 0x07f,
00090        0x080, 0x081, 0x082, 0x083, 0x084, 0x085, 0x086, 0x087,
00091        0x088, 0x089, 0x08a, 0x08b, 0x08c, 0x08d, 0x08e, 0x08f,
00092        0x090, 0x091, 0x092, 0x093, 0x094, 0x095, 0x096, 0x097,
00093        0x098, 0x099, 0x09a, 0x09b, 0x09c, 0x09d, 0x09e, 0x09f,
00094        0x0a0, 0x0a1, 0x0a2, 0x0a3, 0x024, 0x0a5, 0x023, 0x0a7,
00095        0x0a4, 0x000, 0x000, 0x0ab, 0x000, 0x000, 0x000, 0x000,
00096        0x0b0, 0x0b1, 0x0b2, 0x0b3, 0x0d7, 0x0b5, 0x0b6, 0x0b7,
00097        0x0f7, 0x000, 0x000, 0x0bb, 0x0bc, 0x0bd, 0x0be, 0x0bf,
00098        0x000, 0x300, 0x301, 0x302, 0x303, 0x304, 0x306, 0x307,
00099        0x308, 0x000, 0x30a, 0x327, 0x332, 0x30b, 0x328, 0x30c,
00100        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
00101        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
00102        0x2126, 0xc6, 0x0d0, 0x0aa, 0x126, 0x000, 0x132, 0x13f,
00103        0x141, 0x0d8, 0x152, 0x0ba, 0x0de, 0x166, 0x14a, 0x149,
00104        0x138, 0x0e6, 0x111, 0x0f0, 0x127, 0x131, 0x133, 0x140,
00105        0x142, 0x0f8, 0x153, 0x0df, 0x0fe, 0x167, 0x14b, 0x000
00106 };
00107 
00108 typedef wchar_t wvec16[16];
00109 typedef wchar_t wvec32[32];
00110 typedef wchar_t wvec64[64];
00111 
00112 /* Substitutions when 0xc1-0xcf appears by itself or with space 0x20 */
00113 static const wvec16 accents = {
00114        0x000, 0x060, 0x0b4, 0x05e, 0x07e, 0x0af, 0x2d8, 0x2d9,
00115        0x0a8, 0x000, 0x2da, 0x0b8, 0x000, 0x2dd, 0x2db, 0x2c7};
00116 
00117 /* In the following tables, base characters commented in (parentheses)
00118  * are not defined by T.61 but are mapped anyway since their Unicode
00119  * composite exists.
00120  */
00121 
00122 /* Grave accented chars AEIOU (NWY) */
00123 static const wvec32 c1_vec1 = {
00124        /* Upper case */
00125        0, 0xc0, 0, 0, 0, 0xc8, 0, 0, 0, 0xcc, 0, 0, 0, 0, 0x1f8, 0xd2,
00126        0, 0, 0, 0, 0, 0xd9, 0, 0x1e80, 0, 0x1ef2, 0, 0, 0, 0, 0, 0};
00127 static const wvec32 c1_vec2 = {
00128        /* Lower case */
00129        0, 0xe0, 0, 0, 0, 0xe8, 0, 0, 0, 0xec, 0, 0, 0, 0, 0x1f9, 0xf2,
00130        0, 0, 0, 0, 0, 0xf9, 0, 0x1e81, 0, 0x1ef3, 0, 0, 0, 0, 0, 0};
00131        
00132 static const wvec32 *c1_grave[] = {
00133        NULL, NULL, &c1_vec1, &c1_vec2, NULL, NULL, NULL, NULL
00134 };
00135 
00136 /* Acute accented chars AEIOUYCLNRSZ (GKMPW) */
00137 static const wvec32 c2_vec1 = {
00138        /* Upper case */
00139        0, 0xc1, 0, 0x106, 0, 0xc9, 0, 0x1f4,
00140        0, 0xcd, 0, 0x1e30, 0x139, 0x1e3e, 0x143, 0xd3,
00141        0x1e54, 0, 0x154, 0x15a, 0, 0xda, 0, 0x1e82,
00142        0, 0xdd, 0x179, 0, 0, 0, 0, 0};
00143 static const wvec32 c2_vec2 = {
00144        /* Lower case */
00145        0, 0xe1, 0, 0x107, 0, 0xe9, 0, 0x1f5,
00146        0, 0xed, 0, 0x1e31, 0x13a, 0x1e3f, 0x144, 0xf3,
00147        0x1e55, 0, 0x155, 0x15b, 0, 0xfa, 0, 0x1e83,
00148        0, 0xfd, 0x17a, 0, 0, 0, 0, 0};
00149 static const wvec32 c2_vec3 = {
00150        /* (AE and ae) */
00151        0, 0x1fc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00152        0, 0x1fd, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
00153 
00154 static const wvec32 *c2_acute[] = {
00155        NULL, NULL, &c2_vec1, &c2_vec2, NULL, NULL, NULL, &c2_vec3
00156 };
00157 
00158 /* Circumflex AEIOUYCGHJSW (Z) */
00159 static const wvec32 c3_vec1 = {
00160        /* Upper case */
00161        0, 0xc2, 0, 0x108, 0, 0xca, 0, 0x11c,
00162        0x124, 0xce, 0x134, 0, 0, 0, 0, 0xd4,
00163        0, 0, 0, 0x15c, 0, 0xdb, 0, 0x174,
00164        0, 0x176, 0x1e90, 0, 0, 0, 0, 0};
00165 static const wvec32 c3_vec2 = {
00166        /* Lower case */
00167        0, 0xe2, 0, 0x109, 0, 0xea, 0, 0x11d,
00168        0x125, 0xee, 0x135, 0, 0, 0, 0, 0xf4,
00169        0, 0, 0, 0x15d, 0, 0xfb, 0, 0x175,
00170        0, 0x177, 0x1e91, 0, 0, 0, 0, 0};
00171 static const wvec32 *c3_circumflex[] = {
00172        NULL, NULL, &c3_vec1, &c3_vec2, NULL, NULL, NULL, NULL
00173 };
00174 
00175 /* Tilde AIOUN (EVY) */
00176 static const wvec32 c4_vec1 = {
00177        /* Upper case */
00178        0, 0xc3, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5,
00179        0, 0, 0, 0, 0, 0x168, 0x1e7c, 0, 0, 0x1ef8, 0, 0, 0, 0, 0, 0};
00180 static const wvec32 c4_vec2 = {
00181        /* Lower case */
00182        0, 0xe3, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5,
00183        0, 0, 0, 0, 0, 0x169, 0x1e7d, 0, 0, 0x1ef9, 0, 0, 0, 0, 0, 0};
00184 static const wvec32 *c4_tilde[] = {
00185        NULL, NULL, &c4_vec1, &c4_vec2, NULL, NULL, NULL, NULL
00186 };
00187 
00188 /* Macron AEIOU (YG) */
00189 static const wvec32 c5_vec1 = {
00190        /* Upper case */
00191        0, 0x100, 0, 0, 0, 0x112, 0, 0x1e20, 0, 0x12a, 0, 0, 0, 0, 0, 0x14c,
00192        0, 0, 0, 0, 0, 0x16a, 0, 0, 0, 0x232, 0, 0, 0, 0, 0, 0};
00193 static const wvec32 c5_vec2 = {
00194        /* Lower case */
00195        0, 0x101, 0, 0, 0, 0x113, 0, 0x1e21, 0, 0x12b, 0, 0, 0, 0, 0, 0x14d,
00196        0, 0, 0, 0, 0, 0x16b, 0, 0, 0, 0x233, 0, 0, 0, 0, 0, 0};
00197 static const wvec32 c5_vec3 = {
00198        /* (AE and ae) */
00199        0, 0x1e2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00200        0, 0x1e3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
00201 static const wvec32 *c5_macron[] = {
00202        NULL, NULL, &c5_vec1, &c5_vec2, NULL, NULL, NULL, &c5_vec3
00203 };
00204 
00205 /* Breve AUG (EIO) */
00206 static const wvec32 c6_vec1 = {
00207        /* Upper case */
00208        0, 0x102, 0, 0, 0, 0x114, 0, 0x11e, 0, 0x12c, 0, 0, 0, 0, 0, 0x14e,
00209        0, 0, 0, 0, 0, 0x16c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
00210 static const wvec32 c6_vec2 = {
00211        /* Lower case */
00212        0, 0x103, 0, 0, 0, 0x115, 0, 0x11f, 0, 0x12d, 0, 0, 0, 0, 0, 0x14f,
00213        0, 0, 0, 0, 0, 0x16d, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
00214 static const wvec32 *c6_breve[] = {
00215        NULL, NULL, &c6_vec1, &c6_vec2, NULL, NULL, NULL, NULL
00216 };
00217 
00218 /* Dot Above CEGIZ (AOBDFHMNPRSTWXY) */
00219 static const wvec32 c7_vec1 = {
00220        /* Upper case */
00221        0, 0x226, 0x1e02, 0x10a, 0x1e0a, 0x116, 0x1e1e, 0x120,
00222        0x1e22, 0x130, 0, 0, 0, 0x1e40, 0x1e44, 0x22e,
00223        0x1e56, 0, 0x1e58, 0x1e60, 0x1e6a, 0, 0, 0x1e86,
00224        0x1e8a, 0x1e8e, 0x17b, 0, 0, 0, 0, 0};
00225 static const wvec32 c7_vec2 = {
00226        /* Lower case */
00227        0, 0x227, 0x1e03, 0x10b, 0x1e0b, 0x117, 0x1e1f, 0x121,
00228        0x1e23, 0, 0, 0, 0, 0x1e41, 0x1e45, 0x22f,
00229        0x1e57, 0, 0x1e59, 0x1e61, 0x1e6b, 0, 0, 0x1e87,
00230        0x1e8b, 0x1e8f, 0x17c, 0, 0, 0, 0, 0};
00231 static const wvec32 *c7_dotabove[] = {
00232        NULL, NULL, &c7_vec1, &c7_vec2, NULL, NULL, NULL, NULL
00233 };
00234 
00235 /* Diaeresis AEIOUY (HWXt) */
00236 static const wvec32 c8_vec1 = {
00237        /* Upper case */
00238        0, 0xc4, 0, 0, 0, 0xcb, 0, 0, 0x1e26, 0xcf, 0, 0, 0, 0, 0, 0xd6,
00239        0, 0, 0, 0, 0, 0xdc, 0, 0x1e84, 0x1e8c, 0x178, 0, 0, 0, 0, 0, 0};
00240 static const wvec32 c8_vec2 = {
00241        /* Lower case */
00242        0, 0xe4, 0, 0, 0, 0xeb, 0, 0, 0x1e27, 0xef, 0, 0, 0, 0, 0, 0xf6,
00243        0, 0, 0, 0, 0x1e97, 0xfc, 0, 0x1e85, 0x1e8d, 0xff, 0, 0, 0, 0, 0, 0};
00244 static const wvec32 *c8_diaeresis[] = {
00245        NULL, NULL, &c8_vec1, &c8_vec2, NULL, NULL, NULL, NULL
00246 };
00247 
00248 /* Ring Above AU (wy) */
00249 static const wvec32 ca_vec1 = {
00250        /* Upper case */
00251        0, 0xc5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00252        0, 0, 0, 0, 0, 0x16e, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
00253 static const wvec32 ca_vec2 = {
00254        /* Lower case */
00255        0, 0xe5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00256        0, 0, 0, 0, 0, 0x16f, 0, 0x1e98, 0, 0x1e99, 0, 0, 0, 0, 0, 0};
00257 static const wvec32 *ca_ringabove[] = {
00258        NULL, NULL, &ca_vec1, &ca_vec2, NULL, NULL, NULL, NULL
00259 };
00260 
00261 /* Cedilla CGKLNRST (EDH) */
00262 static const wvec32 cb_vec1 = {
00263        /* Upper case */
00264        0, 0, 0, 0xc7, 0x1e10, 0x228, 0, 0x122,
00265        0x1e28, 0, 0, 0x136, 0x13b, 0, 0x145, 0,
00266        0, 0, 0x156, 0x15e, 0x162, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
00267 static const wvec32 cb_vec2 = {
00268        /* Lower case */
00269        0, 0, 0, 0xe7, 0x1e11, 0x229, 0, 0x123,
00270        0x1e29, 0, 0, 0x137, 0x13c, 0, 0x146, 0,
00271        0, 0, 0x157, 0x15f, 0x163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
00272 static const wvec32 *cb_cedilla[] = {
00273        NULL, NULL, &cb_vec1, &cb_vec2, NULL, NULL, NULL, NULL
00274 };
00275 
00276 /* Double Acute Accent OU */
00277 static const wvec32 cd_vec1 = {
00278        /* Upper case */
00279        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x150,
00280        0, 0, 0, 0, 0, 0x170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
00281 static const wvec32 cd_vec2 = {
00282        /* Lower case */
00283        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x151,
00284        0, 0, 0, 0, 0, 0x171, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
00285 static const wvec32 *cd_doubleacute[] = {
00286        NULL, NULL, &cd_vec1, &cd_vec2, NULL, NULL, NULL, NULL
00287 };
00288 
00289 /* Ogonek AEIU (O) */
00290 static const wvec32 ce_vec1 = {
00291        /* Upper case */
00292        0, 0x104, 0, 0, 0, 0x118, 0, 0, 0, 0x12e, 0, 0, 0, 0, 0, 0x1ea,
00293        0, 0, 0, 0, 0, 0x172, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
00294 static const wvec32 ce_vec2 = {
00295        /* Lower case */
00296        0, 0x105, 0, 0, 0, 0x119, 0, 0, 0, 0x12f, 0, 0, 0, 0, 0, 0x1eb,
00297        0, 0, 0, 0, 0, 0x173, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
00298 static const wvec32 *ce_ogonek[] = {
00299        NULL, NULL, &ce_vec1, &ce_vec2, NULL, NULL, NULL, NULL
00300 };
00301 
00302 /* Caron CDELNRSTZ (AIOUGKjH) */
00303 static const wvec32 cf_vec1 = {
00304        /* Upper case */
00305        0, 0x1cd, 0, 0x10c, 0x10e, 0x11a, 0, 0x1e6,
00306        0x21e, 0x1cf, 0, 0x1e8, 0x13d, 0, 0x147, 0x1d1,
00307        0, 0, 0x158, 0x160, 0x164, 0x1d3, 0, 0,
00308        0, 0, 0x17d, 0, 0, 0, 0, 0};
00309 static const wvec32 cf_vec2 = {
00310        /* Lower case */
00311        0, 0x1ce, 0, 0x10d, 0x10f, 0x11b, 0, 0x1e7,
00312        0x21f, 0x1d0, 0x1f0, 0x1e9, 0x13e, 0, 0x148, 0x1d2,
00313        0, 0, 0x159, 0x161, 0x165, 0x1d4, 0, 0,
00314        0, 0, 0x17e, 0, 0, 0, 0, 0};
00315 static const wvec32 *cf_caron[] = {
00316        NULL, NULL, &cf_vec1, &cf_vec2, NULL, NULL, NULL, NULL
00317 };
00318 
00319 static const wvec32 **cx_tab[] = {
00320        NULL, c1_grave, c2_acute, c3_circumflex, c4_tilde, c5_macron,
00321        c6_breve, c7_dotabove, c8_diaeresis, NULL, ca_ringabove,
00322        cb_cedilla, NULL, cd_doubleacute, ce_ogonek, cf_caron };
00323 
00324 int ldap_t61s_valid( struct berval *str )
00325 {
00326        unsigned char *c = (unsigned char *)str->bv_val;
00327        int i;
00328 
00329        for (i=0; i < str->bv_len; c++,i++)
00330               if (!t61_tab[*c])
00331                      return 0;
00332        return 1;
00333 }
00334 
00335 /* Transform a T.61 string to UTF-8.
00336  */
00337 int ldap_t61s_to_utf8s( struct berval *src, struct berval *dst )
00338 {
00339        unsigned char *c;
00340        char *d;
00341        int i, wlen = 0;
00342 
00343        /* Just count the length of the UTF-8 result first */
00344        for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
00345               /* Invalid T.61 characters? */
00346               if (!t61_tab[*c]) 
00347                      return LDAP_INVALID_SYNTAX;
00348               if ((*c & 0xf0) == 0xc0) {
00349                      int j = *c & 0x0f;
00350                      /* If this is the end of the string, or if the base
00351                       * character is just a space, treat this as a regular
00352                       * spacing character.
00353                       */
00354                      if ((!c[1] || c[1] == 0x20) && accents[j]) {
00355                             wlen += ldap_x_wc_to_utf8(NULL, accents[j], 0);
00356                      } else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
00357                      /* We have a composite mapping for this pair */
00358                             (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
00359                             wlen += ldap_x_wc_to_utf8( NULL,
00360                                    (*cx_tab[j][c[1]>>5])[c[1]&0x1f], 0);
00361                      } else {
00362                      /* No mapping, just swap it around so the base
00363                       * character comes first.
00364                       */
00365                             wlen += ldap_x_wc_to_utf8(NULL, c[1], 0);
00366                             wlen += ldap_x_wc_to_utf8(NULL,
00367                                    t61_tab[*c], 0);
00368                      }
00369                      c++; i++;
00370                      continue;
00371               } else {
00372                      wlen += ldap_x_wc_to_utf8(NULL, t61_tab[*c], 0);
00373               }
00374        }
00375 
00376        /* Now transform the string */
00377        dst->bv_len = wlen;
00378        dst->bv_val = LDAP_MALLOC( wlen+1 );
00379        d = dst->bv_val;
00380        if (!d)
00381               return LDAP_NO_MEMORY;
00382 
00383        for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
00384               if ((*c & 0xf0) == 0xc0) {
00385                      int j = *c & 0x0f;
00386                      /* If this is the end of the string, or if the base
00387                       * character is just a space, treat this as a regular
00388                       * spacing character.
00389                       */
00390                      if ((!c[1] || c[1] == 0x20) && accents[j]) {
00391                             d += ldap_x_wc_to_utf8(d, accents[j], 6);
00392                      } else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
00393                      /* We have a composite mapping for this pair */
00394                             (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
00395                             d += ldap_x_wc_to_utf8(d, 
00396                             (*cx_tab[j][c[1]>>5])[c[1]&0x1f], 6);
00397                      } else {
00398                      /* No mapping, just swap it around so the base
00399                       * character comes first.
00400                       */
00401                             d += ldap_x_wc_to_utf8(d, c[1], 6);
00402                             d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
00403                      }
00404                      c++; i++;
00405                      continue;
00406               } else {
00407                      d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
00408               }
00409        }
00410        *d = '\0';
00411        return LDAP_SUCCESS;
00412 }
00413 
00414 /* For the reverse mapping, we just pay attention to the Latin-oriented
00415  * code blocks. These are
00416  *     0000 - 007f Basic Latin
00417  *     0080 - 00ff Latin-1 Supplement
00418  *     0100 - 017f Latin Extended-A
00419  *     0180 - 024f Latin Extended-B
00420  *     1e00 - 1eff Latin Extended Additional
00421  *
00422  * We have a special case to map Ohm U2126 back to T.61 0xe0. All other
00423  * unrecognized characters are replaced with '?' 0x3f.
00424  */
00425 
00426 static const wvec64 u000 = {
00427        0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
00428        0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
00429        0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
00430        0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
00431        0x0020, 0x0021, 0x0022, 0x00a6, 0x00a4, 0x0025, 0x0026, 0x0027,
00432        0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
00433        0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
00434        0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f};
00435 
00436 /* In this range, we've mapped caret to xc3/x20, backquote to xc1/x20,
00437  * and tilde to xc4/x20. T.61 (stupidly!) doesn't define these characters
00438  * on their own, even though it provides them as combiners for other
00439  * letters. T.61 doesn't define these pairings either, so this may just
00440  * have to be replaced with '?' 0x3f if other software can't cope with it.
00441  */
00442 static const wvec64 u001 = {
00443        0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
00444        0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
00445        0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
00446        0x0058, 0x0059, 0x005a, 0x005b, 0x003f, 0x005d, 0xc320, 0x005f,
00447        0xc120, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
00448        0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
00449        0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
00450        0x0078, 0x0079, 0x007a, 0x003f, 0x007c, 0x003f, 0xc420, 0x007f};
00451 
00452 static const wvec64 u002 = {
00453        0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
00454        0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
00455        0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
00456        0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
00457        0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a8, 0x00a5, 0x003f, 0x00a7,
00458        0xc820, 0x003f, 0x00e3, 0x00ab, 0x003f, 0x003f, 0x003f, 0xc520,
00459        0x00b0, 0x00b1, 0x00b2, 0x00b3, 0xc220, 0x00b5, 0x00b6, 0x00b7,
00460        0xcb20, 0x003f, 0x00eb, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf};
00461 
00462 static const wvec64 u003 = {
00463        0xc141, 0xc241, 0xc341, 0xc441, 0xc841, 0xca41, 0x00e1, 0xcb43,
00464        0xc145, 0xc245, 0xc345, 0xc845, 0xc149, 0xc249, 0xc349, 0xc849,
00465        0x00e2, 0xc44e, 0xc14f, 0xc24f, 0xc34f, 0xc44f, 0xc84f, 0x00b4,
00466        0x00e9, 0xc155, 0xc255, 0xc355, 0xc855, 0xc259, 0x00ec, 0x00fb,
00467        0xc161, 0xc261, 0xc361, 0xc461, 0xc861, 0xca61, 0x00f1, 0xcb63,
00468        0xc165, 0xc265, 0xc365, 0xc865, 0xc169, 0xc269, 0xc369, 0xc869,
00469        0x00f3, 0xc46e, 0xc16f, 0xc26f, 0xc36f, 0xc46f, 0xc86f, 0x00b8,
00470        0x00f9, 0xc175, 0xc275, 0xc375, 0xc875, 0xc279, 0x00fc, 0xc879};
00471 
00472 /* These codes are used here but not defined by T.61:
00473  * x114 = xc6/x45, x115 = xc6/x65, x12c = xc6/x49, x12d = xc6/x69
00474  */
00475 static const wvec64 u010 = {
00476        0xc541, 0xc561, 0xc641, 0xc661, 0xce41, 0xce61, 0xc243, 0xc263,
00477        0xc343, 0xc363, 0xc743, 0xc763, 0xcf43, 0xcf63, 0xcf44, 0xcf64,
00478        0x003f, 0x00f2, 0xc545, 0xc565, 0xc645, 0xc665, 0xc745, 0xc765,
00479        0xce45, 0xce65, 0xcf45, 0xcf65, 0xc347, 0xc367, 0xc647, 0xc667,
00480        0xc747, 0xc767, 0xcb47, 0xcb67, 0xc348, 0xc368, 0x00e4, 0x00f4,
00481        0xc449, 0xc469, 0xc549, 0xc569, 0xc649, 0xc669, 0xce49, 0xce69,
00482        0xc749, 0x00f5, 0x00e6, 0x00f6, 0xc34a, 0xc36a, 0xcb4b, 0xcb6b,
00483        0x00f0, 0xc24c, 0xc26c, 0xcb4c, 0xcb6c, 0xcf4c, 0xcf6c, 0x00e7};
00484 
00485 /* These codes are used here but not defined by T.61:
00486  * x14e = xc6/x4f, x14f = xc6/x6f
00487  */
00488 static const wvec64 u011 = {
00489        0x00f7, 0x00e8, 0x00f8, 0xc24e, 0xc26e, 0xcb4e, 0xcb6e, 0xcf4e,
00490        0xcf6e, 0x00ef, 0x00ee, 0x00fe, 0xc54f, 0xc56f, 0xc64f, 0xc66f,
00491        0xcd4f, 0xcd6f, 0x00ea, 0x00fa, 0xc252, 0xc272, 0xcb52, 0xcb72,
00492        0xcf52, 0xcf72, 0xc253, 0xc273, 0xc353, 0xc373, 0xcb53, 0xcb73,
00493        0xcf53, 0xcf73, 0xcb54, 0xcb74, 0xcf54, 0xcf74, 0x00ed, 0x00fd,
00494        0xc455, 0xc475, 0xc555, 0xc575, 0xc655, 0xc675, 0xca55, 0xca75,
00495        0xcd55, 0xcd75, 0xce55, 0xce75, 0xc357, 0xc377, 0xc359, 0xc379,
00496        0xc859, 0xc25a, 0xc27a, 0xc75a, 0xc77a, 0xcf5a, 0xcf7a, 0x003f};
00497 
00498 /* All of the codes in this block are undefined in T.61.
00499  */
00500 static const wvec64 u013 = {
00501        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00502        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf41, 0xcf61, 0xcf49,
00503        0xcf69, 0xcf4f, 0xcf6f, 0xcf55, 0xcf75, 0x003f, 0x003f, 0x003f, 
00504        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00505        0x003f, 0x003f, 0xc5e1, 0xc5f1, 0x003f, 0x003f, 0xcf47, 0xcf67,
00506        0xcf4b, 0xcf6b, 0xce4f, 0xce6f, 0x003f, 0x003f, 0x003f, 0x003f,
00507        0xcf6a, 0x003f, 0x003f, 0x003f, 0xc247, 0xc267, 0x003f, 0x003f,
00508        0xc14e, 0xc16e, 0x003f, 0x003f, 0xc2e1, 0xc2f1, 0x003f, 0x003f};
00509 
00510 /* All of the codes in this block are undefined in T.61.
00511  */
00512 static const wvec64 u020 = {
00513        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00514        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00515        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00516        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf48, 0xcf68,
00517        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc741, 0xc761,
00518        0xcb45, 0xcb65, 0x003f, 0x003f, 0x003f, 0x003f, 0xc74f, 0xc76f,
00519        0x003f, 0x003f, 0xc559, 0xc579, 0x003f, 0x003f, 0x003f, 0x003f,
00520        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
00521 
00522 static const wvec64 u023 = {
00523        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf20,
00524        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00525        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00526        0xc620, 0xc720, 0xca20, 0xce20, 0x003f, 0xcd20, 0x003f, 0x003f,
00527        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00528        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00529        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00530        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
00531 
00532 /* These are the non-spacing characters by themselves. They should
00533  * never appear by themselves in actual text.
00534  */
00535 static const wvec64 u030 = {
00536        0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x003f, 0x00c6, 0x00c7,
00537        0x00c8, 0x003f, 0x00ca, 0x00cd, 0x00cf, 0x003f, 0x003f, 0x003f,
00538        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00539        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00540        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x00cb,
00541        0x00ce, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00542        0x003f, 0x003f, 0x00cc, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00543        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
00544 
00545 /* None of the following blocks are defined in T.61.
00546  */
00547 static const wvec64 u1e0 = {
00548        0x003f, 0x003f, 0xc742, 0xc762, 0x003f, 0x003f, 0x003f, 0x003f, 
00549        0x003f, 0x003f, 0xc744, 0xc764, 0x003f, 0x003f, 0x003f, 0x003f,
00550        0xcb44, 0xcb64, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00551        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc746, 0xc766,
00552        0xc547, 0xc567, 0xc748, 0xc768, 0x003f, 0x003f, 0xc848, 0xc868,
00553        0xcb48, 0xcb68, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00554        0xc24b, 0xc26b, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00555        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc24d, 0xc26d,
00556 };
00557 
00558 static const wvec64 u1e1 = {
00559        0xc74d, 0xc76d, 0x003f, 0x003f, 0xc74e, 0xc76e, 0x003f, 0x003f,
00560        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00561        0x003f, 0x003f, 0x003f, 0x003f, 0xc250, 0xc270, 0xc750, 0xc770,
00562        0xc752, 0xc772, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00563        0xc753, 0xc773, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00564        0x003f, 0x003f, 0xc754, 0xc774, 0x003f, 0x003f, 0x003f, 0x003f,
00565        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00566        0x003f, 0x003f, 0x003f, 0x003f, 0xc456, 0xc476, 0x003f, 0x003f,
00567 };
00568 
00569 static const wvec64 u1e2 = {
00570        0xc157, 0xc177, 0xc257, 0xc277, 0xc857, 0xc877, 0xc757, 0xc777,
00571        0x003f, 0x003f, 0xc758, 0xc778, 0xc858, 0xc878, 0xc759, 0xc779,
00572        0xc35a, 0xc37a, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc874,
00573        0xca77, 0xca79, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00574        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00575        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00576        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00577        0x003f, 0x003f, 0x003f, 0x003f, 0xc445, 0xc465, 0x003f, 0x003f,
00578 };
00579 
00580 static const wvec64 u1e3 = {
00581        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00582        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00583        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00584        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00585        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00586        0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00587        0x003f, 0x003f, 0xc159, 0xc179, 0x003f, 0x003f, 0x003f, 0x003f,
00588        0xc459, 0xc479, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00589 };
00590 
00591 static const wvec64 *wc00[] = {
00592        &u000, &u001, &u002, &u003,
00593        &u010, &u011, NULL, &u013,
00594        &u020, NULL, NULL, &u023,
00595        &u030, NULL, NULL, NULL};
00596 
00597 static const wvec64 *wc1e[] = {
00598        &u1e0, &u1e1, &u1e2, &u1e3};
00599 
00600 
00601 int ldap_utf8s_to_t61s( struct berval *src, struct berval *dst )
00602 {
00603        char *c, *d;
00604        wchar_t tmp;
00605        int i, j, tlen = 0;
00606 
00607        /* Just count the length of the T.61 result first */
00608        for (i=0,c=src->bv_val; i < src->bv_len;) {
00609               j = ldap_x_utf8_to_wc( &tmp, c );
00610               if (j == -1)
00611                      return LDAP_INVALID_SYNTAX;
00612               switch (tmp >> 8) {
00613               case 0x00:
00614               case 0x01:
00615               case 0x02:
00616               case 0x03:
00617                      if (wc00[tmp >> 6] &&
00618                             ((*wc00[tmp >> 6])[tmp & 0x3f] & 0xff00)) {
00619                             tlen++;
00620                      }
00621                      tlen++;
00622                      break;
00623               case 0x1e:
00624                      if ((*wc1e[(tmp >> 6) & 3])[tmp & 0x3f] & 0xff00) {
00625                             tlen++;
00626                      }
00627               case 0x21:
00628               default:
00629                      tlen ++;
00630                      break;
00631               }
00632               i += j;
00633               c += j;
00634        }
00635        dst->bv_len = tlen;
00636        dst->bv_val = LDAP_MALLOC( tlen+1 );
00637        if (!dst->bv_val)
00638               return LDAP_NO_MEMORY;
00639        
00640        d = dst->bv_val;
00641        for (i=0,c=src->bv_val; i < src->bv_len;) {
00642               j = ldap_x_utf8_to_wc( &tmp, c );
00643               switch (tmp >> 8) {
00644               case 0x00:
00645               case 0x01:
00646               case 0x02:
00647                      if (wc00[tmp >> 6]) {
00648                             tmp = (*wc00[tmp >> 6])[tmp & 0x3f];
00649                             if (tmp & 0xff00)
00650                                    *d++ = (tmp >> 8);
00651                             *d++ = tmp & 0xff;
00652                      } else {
00653                             *d++ = 0x3f;
00654                      }
00655                      break;
00656               case 0x03:
00657                      /* swap order of non-spacing characters */
00658                      if (wc00[tmp >> 6]) {
00659                             wchar_t t2 = (*wc00[tmp >> 6])[tmp & 0x3f];
00660                             if (t2 != 0x3f) {
00661                                    d[0] = d[-1];
00662                                    d[-1] = t2;
00663                                    d++;
00664                             } else {
00665                                    *d++ = 0x3f;
00666                             }
00667                      } else {
00668                             *d++ = 0x3f;
00669                      }
00670                      break;
00671               case 0x1e:
00672                      tmp = (*wc1e[(tmp >> 6) & 3])[tmp & 0x3f];
00673                      if (tmp & 0xff00)
00674                             *d++ = (tmp >> 8);
00675                      *d++ = tmp & 0xff;
00676                      break;
00677               case 0x21:
00678                      if (tmp == 0x2126) {
00679                             *d++ = 0xe0;
00680                             break;
00681                      }
00682                      /* FALLTHRU */
00683               default:
00684                      *d++ = 0x3f;
00685                      break;
00686               }
00687               i += j;
00688               c += j;
00689        }
00690        *d = '\0';
00691        return LDAP_SUCCESS;
00692 }