Back to index

openldap  2.4.31
ucgendat.c
Go to the documentation of this file.
00001 /* $OpenLDAP$ */
00002 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
00003  *
00004  * Copyright 1998-2012 The OpenLDAP Foundation.
00005  * All rights reserved.
00006  *
00007  * Redistribution and use in source and binary forms, with or without
00008  * modification, are permitted only as authorized by the OpenLDAP
00009  * Public License.
00010  *
00011  * A copy of this license is available in file LICENSE in the
00012  * top-level directory of the distribution or, alternatively, at
00013  * <http://www.OpenLDAP.org/license.html>.
00014  */
00015 /* Copyright 2001 Computing Research Labs, New Mexico State University
00016  *
00017  * Permission is hereby granted, free of charge, to any person obtaining a
00018  * copy of this software and associated documentation files (the "Software"),
00019  * to deal in the Software without restriction, including without limitation
00020  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
00021  * and/or sell copies of the Software, and to permit persons to whom the
00022  * Software is furnished to do so, subject to the following conditions:
00023  *
00024  * The above copyright notice and this permission notice shall be included in
00025  * all copies or substantial portions of the Software.
00026  *
00027  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00028  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00029  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
00030  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
00031  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
00032  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
00033  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
00034  */
00035 /* $Id: ucgendat.c,v 1.4 2001/01/02 18:46:20 mleisher Exp $" */
00036 
00037 #include "portable.h"
00038 #include "ldap_config.h"
00039 
00040 #include <stdio.h>
00041 #include <ac/ctype.h>
00042 #include <ac/stdlib.h>
00043 #include <ac/string.h>
00044 #include <ac/unistd.h>
00045 
00046 #include <ac/bytes.h>
00047 
00048 #include <lutil.h>
00049 
00050 #ifndef HARDCODE_DATA
00051 #define       HARDCODE_DATA 1
00052 #endif
00053 
00054 #undef ishdigit
00055 #define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\
00056                       ((cc) >= 'A' && (cc) <= 'F') ||\
00057                       ((cc) >= 'a' && (cc) <= 'f'))
00058 
00059 /*
00060  * A header written to the output file with the byte-order-mark and the number
00061  * of property nodes.
00062  */
00063 static ac_uint2 hdr[2] = {0xfeff, 0};
00064 
00065 #define NUMPROPS 50
00066 #define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3)))
00067 
00068 typedef struct {
00069     char *name;
00070     int len;
00071 } _prop_t;
00072 
00073 /*
00074  * List of properties expected to be found in the Unicode Character Database
00075  * including some implementation specific properties.
00076  *
00077  * The implementation specific properties are:
00078  * Cm = Composed (can be decomposed)
00079  * Nb = Non-breaking
00080  * Sy = Symmetric (has left and right forms)
00081  * Hd = Hex digit
00082  * Qm = Quote marks
00083  * Mr = Mirroring
00084  * Ss = Space, other
00085  * Cp = Defined character
00086  */
00087 static _prop_t props[NUMPROPS] = {
00088     {"Mn", 2}, {"Mc", 2}, {"Me", 2}, {"Nd", 2}, {"Nl", 2}, {"No", 2},
00089     {"Zs", 2}, {"Zl", 2}, {"Zp", 2}, {"Cc", 2}, {"Cf", 2}, {"Cs", 2},
00090     {"Co", 2}, {"Cn", 2}, {"Lu", 2}, {"Ll", 2}, {"Lt", 2}, {"Lm", 2},
00091     {"Lo", 2}, {"Pc", 2}, {"Pd", 2}, {"Ps", 2}, {"Pe", 2}, {"Po", 2},
00092     {"Sm", 2}, {"Sc", 2}, {"Sk", 2}, {"So", 2}, {"L",  1}, {"R",  1},
00093     {"EN", 2}, {"ES", 2}, {"ET", 2}, {"AN", 2}, {"CS", 2}, {"B",  1},
00094     {"S",  1}, {"WS", 2}, {"ON", 2},
00095     {"Cm", 2}, {"Nb", 2}, {"Sy", 2}, {"Hd", 2}, {"Qm", 2}, {"Mr", 2},
00096     {"Ss", 2}, {"Cp", 2}, {"Pi", 2}, {"Pf", 2}, {"AL", 2}
00097 };
00098 
00099 typedef struct {
00100     ac_uint4 *ranges;
00101     ac_uint2 used;
00102     ac_uint2 size;
00103 } _ranges_t;
00104 
00105 static _ranges_t proptbl[NUMPROPS];
00106 
00107 /*
00108  * Make sure this array is sized to be on a 4-byte boundary at compile time.
00109  */
00110 static ac_uint2 propcnt[NEEDPROPS];
00111 
00112 /*
00113  * Array used to collect a decomposition before adding it to the decomposition
00114  * table.
00115  */
00116 static ac_uint4 dectmp[64];
00117 static ac_uint4 dectmp_size;
00118 
00119 typedef struct {
00120     ac_uint4 code;
00121     ac_uint2 size;
00122     ac_uint2 used;
00123     ac_uint4 *decomp;
00124 } _decomp_t;
00125 
00126 /*
00127  * List of decomposition.  Created and expanded in order as the characters are
00128  * encountered. First list contains canonical mappings, second also includes
00129  * compatibility mappings.
00130  */
00131 static _decomp_t *decomps;
00132 static ac_uint4 decomps_used;
00133 static ac_uint4 decomps_size;
00134 
00135 static _decomp_t *kdecomps;
00136 static ac_uint4 kdecomps_used;
00137 static ac_uint4 kdecomps_size;
00138 
00139 /*
00140  * Composition exclusion table stuff.
00141  */
00142 #define COMPEX_SET(c) (compexs[(c) >> 5] |= (1 << ((c) & 31)))
00143 #define COMPEX_TEST(c) (compexs[(c) >> 5] & (1 << ((c) & 31)))
00144 static ac_uint4 compexs[8192];
00145 
00146 /*
00147  * Struct for holding a composition pair, and array of composition pairs
00148  */
00149 typedef struct {
00150     ac_uint4 comp;
00151     ac_uint4 count;
00152     ac_uint4 code1;
00153     ac_uint4 code2;
00154 } _comp_t;
00155 
00156 static _comp_t *comps;
00157 static ac_uint4 comps_used;
00158 
00159 /*
00160  * Types and lists for handling lists of case mappings.
00161  */
00162 typedef struct {
00163     ac_uint4 key;
00164     ac_uint4 other1;
00165     ac_uint4 other2;
00166 } _case_t;
00167 
00168 static _case_t *upper;
00169 static _case_t *lower;
00170 static _case_t *title;
00171 static ac_uint4 upper_used;
00172 static ac_uint4 upper_size;
00173 static ac_uint4 lower_used;
00174 static ac_uint4 lower_size;
00175 static ac_uint4 title_used;
00176 static ac_uint4 title_size;
00177 
00178 /*
00179  * Array used to collect case mappings before adding them to a list.
00180  */
00181 static ac_uint4 cases[3];
00182 
00183 /*
00184  * An array to hold ranges for combining classes.
00185  */
00186 static ac_uint4 *ccl;
00187 static ac_uint4 ccl_used;
00188 static ac_uint4 ccl_size;
00189 
00190 /*
00191  * Structures for handling numbers.
00192  */
00193 typedef struct {
00194     ac_uint4 code;
00195     ac_uint4 idx;
00196 } _codeidx_t;
00197 
00198 typedef struct {
00199     short numerator;
00200     short denominator;
00201 } _num_t;
00202 
00203 /*
00204  * Arrays to hold the mapping of codes to numbers.
00205  */
00206 static _codeidx_t *ncodes;
00207 static ac_uint4 ncodes_used;
00208 static ac_uint4 ncodes_size;
00209 
00210 static _num_t *nums;
00211 static ac_uint4 nums_used;
00212 static ac_uint4 nums_size;
00213 
00214 /*
00215  * Array for holding numbers.
00216  */
00217 static _num_t *nums;
00218 static ac_uint4 nums_used;
00219 static ac_uint4 nums_size;
00220 
00221 static void
00222 add_range(ac_uint4 start, ac_uint4 end, char *p1, char *p2)
00223 {
00224     int i, j, k, len;
00225     _ranges_t *rlp;
00226     char *name;
00227 
00228     for (k = 0; k < 2; k++) {
00229         if (k == 0) {
00230             name = p1;
00231             len = 2;
00232         } else {
00233             if (p2 == 0)
00234               break;
00235 
00236             name = p2;
00237             len = 1;
00238         }
00239 
00240         for (i = 0; i < NUMPROPS; i++) {
00241             if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
00242               break;
00243         }
00244 
00245         if (i == NUMPROPS)
00246           continue;
00247 
00248         rlp = &proptbl[i];
00249 
00250         /*
00251          * Resize the range list if necessary.
00252          */
00253         if (rlp->used == rlp->size) {
00254             if (rlp->size == 0)
00255               rlp->ranges = (ac_uint4 *)
00256                   malloc(sizeof(ac_uint4) << 3);
00257             else
00258               rlp->ranges = (ac_uint4 *)
00259                   realloc((char *) rlp->ranges,
00260                           sizeof(ac_uint4) * (rlp->size + 8));
00261             rlp->size += 8;
00262         }
00263 
00264         /*
00265          * If this is the first code for this property list, just add it
00266          * and return.
00267          */
00268         if (rlp->used == 0) {
00269             rlp->ranges[0] = start;
00270             rlp->ranges[1] = end;
00271             rlp->used += 2;
00272             continue;
00273         }
00274 
00275         /*
00276          * Optimize the case of adding the range to the end.
00277          */
00278         j = rlp->used - 1;
00279         if (start > rlp->ranges[j]) {
00280             j = rlp->used;
00281             rlp->ranges[j++] = start;
00282             rlp->ranges[j++] = end;
00283             rlp->used = j;
00284             continue;
00285         }
00286 
00287         /*
00288          * Need to locate the insertion point.
00289          */
00290         for (i = 0;
00291              i < rlp->used && start > rlp->ranges[i + 1] + 1; i += 2) ;
00292 
00293         /*
00294          * If the start value lies in the current range, then simply set the
00295          * new end point of the range to the end value passed as a parameter.
00296          */
00297         if (rlp->ranges[i] <= start && start <= rlp->ranges[i + 1] + 1) {
00298             rlp->ranges[i + 1] = end;
00299             return;
00300         }
00301 
00302         /*
00303          * Shift following values up by two.
00304          */
00305         for (j = rlp->used; j > i; j -= 2) {
00306             rlp->ranges[j] = rlp->ranges[j - 2];
00307             rlp->ranges[j + 1] = rlp->ranges[j - 1];
00308         }
00309 
00310         /*
00311          * Add the new range at the insertion point.
00312          */
00313         rlp->ranges[i] = start;
00314         rlp->ranges[i + 1] = end;
00315         rlp->used += 2;
00316     }
00317 }
00318 
00319 static void
00320 ordered_range_insert(ac_uint4 c, char *name, int len)
00321 {
00322     int i, j;
00323     ac_uint4 s, e;
00324     _ranges_t *rlp;
00325 
00326     if (len == 0)
00327       return;
00328 
00329     /*
00330      * Deal with directionality codes introduced in Unicode 3.0.
00331      */
00332     if ((len == 2 && memcmp(name, "BN", 2) == 0) ||
00333         (len == 3 &&
00334          (memcmp(name, "NSM", 3) == 0 || memcmp(name, "PDF", 3) == 0 ||
00335           memcmp(name, "LRE", 3) == 0 || memcmp(name, "LRO", 3) == 0 ||
00336           memcmp(name, "RLE", 3) == 0 || memcmp(name, "RLO", 3) == 0))) {
00337         /*
00338          * Mark all of these as Other Neutral to preserve compatibility with
00339          * older versions.
00340          */
00341         len = 2;
00342         name = "ON";
00343     }
00344 
00345     for (i = 0; i < NUMPROPS; i++) {
00346         if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
00347           break;
00348     }
00349 
00350     if (i == NUMPROPS)
00351       return;
00352 
00353     /*
00354      * Have a match, so insert the code in order.
00355      */
00356     rlp = &proptbl[i];
00357 
00358     /*
00359      * Resize the range list if necessary.
00360      */
00361     if (rlp->used == rlp->size) {
00362         if (rlp->size == 0)
00363           rlp->ranges = (ac_uint4 *)
00364               malloc(sizeof(ac_uint4) << 3);
00365         else
00366           rlp->ranges = (ac_uint4 *)
00367               realloc((char *) rlp->ranges,
00368                       sizeof(ac_uint4) * (rlp->size + 8));
00369         rlp->size += 8;
00370     }
00371 
00372     /*
00373      * If this is the first code for this property list, just add it
00374      * and return.
00375      */
00376     if (rlp->used == 0) {
00377         rlp->ranges[0] = rlp->ranges[1] = c;
00378         rlp->used += 2;
00379         return;
00380     }
00381 
00382     /*
00383      * Optimize the cases of extending the last range and adding new ranges to
00384      * the end.
00385      */
00386     j = rlp->used - 1;
00387     e = rlp->ranges[j];
00388     s = rlp->ranges[j - 1];
00389 
00390     if (c == e + 1) {
00391         /*
00392          * Extend the last range.
00393          */
00394         rlp->ranges[j] = c;
00395         return;
00396     }
00397 
00398     if (c > e + 1) {
00399         /*
00400          * Start another range on the end.
00401          */
00402         j = rlp->used;
00403         rlp->ranges[j] = rlp->ranges[j + 1] = c;
00404         rlp->used += 2;
00405         return;
00406     }
00407 
00408     if (c >= s)
00409       /*
00410        * The code is a duplicate of a code in the last range, so just return.
00411        */
00412       return;
00413 
00414     /*
00415      * The code should be inserted somewhere before the last range in the
00416      * list.  Locate the insertion point.
00417      */
00418     for (i = 0;
00419          i < rlp->used && c > rlp->ranges[i + 1] + 1; i += 2) ;
00420 
00421     s = rlp->ranges[i];
00422     e = rlp->ranges[i + 1];
00423 
00424     if (c == e + 1)
00425       /*
00426        * Simply extend the current range.
00427        */
00428       rlp->ranges[i + 1] = c;
00429     else if (c < s) {
00430         /*
00431          * Add a new entry before the current location.  Shift all entries
00432          * before the current one up by one to make room.
00433          */
00434         for (j = rlp->used; j > i; j -= 2) {
00435             rlp->ranges[j] = rlp->ranges[j - 2];
00436             rlp->ranges[j + 1] = rlp->ranges[j - 1];
00437         }
00438         rlp->ranges[i] = rlp->ranges[i + 1] = c;
00439 
00440         rlp->used += 2;
00441     }
00442 }
00443 
00444 static void
00445 add_decomp(ac_uint4 code, short compat)
00446 {
00447     ac_uint4 i, j, size;
00448     _decomp_t **pdecomps;
00449     ac_uint4 *pdecomps_used;
00450     ac_uint4 *pdecomps_size;
00451 
00452     if (compat) {
00453        pdecomps = &kdecomps;
00454        pdecomps_used = &kdecomps_used;
00455        pdecomps_size = &kdecomps_size;
00456     } else {
00457        pdecomps = &decomps;
00458        pdecomps_used = &decomps_used;
00459        pdecomps_size = &decomps_size;
00460     }
00461     
00462     /*
00463      * Add the code to the composite property.
00464      */
00465     if (!compat) {
00466        ordered_range_insert(code, "Cm", 2);
00467     }
00468 
00469     /*
00470      * Locate the insertion point for the code.
00471      */
00472     for (i = 0; i < *pdecomps_used && code > (*pdecomps)[i].code; i++) ;
00473 
00474     /*
00475      * Allocate space for a new decomposition.
00476      */
00477     if (*pdecomps_used == *pdecomps_size) {
00478         if (*pdecomps_size == 0)
00479           *pdecomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3);
00480         else
00481           *pdecomps = (_decomp_t *)
00482               realloc((char *) *pdecomps,
00483                       sizeof(_decomp_t) * (*pdecomps_size + 8));
00484         (void) memset((char *) (*pdecomps + *pdecomps_size), '\0',
00485                       sizeof(_decomp_t) << 3);
00486         *pdecomps_size += 8;
00487     }
00488 
00489     if (i < *pdecomps_used && code != (*pdecomps)[i].code) {
00490         /*
00491          * Shift the decomps up by one if the codes don't match.
00492          */
00493         for (j = *pdecomps_used; j > i; j--)
00494           (void) AC_MEMCPY((char *) &(*pdecomps)[j], (char *) &(*pdecomps)[j - 1],
00495                         sizeof(_decomp_t));
00496     }
00497 
00498     /*
00499      * Insert or replace a decomposition.
00500      */
00501     size = dectmp_size + (4 - (dectmp_size & 3));
00502     if ((*pdecomps)[i].size < size) {
00503         if ((*pdecomps)[i].size == 0)
00504           (*pdecomps)[i].decomp = (ac_uint4 *)
00505               malloc(sizeof(ac_uint4) * size);
00506         else
00507           (*pdecomps)[i].decomp = (ac_uint4 *)
00508               realloc((char *) (*pdecomps)[i].decomp,
00509                       sizeof(ac_uint4) * size);
00510         (*pdecomps)[i].size = size;
00511     }
00512 
00513     if ((*pdecomps)[i].code != code)
00514       (*pdecomps_used)++;
00515 
00516     (*pdecomps)[i].code = code;
00517     (*pdecomps)[i].used = dectmp_size;
00518     (void) AC_MEMCPY((char *) (*pdecomps)[i].decomp, (char *) dectmp,
00519                   sizeof(ac_uint4) * dectmp_size);
00520 
00521     /*
00522      * NOTICE: This needs changing later so it is more general than simply
00523      * pairs.  This calculation is done here to simplify allocation elsewhere.
00524      */
00525     if (!compat && dectmp_size == 2)
00526       comps_used++;
00527 }
00528 
00529 static void
00530 add_title(ac_uint4 code)
00531 {
00532     ac_uint4 i, j;
00533 
00534     /*
00535      * Always map the code to itself.
00536      */
00537     cases[2] = code;
00538 
00539     if (title_used == title_size) {
00540         if (title_size == 0)
00541           title = (_case_t *) malloc(sizeof(_case_t) << 3);
00542         else
00543           title = (_case_t *) realloc((char *) title,
00544                                       sizeof(_case_t) * (title_size + 8));
00545         title_size += 8;
00546     }
00547 
00548     /*
00549      * Locate the insertion point.
00550      */
00551     for (i = 0; i < title_used && code > title[i].key; i++) ;
00552 
00553     if (i < title_used) {
00554         /*
00555          * Shift the array up by one.
00556          */
00557         for (j = title_used; j > i; j--)
00558           (void) AC_MEMCPY((char *) &title[j], (char *) &title[j - 1],
00559                         sizeof(_case_t));
00560     }
00561 
00562     title[i].key = cases[2];    /* Title */
00563     title[i].other1 = cases[0]; /* Upper */
00564     title[i].other2 = cases[1]; /* Lower */
00565 
00566     title_used++;
00567 }
00568 
00569 static void
00570 add_upper(ac_uint4 code)
00571 {
00572     ac_uint4 i, j;
00573 
00574     /*
00575      * Always map the code to itself.
00576      */
00577     cases[0] = code;
00578 
00579     /*
00580      * If the title case character is not present, then make it the same as
00581      * the upper case.
00582      */
00583     if (cases[2] == 0)
00584       cases[2] = code;
00585 
00586     if (upper_used == upper_size) {
00587         if (upper_size == 0)
00588           upper = (_case_t *) malloc(sizeof(_case_t) << 3);
00589         else
00590           upper = (_case_t *) realloc((char *) upper,
00591                                       sizeof(_case_t) * (upper_size + 8));
00592         upper_size += 8;
00593     }
00594 
00595     /*
00596      * Locate the insertion point.
00597      */
00598     for (i = 0; i < upper_used && code > upper[i].key; i++) ;
00599 
00600     if (i < upper_used) {
00601         /*
00602          * Shift the array up by one.
00603          */
00604         for (j = upper_used; j > i; j--)
00605           (void) AC_MEMCPY((char *) &upper[j], (char *) &upper[j - 1],
00606                         sizeof(_case_t));
00607     }
00608 
00609     upper[i].key = cases[0];    /* Upper */
00610     upper[i].other1 = cases[1]; /* Lower */
00611     upper[i].other2 = cases[2]; /* Title */
00612 
00613     upper_used++;
00614 }
00615 
00616 static void
00617 add_lower(ac_uint4 code)
00618 {
00619     ac_uint4 i, j;
00620 
00621     /*
00622      * Always map the code to itself.
00623      */
00624     cases[1] = code;
00625 
00626     /*
00627      * If the title case character is empty, then make it the same as the
00628      * upper case.
00629      */
00630     if (cases[2] == 0)
00631       cases[2] = cases[0];
00632 
00633     if (lower_used == lower_size) {
00634         if (lower_size == 0)
00635           lower = (_case_t *) malloc(sizeof(_case_t) << 3);
00636         else
00637           lower = (_case_t *) realloc((char *) lower,
00638                                       sizeof(_case_t) * (lower_size + 8));
00639         lower_size += 8;
00640     }
00641 
00642     /*
00643      * Locate the insertion point.
00644      */
00645     for (i = 0; i < lower_used && code > lower[i].key; i++) ;
00646 
00647     if (i < lower_used) {
00648         /*
00649          * Shift the array up by one.
00650          */
00651         for (j = lower_used; j > i; j--)
00652           (void) AC_MEMCPY((char *) &lower[j], (char *) &lower[j - 1],
00653                         sizeof(_case_t));
00654     }
00655 
00656     lower[i].key = cases[1];    /* Lower */
00657     lower[i].other1 = cases[0]; /* Upper */
00658     lower[i].other2 = cases[2]; /* Title */
00659 
00660     lower_used++;
00661 }
00662 
00663 static void
00664 ordered_ccl_insert(ac_uint4 c, ac_uint4 ccl_code)
00665 {
00666     ac_uint4 i, j;
00667 
00668     if (ccl_used == ccl_size) {
00669         if (ccl_size == 0)
00670           ccl = (ac_uint4 *) malloc(sizeof(ac_uint4) * 24);
00671         else
00672           ccl = (ac_uint4 *)
00673               realloc((char *) ccl, sizeof(ac_uint4) * (ccl_size + 24));
00674         ccl_size += 24;
00675     }
00676 
00677     /*
00678      * Optimize adding the first item.
00679      */
00680     if (ccl_used == 0) {
00681         ccl[0] = ccl[1] = c;
00682         ccl[2] = ccl_code;
00683         ccl_used += 3;
00684         return;
00685     }
00686 
00687     /*
00688      * Handle the special case of extending the range on the end.  This
00689      * requires that the combining class codes are the same.
00690      */
00691     if (ccl_code == ccl[ccl_used - 1] && c == ccl[ccl_used - 2] + 1) {
00692         ccl[ccl_used - 2] = c;
00693         return;
00694     }
00695 
00696     /*
00697      * Handle the special case of adding another range on the end.
00698      */
00699     if (c > ccl[ccl_used - 2] + 1 ||
00700         (c == ccl[ccl_used - 2] + 1 && ccl_code != ccl[ccl_used - 1])) {
00701         ccl[ccl_used++] = c;
00702         ccl[ccl_used++] = c;
00703         ccl[ccl_used++] = ccl_code;
00704         return;
00705     }
00706 
00707     /*
00708      * Locate either the insertion point or range for the code.
00709      */
00710     for (i = 0; i < ccl_used && c > ccl[i + 1] + 1; i += 3) ;
00711 
00712     if (ccl_code == ccl[i + 2] && c == ccl[i + 1] + 1) {
00713         /*
00714          * Extend an existing range.
00715          */
00716         ccl[i + 1] = c;
00717         return;
00718     } else if (c < ccl[i]) {
00719         /*
00720          * Start a new range before the current location.
00721          */
00722         for (j = ccl_used; j > i; j -= 3) {
00723             ccl[j] = ccl[j - 3];
00724             ccl[j - 1] = ccl[j - 4];
00725             ccl[j - 2] = ccl[j - 5];
00726         }
00727         ccl[i] = ccl[i + 1] = c;
00728         ccl[i + 2] = ccl_code;
00729     }
00730 }
00731 
00732 /*
00733  * Adds a number if it does not already exist and returns an index value
00734  * multiplied by 2.
00735  */
00736 static ac_uint4
00737 make_number(short num, short denom)
00738 {
00739     ac_uint4 n;
00740 
00741     /*
00742      * Determine if the number already exists.
00743      */
00744     for (n = 0; n < nums_used; n++) {
00745         if (nums[n].numerator == num && nums[n].denominator == denom)
00746           return n << 1;
00747     }
00748 
00749     if (nums_used == nums_size) {
00750         if (nums_size == 0)
00751           nums = (_num_t *) malloc(sizeof(_num_t) << 3);
00752         else
00753           nums = (_num_t *) realloc((char *) nums,
00754                                     sizeof(_num_t) * (nums_size + 8));
00755         nums_size += 8;
00756     }
00757 
00758     n = nums_used++;
00759     nums[n].numerator = num;
00760     nums[n].denominator = denom;
00761 
00762     return n << 1;
00763 }
00764 
00765 static void
00766 add_number(ac_uint4 code, short num, short denom)
00767 {
00768     ac_uint4 i, j;
00769 
00770     /*
00771      * Insert the code in order.
00772      */
00773     for (i = 0; i < ncodes_used && code > ncodes[i].code; i++) ;
00774 
00775     /*
00776      * Handle the case of the codes matching and simply replace the number
00777      * that was there before.
00778      */
00779     if (i < ncodes_used && code == ncodes[i].code) {
00780         ncodes[i].idx = make_number(num, denom);
00781         return;
00782     }
00783 
00784     /*
00785      * Resize the array if necessary.
00786      */
00787     if (ncodes_used == ncodes_size) {
00788         if (ncodes_size == 0)
00789           ncodes = (_codeidx_t *) malloc(sizeof(_codeidx_t) << 3);
00790         else
00791           ncodes = (_codeidx_t *)
00792               realloc((char *) ncodes, sizeof(_codeidx_t) * (ncodes_size + 8));
00793 
00794         ncodes_size += 8;
00795     }
00796 
00797     /*
00798      * Shift things around to insert the code if necessary.
00799      */
00800     if (i < ncodes_used) {
00801         for (j = ncodes_used; j > i; j--) {
00802             ncodes[j].code = ncodes[j - 1].code;
00803             ncodes[j].idx = ncodes[j - 1].idx;
00804         }
00805     }
00806     ncodes[i].code = code;
00807     ncodes[i].idx = make_number(num, denom);
00808 
00809     ncodes_used++;
00810 }
00811 
00812 /*
00813  * This routine assumes that the line is a valid Unicode Character Database
00814  * entry.
00815  */
00816 static void
00817 read_cdata(FILE *in)
00818 {
00819     ac_uint4 i, lineno, skip, code, ccl_code;
00820     short wnum, neg, number[2], compat;
00821     char line[512], *s, *e;
00822 
00823     lineno = skip = 0;
00824     while (fgets(line, sizeof(line), in)) {
00825        if( (s=strchr(line, '\n')) ) *s = '\0';
00826         lineno++;
00827 
00828         /*
00829          * Skip blank lines and lines that start with a '#'.
00830          */
00831         if (line[0] == 0 || line[0] == '#')
00832           continue;
00833 
00834         /*
00835          * If lines need to be skipped, do it here.
00836          */
00837         if (skip) {
00838             skip--;
00839             continue;
00840         }
00841 
00842         /*
00843          * Collect the code.  The code can be up to 6 hex digits in length to
00844          * allow surrogates to be specified.
00845          */
00846         for (s = line, i = code = 0; *s != ';' && i < 6; i++, s++) {
00847             code <<= 4;
00848             if (*s >= '0' && *s <= '9')
00849               code += *s - '0';
00850             else if (*s >= 'A' && *s <= 'F')
00851               code += (*s - 'A') + 10;
00852             else if (*s >= 'a' && *s <= 'f')
00853               code += (*s - 'a') + 10;
00854         }
00855 
00856         /*
00857          * Handle the following special cases:
00858          * 1. 4E00-9FA5 CJK Ideographs.
00859          * 2. AC00-D7A3 Hangul Syllables.
00860          * 3. D800-DFFF Surrogates.
00861          * 4. E000-F8FF Private Use Area.
00862          * 5. F900-FA2D Han compatibility.
00863         * ...Plus additional ranges in newer Unicode versions...
00864          */
00865         switch (code) {
00866          case 0x3400:
00867            /* CJK Ideograph Extension A */
00868             add_range(0x3400, 0x4db5, "Lo", "L");
00869 
00870             add_range(0x3400, 0x4db5, "Cp", 0);
00871 
00872            skip = 1;
00873            break;
00874           case 0x4e00:
00875             /*
00876              * The Han ideographs.
00877              */
00878             add_range(0x4e00, 0x9fff, "Lo", "L");
00879 
00880             /*
00881              * Add the characters to the defined category.
00882              */
00883             add_range(0x4e00, 0x9fa5, "Cp", 0);
00884 
00885             skip = 1;
00886             break;
00887           case 0xac00:
00888             /*
00889              * The Hangul syllables.
00890              */
00891             add_range(0xac00, 0xd7a3, "Lo", "L");
00892 
00893             /*
00894              * Add the characters to the defined category.
00895              */
00896             add_range(0xac00, 0xd7a3, "Cp", 0);
00897 
00898             skip = 1;
00899             break;
00900           case 0xd800:
00901             /*
00902              * Make a range of all surrogates and assume some default
00903              * properties.
00904              */
00905             add_range(0x010000, 0x10ffff, "Cs", "L");
00906             skip = 5;
00907             break;
00908           case 0xe000:
00909             /*
00910              * The Private Use area.  Add with a default set of properties.
00911              */
00912             add_range(0xe000, 0xf8ff, "Co", "L");
00913             skip = 1;
00914             break;
00915           case 0xf900:
00916             /*
00917              * The CJK compatibility area.
00918              */
00919             add_range(0xf900, 0xfaff, "Lo", "L");
00920 
00921             /*
00922              * Add the characters to the defined category.
00923              */
00924             add_range(0xf900, 0xfaff, "Cp", 0);
00925 
00926             skip = 1;
00927            break;
00928          case 0x20000:
00929            /* CJK Ideograph Extension B */
00930             add_range(0x20000, 0x2a6d6, "Lo", "L");
00931 
00932             add_range(0x20000, 0x2a6d6, "Cp", 0);
00933 
00934            skip = 1;
00935            break;
00936          case 0xf0000:
00937            /* Plane 15 private use */
00938            add_range(0xf0000, 0xffffd, "Co", "L");
00939            skip = 1;
00940            break;
00941 
00942          case 0x100000:
00943            /* Plane 16 private use */
00944            add_range(0x100000, 0x10fffd, "Co", "L");
00945            skip = 1;
00946            break;
00947         }
00948 
00949         if (skip)
00950           continue;
00951 
00952         /*
00953          * Add the code to the defined category.
00954          */
00955         ordered_range_insert(code, "Cp", 2);
00956 
00957         /*
00958          * Locate the first character property field.
00959          */
00960         for (i = 0; *s != 0 && i < 2; s++) {
00961             if (*s == ';')
00962               i++;
00963         }
00964         for (e = s; *e && *e != ';'; e++) ;
00965     
00966         ordered_range_insert(code, s, e - s);
00967 
00968         /*
00969          * Locate the combining class code.
00970          */
00971         for (s = e; *s != 0 && i < 3; s++) {
00972             if (*s == ';')
00973               i++;
00974         }
00975 
00976         /*
00977          * Convert the combining class code from decimal.
00978          */
00979         for (ccl_code = 0, e = s; *e && *e != ';'; e++)
00980           ccl_code = (ccl_code * 10) + (*e - '0');
00981 
00982         /*
00983          * Add the code if it not 0.
00984          */
00985         if (ccl_code != 0)
00986           ordered_ccl_insert(code, ccl_code);
00987 
00988         /*
00989          * Locate the second character property field.
00990          */
00991         for (s = e; *s != 0 && i < 4; s++) {
00992             if (*s == ';')
00993               i++;
00994         }
00995         for (e = s; *e && *e != ';'; e++) ;
00996 
00997         ordered_range_insert(code, s, e - s);
00998 
00999         /*
01000          * Check for a decomposition.
01001          */
01002         s = ++e;
01003         if (*s != ';') {
01004            compat = *s == '<';
01005            if (compat) {
01006               /*
01007                * Skip compatibility formatting tag.
01008                */
01009               while (*s++ != '>');
01010            }
01011             /*
01012              * Collect the codes of the decomposition.
01013              */
01014             for (dectmp_size = 0; *s != ';'; ) {
01015                 /*
01016                  * Skip all leading non-hex digits.
01017                  */
01018                 while (!ishdigit(*s))
01019                 s++;
01020 
01021                 for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) {
01022                     dectmp[dectmp_size] <<= 4;
01023                     if (*s >= '0' && *s <= '9')
01024                       dectmp[dectmp_size] += *s - '0';
01025                     else if (*s >= 'A' && *s <= 'F')
01026                       dectmp[dectmp_size] += (*s - 'A') + 10;
01027                     else if (*s >= 'a' && *s <= 'f')
01028                       dectmp[dectmp_size] += (*s - 'a') + 10;
01029                 }
01030                 dectmp_size++;
01031             }
01032 
01033             /*
01034              * If there are any codes in the temporary decomposition array,
01035              * then add the character with its decomposition.
01036              */
01037             if (dectmp_size > 0) {
01038               if (!compat) {
01039                   add_decomp(code, 0);
01040               }
01041               add_decomp(code, 1);
01042            }
01043         }
01044 
01045         /*
01046          * Skip to the number field.
01047          */
01048         for (i = 0; i < 3 && *s; s++) {
01049             if (*s == ';')
01050               i++;
01051         }
01052 
01053         /*
01054          * Scan the number in.
01055          */
01056         number[0] = number[1] = 0;
01057         for (e = s, neg = wnum = 0; *e && *e != ';'; e++) {
01058             if (*e == '-') {
01059                 neg = 1;
01060                 continue;
01061             }
01062 
01063             if (*e == '/') {
01064                 /*
01065                  * Move the the denominator of the fraction.
01066                  */
01067                 if (neg)
01068                   number[wnum] *= -1;
01069                 neg = 0;
01070                 e++;
01071                 wnum++;
01072             }
01073             number[wnum] = (number[wnum] * 10) + (*e - '0');
01074         }
01075 
01076         if (e > s) {
01077             /*
01078              * Adjust the denominator in case of integers and add the number.
01079              */
01080             if (wnum == 0)
01081               number[1] = 1;
01082 
01083             add_number(code, number[0], number[1]);
01084         }
01085 
01086         /*
01087          * Skip to the start of the possible case mappings.
01088          */
01089         for (s = e, i = 0; i < 4 && *s; s++) {
01090             if (*s == ';')
01091               i++;
01092         }
01093 
01094         /*
01095          * Collect the case mappings.
01096          */
01097         cases[0] = cases[1] = cases[2] = 0;
01098         for (i = 0; i < 3; i++) {
01099             while (ishdigit(*s)) {
01100                 cases[i] <<= 4;
01101                 if (*s >= '0' && *s <= '9')
01102                   cases[i] += *s - '0';
01103                 else if (*s >= 'A' && *s <= 'F')
01104                   cases[i] += (*s - 'A') + 10;
01105                 else if (*s >= 'a' && *s <= 'f')
01106                   cases[i] += (*s - 'a') + 10;
01107                 s++;
01108             }
01109             if (*s == ';')
01110               s++;
01111         }
01112         if (cases[0] && cases[1])
01113           /*
01114            * Add the upper and lower mappings for a title case character.
01115            */
01116           add_title(code);
01117         else if (cases[1])
01118           /*
01119            * Add the lower and title case mappings for the upper case
01120            * character.
01121            */
01122           add_upper(code);
01123         else if (cases[0])
01124           /*
01125            * Add the upper and title case mappings for the lower case
01126            * character.
01127            */
01128           add_lower(code);
01129     }
01130 }
01131 
01132 static _decomp_t *
01133 find_decomp(ac_uint4 code, short compat)
01134 {
01135     long l, r, m;
01136     _decomp_t *decs;
01137     
01138     l = 0;
01139     r = (compat ? kdecomps_used : decomps_used) - 1;
01140     decs = compat ? kdecomps : decomps;
01141     while (l <= r) {
01142         m = (l + r) >> 1;
01143         if (code > decs[m].code)
01144           l = m + 1;
01145         else if (code < decs[m].code)
01146           r = m - 1;
01147         else
01148           return &decs[m];
01149     }
01150     return 0;
01151 }
01152 
01153 static void
01154 decomp_it(_decomp_t *d, short compat)
01155 {
01156     ac_uint4 i;
01157     _decomp_t *dp;
01158 
01159     for (i = 0; i < d->used; i++) {
01160         if ((dp = find_decomp(d->decomp[i], compat)) != 0)
01161           decomp_it(dp, compat);
01162         else
01163           dectmp[dectmp_size++] = d->decomp[i];
01164     }
01165 }
01166 
01167 /*
01168  * Expand all decompositions by recursively decomposing each character
01169  * in the decomposition.
01170  */
01171 static void
01172 expand_decomp(void)
01173 {
01174     ac_uint4 i;
01175 
01176     for (i = 0; i < decomps_used; i++) {
01177         dectmp_size = 0;
01178         decomp_it(&decomps[i], 0);
01179         if (dectmp_size > 0)
01180           add_decomp(decomps[i].code, 0);
01181     }
01182 
01183     for (i = 0; i < kdecomps_used; i++) {
01184         dectmp_size = 0;
01185         decomp_it(&kdecomps[i], 1);
01186         if (dectmp_size > 0)
01187           add_decomp(kdecomps[i].code, 1);
01188     }
01189 }
01190 
01191 static int
01192 cmpcomps(const void *v_comp1, const void *v_comp2)
01193 {
01194        const _comp_t *comp1 = v_comp1, *comp2 = v_comp2;
01195     long diff = comp1->code1 - comp2->code1;
01196 
01197     if (!diff)
01198        diff = comp1->code2 - comp2->code2;
01199     return (int) diff;
01200 }
01201 
01202 /*
01203  * Load composition exclusion data
01204  */
01205 static void
01206 read_compexdata(FILE *in)
01207 {
01208     ac_uint2 i;
01209     ac_uint4 code;
01210     char line[512], *s;
01211 
01212     (void) memset((char *) compexs, 0, sizeof(compexs));
01213 
01214     while (fgets(line, sizeof(line), in)) {
01215        if( (s=strchr(line, '\n')) ) *s = '\0';
01216         /*
01217          * Skip blank lines and lines that start with a '#'.
01218          */
01219         if (line[0] == 0 || line[0] == '#')
01220            continue;
01221 
01222        /*
01223          * Collect the code.  Assume max 6 digits
01224          */
01225 
01226        for (s = line, i = code = 0; *s != '#' && i < 6; i++, s++) {
01227            if (isspace((unsigned char)*s)) break;
01228             code <<= 4;
01229             if (*s >= '0' && *s <= '9')
01230               code += *s - '0';
01231             else if (*s >= 'A' && *s <= 'F')
01232               code += (*s - 'A') + 10;
01233             else if (*s >= 'a' && *s <= 'f')
01234               code += (*s - 'a') + 10;
01235         }
01236         COMPEX_SET(code);
01237     }
01238 }
01239 
01240 /*
01241  * Creates array of compositions from decomposition array
01242  */
01243 static void
01244 create_comps(void)
01245 {
01246     ac_uint4 i, cu;
01247 
01248     comps = (_comp_t *) malloc(comps_used * sizeof(_comp_t));
01249 
01250     for (i = cu = 0; i < decomps_used; i++) {
01251        if (decomps[i].used != 2 || COMPEX_TEST(decomps[i].code))
01252            continue;
01253        comps[cu].comp = decomps[i].code;
01254        comps[cu].count = 2;
01255        comps[cu].code1 = decomps[i].decomp[0];
01256        comps[cu].code2 = decomps[i].decomp[1];
01257        cu++;
01258     }
01259     comps_used = cu;
01260     qsort(comps, comps_used, sizeof(_comp_t), cmpcomps);
01261 }
01262 
01263 #if HARDCODE_DATA
01264 static void
01265 write_case(FILE *out, _case_t *tab, int num, int first)
01266 {
01267     int i;
01268 
01269     for (i=0; i<num; i++) {
01270        if (first) first = 0;
01271        else fprintf(out, ",");
01272        fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx",
01273               (unsigned long) tab[i].key, (unsigned long) tab[i].other1,
01274               (unsigned long) tab[i].other2);
01275     }
01276 }
01277 
01278 #define PREF "static const "
01279 
01280 #endif
01281 
01282 static void
01283 write_cdata(char *opath)
01284 {
01285     FILE *out;
01286        ac_uint4 bytes;
01287     ac_uint4 i, idx, nprops;
01288 #if !(HARDCODE_DATA)
01289     ac_uint2 casecnt[2];
01290 #endif
01291     char path[BUFSIZ];
01292 #if HARDCODE_DATA
01293     int j, k;
01294 
01295     /*****************************************************************
01296      *
01297      * Generate the ctype data.
01298      *
01299      *****************************************************************/
01300 
01301     /*
01302      * Open the output file.
01303      */
01304     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "uctable.h", opath);
01305     if ((out = fopen(path, "w")) == 0)
01306       return;
01307 #else
01308     /*
01309      * Open the ctype.dat file.
01310      */
01311     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "ctype.dat", opath);
01312     if ((out = fopen(path, "wb")) == 0)
01313       return;
01314 #endif
01315 
01316     /*
01317      * Collect the offsets for the properties.  The offsets array is
01318      * on a 4-byte boundary to keep things efficient for architectures
01319      * that need such a thing.
01320      */
01321     for (i = idx = 0; i < NUMPROPS; i++) {
01322         propcnt[i] = (proptbl[i].used != 0) ? idx : 0xffff;
01323         idx += proptbl[i].used;
01324     }
01325 
01326     /*
01327      * Add the sentinel index which is used by the binary search as the upper
01328      * bound for a search.
01329      */
01330     propcnt[i] = idx;
01331 
01332     /*
01333      * Record the actual number of property lists.  This may be different than
01334      * the number of offsets actually written because of aligning on a 4-byte
01335      * boundary.
01336      */
01337     hdr[1] = NUMPROPS;
01338 
01339     /*
01340      * Calculate the byte count needed and pad the property counts array to a
01341      * 4-byte boundary.
01342      */
01343     if ((bytes = sizeof(ac_uint2) * (NUMPROPS + 1)) & 3)
01344       bytes += 4 - (bytes & 3);
01345     nprops = bytes / sizeof(ac_uint2);
01346     bytes += sizeof(ac_uint4) * idx;
01347 
01348 #if HARDCODE_DATA
01349     fprintf(out, PREF "ac_uint4 _ucprop_size = %d;\n\n", NUMPROPS);
01350 
01351     fprintf(out, PREF "ac_uint2 _ucprop_offsets[] = {");
01352 
01353     for (i = 0; i<nprops; i++) {
01354        if (i) fprintf(out, ",");
01355        if (!(i&7)) fprintf(out, "\n\t");
01356        else fprintf(out, " ");
01357        fprintf(out, "0x%04x", propcnt[i]);
01358     }
01359     fprintf(out, "\n};\n\n");
01360 
01361     fprintf(out, PREF "ac_uint4 _ucprop_ranges[] = {");
01362 
01363     k = 0;
01364     for (i = 0; i < NUMPROPS; i++) {
01365        if (proptbl[i].used > 0) {
01366          for (j=0; j<proptbl[i].used; j++) {
01367            if (k) fprintf(out, ",");
01368            if (!(k&3)) fprintf(out,"\n\t");
01369            else fprintf(out, " ");
01370            k++;
01371            fprintf(out, "0x%08lx", (unsigned long) proptbl[i].ranges[j]);
01372          }
01373        }
01374     }
01375     fprintf(out, "\n};\n\n");
01376 #else
01377     /*
01378      * Write the header.
01379      */
01380     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
01381 
01382     /*
01383      * Write the byte count.
01384      */
01385     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
01386 
01387     /*
01388      * Write the property list counts.
01389      */
01390     fwrite((char *) propcnt, sizeof(ac_uint2), nprops, out);
01391 
01392     /*
01393      * Write the property lists.
01394      */
01395     for (i = 0; i < NUMPROPS; i++) {
01396         if (proptbl[i].used > 0)
01397           fwrite((char *) proptbl[i].ranges, sizeof(ac_uint4),
01398                  proptbl[i].used, out);
01399     }
01400 
01401     fclose(out);
01402 #endif
01403 
01404     /*****************************************************************
01405      *
01406      * Generate the case mapping data.
01407      *
01408      *****************************************************************/
01409 
01410 #if HARDCODE_DATA
01411     fprintf(out, PREF "ac_uint4 _uccase_size = %ld;\n\n",
01412         (long) (upper_used + lower_used + title_used));
01413 
01414     fprintf(out, PREF "ac_uint2 _uccase_len[2] = {%ld, %ld};\n\n",
01415         (long) upper_used, (long) lower_used);
01416     fprintf(out, PREF "ac_uint4 _uccase_map[] = {");
01417 
01418     if (upper_used > 0)
01419       /*
01420        * Write the upper case table.
01421        */
01422       write_case(out, upper, upper_used, 1);
01423 
01424     if (lower_used > 0)
01425       /*
01426        * Write the lower case table.
01427        */
01428       write_case(out, lower, lower_used, !upper_used);
01429 
01430     if (title_used > 0)
01431       /*
01432        * Write the title case table.
01433        */
01434       write_case(out, title, title_used, !(upper_used||lower_used));
01435 
01436     if (!(upper_used || lower_used || title_used))
01437        fprintf(out, "\t0");
01438 
01439     fprintf(out, "\n};\n\n");
01440 #else
01441     /*
01442      * Open the case.dat file.
01443      */
01444     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "case.dat", opath);
01445     if ((out = fopen(path, "wb")) == 0)
01446       return;
01447 
01448     /*
01449      * Write the case mapping tables.
01450      */
01451     hdr[1] = upper_used + lower_used + title_used;
01452     casecnt[0] = upper_used;
01453     casecnt[1] = lower_used;
01454 
01455     /*
01456      * Write the header.
01457      */
01458     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
01459 
01460     /*
01461      * Write the upper and lower case table sizes.
01462      */
01463     fwrite((char *) casecnt, sizeof(ac_uint2), 2, out);
01464 
01465     if (upper_used > 0)
01466       /*
01467        * Write the upper case table.
01468        */
01469       fwrite((char *) upper, sizeof(_case_t), upper_used, out);
01470 
01471     if (lower_used > 0)
01472       /*
01473        * Write the lower case table.
01474        */
01475       fwrite((char *) lower, sizeof(_case_t), lower_used, out);
01476 
01477     if (title_used > 0)
01478       /*
01479        * Write the title case table.
01480        */
01481       fwrite((char *) title, sizeof(_case_t), title_used, out);
01482 
01483     fclose(out);
01484 #endif
01485 
01486     /*****************************************************************
01487      *
01488      * Generate the composition data.
01489      *
01490      *****************************************************************/
01491     
01492     /*
01493      * Create compositions from decomposition data
01494      */
01495     create_comps();
01496     
01497 #if HARDCODE_DATA
01498     fprintf(out, PREF "ac_uint4 _uccomp_size = %ld;\n\n",
01499         comps_used * 4L);
01500 
01501     fprintf(out, PREF "ac_uint4 _uccomp_data[] = {");
01502 
01503      /*
01504       * Now, if comps exist, write them out.
01505       */
01506     if (comps_used > 0) {
01507        for (i=0; i<comps_used; i++) {
01508            if (i) fprintf(out, ",");
01509            fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx, 0x%08lx",
01510                (unsigned long) comps[i].comp, (unsigned long) comps[i].count,
01511                (unsigned long) comps[i].code1, (unsigned long) comps[i].code2);
01512        }
01513     } else {
01514        fprintf(out, "\t0");
01515     }
01516     fprintf(out, "\n};\n\n");
01517 #else
01518     /*
01519      * Open the comp.dat file.
01520      */
01521     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "comp.dat", opath);
01522     if ((out = fopen(path, "wb")) == 0)
01523        return;
01524     
01525     /*
01526      * Write the header.
01527      */
01528     hdr[1] = (ac_uint2) comps_used * 4;
01529     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
01530     
01531     /*
01532      * Write out the byte count to maintain header size.
01533      */
01534     bytes = comps_used * sizeof(_comp_t);
01535     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
01536     
01537     /*
01538      * Now, if comps exist, write them out.
01539      */
01540     if (comps_used > 0)
01541         fwrite((char *) comps, sizeof(_comp_t), comps_used, out);
01542     
01543     fclose(out);
01544 #endif
01545     
01546     /*****************************************************************
01547      *
01548      * Generate the decomposition data.
01549      *
01550      *****************************************************************/
01551 
01552     /*
01553      * Fully expand all decompositions before generating the output file.
01554      */
01555     expand_decomp();
01556 
01557 #if HARDCODE_DATA
01558     fprintf(out, PREF "ac_uint4 _ucdcmp_size = %ld;\n\n",
01559         decomps_used * 2L);
01560 
01561     fprintf(out, PREF "ac_uint4 _ucdcmp_nodes[] = {");
01562 
01563     if (decomps_used) {
01564        /*
01565         * Write the list of decomp nodes.
01566         */
01567        for (i = idx = 0; i < decomps_used; i++) {
01568            fprintf(out, "\n\t0x%08lx, 0x%08lx,",
01569                (unsigned long) decomps[i].code, (unsigned long) idx);
01570            idx += decomps[i].used;
01571        }
01572 
01573        /*
01574         * Write the sentinel index as the last decomp node.
01575         */
01576        fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx);
01577 
01578        fprintf(out, PREF "ac_uint4 _ucdcmp_decomp[] = {");
01579        /*
01580         * Write the decompositions themselves.
01581         */
01582        k = 0;
01583        for (i = 0; i < decomps_used; i++)
01584          for (j=0; j<decomps[i].used; j++) {
01585            if (k) fprintf(out, ",");
01586            if (!(k&3)) fprintf(out,"\n\t");
01587            else fprintf(out, " ");
01588            k++;
01589            fprintf(out, "0x%08lx", (unsigned long) decomps[i].decomp[j]);
01590          }
01591        fprintf(out, "\n};\n\n");
01592     }
01593 #else
01594     /*
01595      * Open the decomp.dat file.
01596      */
01597     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "decomp.dat", opath);
01598     if ((out = fopen(path, "wb")) == 0)
01599       return;
01600 
01601     hdr[1] = decomps_used;
01602 
01603     /*
01604      * Write the header.
01605      */
01606     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
01607 
01608     /*
01609      * Write a temporary byte count which will be calculated as the
01610      * decompositions are written out.
01611      */
01612     bytes = 0;
01613     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
01614 
01615     if (decomps_used) {
01616         /*
01617          * Write the list of decomp nodes.
01618          */
01619         for (i = idx = 0; i < decomps_used; i++) {
01620             fwrite((char *) &decomps[i].code, sizeof(ac_uint4), 1, out);
01621             fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
01622             idx += decomps[i].used;
01623         }
01624 
01625         /*
01626          * Write the sentinel index as the last decomp node.
01627          */
01628         fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
01629 
01630         /*
01631          * Write the decompositions themselves.
01632          */
01633         for (i = 0; i < decomps_used; i++)
01634           fwrite((char *) decomps[i].decomp, sizeof(ac_uint4),
01635                  decomps[i].used, out);
01636 
01637         /*
01638          * Seek back to the beginning and write the byte count.
01639          */
01640         bytes = (sizeof(ac_uint4) * idx) +
01641             (sizeof(ac_uint4) * ((hdr[1] << 1) + 1));
01642         fseek(out, sizeof(ac_uint2) << 1, 0L);
01643         fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
01644 
01645         fclose(out);
01646     }
01647 #endif
01648 
01649 #ifdef HARDCODE_DATA
01650     fprintf(out, PREF "ac_uint4 _uckdcmp_size = %ld;\n\n",
01651         kdecomps_used * 2L);
01652 
01653     fprintf(out, PREF "ac_uint4 _uckdcmp_nodes[] = {");
01654 
01655     if (kdecomps_used) {
01656        /*
01657         * Write the list of kdecomp nodes.
01658         */
01659        for (i = idx = 0; i < kdecomps_used; i++) {
01660            fprintf(out, "\n\t0x%08lx, 0x%08lx,",
01661                (unsigned long) kdecomps[i].code, (unsigned long) idx);
01662            idx += kdecomps[i].used;
01663        }
01664 
01665        /*
01666         * Write the sentinel index as the last decomp node.
01667         */
01668        fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx);
01669 
01670        fprintf(out, PREF "ac_uint4 _uckdcmp_decomp[] = {");
01671 
01672        /*
01673         * Write the decompositions themselves.
01674         */
01675        k = 0;
01676        for (i = 0; i < kdecomps_used; i++)
01677          for (j=0; j<kdecomps[i].used; j++) {
01678            if (k) fprintf(out, ",");
01679            if (!(k&3)) fprintf(out,"\n\t");
01680            else fprintf(out, " ");
01681            k++;
01682            fprintf(out, "0x%08lx", (unsigned long) kdecomps[i].decomp[j]);
01683          }
01684        fprintf(out, "\n};\n\n");
01685     }
01686 #else
01687     /*
01688      * Open the kdecomp.dat file.
01689      */
01690     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "kdecomp.dat", opath);
01691     if ((out = fopen(path, "wb")) == 0)
01692       return;
01693 
01694     hdr[1] = kdecomps_used;
01695 
01696     /*
01697      * Write the header.
01698      */
01699     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
01700 
01701     /*
01702      * Write a temporary byte count which will be calculated as the
01703      * decompositions are written out.
01704      */
01705     bytes = 0;
01706     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
01707 
01708     if (kdecomps_used) {
01709         /*
01710          * Write the list of kdecomp nodes.
01711          */
01712         for (i = idx = 0; i < kdecomps_used; i++) {
01713             fwrite((char *) &kdecomps[i].code, sizeof(ac_uint4), 1, out);
01714             fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
01715             idx += kdecomps[i].used;
01716         }
01717 
01718         /*
01719          * Write the sentinel index as the last decomp node.
01720          */
01721         fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
01722 
01723         /*
01724          * Write the decompositions themselves.
01725          */
01726         for (i = 0; i < kdecomps_used; i++)
01727           fwrite((char *) kdecomps[i].decomp, sizeof(ac_uint4),
01728                  kdecomps[i].used, out);
01729 
01730         /*
01731          * Seek back to the beginning and write the byte count.
01732          */
01733         bytes = (sizeof(ac_uint4) * idx) +
01734             (sizeof(ac_uint4) * ((hdr[1] << 1) + 1));
01735         fseek(out, sizeof(ac_uint2) << 1, 0L);
01736         fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
01737 
01738         fclose(out);
01739     }
01740 #endif
01741 
01742     /*****************************************************************
01743      *
01744      * Generate the combining class data.
01745      *
01746      *****************************************************************/
01747 #ifdef HARDCODE_DATA
01748     fprintf(out, PREF "ac_uint4 _uccmcl_size = %ld;\n\n", (long) ccl_used);
01749 
01750     fprintf(out, PREF "ac_uint4 _uccmcl_nodes[] = {");
01751 
01752     if (ccl_used > 0) {
01753        /*
01754         * Write the combining class ranges out.
01755         */
01756        for (i = 0; i<ccl_used; i++) {
01757            if (i) fprintf(out, ",");
01758            if (!(i&3)) fprintf(out, "\n\t");
01759            else fprintf(out, " ");
01760            fprintf(out, "0x%08lx", (unsigned long) ccl[i]);
01761        }
01762     } else {
01763        fprintf(out, "\t0");
01764     }
01765     fprintf(out, "\n};\n\n");
01766 #else
01767     /*
01768      * Open the cmbcl.dat file.
01769      */
01770     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "cmbcl.dat", opath);
01771     if ((out = fopen(path, "wb")) == 0)
01772       return;
01773 
01774     /*
01775      * Set the number of ranges used.  Each range has a combining class which
01776      * means each entry is a 3-tuple.
01777      */
01778     hdr[1] = ccl_used / 3;
01779 
01780     /*
01781      * Write the header.
01782      */
01783     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
01784 
01785     /*
01786      * Write out the byte count to maintain header size.
01787      */
01788     bytes = ccl_used * sizeof(ac_uint4);
01789     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
01790 
01791     if (ccl_used > 0)
01792       /*
01793        * Write the combining class ranges out.
01794        */
01795       fwrite((char *) ccl, sizeof(ac_uint4), ccl_used, out);
01796 
01797     fclose(out);
01798 #endif
01799 
01800     /*****************************************************************
01801      *
01802      * Generate the number data.
01803      *
01804      *****************************************************************/
01805 
01806 #if HARDCODE_DATA
01807     fprintf(out, PREF "ac_uint4 _ucnum_size = %lu;\n\n",
01808         (unsigned long)ncodes_used<<1);
01809 
01810     fprintf(out, PREF "ac_uint4 _ucnum_nodes[] = {");
01811 
01812     /*
01813      * Now, if number mappings exist, write them out.
01814      */
01815     if (ncodes_used > 0) {
01816        for (i = 0; i<ncodes_used; i++) {
01817            if (i) fprintf(out, ",");
01818            if (!(i&1)) fprintf(out, "\n\t");
01819            else fprintf(out, " ");
01820            fprintf(out, "0x%08lx, 0x%08lx",
01821                (unsigned long) ncodes[i].code, (unsigned long) ncodes[i].idx);
01822        }
01823        fprintf(out, "\n};\n\n");
01824 
01825        fprintf(out, PREF "short _ucnum_vals[] = {");
01826        for (i = 0; i<nums_used; i++) {
01827            if (i) fprintf(out, ",");
01828            if (!(i&3)) fprintf(out, "\n\t");
01829            else fprintf(out, " ");
01830            if (nums[i].numerator < 0) {
01831               fprintf(out, "%6d, 0x%04x",
01832                 nums[i].numerator, nums[i].denominator);
01833            } else {
01834               fprintf(out, "0x%04x, 0x%04x",
01835                 nums[i].numerator, nums[i].denominator);
01836            }
01837        }
01838        fprintf(out, "\n};\n\n");
01839     }
01840 #else
01841     /*
01842      * Open the num.dat file.
01843      */
01844     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "num.dat", opath);
01845     if ((out = fopen(path, "wb")) == 0)
01846       return;
01847 
01848     /*
01849      * The count part of the header will be the total number of codes that
01850      * have numbers.
01851      */
01852     hdr[1] = (ac_uint2) (ncodes_used << 1);
01853     bytes = (ncodes_used * sizeof(_codeidx_t)) + (nums_used * sizeof(_num_t));
01854 
01855     /*
01856      * Write the header.
01857      */
01858     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
01859 
01860     /*
01861      * Write out the byte count to maintain header size.
01862      */
01863     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
01864 
01865     /*
01866      * Now, if number mappings exist, write them out.
01867      */
01868     if (ncodes_used > 0) {
01869         fwrite((char *) ncodes, sizeof(_codeidx_t), ncodes_used, out);
01870         fwrite((char *) nums, sizeof(_num_t), nums_used, out);
01871     }
01872 #endif
01873 
01874     fclose(out);
01875 }
01876 
01877 static void
01878 usage(char *prog)
01879 {
01880     fprintf(stderr,
01881             "Usage: %s [-o output-directory|-x composition-exclusions]", prog);
01882     fprintf(stderr, " datafile1 datafile2 ...\n\n");
01883     fprintf(stderr,
01884             "-o output-directory\n\t\tWrite the output files to a different");
01885     fprintf(stderr, " directory (default: .).\n");
01886     fprintf(stderr,
01887             "-x composition-exclusion\n\t\tFile of composition codes");
01888     fprintf(stderr, " that should be excluded.\n");
01889     exit(1);
01890 }
01891 
01892 int
01893 main(int argc, char *argv[])
01894 {
01895     FILE *in;
01896     char *prog, *opath;
01897 
01898     prog = lutil_progname( "ucgendat", argc, argv );
01899 
01900     opath = 0;
01901     in = stdin;
01902 
01903     argc--;
01904     argv++;
01905 
01906     while (argc > 0) {
01907         if (argv[0][0] == '-') {
01908             switch (argv[0][1]) {
01909               case 'o':
01910                 argc--;
01911                 argv++;
01912                 opath = argv[0];
01913                 break;
01914               case 'x':
01915                 argc--;
01916                 argv++;
01917                 if ((in = fopen(argv[0], "r")) == 0)
01918                   fprintf(stderr,
01919                           "%s: unable to open composition exclusion file %s\n",
01920                           prog, argv[0]);
01921                 else {
01922                     read_compexdata(in);
01923                     fclose(in);
01924                     in = 0;
01925                 }
01926                 break;
01927               default:
01928                 usage(prog);
01929             }
01930         } else {
01931             if (in != stdin && in != NULL)
01932               fclose(in);
01933             if ((in = fopen(argv[0], "r")) == 0)
01934               fprintf(stderr, "%s: unable to open ctype file %s\n",
01935                       prog, argv[0]);
01936             else {
01937                 read_cdata(in);
01938                 fclose(in);
01939                 in = 0;
01940            }
01941         }
01942         argc--;
01943         argv++;
01944     }
01945 
01946     if (opath == 0)
01947       opath = ".";
01948     write_cdata(opath);
01949 
01950     return 0;
01951 }