Back to index

lightning-sunbird  0.9+nobinonly
ucgendat.c
Go to the documentation of this file.
00001 /*
00002  * Copyright 1996, 1997, 1998 Computing Research Labs,
00003  * New Mexico State University
00004  *
00005  * Permission is hereby granted, free of charge, to any person obtaining a
00006  * copy of this software and associated documentation files (the "Software"),
00007  * to deal in the Software without restriction, including without limitation
00008  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
00009  * and/or sell copies of the Software, and to permit persons to whom the
00010  * Software is furnished to do so, subject to the following conditions:
00011  *
00012  * The above copyright notice and this permission notice shall be included in
00013  * all copies or substantial portions of the Software.
00014  *
00015  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00016  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00017  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
00018  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
00019  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
00020  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
00021  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
00022  */
00023 #ifndef lint
00024 #ifdef __GNUC__
00025 static char rcsid[] __attribute__ ((unused)) = "$Id: ucgendat.c,v 1.1 1999/01/08 00:19:21 ftang%netscape.com Exp $";
00026 #else
00027 static char rcsid[] = "$Id: ucgendat.c,v 1.1 1999/01/08 00:19:21 ftang%netscape.com Exp $";
00028 #endif
00029 #endif
00030 
00031 #include <stdio.h>
00032 #include <stdlib.h>
00033 #include <string.h>
00034 #ifndef WIN32
00035 #include <unistd.h>
00036 #endif
00037 
00038 #define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\
00039                       ((cc) >= 'A' && (cc) <= 'F') ||\
00040                       ((cc) >= 'a' && (cc) <= 'f'))
00041 
00042 /*
00043  * A header written to the output file with the byte-order-mark and the number
00044  * of property nodes.
00045  */
00046 static unsigned short hdr[2] = {0xfeff, 0};
00047 
00048 #define NUMPROPS 49
00049 #define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3)))
00050 
00051 typedef struct {
00052     char *name;
00053     int len;
00054 } _prop_t;
00055 
00056 /*
00057  * List of properties expected to be found in the Unicode Character Database
00058  * including some implementation specific properties.
00059  *
00060  * The implementation specific properties are:
00061  * Cm = Composed (can be decomposed)
00062  * Nb = Non-breaking
00063  * Sy = Symmetric (has left and right forms)
00064  * Hd = Hex digit
00065  * Qm = Quote marks
00066  * Mr = Mirroring
00067  * Ss = Space, other
00068  * Cp = Defined character
00069  */
00070 static _prop_t props[NUMPROPS] = {
00071     {"Mn", 2}, {"Mc", 2}, {"Me", 2}, {"Nd", 2}, {"Nl", 2}, {"No", 2},
00072     {"Zs", 2}, {"Zl", 2}, {"Zp", 2}, {"Cc", 2}, {"Cf", 2}, {"Cs", 2},
00073     {"Co", 2}, {"Cn", 2}, {"Lu", 2}, {"Ll", 2}, {"Lt", 2}, {"Lm", 2},
00074     {"Lo", 2}, {"Pc", 2}, {"Pd", 2}, {"Ps", 2}, {"Pe", 2}, {"Po", 2},
00075     {"Sm", 2}, {"Sc", 2}, {"Sk", 2}, {"So", 2}, {"L",  1}, {"R",  1},
00076     {"EN", 2}, {"ES", 2}, {"ET", 2}, {"AN", 2}, {"CS", 2}, {"B",  1},
00077     {"S",  1}, {"WS", 2}, {"ON", 2},
00078     {"Cm", 2}, {"Nb", 2}, {"Sy", 2}, {"Hd", 2}, {"Qm", 2}, {"Mr", 2},
00079     {"Ss", 2}, {"Cp", 2}, {"Pi", 2}, {"Pf", 2}
00080 };
00081 
00082 typedef struct {
00083     unsigned long *ranges;
00084     unsigned short used;
00085     unsigned short size;
00086 } _ranges_t;
00087 
00088 static _ranges_t proptbl[NUMPROPS];
00089 
00090 /*
00091  * Make sure this array is sized to be on a 4-byte boundary at compile time.
00092  */
00093 static unsigned short propcnt[NEEDPROPS];
00094 
00095 /*
00096  * Array used to collect a decomposition before adding it to the decomposition
00097  * table.
00098  */
00099 static unsigned long dectmp[64];
00100 static unsigned long dectmp_size;
00101 
00102 typedef struct {
00103     unsigned long code;
00104     unsigned short size;
00105     unsigned short used;
00106     unsigned long *decomp;
00107 } _decomp_t;
00108 
00109 /*
00110  * List of decomposition.  Created and expanded in order as the characters are
00111  * encountered.
00112  */
00113 static _decomp_t *decomps;
00114 static unsigned long decomps_used;
00115 static unsigned long decomps_size;
00116 
00117 /*
00118  * Types and lists for handling lists of case mappings.
00119  */
00120 typedef struct {
00121     unsigned long key;
00122     unsigned long other1;
00123     unsigned long other2;
00124 } _case_t;
00125 
00126 static _case_t *upper;
00127 static _case_t *lower;
00128 static _case_t *title;
00129 static unsigned long upper_used;
00130 static unsigned long upper_size;
00131 static unsigned long lower_used;
00132 static unsigned long lower_size;
00133 static unsigned long title_used;
00134 static unsigned long title_size;
00135 
00136 /*
00137  * Array used to collect case mappings before adding them to a list.
00138  */
00139 static unsigned long cases[3];
00140 
00141 /*
00142  * An array to hold ranges for combining classes.
00143  */
00144 static unsigned long *ccl;
00145 static unsigned long ccl_used;
00146 static unsigned long ccl_size;
00147 
00148 /*
00149  * Structures for handling numbers.
00150  */
00151 typedef struct {
00152     unsigned long code;
00153     unsigned long idx;
00154 } _codeidx_t;
00155 
00156 typedef struct {
00157     short numerator;
00158     short denominator;
00159 } _num_t;
00160 
00161 /*
00162  * Arrays to hold the mapping of codes to numbers.
00163  */
00164 static _codeidx_t *ncodes;
00165 static unsigned long ncodes_used;
00166 static unsigned long ncodes_size;
00167 
00168 static _num_t *nums;
00169 static unsigned long nums_used;
00170 static unsigned long nums_size;
00171 
00172 /*
00173  * Array for holding numbers.
00174  */
00175 static _num_t *nums;
00176 static unsigned long nums_used;
00177 static unsigned long nums_size;
00178 
00179 static void
00180 #ifdef __STDC__
00181 add_range(unsigned long start, unsigned long end, char *p1, char *p2)
00182 #else
00183 add_range(start, end, p1, p2)
00184 unsigned long start, end;
00185 char *p1, *p2;
00186 #endif
00187 {
00188     int i, j, k, len;
00189     _ranges_t *rlp;
00190     char *name;
00191 
00192     for (k = 0; k < 2; k++) {
00193         if (k == 0) {
00194             name = p1;
00195             len = 2;
00196         } else {
00197             if (p2 == 0)
00198               break;
00199 
00200             name = p2;
00201             len = 1;
00202         }
00203 
00204         for (i = 0; i < NUMPROPS; i++) {
00205             if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
00206               break;
00207         }
00208 
00209         if (i == NUMPROPS)
00210           continue;
00211 
00212         rlp = &proptbl[i];
00213 
00214         /*
00215          * Resize the range list if necessary.
00216          */
00217         if (rlp->used == rlp->size) {
00218             if (rlp->size == 0)
00219               rlp->ranges = (unsigned long *)
00220                   malloc(sizeof(unsigned long) << 3);
00221             else
00222               rlp->ranges = (unsigned long *)
00223                   realloc((char *) rlp->ranges,
00224                           sizeof(unsigned long) * (rlp->size + 8));
00225             rlp->size += 8;
00226         }
00227 
00228         /*
00229          * If this is the first code for this property list, just add it
00230          * and return.
00231          */
00232         if (rlp->used == 0) {
00233             rlp->ranges[0] = start;
00234             rlp->ranges[1] = end;
00235             rlp->used += 2;
00236             continue;
00237         }
00238 
00239         /*
00240          * Optimize the case of adding the range to the end.
00241          */
00242         j = rlp->used - 1;
00243         if (start > rlp->ranges[j]) {
00244             j = rlp->used;
00245             rlp->ranges[j++] = start;
00246             rlp->ranges[j++] = end;
00247             rlp->used = j;
00248             continue;
00249         }
00250 
00251         /*
00252          * Need to locate the insertion point.
00253          */
00254         for (i = 0;
00255              i < rlp->used && start > rlp->ranges[i + 1] + 1; i += 2) ;
00256 
00257         /*
00258          * If the start value lies in the current range, then simply set the
00259          * new end point of the range to the end value passed as a parameter.
00260          */
00261         if (rlp->ranges[i] <= start && start <= rlp->ranges[i + 1] + 1) {
00262             rlp->ranges[i + 1] = end;
00263             return;
00264         }
00265 
00266         /*
00267          * Shift following values up by two.
00268          */
00269         for (j = rlp->used; j > i; j -= 2) {
00270             rlp->ranges[j] = rlp->ranges[j - 2];
00271             rlp->ranges[j + 1] = rlp->ranges[j - 1];
00272         }
00273 
00274         /*
00275          * Add the new range at the insertion point.
00276          */
00277         rlp->ranges[i] = start;
00278         rlp->ranges[i + 1] = end;
00279         rlp->used += 2;
00280     }
00281 }
00282 
00283 static void
00284 #ifdef __STDC__
00285 ordered_range_insert(unsigned long c, char *name, int len)
00286 #else
00287 ordered_range_insert(c, name, len)
00288 unsigned long c;
00289 char *name;
00290 int len;
00291 #endif
00292 {
00293     int i, j;
00294     unsigned long s, e;
00295     _ranges_t *rlp;
00296 
00297     if (len == 0)
00298       return;
00299 
00300     for (i = 0; i < NUMPROPS; i++) {
00301         if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
00302           break;
00303     }
00304 
00305     if (i == NUMPROPS)
00306       return;
00307 
00308     /*
00309      * Have a match, so insert the code in order.
00310      */
00311     rlp = &proptbl[i];
00312 
00313     /*
00314      * Resize the range list if necessary.
00315      */
00316     if (rlp->used == rlp->size) {
00317         if (rlp->size == 0)
00318           rlp->ranges = (unsigned long *)
00319               malloc(sizeof(unsigned long) << 3);
00320         else
00321           rlp->ranges = (unsigned long *)
00322               realloc((char *) rlp->ranges,
00323                       sizeof(unsigned long) * (rlp->size + 8));
00324         rlp->size += 8;
00325     }
00326 
00327     /*
00328      * If this is the first code for this property list, just add it
00329      * and return.
00330      */
00331     if (rlp->used == 0) {
00332         rlp->ranges[0] = rlp->ranges[1] = c;
00333         rlp->used += 2;
00334         return;
00335     }
00336 
00337     /*
00338      * Optimize the cases of extending the last range and adding new ranges to
00339      * the end.
00340      */
00341     j = rlp->used - 1;
00342     e = rlp->ranges[j];
00343     s = rlp->ranges[j - 1];
00344 
00345     if (c == e + 1) {
00346         /*
00347          * Extend the last range.
00348          */
00349         rlp->ranges[j] = c;
00350         return;
00351     }
00352 
00353     if (c > e + 1) {
00354         /*
00355          * Start another range on the end.
00356          */
00357         j = rlp->used;
00358         rlp->ranges[j] = rlp->ranges[j + 1] = c;
00359         rlp->used += 2;
00360         return;
00361     }
00362 
00363     if (c >= s)
00364       /*
00365        * The code is a duplicate of a code in the last range, so just return.
00366        */
00367       return;
00368 
00369     /*
00370      * The code should be inserted somewhere before the last range in the
00371      * list.  Locate the insertion point.
00372      */
00373     for (i = 0;
00374          i < rlp->used && c > rlp->ranges[i + 1] + 1; i += 2) ;
00375 
00376     s = rlp->ranges[i];
00377     e = rlp->ranges[i + 1];
00378 
00379     if (c == e + 1)
00380       /*
00381        * Simply extend the current range.
00382        */
00383       rlp->ranges[i + 1] = c;
00384     else if (c < s) {
00385         /*
00386          * Add a new entry before the current location.  Shift all entries
00387          * before the current one up by one to make room.
00388          */
00389         for (j = rlp->used; j > i; j -= 2) {
00390             rlp->ranges[j] = rlp->ranges[j - 2];
00391             rlp->ranges[j + 1] = rlp->ranges[j - 1];
00392         }
00393         rlp->ranges[i] = rlp->ranges[i + 1] = c;
00394 
00395         rlp->used += 2;
00396     }
00397 }
00398 
00399 static void
00400 #ifdef __STDC__
00401 add_decomp(unsigned long code)
00402 #else
00403 add_decomp(code)
00404 unsigned long code;
00405 #endif
00406 {
00407     unsigned long i, j, size;
00408 
00409     /*
00410      * Add the code to the composite property.
00411      */
00412     ordered_range_insert(code, "Cm", 2);
00413 
00414     /*
00415      * Locate the insertion point for the code.
00416      */
00417     for (i = 0; i < decomps_used && code > decomps[i].code; i++) ;
00418 
00419     /*
00420      * Allocate space for a new decomposition.
00421      */
00422     if (decomps_used == decomps_size) {
00423         if (decomps_size == 0)
00424           decomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3);
00425         else
00426           decomps = (_decomp_t *)
00427               realloc((char *) decomps,
00428                       sizeof(_decomp_t) * (decomps_size + 8));
00429         (void) memset((char *) (decomps + decomps_size), 0,
00430                       sizeof(_decomp_t) << 3);
00431         decomps_size += 8;
00432     }
00433 
00434     if (i < decomps_used && code != decomps[i].code) {
00435         /*
00436          * Shift the decomps up by one if the codes don't match.
00437          */
00438         for (j = decomps_used; j > i; j--)
00439           (void) memcpy((char *) &decomps[j], (char *) &decomps[j - 1],
00440                         sizeof(_decomp_t));
00441     }
00442 
00443     /*
00444      * Insert or replace a decomposition.
00445      */
00446     size = dectmp_size + (4 - (dectmp_size & 3));
00447     if (decomps[i].size < size) {
00448         if (decomps[i].size == 0)
00449           decomps[i].decomp = (unsigned long *)
00450               malloc(sizeof(unsigned long) * size);
00451         else
00452           decomps[i].decomp = (unsigned long *)
00453               realloc((char *) decomps[i].decomp,
00454                       sizeof(unsigned long) * size);
00455         decomps[i].size = size;
00456     }
00457 
00458     if (decomps[i].code != code)
00459       decomps_used++;
00460 
00461     decomps[i].code = code;
00462     decomps[i].used = dectmp_size;
00463     (void) memcpy((char *) decomps[i].decomp, (char *) dectmp,
00464                   sizeof(unsigned long) * dectmp_size);
00465 
00466 }
00467 
00468 static void
00469 #ifdef __STDC__
00470 add_title(unsigned long code)
00471 #else
00472 add_title(code)
00473 unsigned long code;
00474 #endif
00475 {
00476     unsigned long i, j;
00477 
00478     /*
00479      * Always map the code to itself.
00480      */
00481     cases[2] = code;
00482 
00483     if (title_used == title_size) {
00484         if (title_size == 0)
00485           title = (_case_t *) malloc(sizeof(_case_t) << 3);
00486         else
00487           title = (_case_t *) realloc((char *) title,
00488                                       sizeof(_case_t) * (title_size + 8));
00489         title_size += 8;
00490     }
00491 
00492     /*
00493      * Locate the insertion point.
00494      */
00495     for (i = 0; i < title_used && code > title[i].key; i++) ;
00496 
00497     if (i < title_used) {
00498         /*
00499          * Shift the array up by one.
00500          */
00501         for (j = title_used; j > i; j--)
00502           (void) memcpy((char *) &title[j], (char *) &title[j - 1],
00503                         sizeof(_case_t));
00504     }
00505 
00506     title[i].key = cases[2];    /* Title */
00507     title[i].other1 = cases[0]; /* Upper */
00508     title[i].other2 = cases[1]; /* Lower */
00509 
00510     title_used++;
00511 }
00512 
00513 static void
00514 #ifdef __STDC__
00515 add_upper(unsigned long code)
00516 #else
00517 add_upper(code)
00518 unsigned long code;
00519 #endif
00520 {
00521     unsigned long i, j;
00522 
00523     /*
00524      * Always map the code to itself.
00525      */
00526     cases[0] = code;
00527 
00528     /*
00529      * If the title case character is not present, then make it the same as
00530      * the upper case.
00531      */
00532     if (cases[2] == 0)
00533       cases[2] = code;
00534 
00535     if (upper_used == upper_size) {
00536         if (upper_size == 0)
00537           upper = (_case_t *) malloc(sizeof(_case_t) << 3);
00538         else
00539           upper = (_case_t *) realloc((char *) upper,
00540                                       sizeof(_case_t) * (upper_size + 8));
00541         upper_size += 8;
00542     }
00543 
00544     /*
00545      * Locate the insertion point.
00546      */
00547     for (i = 0; i < upper_used && code > upper[i].key; i++) ;
00548 
00549     if (i < upper_used) {
00550         /*
00551          * Shift the array up by one.
00552          */
00553         for (j = upper_used; j > i; j--)
00554           (void) memcpy((char *) &upper[j], (char *) &upper[j - 1],
00555                         sizeof(_case_t));
00556     }
00557 
00558     upper[i].key = cases[0];    /* Upper */
00559     upper[i].other1 = cases[1]; /* Lower */
00560     upper[i].other2 = cases[2]; /* Title */
00561 
00562     upper_used++;
00563 }
00564 
00565 static void
00566 #ifdef __STDC__
00567 add_lower(unsigned long code)
00568 #else
00569 add_lower(code)
00570 unsigned long code;
00571 #endif
00572 {
00573     unsigned long i, j;
00574 
00575     /*
00576      * Always map the code to itself.
00577      */
00578     cases[1] = code;
00579 
00580     /*
00581      * If the title case character is empty, then make it the same as the
00582      * upper case.
00583      */
00584     if (cases[2] == 0)
00585       cases[2] = cases[0];
00586 
00587     if (lower_used == lower_size) {
00588         if (lower_size == 0)
00589           lower = (_case_t *) malloc(sizeof(_case_t) << 3);
00590         else
00591           lower = (_case_t *) realloc((char *) lower,
00592                                       sizeof(_case_t) * (lower_size + 8));
00593         lower_size += 8;
00594     }
00595 
00596     /*
00597      * Locate the insertion point.
00598      */
00599     for (i = 0; i < lower_used && code > lower[i].key; i++) ;
00600 
00601     if (i < lower_used) {
00602         /*
00603          * Shift the array up by one.
00604          */
00605         for (j = lower_used; j > i; j--)
00606           (void) memcpy((char *) &lower[j], (char *) &lower[j - 1],
00607                         sizeof(_case_t));
00608     }
00609 
00610     lower[i].key = cases[1];    /* Lower */
00611     lower[i].other1 = cases[0]; /* Upper */
00612     lower[i].other2 = cases[2]; /* Title */
00613 
00614     lower_used++;
00615 }
00616 
00617 static void
00618 #ifdef __STDC__
00619 ordered_ccl_insert(unsigned long c, unsigned long ccl_code)
00620 #else
00621 ordered_ccl_insert(c, ccl_code)
00622 unsigned long c, ccl_code;
00623 #endif
00624 {
00625     unsigned long i, j;
00626 
00627     if (ccl_used == ccl_size) {
00628         if (ccl_size == 0)
00629           ccl = (unsigned long *) malloc(sizeof(unsigned long) * 24);
00630         else
00631           ccl = (unsigned long *)
00632               realloc((char *) ccl, sizeof(unsigned long) * (ccl_size + 24));
00633         ccl_size += 24;
00634     }
00635 
00636     /*
00637      * Optimize adding the first item.
00638      */
00639     if (ccl_used == 0) {
00640         ccl[0] = ccl[1] = c;
00641         ccl[2] = ccl_code;
00642         ccl_used += 3;
00643         return;
00644     }
00645 
00646     /*
00647      * Handle the special case of extending the range on the end.  This
00648      * requires that the combining class codes are the same.
00649      */
00650     if (ccl_code == ccl[ccl_used - 1] && c == ccl[ccl_used - 2] + 1) {
00651         ccl[ccl_used - 2] = c;
00652         return;
00653     }
00654 
00655     /*
00656      * Handle the special case of adding another range on the end.
00657      */
00658     if (c > ccl[ccl_used - 2] + 1 ||
00659         (c == ccl[ccl_used - 2] + 1 && ccl_code != ccl[ccl_used - 1])) {
00660         ccl[ccl_used++] = c;
00661         ccl[ccl_used++] = c;
00662         ccl[ccl_used++] = ccl_code;
00663         return;
00664     }
00665 
00666     /*
00667      * Locate either the insertion point or range for the code.
00668      */
00669     for (i = 0; i < ccl_used && c > ccl[i + 1] + 1; i += 3) ;
00670 
00671     if (ccl_code == ccl[i + 2] && c == ccl[i + 1] + 1) {
00672         /*
00673          * Extend an existing range.
00674          */
00675         ccl[i + 1] = c;
00676         return;
00677     } else if (c < ccl[i]) {
00678         /*
00679          * Start a new range before the current location.
00680          */
00681         for (j = ccl_used; j > i; j -= 3) {
00682             ccl[j] = ccl[j - 3];
00683             ccl[j - 1] = ccl[j - 4];
00684             ccl[j - 2] = ccl[j - 5];
00685         }
00686         ccl[i] = ccl[i + 1] = c;
00687         ccl[i + 2] = ccl_code;
00688     }
00689 }
00690 
00691 /*
00692  * Adds a number if it does not already exist and returns an index value
00693  * multiplied by 2.
00694  */
00695 static unsigned long
00696 #ifdef __STDC__
00697 make_number(short num, short denom)
00698 #else
00699 make_number(num, denom)
00700 short num, denom;
00701 #endif
00702 {
00703     unsigned long n;
00704 
00705     /*
00706      * Determine if the number already exists.
00707      */
00708     for (n = 0; n < nums_used; n++) {
00709         if (nums[n].numerator == num && nums[n].denominator == denom)
00710           return n << 1;
00711     }
00712 
00713     if (nums_used == nums_size) {
00714         if (nums_size == 0)
00715           nums = (_num_t *) malloc(sizeof(_num_t) << 3);
00716         else
00717           nums = (_num_t *) realloc((char *) nums,
00718                                     sizeof(_num_t) * (nums_size + 8));
00719         nums_size += 8;
00720     }
00721 
00722     n = nums_used++;
00723     nums[n].numerator = num;
00724     nums[n].denominator = denom;
00725 
00726     return n << 1;
00727 }
00728 
00729 static void
00730 #ifdef __STDC__
00731 add_number(unsigned long code, short num, short denom)
00732 #else
00733 add_number(code, num, denom)
00734 unsigned long code;
00735 short num, denom;
00736 #endif
00737 {
00738     unsigned long i, j;
00739 
00740     /*
00741      * Insert the code in order.
00742      */
00743     for (i = 0; i < ncodes_used && code > ncodes[i].code; i++) ;
00744 
00745     /*
00746      * Handle the case of the codes matching and simply replace the number
00747      * that was there before.
00748      */
00749     if (ncodes_used > 0 && code == ncodes[i].code) {
00750         ncodes[i].idx = make_number(num, denom);
00751         return;
00752     }
00753 
00754     /*
00755      * Resize the array if necessary.
00756      */
00757     if (ncodes_used == ncodes_size) {
00758         if (ncodes_size == 0)
00759           ncodes = (_codeidx_t *) malloc(sizeof(_codeidx_t) << 3);
00760         else
00761           ncodes = (_codeidx_t *)
00762               realloc((char *) ncodes, sizeof(_codeidx_t) * (ncodes_size + 8));
00763 
00764         ncodes_size += 8;
00765     }
00766 
00767     /*
00768      * Shift things around to insert the code if necessary.
00769      */
00770     if (i < ncodes_used) {
00771         for (j = ncodes_used; j > i; j--) {
00772             ncodes[j].code = ncodes[j - 1].code;
00773             ncodes[j].idx = ncodes[j - 1].idx;
00774         }
00775     }
00776     ncodes[i].code = code;
00777     ncodes[i].idx = make_number(num, denom);
00778 
00779     ncodes_used++;
00780 }
00781 
00782 /*
00783  * This routine assumes that the line is a valid Unicode Character Database
00784  * entry.
00785  */
00786 static void
00787 #ifdef __STDC__
00788 read_cdata(FILE *in)
00789 #else
00790 read_cdata(in)
00791 FILE *in;
00792 #endif
00793 {
00794     unsigned long i, lineno, skip, code, ccl_code;
00795     short wnum, neg, number[2];
00796     char line[512], *s, *e;
00797 
00798     lineno = skip = 0;
00799     while (fscanf(in, "%[^\n]\n", line) != EOF) {
00800         lineno++;
00801 
00802         /*
00803          * Skip blank lines and lines that start with a '#'.
00804          */
00805         if (line[0] == 0 || line[0] == '#')
00806           continue;
00807 
00808         /*
00809          * If lines need to be skipped, do it here.
00810          */
00811         if (skip) {
00812             skip--;
00813             continue;
00814         }
00815 
00816         /*
00817          * Collect the code.  The code can be up to 6 hex digits in length to
00818          * allow surrogates to be specified.
00819          */
00820         for (s = line, i = code = 0; *s != ';' && i < 6; i++, s++) {
00821             code <<= 4;
00822             if (*s >= '0' && *s <= '9')
00823               code += *s - '0';
00824             else if (*s >= 'A' && *s <= 'F')
00825               code += (*s - 'A') + 10;
00826             else if (*s >= 'a' && *s <= 'f')
00827               code += (*s - 'a') + 10;
00828         }
00829 
00830         /*
00831          * Handle the following special cases:
00832          * 1. 4E00-9FA5 CJK Ideographs.
00833          * 2. AC00-D7A3 Hangul Syllables.
00834          * 3. D800-DFFF Surrogates.
00835          * 4. E000-F8FF Private Use Area.
00836          * 5. F900-FA2D Han compatibility.
00837          */
00838         switch (code) {
00839           case 0x4e00:
00840             /*
00841              * The Han ideographs.
00842              */
00843             add_range(0x4e00, 0x9fff, "Lo", "L");
00844 
00845             /*
00846              * Add the characters to the defined category.
00847              */
00848             add_range(0x4e00, 0x9fa5, "Cp", 0);
00849 
00850             skip = 1;
00851             break;
00852           case 0xac00:
00853             /*
00854              * The Hangul syllables.
00855              */
00856             add_range(0xac00, 0xd7a3, "Lo", "L");
00857 
00858             /*
00859              * Add the characters to the defined category.
00860              */
00861             add_range(0xac00, 0xd7a3, "Cp", 0);
00862 
00863             skip = 1;
00864             break;
00865           case 0xd800:
00866             /*
00867              * Make a range of all surrogates and assume some default
00868              * properties.
00869              */
00870             add_range(0x010000, 0x10ffff, "Cs", "L");
00871             skip = 5;
00872             break;
00873           case 0xe000:
00874             /*
00875              * The Private Use area.  Add with a default set of properties.
00876              */
00877             add_range(0xe000, 0xf8ff, "Co", "L");
00878             skip = 1;
00879             break;
00880           case 0xf900:
00881             /*
00882              * The CJK compatibility area.
00883              */
00884             add_range(0xf900, 0xfaff, "Lo", "L");
00885 
00886             /*
00887              * Add the characters to the defined category.
00888              */
00889             add_range(0xf900, 0xfaff, "Cp", 0);
00890 
00891             skip = 1;
00892         }
00893 
00894         if (skip)
00895           continue;
00896 
00897         /*
00898          * Add the code to the defined category.
00899          */
00900         ordered_range_insert(code, "Cp", 2);
00901 
00902         /*
00903          * Locate the first character property field.
00904          */
00905         for (i = 0; *s != 0 && i < 2; s++) {
00906             if (*s == ';')
00907               i++;
00908         }
00909         for (e = s; *e && *e != ';'; e++) ;
00910     
00911         ordered_range_insert(code, s, e - s);
00912 
00913         /*
00914          * Locate the combining class code.
00915          */
00916         for (s = e; *s != 0 && i < 3; s++) {
00917             if (*s == ';')
00918               i++;
00919         }
00920 
00921         /*
00922          * Convert the combining class code from decimal.
00923          */
00924         for (ccl_code = 0, e = s; *e && *e != ';'; e++)
00925           ccl_code = (ccl_code * 10) + (*e - '0');
00926 
00927         /*
00928          * Add the code if it not 0.
00929          */
00930         if (ccl_code != 0)
00931           ordered_ccl_insert(code, ccl_code);
00932 
00933         /*
00934          * Locate the second character property field.
00935          */
00936         for (s = e; *s != 0 && i < 4; s++) {
00937             if (*s == ';')
00938               i++;
00939         }
00940         for (e = s; *e && *e != ';'; e++) ;
00941 
00942         ordered_range_insert(code, s, e - s);
00943 
00944         /*
00945          * Check for a decomposition.
00946          */
00947         s = ++e;
00948         if (*s != ';' && *s != '<') {
00949             /*
00950              * Collect the codes of the decomposition.
00951              */
00952             for (dectmp_size = 0; *s != ';'; ) {
00953                 /*
00954                  * Skip all leading non-hex digits.
00955                  */
00956                 while (!ishdigit(*s))
00957                   s++;
00958 
00959                 for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) {
00960                     dectmp[dectmp_size] <<= 4;
00961                     if (*s >= '0' && *s <= '9')
00962                       dectmp[dectmp_size] += *s - '0';
00963                     else if (*s >= 'A' && *s <= 'F')
00964                       dectmp[dectmp_size] += (*s - 'A') + 10;
00965                     else if (*s >= 'a' && *s <= 'f')
00966                       dectmp[dectmp_size] += (*s - 'a') + 10;
00967                 }
00968                 dectmp_size++;
00969             }
00970 
00971             /*
00972              * If there is more than one code in the temporary decomposition
00973              * array, then add the character with its decomposition.
00974              */
00975             if (dectmp_size > 1)
00976               add_decomp(code);
00977         }
00978 
00979         /*
00980          * Skip to the number field.
00981          */
00982         for (i = 0; i < 3 && *s; s++) {
00983             if (*s == ';')
00984               i++;
00985         }
00986 
00987         /*
00988          * Scan the number in.
00989          */
00990         number[0] = number[1] = 0;
00991         for (e = s, neg = wnum = 0; *e && *e != ';'; e++) {
00992             if (*e == '-') {
00993                 neg = 1;
00994                 continue;
00995             }
00996 
00997             if (*e == '/') {
00998                 /*
00999                  * Move the the denominator of the fraction.
01000                  */
01001                 if (neg)
01002                   number[wnum] *= -1;
01003                 neg = 0;
01004                 e++;
01005                 wnum++;
01006             }
01007             number[wnum] = (number[wnum] * 10) + (*e - '0');
01008         }
01009 
01010         if (e > s) {
01011             /*
01012              * Adjust the denominator in case of integers and add the number.
01013              */
01014             if (wnum == 0)
01015               number[1] = number[0];
01016 
01017             add_number(code, number[0], number[1]);
01018         }
01019 
01020         /*
01021          * Skip to the start of the possible case mappings.
01022          */
01023         for (s = e, i = 0; i < 4 && *s; s++) {
01024             if (*s == ';')
01025               i++;
01026         }
01027 
01028         /*
01029          * Collect the case mappings.
01030          */
01031         cases[0] = cases[1] = cases[2] = 0;
01032         for (i = 0; i < 3; i++) {
01033             while (ishdigit(*s)) {
01034                 cases[i] <<= 4;
01035                 if (*s >= '0' && *s <= '9')
01036                   cases[i] += *s - '0';
01037                 else if (*s >= 'A' && *s <= 'F')
01038                   cases[i] += (*s - 'A') + 10;
01039                 else if (*s >= 'a' && *s <= 'f')
01040                   cases[i] += (*s - 'a') + 10;
01041                 s++;
01042             }
01043             if (*s == ';')
01044               s++;
01045         }
01046         if (cases[0] && cases[1])
01047           /*
01048            * Add the upper and lower mappings for a title case character.
01049            */
01050           add_title(code);
01051         else if (cases[1])
01052           /*
01053            * Add the lower and title case mappings for the upper case
01054            * character.
01055            */
01056           add_upper(code);
01057         else if (cases[0])
01058           /*
01059            * Add the upper and title case mappings for the lower case
01060            * character.
01061            */
01062           add_lower(code);
01063     }
01064 }
01065 
01066 static _decomp_t *
01067 #ifdef __STDC__
01068 find_decomp(unsigned long code)
01069 #else
01070 find_decomp(code)
01071 unsigned long code;
01072 #endif
01073 {
01074     long l, r, m;
01075 
01076     l = 0;
01077     r = decomps_used - 1;
01078     while (l <= r) {
01079         m = (l + r) >> 1;
01080         if (code > decomps[m].code)
01081           l = m + 1;
01082         else if (code < decomps[m].code)
01083           r = m - 1;
01084         else
01085           return &decomps[m];
01086     }
01087     return 0;
01088 }
01089 
01090 static void
01091 #ifdef __STDC__
01092 decomp_it(_decomp_t *d)
01093 #else
01094 decomp_it(d)
01095 _decomp_t *d;
01096 #endif
01097 {
01098     unsigned long i;
01099     _decomp_t *dp;
01100 
01101     for (i = 0; i < d->used; i++) {
01102         if ((dp = find_decomp(d->decomp[i])) != 0)
01103           decomp_it(dp);
01104         else
01105           dectmp[dectmp_size++] = d->decomp[i];
01106     }
01107 }
01108 
01109 /*
01110  * Expand all decompositions by recursively decomposing each character
01111  * in the decomposition.
01112  */
01113 static void
01114 #ifdef __STDC__
01115 expand_decomp(void)
01116 #else
01117 expand_decomp()
01118 #endif
01119 {
01120     unsigned long i;
01121 
01122     for (i = 0; i < decomps_used; i++) {
01123         dectmp_size = 0;
01124         decomp_it(&decomps[i]);
01125         if (dectmp_size > 0)
01126           add_decomp(decomps[i].code);
01127     }
01128 }
01129 
01130 static void
01131 #ifdef __STDC__
01132 write_cdata(char *opath)
01133 #else
01134 write_cdata(opath)
01135 char *opath;
01136 #endif
01137 {
01138     FILE *out;
01139     unsigned long i, idx, bytes, nprops;
01140     unsigned short casecnt[2];
01141     char path[BUFSIZ];
01142 
01143     /*****************************************************************
01144      *
01145      * Generate the ctype data.
01146      *
01147      *****************************************************************/
01148 
01149     /*
01150      * Open the ctype.dat file.
01151      */
01152     sprintf(path, "%s/ctype.dat", opath);
01153     if ((out = fopen(path, "wb")) == 0)
01154       return;
01155 
01156     /*
01157      * Collect the offsets for the properties.  The offsets array is
01158      * on a 4-byte boundary to keep things efficient for architectures
01159      * that need such a thing.
01160      */
01161     for (i = idx = 0; i < NUMPROPS; i++) {
01162         propcnt[i] = (proptbl[i].used != 0) ? idx : 0xffff;
01163         idx += proptbl[i].used;
01164     }
01165 
01166     /*
01167      * Add the sentinel index which is used by the binary search as the upper
01168      * bound for a search.
01169      */
01170     propcnt[i] = idx;
01171 
01172     /*
01173      * Record the actual number of property lists.  This may be different than
01174      * the number of offsets actually written because of aligning on a 4-byte
01175      * boundary.
01176      */
01177     hdr[1] = NUMPROPS;
01178 
01179     /*
01180      * Calculate the byte count needed and pad the property counts array to a
01181      * 4-byte boundary.
01182      */
01183     if ((bytes = sizeof(unsigned short) * (NUMPROPS + 1)) & 3)
01184       bytes += 4 - (bytes & 3);
01185     nprops = bytes / sizeof(unsigned short);
01186     bytes += sizeof(unsigned long) * idx;
01187         
01188     /*
01189      * Write the header.
01190      */
01191     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
01192 
01193     /*
01194      * Write the byte count.
01195      */
01196     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
01197 
01198     /*
01199      * Write the property list counts.
01200      */
01201     fwrite((char *) propcnt, sizeof(unsigned short), nprops, out);
01202 
01203     /*
01204      * Write the property lists.
01205      */
01206     for (i = 0; i < NUMPROPS; i++) {
01207         if (proptbl[i].used > 0)
01208           fwrite((char *) proptbl[i].ranges, sizeof(unsigned long),
01209                  proptbl[i].used, out);
01210     }
01211 
01212     fclose(out);
01213 
01214     /*****************************************************************
01215      *
01216      * Generate the case mapping data.
01217      *
01218      *****************************************************************/
01219 
01220     /*
01221      * Open the case.dat file.
01222      */
01223     sprintf(path, "%s/case.dat", opath);
01224     if ((out = fopen(path, "wb")) == 0)
01225       return;
01226 
01227     /*
01228      * Write the case mapping tables.
01229      */
01230     hdr[1] = upper_used + lower_used + title_used;
01231     casecnt[0] = upper_used;
01232     casecnt[1] = lower_used;
01233 
01234     /*
01235      * Write the header.
01236      */
01237     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
01238 
01239     /*
01240      * Write the upper and lower case table sizes.
01241      */
01242     fwrite((char *) casecnt, sizeof(unsigned short), 2, out);
01243 
01244     if (upper_used > 0)
01245       /*
01246        * Write the upper case table.
01247        */
01248       fwrite((char *) upper, sizeof(_case_t), upper_used, out);
01249 
01250     if (lower_used > 0)
01251       /*
01252        * Write the lower case table.
01253        */
01254       fwrite((char *) lower, sizeof(_case_t), lower_used, out);
01255 
01256     if (title_used > 0)
01257       /*
01258        * Write the title case table.
01259        */
01260       fwrite((char *) title, sizeof(_case_t), title_used, out);
01261 
01262     fclose(out);
01263 
01264     /*****************************************************************
01265      *
01266      * Generate the decomposition data.
01267      *
01268      *****************************************************************/
01269 
01270     /*
01271      * Fully expand all decompositions before generating the output file.
01272      */
01273     expand_decomp();
01274 
01275     /*
01276      * Open the decomp.dat file.
01277      */
01278     sprintf(path, "%s/decomp.dat", opath);
01279     if ((out = fopen(path, "wb")) == 0)
01280       return;
01281 
01282     hdr[1] = decomps_used;
01283 
01284     /*
01285      * Write the header.
01286      */
01287     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
01288 
01289     /*
01290      * Write a temporary byte count which will be calculated as the
01291      * decompositions are written out.
01292      */
01293     bytes = 0;
01294     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
01295 
01296     if (decomps_used) {
01297         /*
01298          * Write the list of decomp nodes.
01299          */
01300         for (i = idx = 0; i < decomps_used; i++) {
01301             fwrite((char *) &decomps[i].code, sizeof(unsigned long), 1, out);
01302             fwrite((char *) &idx, sizeof(unsigned long), 1, out);
01303             idx += decomps[i].used;
01304         }
01305 
01306         /*
01307          * Write the sentinel index as the last decomp node.
01308          */
01309         fwrite((char *) &idx, sizeof(unsigned long), 1, out);
01310 
01311         /*
01312          * Write the decompositions themselves.
01313          */
01314         for (i = 0; i < decomps_used; i++)
01315           fwrite((char *) decomps[i].decomp, sizeof(unsigned long),
01316                  decomps[i].used, out);
01317 
01318         /*
01319          * Seek back to the beginning and write the byte count.
01320          */
01321         bytes = (sizeof(unsigned long) * idx) +
01322             (sizeof(unsigned long) * ((hdr[1] << 1) + 1));
01323         fseek(out, sizeof(unsigned short) << 1, 0L);
01324         fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
01325 
01326         fclose(out);
01327     }
01328 
01329     /*****************************************************************
01330      *
01331      * Generate the combining class data.
01332      *
01333      *****************************************************************/
01334 
01335     /*
01336      * Open the cmbcl.dat file.
01337      */
01338     sprintf(path, "%s/cmbcl.dat", opath);
01339     if ((out = fopen(path, "wb")) == 0)
01340       return;
01341 
01342     /*
01343      * Set the number of ranges used.  Each range has a combining class which
01344      * means each entry is a 3-tuple.
01345      */
01346     hdr[1] = ccl_used / 3;
01347 
01348     /*
01349      * Write the header.
01350      */
01351     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
01352 
01353     /*
01354      * Write out the byte count to maintain header size.
01355      */
01356     bytes = ccl_used * sizeof(unsigned long);
01357     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
01358 
01359     if (ccl_used > 0)
01360       /*
01361        * Write the combining class ranges out.
01362        */
01363       fwrite((char *) ccl, sizeof(unsigned long), ccl_used, out);
01364 
01365     fclose(out);
01366 
01367     /*****************************************************************
01368      *
01369      * Generate the number data.
01370      *
01371      *****************************************************************/
01372 
01373     /*
01374      * Open the num.dat file.
01375      */
01376     sprintf(path, "%s/num.dat", opath);
01377     if ((out = fopen(path, "wb")) == 0)
01378       return;
01379 
01380     /*
01381      * The count part of the header will be the total number of codes that
01382      * have numbers.
01383      */
01384     hdr[1] = (unsigned short) (ncodes_used << 1);
01385     bytes = (ncodes_used * sizeof(_codeidx_t)) + (nums_used * sizeof(_num_t));
01386 
01387     /*
01388      * Write the header.
01389      */
01390     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
01391 
01392     /*
01393      * Write out the byte count to maintain header size.
01394      */
01395     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
01396 
01397     /*
01398      * Now, if number mappings exist, write them out.
01399      */
01400     if (ncodes_used > 0) {
01401         fwrite((char *) ncodes, sizeof(_codeidx_t), ncodes_used, out);
01402         fwrite((char *) nums, sizeof(_num_t), nums_used, out);
01403     }
01404 
01405     fclose(out);
01406 }
01407 
01408 void
01409 #ifdef __STDC__
01410 main(int argc, char *argv[])
01411 #else
01412 main(argc, argv)
01413 int argc;
01414 char *argv[];
01415 #endif
01416 {
01417     FILE *in;
01418     char *prog, *opath;
01419 
01420     if ((prog = strrchr(argv[0], '/')) != 0)
01421       prog++;
01422     else
01423       prog = argv[0];
01424 
01425     opath = 0;
01426     in = stdin;
01427 
01428     argc--;
01429     argv++;
01430 
01431     while (argc > 0) {
01432         if (argv[0][0] == '-' && argv[0][1] == 'o') {
01433             argc--;
01434             argv++;
01435             opath = argv[0];
01436         } else {
01437             if (in != stdin)
01438               fclose(in);
01439             if ((in = fopen(argv[0], "rb")) == 0)
01440               fprintf(stderr, "%s: unable to open ctype file %s\n",
01441                       prog, argv[0]);
01442             else {
01443                 read_cdata(in);
01444                 fclose(in);
01445                 in = 0;
01446             }
01447         }
01448         argc--;
01449         argv++;
01450     }
01451 
01452     if (opath == 0)
01453       opath = ".";
01454     write_cdata(opath);
01455 
01456     exit(0);
01457 }