Back to index

tetex-bin  3.0
CharCodeToUnicode.cc
Go to the documentation of this file.
00001 //========================================================================
00002 //
00003 // CharCodeToUnicode.cc
00004 //
00005 // Copyright 2001-2003 Glyph & Cog, LLC
00006 //
00007 //========================================================================
00008 
00009 #include <aconf.h>
00010 
00011 #ifdef USE_GCC_PRAGMAS
00012 #pragma implementation
00013 #endif
00014 
00015 #include <stdio.h>
00016 #include <string.h>
00017 #include "gmem.h"
00018 #include "gfile.h"
00019 #include "GString.h"
00020 #include "Error.h"
00021 #include "GlobalParams.h"
00022 #include "PSTokenizer.h"
00023 #include "CharCodeToUnicode.h"
00024 
00025 //------------------------------------------------------------------------
00026 
00027 #define maxUnicodeString 8
00028 
00029 struct CharCodeToUnicodeString {
00030   CharCode c;
00031   Unicode u[maxUnicodeString];
00032   int len;
00033 };
00034 
00035 //------------------------------------------------------------------------
00036 
00037 static int getCharFromString(void *data) {
00038   char *p;
00039   int c;
00040 
00041   p = *(char **)data;
00042   if (*p) {
00043     c = *p++;
00044     *(char **)data = p;
00045   } else {
00046     c = EOF;
00047   }
00048   return c;
00049 }
00050 
00051 static int getCharFromFile(void *data) {
00052   return fgetc((FILE *)data);
00053 }
00054 
00055 //------------------------------------------------------------------------
00056 
00057 CharCodeToUnicode *CharCodeToUnicode::parseCIDToUnicode(GString *fileName,
00058                                                  GString *collection) {
00059   FILE *f;
00060   Unicode *mapA;
00061   CharCode size, mapLenA;
00062   char buf[64];
00063   Unicode u;
00064   CharCodeToUnicode *ctu;
00065 
00066   if (!(f = fopen(fileName->getCString(), "r"))) {
00067     error(-1, "Couldn't open cidToUnicode file '%s'",
00068          fileName->getCString());
00069     return NULL;
00070   }
00071 
00072   size = 32768;
00073   mapA = (Unicode *)gmalloc(size * sizeof(Unicode));
00074   mapLenA = 0;
00075 
00076   while (getLine(buf, sizeof(buf), f)) {
00077     if (mapLenA == size) {
00078       size *= 2;
00079       mapA = (Unicode *)grealloc(mapA, size * sizeof(Unicode));
00080     }
00081     if (sscanf(buf, "%x", &u) == 1) {
00082       mapA[mapLenA] = u;
00083     } else {
00084       error(-1, "Bad line (%d) in cidToUnicode file '%s'",
00085            (int)(mapLenA + 1), fileName->getCString());
00086       mapA[mapLenA] = 0;
00087     }
00088     ++mapLenA;
00089   }
00090   fclose(f);
00091 
00092   ctu = new CharCodeToUnicode(collection->copy(), mapA, mapLenA, gTrue,
00093                            NULL, 0, 0);
00094   gfree(mapA);
00095   return ctu;
00096 }
00097 
00098 CharCodeToUnicode *CharCodeToUnicode::parseUnicodeToUnicode(
00099                                               GString *fileName) {
00100   FILE *f;
00101   Unicode *mapA;
00102   CharCodeToUnicodeString *sMapA;
00103   CharCode size, oldSize, len, sMapSizeA, sMapLenA;
00104   char buf[256];
00105   char *tok;
00106   Unicode u0;
00107   Unicode uBuf[maxUnicodeString];
00108   CharCodeToUnicode *ctu;
00109   int line, n, i;
00110 
00111   if (!(f = fopen(fileName->getCString(), "r"))) {
00112     error(-1, "Couldn't open unicodeToUnicode file '%s'",
00113          fileName->getCString());
00114     return NULL;
00115   }
00116 
00117   size = 4096;
00118   mapA = (Unicode *)gmalloc(size * sizeof(Unicode));
00119   memset(mapA, 0, size * sizeof(Unicode));
00120   len = 0;
00121   sMapA = NULL;
00122   sMapSizeA = sMapLenA = 0;
00123 
00124   line = 0;
00125   while (getLine(buf, sizeof(buf), f)) {
00126     ++line;
00127     if (!(tok = strtok(buf, " \t\r\n")) ||
00128        sscanf(tok, "%x", &u0) != 1) {
00129       error(-1, "Bad line (%d) in unicodeToUnicode file '%s'",
00130            line, fileName->getCString());
00131       continue;
00132     }
00133     n = 0;
00134     while (n < maxUnicodeString) {
00135       if (!(tok = strtok(NULL, " \t\r\n"))) {
00136        break;
00137       }
00138       if (sscanf(tok, "%x", &uBuf[n]) != 1) {
00139        error(-1, "Bad line (%d) in unicodeToUnicode file '%s'",
00140              line, fileName->getCString());
00141        break;
00142       }
00143       ++n;
00144     }
00145     if (n < 1) {
00146       error(-1, "Bad line (%d) in unicodeToUnicode file '%s'",
00147            line, fileName->getCString());
00148       continue;
00149     }
00150     if (u0 >= size) {
00151       oldSize = size;
00152       while (u0 >= size) {
00153        size *= 2;
00154       }
00155       mapA = (Unicode *)grealloc(mapA, size * sizeof(Unicode));
00156       memset(mapA + oldSize, 0, (size - oldSize) * sizeof(Unicode));
00157     }
00158     if (n == 1) {
00159       mapA[u0] = uBuf[0];
00160     } else {
00161       mapA[u0] = 0;
00162       if (sMapLenA == sMapSizeA) {
00163        sMapSizeA += 16;
00164        sMapA = (CharCodeToUnicodeString *)
00165                  grealloc(sMapA, sMapSizeA * sizeof(CharCodeToUnicodeString));
00166       }
00167       sMapA[sMapLenA].c = u0;
00168       for (i = 0; i < n; ++i) {
00169        sMapA[sMapLenA].u[i] = uBuf[i];
00170       }
00171       sMapA[sMapLenA].len = n;
00172       ++sMapLenA;
00173     }
00174     if (u0 >= len) {
00175       len = u0 + 1;
00176     }
00177   }
00178   fclose(f);
00179 
00180   ctu = new CharCodeToUnicode(fileName->copy(), mapA, len, gTrue,
00181                            sMapA, sMapLenA, sMapSizeA);
00182   gfree(mapA);
00183   return ctu;
00184 }
00185 
00186 CharCodeToUnicode *CharCodeToUnicode::make8BitToUnicode(Unicode *toUnicode) {
00187   return new CharCodeToUnicode(NULL, toUnicode, 256, gTrue, NULL, 0, 0);
00188 }
00189 
00190 CharCodeToUnicode *CharCodeToUnicode::parseCMap(GString *buf, int nBits) {
00191   CharCodeToUnicode *ctu;
00192   char *p;
00193 
00194   ctu = new CharCodeToUnicode(NULL);
00195   p = buf->getCString();
00196   ctu->parseCMap1(&getCharFromString, &p, nBits);
00197   return ctu;
00198 }
00199 
00200 void CharCodeToUnicode::mergeCMap(GString *buf, int nBits) {
00201   char *p;
00202 
00203   p = buf->getCString();
00204   parseCMap1(&getCharFromString, &p, nBits);
00205 }
00206 
00207 void CharCodeToUnicode::parseCMap1(int (*getCharFunc)(void *), void *data,
00208                                int nBits) {
00209   PSTokenizer *pst;
00210   char tok1[256], tok2[256], tok3[256];
00211   int nDigits, n1, n2, n3;
00212   CharCode i;
00213   CharCode code1, code2;
00214   GString *name;
00215   FILE *f;
00216 
00217   nDigits = nBits / 4;
00218   pst = new PSTokenizer(getCharFunc, data);
00219   pst->getToken(tok1, sizeof(tok1), &n1);
00220   while (pst->getToken(tok2, sizeof(tok2), &n2)) {
00221     if (!strcmp(tok2, "usecmap")) {
00222       if (tok1[0] == '/') {
00223        name = new GString(tok1 + 1);
00224        if ((f = globalParams->findToUnicodeFile(name))) {
00225          parseCMap1(&getCharFromFile, f, nBits);
00226          fclose(f);
00227        } else {
00228          error(-1, "Couldn't find ToUnicode CMap file for '%s'",
00229               name->getCString());
00230        }
00231        delete name;
00232       }
00233       pst->getToken(tok1, sizeof(tok1), &n1);
00234     } else if (!strcmp(tok2, "beginbfchar")) {
00235       while (pst->getToken(tok1, sizeof(tok1), &n1)) {
00236        if (!strcmp(tok1, "endbfchar")) {
00237          break;
00238        }
00239        if (!pst->getToken(tok2, sizeof(tok2), &n2) ||
00240            !strcmp(tok2, "endbfchar")) {
00241          error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
00242          break;
00243        }
00244        if (!(n1 == 2 + nDigits && tok1[0] == '<' && tok1[n1 - 1] == '>' &&
00245              tok2[0] == '<' && tok2[n2 - 1] == '>')) {
00246          error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
00247          continue;
00248        }
00249        tok1[n1 - 1] = tok2[n2 - 1] = '\0';
00250        if (sscanf(tok1 + 1, "%x", &code1) != 1) {
00251          error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
00252          continue;
00253        }
00254        addMapping(code1, tok2 + 1, n2 - 1, 0);
00255       }
00256       pst->getToken(tok1, sizeof(tok1), &n1);
00257     } else if (!strcmp(tok2, "beginbfrange")) {
00258       while (pst->getToken(tok1, sizeof(tok1), &n1)) {
00259        if (!strcmp(tok1, "endbfrange")) {
00260          break;
00261        }
00262        if (!pst->getToken(tok2, sizeof(tok2), &n2) ||
00263            !strcmp(tok2, "endbfrange") ||
00264            !pst->getToken(tok3, sizeof(tok3), &n3) ||
00265            !strcmp(tok3, "endbfrange")) {
00266          error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
00267          break;
00268        }
00269        if (!(n1 == 2 + nDigits && tok1[0] == '<' && tok1[n1 - 1] == '>' &&
00270              n2 == 2 + nDigits && tok2[0] == '<' && tok2[n2 - 1] == '>')) {
00271          error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
00272          continue;
00273        }
00274        tok1[n1 - 1] = tok2[n2 - 1] = '\0';
00275        if (sscanf(tok1 + 1, "%x", &code1) != 1 ||
00276            sscanf(tok2 + 1, "%x", &code2) != 1) {
00277          error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
00278          continue;
00279        }
00280        if (!strcmp(tok3, "[")) {
00281          i = 0;
00282          while (pst->getToken(tok1, sizeof(tok1), &n1) &&
00283                code1 + i <= code2) {
00284            if (!strcmp(tok1, "]")) {
00285              break;
00286            }
00287            if (tok1[0] == '<' && tok1[n1 - 1] == '>') {
00288              tok1[n1 - 1] = '\0';
00289              addMapping(code1 + i, tok1 + 1, n1 - 2, 0);
00290            } else {
00291              error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
00292            }
00293            ++i;
00294          }
00295        } else if (tok3[0] == '<' && tok3[n3 - 1] == '>') {
00296          tok3[n3 - 1] = '\0';
00297          for (i = 0; code1 <= code2; ++code1, ++i) {
00298            addMapping(code1, tok3 + 1, n3 - 2, i);
00299          }
00300 
00301        } else {
00302          error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
00303        }
00304       }
00305       pst->getToken(tok1, sizeof(tok1), &n1);
00306     } else {
00307       strcpy(tok1, tok2);
00308     }
00309   }
00310   delete pst;
00311 }
00312 
00313 void CharCodeToUnicode::addMapping(CharCode code, char *uStr, int n,
00314                                int offset) {
00315   CharCode oldLen, i;
00316   Unicode u;
00317   char uHex[5];
00318   int j;
00319 
00320   if (code >= mapLen) {
00321     oldLen = mapLen;
00322     mapLen = (code + 256) & ~255;
00323     map = (Unicode *)grealloc(map, mapLen * sizeof(Unicode));
00324     for (i = oldLen; i < mapLen; ++i) {
00325       map[i] = 0;
00326     }
00327   }
00328   if (n <= 4) {
00329     if (sscanf(uStr, "%x", &u) != 1) {
00330       error(-1, "Illegal entry in ToUnicode CMap");
00331       return;
00332     }
00333     map[code] = u + offset;
00334   } else {
00335     if (sMapLen >= sMapSize) {
00336       sMapSize = sMapSize + 16;
00337       sMap = (CharCodeToUnicodeString *)
00338               grealloc(sMap, sMapSize * sizeof(CharCodeToUnicodeString));
00339     }
00340     map[code] = 0;
00341     sMap[sMapLen].c = code;
00342     sMap[sMapLen].len = n / 4;
00343     for (j = 0; j < sMap[sMapLen].len && j < maxUnicodeString; ++j) {
00344       strncpy(uHex, uStr + j*4, 4);
00345       uHex[4] = '\0';
00346       if (sscanf(uHex, "%x", &sMap[sMapLen].u[j]) != 1) {
00347        error(-1, "Illegal entry in ToUnicode CMap");
00348       }
00349     }
00350     sMap[sMapLen].u[sMap[sMapLen].len - 1] += offset;
00351     ++sMapLen;
00352   }
00353 }
00354 
00355 CharCodeToUnicode::CharCodeToUnicode(GString *tagA) {
00356   CharCode i;
00357 
00358   tag = tagA;
00359   mapLen = 256;
00360   map = (Unicode *)gmalloc(mapLen * sizeof(Unicode));
00361   for (i = 0; i < mapLen; ++i) {
00362     map[i] = 0;
00363   }
00364   sMap = NULL;
00365   sMapLen = sMapSize = 0;
00366   refCnt = 1;
00367 #if MULTITHREADED
00368   gInitMutex(&mutex);
00369 #endif
00370 }
00371 
00372 CharCodeToUnicode::CharCodeToUnicode(GString *tagA, Unicode *mapA,
00373                                  CharCode mapLenA, GBool copyMap,
00374                                  CharCodeToUnicodeString *sMapA,
00375                                  int sMapLenA, int sMapSizeA) {
00376   tag = tagA;
00377   mapLen = mapLenA;
00378   if (copyMap) {
00379     map = (Unicode *)gmalloc(mapLen * sizeof(Unicode));
00380     memcpy(map, mapA, mapLen * sizeof(Unicode));
00381   } else {
00382     map = mapA;
00383   }
00384   sMap = sMapA;
00385   sMapLen = sMapLenA;
00386   sMapSize = sMapSizeA;
00387   refCnt = 1;
00388 #if MULTITHREADED
00389   gInitMutex(&mutex);
00390 #endif
00391 }
00392 
00393 CharCodeToUnicode::~CharCodeToUnicode() {
00394   if (tag) {
00395     delete tag;
00396   }
00397   gfree(map);
00398   if (sMap) {
00399     gfree(sMap);
00400   }
00401 #if MULTITHREADED
00402   gDestroyMutex(&mutex);
00403 #endif
00404 }
00405 
00406 void CharCodeToUnicode::incRefCnt() {
00407 #if MULTITHREADED
00408   gLockMutex(&mutex);
00409 #endif
00410   ++refCnt;
00411 #if MULTITHREADED
00412   gUnlockMutex(&mutex);
00413 #endif
00414 }
00415 
00416 void CharCodeToUnicode::decRefCnt() {
00417   GBool done;
00418 
00419 #if MULTITHREADED
00420   gLockMutex(&mutex);
00421 #endif
00422   done = --refCnt == 0;
00423 #if MULTITHREADED
00424   gUnlockMutex(&mutex);
00425 #endif
00426   if (done) {
00427     delete this;
00428   }
00429 }
00430 
00431 GBool CharCodeToUnicode::match(GString *tagA) {
00432   return tag && !tag->cmp(tagA);
00433 }
00434 
00435 void CharCodeToUnicode::setMapping(CharCode c, Unicode *u, int len) {
00436   int i;
00437 
00438   if (len == 1) {
00439     map[c] = u[0];
00440   } else {
00441     map[c] = 0;
00442     if (sMapLen == sMapSize) {
00443       sMapSize += 8;
00444       sMap = (CharCodeToUnicodeString *)
00445               grealloc(sMap, sMapSize * sizeof(CharCodeToUnicodeString));
00446     }
00447     sMap[sMapLen].c = c;
00448     sMap[sMapLen].len = len;
00449     for (i = 0; i < len && i < maxUnicodeString; ++i) {
00450       sMap[sMapLen].u[i] = u[i];
00451     }
00452     ++sMapLen;
00453   }
00454 }
00455 
00456 int CharCodeToUnicode::mapToUnicode(CharCode c, Unicode *u, int size) {
00457   int i, j;
00458 
00459   if (c >= mapLen) {
00460     return 0;
00461   }
00462   if (map[c]) {
00463     u[0] = map[c];
00464     return 1;
00465   }
00466   for (i = 0; i < sMapLen; ++i) {
00467     if (sMap[i].c == c) {
00468       for (j = 0; j < sMap[i].len && j < size; ++j) {
00469        u[j] = sMap[i].u[j];
00470       }
00471       return j;
00472     }
00473   }
00474   return 0;
00475 }
00476 
00477 //------------------------------------------------------------------------
00478 
00479 CharCodeToUnicodeCache::CharCodeToUnicodeCache(int sizeA) {
00480   int i;
00481 
00482   size = sizeA;
00483   cache = (CharCodeToUnicode **)gmalloc(size * sizeof(CharCodeToUnicode *));
00484   for (i = 0; i < size; ++i) {
00485     cache[i] = NULL;
00486   }
00487 }
00488 
00489 CharCodeToUnicodeCache::~CharCodeToUnicodeCache() {
00490   int i;
00491 
00492   for (i = 0; i < size; ++i) {
00493     if (cache[i]) {
00494       cache[i]->decRefCnt();
00495     }
00496   }
00497   gfree(cache);
00498 }
00499 
00500 CharCodeToUnicode *CharCodeToUnicodeCache::getCharCodeToUnicode(GString *tag) {
00501   CharCodeToUnicode *ctu;
00502   int i, j;
00503 
00504   if (cache[0] && cache[0]->match(tag)) {
00505     cache[0]->incRefCnt();
00506     return cache[0];
00507   }
00508   for (i = 1; i < size; ++i) {
00509     if (cache[i] && cache[i]->match(tag)) {
00510       ctu = cache[i];
00511       for (j = i; j >= 1; --j) {
00512        cache[j] = cache[j - 1];
00513       }
00514       cache[0] = ctu;
00515       ctu->incRefCnt();
00516       return ctu;
00517     }
00518   }
00519   return NULL;
00520 }
00521 
00522 void CharCodeToUnicodeCache::add(CharCodeToUnicode *ctu) {
00523   int i;
00524 
00525   if (cache[size - 1]) {
00526     cache[size - 1]->decRefCnt();
00527   }
00528   for (i = size - 1; i >= 1; --i) {
00529     cache[i] = cache[i - 1];
00530   }
00531   cache[0] = ctu;
00532   ctu->incRefCnt();
00533 }