Back to index

tetex-bin  3.0
Classes | Functions
encodings.h File Reference
#include "xdvi-config.h"
#include "xdvi.h"
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

struct  adobe2unicode
struct  unicode2adobe

Functions

uint32_t get_accented_glyph (uint32_t accent, uint32_t base_glyph)
const char * unicode2adobe_name (uint32_t unicode)
uint32_t adobe2unicode_name (const char *adobe_name)
uint32_t guess_encoding (wide_ubyte ch, const char *fontname, char *retbuf)
const char * expand_ligature (uint32_t unicode)
const char * search_normalize_chars (uint32_t unicode)
Boolean utf8_lowercase (char *utf8)
int utf8_to_ucs4 (const char *utf8, uint32_t *ucs4, size_t len)
void ucs4_to_utf8 (uint32_t ucs4, char *utf8, size_t *len, Boolean do_lowercase)
unsigned char utf8_to_iso_8859_1 (const char *utf8, size_t *len)
char * str_utf8_to_iso_8859_1 (const char *utf8)
void iso_8859_1_to_utf8 (unsigned char iso_8859_1, char *utf8, size_t *len)
int str_iso_8859_1_to_utf8 (const char *latin1, char *utf8, size_t len)
Boolean is_hyphenchar (uint32_t unicode)
Boolean is_ideograph (uint32_t unicode)
void close_iconv (void)

Class Documentation

struct adobe2unicode

Definition at line 29 of file encodings.h.

Class Members
const char * adobe_name
uint32_t unicode
struct unicode2adobe

Definition at line 34 of file encodings.h.

Class Members
const char * adobe_name
uint32_t unicode

Function Documentation

uint32_t adobe2unicode_name ( const char *  adobe_name)

Definition at line 2965 of file encodings.c.

{
    struct adobe2unicode search_item;
    struct adobe2unicode *match_item;

    if (memcmp(adobe_name, "cjk", 3) == 0) {
#if HAVE_ICONV_H
       /* Special case for CJK fonts (Chinese) - ZLB: the Adobe names in
        * the Chinese T1 fonts are of the form 'cjkXXXX' where 'XXXX' are
        * the hex number of the GBK/GB18030 encoding */
       unsigned char cjk[2], xx[3];
       xx[0] = adobe_name[3];
       xx[1] = adobe_name[4];
       xx[2] = '\0';
       cjk[0] = strtoul((char *)xx, NULL, 16);
       xx[0] = adobe_name[5];
       xx[1] = adobe_name[6];
       cjk[1] = strtoul((char *)xx, NULL, 16);
       /* convert GBK ==> unicode */
       return cjk2unicode(cjk);
#else /* HAVE_ICONV_H */
       if (!warned_about_cjk) {
           popup_message(globals.widgets.top_level,
                       MSG_WARN, NULL, "This version of xdvi has been compiled without iconv support - "
                       "cannot convert CJK character to UTF-8");
           warned_about_cjk = True;
       }
       return 0;
#endif /* HAVE_ICONV_H */
    }
    else {
       search_item.adobe_name = adobe_name;
    
       match_item = bsearch(&search_item, adobe2unicode_table,
                          sizeof adobe2unicode_table / sizeof adobe2unicode_table[0],
                          sizeof adobe2unicode_table[0],
                          adobe_name_cmp);
       if (match_item != NULL)
           return match_item->unicode;
       else
           return 0;
    }
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 2912 of file encodings.c.

{
#if HAVE_ICONV_H
    if (m_iconv_gb_ucs4 != (iconv_t)(-1)) {
       iconv_close(m_iconv_gb_ucs4);
       m_iconv_gb_ucs4 = (iconv_t)(-1);
    }
#endif /* HAVE_ICONV_H */
}

Here is the call graph for this function:

Here is the caller graph for this function:

const char* expand_ligature ( uint32_t  unicode)

Definition at line 3640 of file encodings.c.

{
    const char *ret = NULL;
    switch(unicode) {
    case 0xFB00: ret = "ff";       break;
    case 0xFB01: ret = "fi";       break;
    case 0xFB02: ret = "fl";       break;
    case 0xFB03: ret = "ffi";      break;
    case 0xFB04: ret = "ffl";      break;
    case 0xFB06: ret = "st";       break;
    case 0x0133: ret = "ij";       break;
    case 0x2013: ret = "--";       break;
    case 0x2014: ret = "---";      break;
    case 0x2039:
    case 0x2329: ret = "<"; break;
    case 0x203A:
    case 0x232A: ret = ">"; break;
    case 0x2018: ret = "`"; break;
    case 0x2019: ret = "'"; break;
    case 0x201C: ret = "``";       break;
    case 0x201D: ret = "''";       break;
    case 0x2026: ret = "...";      break;
    case 0x10ff28: ret = "{";      break;
    case 0x10ff29: ret = "}";      break;
    default: ret = NULL;    break;
    }
    if (ret != NULL) {
        TRACE_FIND((stderr, "expand_ligature: 0x%X --> `%s'",
                    (unsigned int)unicode, ret));
    }
    return ret;
}

Here is the caller graph for this function:

uint32_t get_accented_glyph ( uint32_t  accent,
uint32_t  base_glyph 
)

Definition at line 3231 of file encodings.c.

{
    TRACE_FIND((stderr, "get_accented_glyph: %lu, %lu",
              (unsigned long)accent, (unsigned long)base_glyph));
    switch(accent) {
    case 0x0060: /* grave */
       switch (base_glyph) {
       case 0x0041: /* A */ return 0x00C0;
       case 0x0045: /* E */ return 0x00C8;
       case 0x0049: /* I */ return 0x00CC;
       case 0x004F: /* O */ return 0x00D2;
       case 0x0055: /* U */ return 0x00D9;
       case 0x0057: /* W */ return 0x1E80;
       case 0x0059: /* Y */ return 0x1EF2;
       case 0x0061: /* a */ return 0x00E0;
       case 0x0065: /* e */ return 0x00E8;
       case 0x0069: /* i */ return 0x00EC;
       case 0x006F: /* o */ return 0x00F2;
       case 0x0075: /* u */ return 0x00F9;
       case 0x0077: /* w */ return 0x1E81;
       case 0x0079: /* y */ return 0x1EF3;
       default: return 0;
       }
    case 0x00B4: /* acute */
       switch (base_glyph) {
       case 0x0041: /* A */ return 0x00C1;
       case 0x0043: /* C */ return 0x0106;
       case 0x0045: /* E */ return 0x00C9;
       case 0x0049: /* I */ return 0x00CD;
       case 0x004C: /* L */ return 0x0139;
       case 0x004E: /* N */ return 0x0143;
       case 0x004F: /* O */ return 0x00D3;
       case 0x0052: /* R */ return 0x0154;
       case 0x0053: /* S */ return 0x015A;
       case 0x0055: /* U */ return 0x00DA;
       case 0x0057: /* W */ return 0x1E82;
       case 0x0059: /* Y */ return 0x00DD;
       case 0x0060: /* Z */ return 0x0179;
       case 0x0061: /* a */ return 0x00E1;
       case 0x0063: /* c */ return 0x0107;
       case 0x0065: /* e */ return 0x00E9;
       case 0x0069: /* i */ return 0x00ED;
       case 0x006C: /* l */ return 0x013A;
       case 0x006E: /* n */ return 0x0144;
       case 0x006F: /* o */ return 0x00F3;
       case 0x0072: /* r */ return 0x0155;
       case 0x0073: /* s */ return 0x015B;
       case 0x0075: /* u */ return 0x00FA;
       case 0x0077: /* w */ return 0x1E83;
       case 0x0079: /* y */ return 0x00FD;
       case 0x0080: /* z */ return 0x017A;
       default: return 0;
       }
    case 0x02C6: /* circumflex */
    case 0x005E: /* asciicircum */
       switch (base_glyph) {
       case 0x0041: /* A */ return 0x00C2;
       case 0x0045: /* E */ return 0x00CA;
       case 0x0047: /* G */ return 0x011C;
       case 0x0048: /* H */ return 0x0124;
       case 0x0049: /* I */ return 0x00CE;
       case 0x0050: /* H */ return 0x0124;
       case 0x004F: /* O */ return 0x00D4;
       case 0x0055: /* U */ return 0x00DB;
       case 0x0061: /* a */ return 0x00E2;
       case 0x0065: /* e */ return 0x00EA;
       case 0x0067: /* g */ return 0x011D;
       case 0x0068: /* h */ return 0x0125;
       case 0x0069: /* i */ return 0x00EE;
       case 0x0070: /* j */ return 0x0135;
       case 0x006F: /* o */ return 0x00F4;
       case 0x0075: /* u */ return 0x00FB;
       default: return 0;
       }
    case 0x02DC: /* tilde */
    case 0x007E: /* asciitilde */
       switch (base_glyph) {
       case 0x0041: /* A */ return 0x00C3;
       case 0x0049: /* I */ return 0x0128;
       case 0x004E: /* N */ return 0x00D1;
       case 0x004F: /* O */ return 0x00D5;
       case 0x0055: /* U */ return 0x0168;
       case 0x0061: /* a */ return 0x00E3;
       case 0x0069: /* i */ return 0x0129;
       case 0x006E: /* n */ return 0x00F1;
       case 0x006F: /* o */ return 0x00F5;
       case 0x0075: /* u */ return 0x0169;
       default: return 0;
       }
    case 0x00A8: /* dieresis */
       switch (base_glyph) {
       case 0x0041: /* A */ return 0x00C4;
       case 0x0045: /* E */ return 0x00CB;
       case 0x0049: /* I */ return 0x00CF;
       case 0x004F: /* O */ return 0x00D6;
       case 0x0055: /* U */ return 0x00DC;
       case 0x0057: /* w */ return 0x1E84;
       case 0x0061: /* a */ return 0x00E4;
       case 0x0065: /* e */ return 0x00EB;
       case 0x0069: /* i */ return 0x00EF;
       case 0x006F: /* o */ return 0x00F6;
       case 0x0075: /* u */ return 0x00FC;
       case 0x0077: /* w */ return 0x1E85;
       case 0x0079: /* y */ return 0x00FF;
       default: return 0;
       }
    case 0x02DA: /* ring */
       switch (base_glyph) {
       case 0x0041: /* A */ return 0x00C5;
       case 0x0061: /* a */ return 0x00E5;
       case 0x0055: /* U */ return 0x016E;
       case 0x0075: /* u */ return 0x016F;
       default: return 0;
       }
    case 0x00B8: /* cedilla */
       switch (base_glyph) {
       case 0x0043: /* C */ return 0x00C7;
       case 0x0063: /* c */ return 0x00E7;
       case 0x0053: /* S */ return 0x015E;
       case 0x0073: /* s */ return 0x015F;
       default: return 0;
       }
    case 0x02DB: /* ogonek */
       switch (base_glyph) {
       case 0x0041: /* A */ return 0x0104;
       case 0x0045: /* E */ return 0x0118;
       case 0x0049: /* I */ return 0x012E;
       case 0x0055: /* U */ return 0x0172;
       case 0x0061: /* a */ return 0x0105;
       case 0x0065: /* e */ return 0x0119;
       case 0x0069: /* i */ return 0x012F;
       case 0x006F: /* o */ return 0x02DB;
       case 0x0075: /* u */ return 0x0173;
       default: return 0;
       }
    case 0x002F: /* solidus */
       switch (base_glyph) {
       case 0x004C: /* L */ return 0x0141;
       case 0x004F: /* O */ return 0x00D8;
       case 0x006C: /* l */ return 0x0142;
       case 0x006F: /* o */ return 0x00F8;
       default: return 0;
       }
    case 0x02C7: /* caron */
       switch (base_glyph) {
       case 0x0043: /* C */ return 0x010C;
       case 0x0044: /* D */ return 0x010E;
       case 0x0045: /* E */ return 0x011A;
       case 0x0047: /* G */ return 0x01E6;
       case 0x004C: /* L */ return 0x013D;
       case 0x004E: /* N */ return 0x0147;
       case 0x0052: /* R */ return 0x0158;
       case 0x0053: /* S */ return 0x0160;
       case 0x0054: /* T */ return 0x0164;
       case 0x005A: /* Z */ return 0x017D;
       case 0x0063: /* c */ return 0x010D;
       case 0x0064: /* d */ return 0x010F;
       case 0x0065: /* e */ return 0x011B;
       case 0x0067: /* g */ return 0x01E7;
       case 0x006C: /* l */ return 0x013E;
       case 0x006E: /* n */ return 0x0148;
       case 0x0072: /* r */ return 0x0159;
       case 0x0073: /* s */ return 0x0161;
       case 0x0074: /* t */ return 0x0165;
       case 0x007A: /* z */ return 0x017E;
       default: return 0;
       }
    case 0x02D8: /* breve */
       switch (base_glyph) {
       case 0x0041: /* A */ return 0x0102;
       case 0x0045: /* E */ return 0x0114;
       case 0x0047: /* G */ return 0x011E;
       case 0x0049: /* I */ return 0x012C;
       case 0x004F: /* O */ return 0x014E;
       case 0x0055: /* U */ return 0x016C;
       case 0x0061: /* a */ return 0x0103;
       case 0x0065: /* e */ return 0x0115;
       case 0x0067: /* g */ return 0x011F;
       case 0x0069: /* i */ return 0x012D;
       case 0x006F: /* o */ return 0x014F;
       case 0x0075: /* u */ return 0x016D;
       default: return 0;
       }      
    case 0x02DD: /* hungarumlaut */
       switch (base_glyph) {
       case 0x004F: /* O */ return 0x0150;
       case 0x0055: /* U */ return 0x0170;
       case 0x006F: /* o */ return 0x0151;
       case 0x0075: /* u */ return 0x0171;
       default: return 0;
       }
    case 0x00AF: /* macron */
       switch (base_glyph) {
       case 0x0041: /* A */ return 0x0100;
       case 0x0045: /* E */ return 0x0112;
       case 0x0049: /* I */ return 0x012A;
       case 0x004F: /* O */ return 0x014C;
       case 0x0055: /* U */ return 0x016A;
       case 0x0061: /* a */ return 0x0101;
       case 0x0065: /* e */ return 0x0113;
       case 0x0069: /* i */ return 0x012B;
       case 0x006D: /* m */ return 0x00AF;
       case 0x006F: /* o */ return 0x014D;
       case 0x0075: /* u */ return 0x016B;
       default: return 0;
       }
       /* special cases: accent - char inverted */
    case 0x0043: /* C; special case: cedilla is set after C in OT1 */
       switch (base_glyph) {
       case 0x00B8: /* cedilla */ return 0x00C7;
       default: return 0;
       }
    case 0x0063: /* c; see above */
       switch (base_glyph) {
       case 0x00B8: /* cedilla */ return 0x00E7;
       default: return 0;
       }
    case 0x0053: /* S; see above */
       switch (base_glyph) {
       case 0x00B8: /* cedilla */ return 0x015E;
       default: return 0;
       }
    case 0x0073: /* s; see above */
       switch (base_glyph) {
       case 0x00B8: /* cedilla */ return 0x015F;
       default: return 0;
       }
    }
    return 0;
}

Here is the caller graph for this function:

uint32_t guess_encoding ( wide_ubyte  ch,
const char *  fontname,
char *  retbuf 
)

Definition at line 3463 of file encodings.c.

{
    uint32_t i;
    static hashTableT unknown_font_hash;
    static Boolean hash_initialized = False;
    size_t dummy = 0;
    
    TRACE_FIND_VERBOSE((stderr, "guess_encoding: |%s|, char 0x%.4X", fontname, ch));

    /* our encoding vectors only have size 256 */
    if (ch > 255) {
       XDVI_WARNING((stderr, "guess_encoding: font index %lu too large", (unsigned long)ch));
       return 0;
    }

    if (memcmp(fontname, "gbk", 3) == 0
              && isdigit((int)fontname[(i=strlen(fontname))-1])
              && isdigit((int)fontname[i-2])) {
#if HAVE_ICONV_H
       unsigned char cjk[2];
       TRACE_FIND_VERBOSE((stderr, "guess_encoding: CJK fonts (GBK encoding)"));
       i = atoi(fontname + i - 2);        /* font no */
       i = (i - 1) * 256 + (uint32_t)ch;  /* char index */
       cjk[0] = i / 190 + 129;
       cjk[1] = i % 190 + 64;
       if (cjk[1] >= 128)
           cjk[1]++;
       return cjk2unicode(cjk);
#else /* HAVE_ICONV_H */
       if (!warned_about_cjk) {
           popup_message(globals.widgets.top_level,
                       MSG_WARN, NULL, "This version of xdvi has been compiled without iconv support - "
                       "cannot convert CJK character to UTF-8");
           warned_about_cjk = True;
       }
       return 0;
#endif /* HAVE_ICONV_H */
    }

    if (memcmp(fontname, "cmsy", 4) == 0) {
       TRACE_FIND_VERBOSE((stderr, "guess_encoding: %s => m_cm_symbol", fontname));
       return m_cm_symbol_encoding[ch];
    }
    if (memcmp(fontname, "cmmi", 4) == 0) {
       TRACE_FIND_VERBOSE((stderr, "guess_encoding: %s => m_cm_math_italics", fontname));
       return m_cm_math_italics_encoding[ch];
    }
    if (memcmp(fontname, "cmex", 4) == 0) {
       TRACE_FIND_VERBOSE((stderr, "guess_encoding: %s => m_cm_math_extended", fontname));
       return m_cm_math_extended_encoding[ch];
    }
    if (memcmp(fontname, "cmtt", 4) == 0) {
       TRACE_FIND_VERBOSE((stderr, "guess_encoding: %s => m_cm_typewriter", fontname));
       return m_cm_typewriter_encoding[ch];
    }
    /* following to cover cmsl, cmb, cmbx, cmti, cmdunghill, whatever ...
       hope it doesn't overgenerate ;-) */
    if (memcmp(fontname, "cm", 2) == 0
       || memcmp(fontname, "lcmss", strlen("lcmss")) == 0 /* lcmss8 etc. */
       || memcmp(fontname, "ygoth", strlen("ygoth")) == 0
       || memcmp(fontname, "yinit", strlen("yinit")) == 0
       || memcmp(fontname, "logo", strlen("logo")) == 0
       || memcmp(fontname, "rsfs", strlen("rsfs")) == 0
       ) {
       TRACE_FIND_VERBOSE((stderr, "guess_encoding: %s => m_ot1", fontname));
       return m_ot1_encoding[ch];
    }
    /* cyrillic fonts */
    if (memcmp(fontname, "la", 2) == 0) {
       TRACE_FIND_VERBOSE((stderr, "guess_encoding: %s => m_t2", fontname));
       return m_t2_encoding[ch];
    }
    if (memcmp(fontname, "ec", 2) == 0
       || memcmp(fontname, "eb", 2) == 0) {
       TRACE_FIND_VERBOSE((stderr, "guess_encoding: %s => m_cork", fontname));
       /* FIXME: why cork and not EC? What font actually uses EC?
          The only difference seems that dvips' EC.enc has `ldot' at 0xb8,
          whereas cork.enc has `ydieresis' there. A document with
          \usepackage[T1]{fontenc}
          also produces a ydieresis.
        */
       return m_cork_encoding[ch];
    }
    if (memcmp(fontname, "tc", 2) == 0) {
       TRACE_FIND_VERBOSE((stderr, "guess_encoding: %s => m_ts1", fontname));
       return m_ts1_encoding[ch];
    }
    /* blackletter fonts with funny encoding */
    if (memcmp(fontname, "ysmfrak", 7) == 0
       || memcmp(fontname, "yswab", 5) == 0) {
       TRACE_FIND_VERBOSE((stderr, "guess_encoding: %s => m_yfrak", fontname));
       /* special cases for ligatures */
       switch (ch) {
       case 0x85: strcpy(retbuf, "ch"); return 0;
       case 0x86: strcpy(retbuf, "ck"); return 0;
       case 0xA7: strcpy(retbuf, "sz"); return 0;
       default: return m_yfrak_encoding[ch];
       }
    }

    /* euler mathematical */
    if (memcmp(fontname, "eufm", strlen("eufm")) == 0
       || memcmp(fontname, "eusm", strlen("eufm")) == 0
       ) {
       switch (ch) {
       case 0x0: case 0x1: return 'd'; break;
       case 0x2: case 0x3: return 'f'; break;
       case 0x4: return 'g'; break;
       case 0x5: return 'k'; break;
       case 0x6: return 't'; break;
       case 0x7: return 'u'; break;
       default: return m_ot1_encoding[ch];
       }
    }
              
    
    /* stuff that doesn't have a good ASCII representation */
    if (memcmp(fontname, "lcircle", strlen("lcircle")) == 0
       || memcmp(fontname, "line", strlen("line")) == 0
       || memcmp(fontname, "fmvr8x", strlen("fmvr8x")) == 0
       || memcmp(fontname, "feymr", strlen("feymr")) == 0
       || memcmp(fontname, "msbm", strlen("msbm")) == 0
       || memcmp(fontname, "msam", strlen("msam")) == 0
       || memcmp(fontname, "wasy", strlen("wasy")) == 0
       || memcmp(fontname, "txsy", strlen("txsy")) == 0
       ) {
       return 0;
    }

    /* TODO:
       txfonts
    */
    
    /* default: assume cork encoding, and print out a warning for each font */
    if (!hash_initialized) {
       unknown_font_hash = hash_create(1031);
       hash_initialized = True;
    }
    if (!find_str_int_hash(&unknown_font_hash, fontname, &dummy)) {
       XDVI_WARNING((stderr,
                    "guess_encoding(): nothing suitable for \"%s\", assuming Cork encoding.\n"
                    "(Please tell us about this at "
                    "http://sourceforge.net/tracker/?group_id=23164&atid=377580)", fontname));
       put_str_int_hash(&unknown_font_hash, fontname, dummy);
    }
    return m_cork_encoding[ch];
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 3929 of file encodings.c.

{
    if (u == 0x002D || u ==  0x00AD)
       return True;
    return False;
}

Here is the caller graph for this function:

Definition at line 3956 of file encodings.c.

{
    return (u >= 0x3000 && u <= 0x3002) || /* IDEOGRAPHIC SPACE, COMMA, FULL STOP */
       u == 0xFF61 || /* HALFWIDTH IDEOGRAPHIC FULL STOP */
       u == 0xFF64 || /* HALFWIDTH IDEOGRAPHIC COMMA */
       (u >= 0x3006 && u <= 0x3007) ||
       (u >= 0x3021 && u <= 0x3029) ||
       (u >= 0x3038 && u <= 0x303A) ||
       (u >= 0x3400 && u <= 0x4DB5) ||
       (u >= 0x4E00 && u <= 0x9FA5) ||
       (u >= 0xF900 && u <= 0xFA2D) ||
       (u >= 0x20000 && u <= 0x2A6D6) ||
       (u >= 0x2F800 && u <= 0x2FA1D);
}

Here is the caller graph for this function:

void iso_8859_1_to_utf8 ( unsigned char  iso_8859_1,
char *  utf8,
size_t len 
)

Definition at line 3885 of file encodings.c.

{
    if (c < 0x80) {
       *len = 1;
       utf8[0] = c;
    }
    else {
       *len = 2;
       utf8[1] = 0x80 | (c & 0x3f);
       c >>= 6;
       utf8[0] = 0xc0 | (c & 0x1f);
    }
}

Here is the caller graph for this function:

Definition at line 3616 of file encodings.c.

{
    const char *ret = NULL;
    switch(unicode) {
    case 0x2212: ret = "-"; break;
    case 0x2022: ret = "\xb7";     break; /* middle dot */
    default: ret = NULL;    break;
    }
    if (ret != NULL) {
        TRACE_FIND((stderr, "expand_searchchars: 0x%X --> `%s'",
                    (unsigned int)unicode, ret));
    }
    return ret;
}

Here is the caller graph for this function:

int str_iso_8859_1_to_utf8 ( const char *  latin1,
char *  utf8,
size_t  len 
)

Definition at line 3905 of file encodings.c.

{
    size_t i = 0;

    while (i < len && *latin1 != '\0') {
       char tmpbuf[2];
       size_t tmp_len = 0;
       iso_8859_1_to_utf8((unsigned char)*latin1, tmpbuf, &tmp_len);
       if (i + tmp_len >= len)
           return -1;
       memcpy(utf8 + i, tmpbuf, tmp_len);
       i += tmp_len;
       latin1++;
    }
    /* terminate utf8 */
    if (i < len)
       utf8[i++] = '\0';
    else
       return -1;
    
    return i;
}

Here is the call graph for this function:

Here is the caller graph for this function:

char* str_utf8_to_iso_8859_1 ( const char *  utf8)

Definition at line 3972 of file encodings.c.

{
    size_t utf8_len = strlen(utf8), i = 0, offset = 0;
    char *buf = xmalloc(4 * utf8_len + 1); /* worst case of non-printables */

    while (i < utf8_len) {
       uint32_t ucs4;
       const char *ret;

       /*  fprintf(stderr, "offset: %d\n", (int)offset); */
       /* first apply normalization heurisitcs also used by search */
       size_t len = utf8_to_ucs4(utf8 + i, &ucs4, utf8_len + 1);
       if ((ret = search_normalize_chars(ucs4)) != NULL) {
           size_t len_ret = strlen(ret);
           memcpy(buf + offset, ret, len_ret);
           offset += len_ret;
       }
       else if (ucs4 <= 0xff) { /* in iso-latin1 range */
           buf[offset++] = (unsigned char)ucs4;
       }
       else {
           sprintf(buf + offset, "\\%.4lX", (unsigned long)ucs4);
           offset += 4;
       }
       i += len;
    }
    buf[offset] = '\0';
    
    return buf;
}

Here is the call graph for this function:

Here is the caller graph for this function:

void ucs4_to_utf8 ( uint32_t  ucs4,
char *  utf8,
size_t len,
Boolean  do_lowercase 
)

Definition at line 3807 of file encodings.c.

{
    if (do_lowercase)
       ucs4 = ucs4_lowercase(ucs4);
    
    if (ucs4 < 0x80)
       *len = 1;
    else if (ucs4 < 0x800)
       *len = 2;
    else if (ucs4 < 0x10000)
       *len = 3;
    else if (ucs4 < 0x200000)
       *len = 4;
    else if (ucs4 < 0x4000000)
       *len = 5;
    else if (ucs4 <= 0x7fffffff)
       *len = 6;

    switch(*len) { /* note: code falls through cases! */
    case 6: utf8[5] = 0x80 | (ucs4 & 0x3f); ucs4 = ucs4 >> 6; ucs4 |= 0x4000000;
    case 5: utf8[4] = 0x80 | (ucs4 & 0x3f); ucs4 = ucs4 >> 6; ucs4 |= 0x200000;
    case 4: utf8[3] = 0x80 | (ucs4 & 0x3f); ucs4 = ucs4 >> 6; ucs4 |= 0x10000;
    case 3: utf8[2] = 0x80 | (ucs4 & 0x3f); ucs4 = ucs4 >> 6; ucs4 |= 0x800;
    case 2: utf8[1] = 0x80 | (ucs4 & 0x3f); ucs4 = ucs4 >> 6; ucs4 |= 0xc0;
    case 1: utf8[0] = ucs4;
    }
}

Here is the call graph for this function:

Here is the caller graph for this function:

const char* unicode2adobe_name ( uint32_t  unicode)
Boolean utf8_lowercase ( char *  utf8)

Definition at line 3679 of file encodings.c.

{
    size_t utf8_len = strlen(utf8) + 1; /* also convert terminating byte (acutally not needed ...) */
    size_t ucs4_len = utf8_len * 6; /* ample ... */
    int i = 0;
    uint32_t *ucs4 = xmalloc(ucs4_len);
    uint32_t *ucs4_start = ucs4; /* save for free()ing */
    for (; *utf8 != '\0'; ucs4++, utf8 += i) {
       size_t conv_len;
       if ((i = utf8_to_ucs4(utf8, ucs4, 6 /* don't care about character len */)) < 0) {
           XDVI_ERROR((stderr, "Error in utf8_lowercase: Illegal UTF-8 sequence"));
           free(ucs4_start);
           return False;
       }
       ucs4_to_utf8(*ucs4, utf8, &conv_len, True); /* lowercases it */
       if ((int)conv_len != i) {
           XDVI_ERROR((stderr, "Error in utf8_lowercase: length after UCS4 conversion (%lu)\n"
                     "differs from length after utf8 conversion(%lu) (string: %s)\n",
                     (unsigned long)conv_len, (unsigned long)i, utf8));
           free(ucs4_start);
           return False;
       }
    }
    free(ucs4_start);
    return True;
}

Here is the call graph for this function:

Here is the caller graph for this function:

unsigned char utf8_to_iso_8859_1 ( const char *  utf8,
size_t len 
)

Definition at line 3843 of file encodings.c.

{
    unsigned char c = *utf8;
    uint32_t wc;

    if (c < 0x80) {
       *len = 1;
       return c;
    }
    else if (c < 0xe0) {
       *len = 2;
       wc = ((unsigned char)(c & 0x1f) << 6) | (unsigned char)(utf8[1] ^ 0x80);
       if (wc <= 0xff)
           return (unsigned char)wc;
       else
           return '?';
    }
    else if (c < 0xf0) {
       *len = 3;
       return '?';
    }
    else if (c < 0xf8) {
       *len = 4;
       return '?';
    }
    else if (c < 0xfc) {
       *len = 5;
       return '?';
    }
    else if (c < 0xfe) {
       *len = 6;
       return '?';
    }
    else
       return '?';
}
int utf8_to_ucs4 ( const char *  utf8,
uint32_t ucs4,
size_t  len 
)

Definition at line 3713 of file encodings.c.

{
    const unsigned char *str = (const unsigned char *)utf8;
    unsigned char c = *str;

    if (c < 0x80) {
       *ucs4 = c;
       return 1;
    }
    else if (c < 0xc2) {
       return -1; /* illegal UTF8; shouldn't happen */
    }
    else if (c < 0xe0) {
       if (len < 2)
           return -1; /* len too short */
       if (!((str[1] ^ 0x80) < 0x40)) {
           return -1; /* illegal UTF8; shouldn't happen */
       }
       *ucs4 = ((uint32_t) (c & 0x1f) << 6)
           | (uint32_t) (str[1] ^ 0x80);
       return 2;
    }
    else if (c < 0xf0) {
       if (len < 3)
           return -1; /* len too short */
       if (!((str[1] ^ 0x80) < 0x40
             && (str[2] ^ 0x80) < 0x40
             && (c >= 0xe1 || str[1] >= 0xa0))) {
           return -1; /* illegal UTF8; shouldn't happen */
       }
       *ucs4 = ((uint32_t) (c & 0x0f) << 12)
           | ((uint32_t) (str[1] ^ 0x80) << 6)
           | (uint32_t) (str[2] ^ 0x80);
       return 3;
    }
    else if (c < 0xf8 && sizeof(uint32_t) * 8 >= 32) {
       if (len < 4)
           return -1; /* len too short */
       if (!((str[1] ^ 0x80) < 0x40
             && (str[2] ^ 0x80) < 0x40
             && (str[3] ^ 0x80) < 0x40
             && (c >= 0xf1 || str[1] >= 0x90))) {
           return -1; /* illegal UTF8; shouldn't happen */
       }
       *ucs4 = ((uint32_t) (c & 0x07) << 18)
           | ((uint32_t) (str[1] ^ 0x80) << 12)
           | ((uint32_t) (str[2] ^ 0x80) << 6)
           | (uint32_t) (str[3] ^ 0x80);
       return 4;
    }
    else if (c < 0xfc && sizeof(uint32_t)*8 >= 32) {
       if (len < 5)
           return -1; /* len too short */
       if (!((str[1] ^ 0x80) < 0x40
             && (str[2] ^ 0x80) < 0x40
             && (str[3] ^ 0x80) < 0x40
             && (str[4] ^ 0x80) < 0x40
             && (c >= 0xf9 || str[1] >= 0x88))) {
           return -1; /* illegal UTF8; shouldn't happen */
       }
       *ucs4 = ((uint32_t) (c & 0x03) << 24)
           | ((uint32_t) (str[1] ^ 0x80) << 18)
           | ((uint32_t) (str[2] ^ 0x80) << 12)
           | ((uint32_t) (str[3] ^ 0x80) << 6)
           | (uint32_t) (str[4] ^ 0x80);
       return 5;
    }
    else if (c < 0xfe && sizeof(uint32_t)*8 >= 32) {
       if (len < 6)
           return -1; /* len too short */
       if (!((str[1] ^ 0x80) < 0x40
             && (str[2] ^ 0x80) < 0x40
             && (str[3] ^ 0x80) < 0x40
             && (str[4] ^ 0x80) < 0x40
             && (str[5] ^ 0x80) < 0x40
             && (c >= 0xfd || str[1] >= 0x84))) {
           return -1; /* illegal UTF8; shouldn't happen */
       }
       *ucs4 = ((uint32_t) (c & 0x01) << 30)
           | ((uint32_t) (str[1] ^ 0x80) << 24)
           | ((uint32_t) (str[2] ^ 0x80) << 18)
           | ((uint32_t) (str[3] ^ 0x80) << 12)
           | ((uint32_t) (str[4] ^ 0x80) << 6)
           | (uint32_t) (str[5] ^ 0x80);
       return 6;
    }
    else {
       return -1; /* illegal UTF8 */
    }
}

Here is the caller graph for this function: