Back to index

tetex-bin  3.0
Defines | Functions | Variables
encodings.c File Reference
#include "xdvi-config.h"
#include "xdvi.h"
#include <ctype.h>
#include "util.h"
#include "encodings.h"
#include "my-snprintf.h"
#include "message-window.h"

Go to the source code of this file.

Defines

#define MY_DEBUG   0
#define TRACE_FIND_VERBOSE(x)   /* as nothing */

Functions

static int adobe_name_cmp (const void *s1, const void *s2)
void close_iconv (void)
uint32_t adobe2unicode_name (const char *adobe_name)
static uint32_t ucs4_lowercase (uint32_t c)
uint32_t get_accented_glyph (uint32_t accent, uint32_t base_glyph)
uint32_t guess_encoding (wide_ubyte ch, const char *fontname, char *retbuf)
const char * search_normalize_chars (uint32_t unicode)
const char * expand_ligature (uint32_t unicode)
Boolean utf8_lowercase (char *utf8)
int utf8_to_ucs4 (const char *utf8, uint32_t *ucs4, size_t len)
void ucs4_to_utf8 (uint32_t ucs4, char *utf8, size_t *len, Boolean do_lowercase)
unsigned char utf8_to_iso_8859_1 (const char *utf8, size_t *len)
void iso_8859_1_to_utf8 (unsigned char c, char *utf8, size_t *len)
int str_iso_8859_1_to_utf8 (const char *latin1, char *utf8, size_t len)
Boolean is_hyphenchar (uint32_t u)
Boolean is_ideograph (uint32_t u)
char * str_utf8_to_iso_8859_1 (const char *utf8)

Variables

static Boolean warned_about_cjk = False
static uint32_t m_cm_symbol_encoding [256]
static uint32_t m_cm_math_italics_encoding [256]
static uint32_t m_cm_math_extended_encoding [256]
static uint32_t m_cm_typewriter_encoding [256]
static uint32_t m_ot1_encoding [256]
static uint32_t m_t2_encoding [256]
static uint32_t m_cork_encoding [256]
static uint32_t m_ts1_encoding [256]
static uint32_t m_yfrak_encoding [176]
static struct adobe2unicode []

Define Documentation

#define MY_DEBUG   0

Definition at line 83 of file encodings.c.

#define TRACE_FIND_VERBOSE (   x)    /* as nothing */

Definition at line 88 of file encodings.c.


Function Documentation

uint32_t adobe2unicode_name ( const char *  adobe_name)

Definition at line 2965 of file encodings.c.

{
    struct adobe2unicode search_item;
    struct adobe2unicode *match_item;

    if (memcmp(adobe_name, "cjk", 3) == 0) {
#if HAVE_ICONV_H
       /* Special case for CJK fonts (Chinese) - ZLB: the Adobe names in
        * the Chinese T1 fonts are of the form 'cjkXXXX' where 'XXXX' are
        * the hex number of the GBK/GB18030 encoding */
       unsigned char cjk[2], xx[3];
       xx[0] = adobe_name[3];
       xx[1] = adobe_name[4];
       xx[2] = '\0';
       cjk[0] = strtoul((char *)xx, NULL, 16);
       xx[0] = adobe_name[5];
       xx[1] = adobe_name[6];
       cjk[1] = strtoul((char *)xx, NULL, 16);
       /* convert GBK ==> unicode */
       return cjk2unicode(cjk);
#else /* HAVE_ICONV_H */
       if (!warned_about_cjk) {
           popup_message(globals.widgets.top_level,
                       MSG_WARN, NULL, "This version of xdvi has been compiled without iconv support - "
                       "cannot convert CJK character to UTF-8");
           warned_about_cjk = True;
       }
       return 0;
#endif /* HAVE_ICONV_H */
    }
    else {
       search_item.adobe_name = adobe_name;
    
       match_item = bsearch(&search_item, adobe2unicode_table,
                          sizeof adobe2unicode_table / sizeof adobe2unicode_table[0],
                          sizeof adobe2unicode_table[0],
                          adobe_name_cmp);
       if (match_item != NULL)
           return match_item->unicode;
       else
           return 0;
    }
}

Here is the call graph for this function:

Here is the caller graph for this function:

static int adobe_name_cmp ( const void s1,
const void s2 
) [static]

Definition at line 2892 of file encodings.c.

{
    const struct adobe2unicode *a = s1;
    const struct adobe2unicode *b = s2;
    
    return strcmp(a->adobe_name, b->adobe_name);
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 2912 of file encodings.c.

{
#if HAVE_ICONV_H
    if (m_iconv_gb_ucs4 != (iconv_t)(-1)) {
       iconv_close(m_iconv_gb_ucs4);
       m_iconv_gb_ucs4 = (iconv_t)(-1);
    }
#endif /* HAVE_ICONV_H */
}

Here is the call graph for this function:

Here is the caller graph for this function:

const char* expand_ligature ( uint32_t  unicode)

Definition at line 3640 of file encodings.c.

{
    const char *ret = NULL;
    switch(unicode) {
    case 0xFB00: ret = "ff";       break;
    case 0xFB01: ret = "fi";       break;
    case 0xFB02: ret = "fl";       break;
    case 0xFB03: ret = "ffi";      break;
    case 0xFB04: ret = "ffl";      break;
    case 0xFB06: ret = "st";       break;
    case 0x0133: ret = "ij";       break;
    case 0x2013: ret = "--";       break;
    case 0x2014: ret = "---";      break;
    case 0x2039:
    case 0x2329: ret = "<"; break;
    case 0x203A:
    case 0x232A: ret = ">"; break;
    case 0x2018: ret = "`"; break;
    case 0x2019: ret = "'"; break;
    case 0x201C: ret = "``";       break;
    case 0x201D: ret = "''";       break;
    case 0x2026: ret = "...";      break;
    case 0x10ff28: ret = "{";      break;
    case 0x10ff29: ret = "}";      break;
    default: ret = NULL;    break;
    }
    if (ret != NULL) {
        TRACE_FIND((stderr, "expand_ligature: 0x%X --> `%s'",
                    (unsigned int)unicode, ret));
    }
    return ret;
}

Here is the caller graph for this function:

uint32_t get_accented_glyph ( uint32_t  accent,
uint32_t  base_glyph 
)

Definition at line 3231 of file encodings.c.

{
    TRACE_FIND((stderr, "get_accented_glyph: %lu, %lu",
              (unsigned long)accent, (unsigned long)base_glyph));
    switch(accent) {
    case 0x0060: /* grave */
       switch (base_glyph) {
       case 0x0041: /* A */ return 0x00C0;
       case 0x0045: /* E */ return 0x00C8;
       case 0x0049: /* I */ return 0x00CC;
       case 0x004F: /* O */ return 0x00D2;
       case 0x0055: /* U */ return 0x00D9;
       case 0x0057: /* W */ return 0x1E80;
       case 0x0059: /* Y */ return 0x1EF2;
       case 0x0061: /* a */ return 0x00E0;
       case 0x0065: /* e */ return 0x00E8;
       case 0x0069: /* i */ return 0x00EC;
       case 0x006F: /* o */ return 0x00F2;
       case 0x0075: /* u */ return 0x00F9;
       case 0x0077: /* w */ return 0x1E81;
       case 0x0079: /* y */ return 0x1EF3;
       default: return 0;
       }
    case 0x00B4: /* acute */
       switch (base_glyph) {
       case 0x0041: /* A */ return 0x00C1;
       case 0x0043: /* C */ return 0x0106;
       case 0x0045: /* E */ return 0x00C9;
       case 0x0049: /* I */ return 0x00CD;
       case 0x004C: /* L */ return 0x0139;
       case 0x004E: /* N */ return 0x0143;
       case 0x004F: /* O */ return 0x00D3;
       case 0x0052: /* R */ return 0x0154;
       case 0x0053: /* S */ return 0x015A;
       case 0x0055: /* U */ return 0x00DA;
       case 0x0057: /* W */ return 0x1E82;
       case 0x0059: /* Y */ return 0x00DD;
       case 0x0060: /* Z */ return 0x0179;
       case 0x0061: /* a */ return 0x00E1;
       case 0x0063: /* c */ return 0x0107;
       case 0x0065: /* e */ return 0x00E9;
       case 0x0069: /* i */ return 0x00ED;
       case 0x006C: /* l */ return 0x013A;
       case 0x006E: /* n */ return 0x0144;
       case 0x006F: /* o */ return 0x00F3;
       case 0x0072: /* r */ return 0x0155;
       case 0x0073: /* s */ return 0x015B;
       case 0x0075: /* u */ return 0x00FA;
       case 0x0077: /* w */ return 0x1E83;
       case 0x0079: /* y */ return 0x00FD;
       case 0x0080: /* z */ return 0x017A;
       default: return 0;
       }
    case 0x02C6: /* circumflex */
    case 0x005E: /* asciicircum */
       switch (base_glyph) {
       case 0x0041: /* A */ return 0x00C2;
       case 0x0045: /* E */ return 0x00CA;
       case 0x0047: /* G */ return 0x011C;
       case 0x0048: /* H */ return 0x0124;
       case 0x0049: /* I */ return 0x00CE;
       case 0x0050: /* H */ return 0x0124;
       case 0x004F: /* O */ return 0x00D4;
       case 0x0055: /* U */ return 0x00DB;
       case 0x0061: /* a */ return 0x00E2;
       case 0x0065: /* e */ return 0x00EA;
       case 0x0067: /* g */ return 0x011D;
       case 0x0068: /* h */ return 0x0125;
       case 0x0069: /* i */ return 0x00EE;
       case 0x0070: /* j */ return 0x0135;
       case 0x006F: /* o */ return 0x00F4;
       case 0x0075: /* u */ return 0x00FB;
       default: return 0;
       }
    case 0x02DC: /* tilde */
    case 0x007E: /* asciitilde */
       switch (base_glyph) {
       case 0x0041: /* A */ return 0x00C3;
       case 0x0049: /* I */ return 0x0128;
       case 0x004E: /* N */ return 0x00D1;
       case 0x004F: /* O */ return 0x00D5;
       case 0x0055: /* U */ return 0x0168;
       case 0x0061: /* a */ return 0x00E3;
       case 0x0069: /* i */ return 0x0129;
       case 0x006E: /* n */ return 0x00F1;
       case 0x006F: /* o */ return 0x00F5;
       case 0x0075: /* u */ return 0x0169;
       default: return 0;
       }
    case 0x00A8: /* dieresis */
       switch (base_glyph) {
       case 0x0041: /* A */ return 0x00C4;
       case 0x0045: /* E */ return 0x00CB;
       case 0x0049: /* I */ return 0x00CF;
       case 0x004F: /* O */ return 0x00D6;
       case 0x0055: /* U */ return 0x00DC;
       case 0x0057: /* w */ return 0x1E84;
       case 0x0061: /* a */ return 0x00E4;
       case 0x0065: /* e */ return 0x00EB;
       case 0x0069: /* i */ return 0x00EF;
       case 0x006F: /* o */ return 0x00F6;
       case 0x0075: /* u */ return 0x00FC;
       case 0x0077: /* w */ return 0x1E85;
       case 0x0079: /* y */ return 0x00FF;
       default: return 0;
       }
    case 0x02DA: /* ring */
       switch (base_glyph) {
       case 0x0041: /* A */ return 0x00C5;
       case 0x0061: /* a */ return 0x00E5;
       case 0x0055: /* U */ return 0x016E;
       case 0x0075: /* u */ return 0x016F;
       default: return 0;
       }
    case 0x00B8: /* cedilla */
       switch (base_glyph) {
       case 0x0043: /* C */ return 0x00C7;
       case 0x0063: /* c */ return 0x00E7;
       case 0x0053: /* S */ return 0x015E;
       case 0x0073: /* s */ return 0x015F;
       default: return 0;
       }
    case 0x02DB: /* ogonek */
       switch (base_glyph) {
       case 0x0041: /* A */ return 0x0104;
       case 0x0045: /* E */ return 0x0118;
       case 0x0049: /* I */ return 0x012E;
       case 0x0055: /* U */ return 0x0172;
       case 0x0061: /* a */ return 0x0105;
       case 0x0065: /* e */ return 0x0119;
       case 0x0069: /* i */ return 0x012F;
       case 0x006F: /* o */ return 0x02DB;
       case 0x0075: /* u */ return 0x0173;
       default: return 0;
       }
    case 0x002F: /* solidus */
       switch (base_glyph) {
       case 0x004C: /* L */ return 0x0141;
       case 0x004F: /* O */ return 0x00D8;
       case 0x006C: /* l */ return 0x0142;
       case 0x006F: /* o */ return 0x00F8;
       default: return 0;
       }
    case 0x02C7: /* caron */
       switch (base_glyph) {
       case 0x0043: /* C */ return 0x010C;
       case 0x0044: /* D */ return 0x010E;
       case 0x0045: /* E */ return 0x011A;
       case 0x0047: /* G */ return 0x01E6;
       case 0x004C: /* L */ return 0x013D;
       case 0x004E: /* N */ return 0x0147;
       case 0x0052: /* R */ return 0x0158;
       case 0x0053: /* S */ return 0x0160;
       case 0x0054: /* T */ return 0x0164;
       case 0x005A: /* Z */ return 0x017D;
       case 0x0063: /* c */ return 0x010D;
       case 0x0064: /* d */ return 0x010F;
       case 0x0065: /* e */ return 0x011B;
       case 0x0067: /* g */ return 0x01E7;
       case 0x006C: /* l */ return 0x013E;
       case 0x006E: /* n */ return 0x0148;
       case 0x0072: /* r */ return 0x0159;
       case 0x0073: /* s */ return 0x0161;
       case 0x0074: /* t */ return 0x0165;
       case 0x007A: /* z */ return 0x017E;
       default: return 0;
       }
    case 0x02D8: /* breve */
       switch (base_glyph) {
       case 0x0041: /* A */ return 0x0102;
       case 0x0045: /* E */ return 0x0114;
       case 0x0047: /* G */ return 0x011E;
       case 0x0049: /* I */ return 0x012C;
       case 0x004F: /* O */ return 0x014E;
       case 0x0055: /* U */ return 0x016C;
       case 0x0061: /* a */ return 0x0103;
       case 0x0065: /* e */ return 0x0115;
       case 0x0067: /* g */ return 0x011F;
       case 0x0069: /* i */ return 0x012D;
       case 0x006F: /* o */ return 0x014F;
       case 0x0075: /* u */ return 0x016D;
       default: return 0;
       }      
    case 0x02DD: /* hungarumlaut */
       switch (base_glyph) {
       case 0x004F: /* O */ return 0x0150;
       case 0x0055: /* U */ return 0x0170;
       case 0x006F: /* o */ return 0x0151;
       case 0x0075: /* u */ return 0x0171;
       default: return 0;
       }
    case 0x00AF: /* macron */
       switch (base_glyph) {
       case 0x0041: /* A */ return 0x0100;
       case 0x0045: /* E */ return 0x0112;
       case 0x0049: /* I */ return 0x012A;
       case 0x004F: /* O */ return 0x014C;
       case 0x0055: /* U */ return 0x016A;
       case 0x0061: /* a */ return 0x0101;
       case 0x0065: /* e */ return 0x0113;
       case 0x0069: /* i */ return 0x012B;
       case 0x006D: /* m */ return 0x00AF;
       case 0x006F: /* o */ return 0x014D;
       case 0x0075: /* u */ return 0x016B;
       default: return 0;
       }
       /* special cases: accent - char inverted */
    case 0x0043: /* C; special case: cedilla is set after C in OT1 */
       switch (base_glyph) {
       case 0x00B8: /* cedilla */ return 0x00C7;
       default: return 0;
       }
    case 0x0063: /* c; see above */
       switch (base_glyph) {
       case 0x00B8: /* cedilla */ return 0x00E7;
       default: return 0;
       }
    case 0x0053: /* S; see above */
       switch (base_glyph) {
       case 0x00B8: /* cedilla */ return 0x015E;
       default: return 0;
       }
    case 0x0073: /* s; see above */
       switch (base_glyph) {
       case 0x00B8: /* cedilla */ return 0x015F;
       default: return 0;
       }
    }
    return 0;
}

Here is the caller graph for this function:

uint32_t guess_encoding ( wide_ubyte  ch,
const char *  fontname,
char *  retbuf 
)

Definition at line 3463 of file encodings.c.

{
    uint32_t i;
    static hashTableT unknown_font_hash;
    static Boolean hash_initialized = False;
    size_t dummy = 0;
    
    TRACE_FIND_VERBOSE((stderr, "guess_encoding: |%s|, char 0x%.4X", fontname, ch));

    /* our encoding vectors only have size 256 */
    if (ch > 255) {
       XDVI_WARNING((stderr, "guess_encoding: font index %lu too large", (unsigned long)ch));
       return 0;
    }

    if (memcmp(fontname, "gbk", 3) == 0
              && isdigit((int)fontname[(i=strlen(fontname))-1])
              && isdigit((int)fontname[i-2])) {
#if HAVE_ICONV_H
       unsigned char cjk[2];
       TRACE_FIND_VERBOSE((stderr, "guess_encoding: CJK fonts (GBK encoding)"));
       i = atoi(fontname + i - 2);        /* font no */
       i = (i - 1) * 256 + (uint32_t)ch;  /* char index */
       cjk[0] = i / 190 + 129;
       cjk[1] = i % 190 + 64;
       if (cjk[1] >= 128)
           cjk[1]++;
       return cjk2unicode(cjk);
#else /* HAVE_ICONV_H */
       if (!warned_about_cjk) {
           popup_message(globals.widgets.top_level,
                       MSG_WARN, NULL, "This version of xdvi has been compiled without iconv support - "
                       "cannot convert CJK character to UTF-8");
           warned_about_cjk = True;
       }
       return 0;
#endif /* HAVE_ICONV_H */
    }

    if (memcmp(fontname, "cmsy", 4) == 0) {
       TRACE_FIND_VERBOSE((stderr, "guess_encoding: %s => m_cm_symbol", fontname));
       return m_cm_symbol_encoding[ch];
    }
    if (memcmp(fontname, "cmmi", 4) == 0) {
       TRACE_FIND_VERBOSE((stderr, "guess_encoding: %s => m_cm_math_italics", fontname));
       return m_cm_math_italics_encoding[ch];
    }
    if (memcmp(fontname, "cmex", 4) == 0) {
       TRACE_FIND_VERBOSE((stderr, "guess_encoding: %s => m_cm_math_extended", fontname));
       return m_cm_math_extended_encoding[ch];
    }
    if (memcmp(fontname, "cmtt", 4) == 0) {
       TRACE_FIND_VERBOSE((stderr, "guess_encoding: %s => m_cm_typewriter", fontname));
       return m_cm_typewriter_encoding[ch];
    }
    /* following to cover cmsl, cmb, cmbx, cmti, cmdunghill, whatever ...
       hope it doesn't overgenerate ;-) */
    if (memcmp(fontname, "cm", 2) == 0
       || memcmp(fontname, "lcmss", strlen("lcmss")) == 0 /* lcmss8 etc. */
       || memcmp(fontname, "ygoth", strlen("ygoth")) == 0
       || memcmp(fontname, "yinit", strlen("yinit")) == 0
       || memcmp(fontname, "logo", strlen("logo")) == 0
       || memcmp(fontname, "rsfs", strlen("rsfs")) == 0
       ) {
       TRACE_FIND_VERBOSE((stderr, "guess_encoding: %s => m_ot1", fontname));
       return m_ot1_encoding[ch];
    }
    /* cyrillic fonts */
    if (memcmp(fontname, "la", 2) == 0) {
       TRACE_FIND_VERBOSE((stderr, "guess_encoding: %s => m_t2", fontname));
       return m_t2_encoding[ch];
    }
    if (memcmp(fontname, "ec", 2) == 0
       || memcmp(fontname, "eb", 2) == 0) {
       TRACE_FIND_VERBOSE((stderr, "guess_encoding: %s => m_cork", fontname));
       /* FIXME: why cork and not EC? What font actually uses EC?
          The only difference seems that dvips' EC.enc has `ldot' at 0xb8,
          whereas cork.enc has `ydieresis' there. A document with
          \usepackage[T1]{fontenc}
          also produces a ydieresis.
        */
       return m_cork_encoding[ch];
    }
    if (memcmp(fontname, "tc", 2) == 0) {
       TRACE_FIND_VERBOSE((stderr, "guess_encoding: %s => m_ts1", fontname));
       return m_ts1_encoding[ch];
    }
    /* blackletter fonts with funny encoding */
    if (memcmp(fontname, "ysmfrak", 7) == 0
       || memcmp(fontname, "yswab", 5) == 0) {
       TRACE_FIND_VERBOSE((stderr, "guess_encoding: %s => m_yfrak", fontname));
       /* special cases for ligatures */
       switch (ch) {
       case 0x85: strcpy(retbuf, "ch"); return 0;
       case 0x86: strcpy(retbuf, "ck"); return 0;
       case 0xA7: strcpy(retbuf, "sz"); return 0;
       default: return m_yfrak_encoding[ch];
       }
    }

    /* euler mathematical */
    if (memcmp(fontname, "eufm", strlen("eufm")) == 0
       || memcmp(fontname, "eusm", strlen("eufm")) == 0
       ) {
       switch (ch) {
       case 0x0: case 0x1: return 'd'; break;
       case 0x2: case 0x3: return 'f'; break;
       case 0x4: return 'g'; break;
       case 0x5: return 'k'; break;
       case 0x6: return 't'; break;
       case 0x7: return 'u'; break;
       default: return m_ot1_encoding[ch];
       }
    }
              
    
    /* stuff that doesn't have a good ASCII representation */
    if (memcmp(fontname, "lcircle", strlen("lcircle")) == 0
       || memcmp(fontname, "line", strlen("line")) == 0
       || memcmp(fontname, "fmvr8x", strlen("fmvr8x")) == 0
       || memcmp(fontname, "feymr", strlen("feymr")) == 0
       || memcmp(fontname, "msbm", strlen("msbm")) == 0
       || memcmp(fontname, "msam", strlen("msam")) == 0
       || memcmp(fontname, "wasy", strlen("wasy")) == 0
       || memcmp(fontname, "txsy", strlen("txsy")) == 0
       ) {
       return 0;
    }

    /* TODO:
       txfonts
    */
    
    /* default: assume cork encoding, and print out a warning for each font */
    if (!hash_initialized) {
       unknown_font_hash = hash_create(1031);
       hash_initialized = True;
    }
    if (!find_str_int_hash(&unknown_font_hash, fontname, &dummy)) {
       XDVI_WARNING((stderr,
                    "guess_encoding(): nothing suitable for \"%s\", assuming Cork encoding.\n"
                    "(Please tell us about this at "
                    "http://sourceforge.net/tracker/?group_id=23164&atid=377580)", fontname));
       put_str_int_hash(&unknown_font_hash, fontname, dummy);
    }
    return m_cork_encoding[ch];
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 3929 of file encodings.c.

{
    if (u == 0x002D || u ==  0x00AD)
       return True;
    return False;
}

Here is the caller graph for this function:

Definition at line 3956 of file encodings.c.

{
    return (u >= 0x3000 && u <= 0x3002) || /* IDEOGRAPHIC SPACE, COMMA, FULL STOP */
       u == 0xFF61 || /* HALFWIDTH IDEOGRAPHIC FULL STOP */
       u == 0xFF64 || /* HALFWIDTH IDEOGRAPHIC COMMA */
       (u >= 0x3006 && u <= 0x3007) ||
       (u >= 0x3021 && u <= 0x3029) ||
       (u >= 0x3038 && u <= 0x303A) ||
       (u >= 0x3400 && u <= 0x4DB5) ||
       (u >= 0x4E00 && u <= 0x9FA5) ||
       (u >= 0xF900 && u <= 0xFA2D) ||
       (u >= 0x20000 && u <= 0x2A6D6) ||
       (u >= 0x2F800 && u <= 0x2FA1D);
}

Here is the caller graph for this function:

void iso_8859_1_to_utf8 ( unsigned char  c,
char *  utf8,
size_t len 
)

Definition at line 3885 of file encodings.c.

{
    if (c < 0x80) {
       *len = 1;
       utf8[0] = c;
    }
    else {
       *len = 2;
       utf8[1] = 0x80 | (c & 0x3f);
       c >>= 6;
       utf8[0] = 0xc0 | (c & 0x1f);
    }
}

Here is the caller graph for this function:

Definition at line 3616 of file encodings.c.

{
    const char *ret = NULL;
    switch(unicode) {
    case 0x2212: ret = "-"; break;
    case 0x2022: ret = "\xb7";     break; /* middle dot */
    default: ret = NULL;    break;
    }
    if (ret != NULL) {
        TRACE_FIND((stderr, "expand_searchchars: 0x%X --> `%s'",
                    (unsigned int)unicode, ret));
    }
    return ret;
}

Here is the caller graph for this function:

int str_iso_8859_1_to_utf8 ( const char *  latin1,
char *  utf8,
size_t  len 
)

Definition at line 3905 of file encodings.c.

{
    size_t i = 0;

    while (i < len && *latin1 != '\0') {
       char tmpbuf[2];
       size_t tmp_len = 0;
       iso_8859_1_to_utf8((unsigned char)*latin1, tmpbuf, &tmp_len);
       if (i + tmp_len >= len)
           return -1;
       memcpy(utf8 + i, tmpbuf, tmp_len);
       i += tmp_len;
       latin1++;
    }
    /* terminate utf8 */
    if (i < len)
       utf8[i++] = '\0';
    else
       return -1;
    
    return i;
}

Here is the call graph for this function:

Here is the caller graph for this function:

char* str_utf8_to_iso_8859_1 ( const char *  utf8)

Definition at line 3972 of file encodings.c.

{
    size_t utf8_len = strlen(utf8), i = 0, offset = 0;
    char *buf = xmalloc(4 * utf8_len + 1); /* worst case of non-printables */

    while (i < utf8_len) {
       uint32_t ucs4;
       const char *ret;

       /*  fprintf(stderr, "offset: %d\n", (int)offset); */
       /* first apply normalization heurisitcs also used by search */
       size_t len = utf8_to_ucs4(utf8 + i, &ucs4, utf8_len + 1);
       if ((ret = search_normalize_chars(ucs4)) != NULL) {
           size_t len_ret = strlen(ret);
           memcpy(buf + offset, ret, len_ret);
           offset += len_ret;
       }
       else if (ucs4 <= 0xff) { /* in iso-latin1 range */
           buf[offset++] = (unsigned char)ucs4;
       }
       else {
           sprintf(buf + offset, "\\%.4lX", (unsigned long)ucs4);
           offset += 4;
       }
       i += len;
    }
    buf[offset] = '\0';
    
    return buf;
}

Here is the call graph for this function:

Here is the caller graph for this function:

static uint32_t ucs4_lowercase ( uint32_t  c) [static]

Definition at line 3035 of file encodings.c.

{
    /*
      This table was produced by:
      
      cat uni2adobe | while read a b; do lc_b=`echo $b | tr 'A-Z' 'a-z'`; \
      if [ "$lc_b" != "$b" ] ; then res=`egrep " $lc_b\$" uni2adobe` ; \
      if [ -n "$res" ]; then echo "RES: $a $b -> $res"; fi; fi; done | grep 'RES' > uni2adobe-map

      And then, some Emacs keyboard macros.
     */
    switch (c) {
    case 0x0041: /* A */                return 0x0061; /* a */
    case 0x0042: /* B */                return 0x0062; /* b */
    case 0x0043: /* C */                return 0x0063; /* c */
    case 0x0044: /* D */                return 0x0064; /* d */
    case 0x0045: /* E */                return 0x0065; /* e */
    case 0x0046: /* F */                return 0x0066; /* f */
    case 0x0047: /* G */                return 0x0067; /* g */
    case 0x0048: /* H */                return 0x0068; /* h */
    case 0x0049: /* I */                return 0x0069; /* i */
    case 0x004A: /* J */                return 0x006A; /* j */
    case 0x004B: /* K */                return 0x006B; /* k */
    case 0x004C: /* L */                return 0x006C; /* l */
    case 0x004D: /* M */                return 0x006D; /* m */
    case 0x004E: /* N */                return 0x006E; /* n */
    case 0x004F: /* O */                return 0x006F; /* o */
    case 0x0050: /* P */                return 0x0070; /* p */
    case 0x0051: /* Q */                return 0x0071; /* q */
    case 0x0052: /* R */                return 0x0072; /* r */
    case 0x0053: /* S */                return 0x0073; /* s */
    case 0x0054: /* T */                return 0x0074; /* t */
    case 0x0055: /* U */                return 0x0075; /* u */
    case 0x0056: /* V */                return 0x0076; /* v */
    case 0x0057: /* W */                return 0x0077; /* w */
    case 0x0058: /* X */                return 0x0078; /* x */
    case 0x0059: /* Y */                return 0x0079; /* y */
    case 0x005A: /* Z */                return 0x007A; /* z */
    case 0x00C0: /* Agrave */           return 0x00E0; /* agrave */
    case 0x00C1: /* Aacute */           return 0x00E1; /* aacute */
    case 0x00C2: /* Acircumflex */      return 0x00E2; /* acircumflex */
    case 0x00C3: /* Atilde */           return 0x00E3; /* atilde */
    case 0x00C4: /* Adieresis */        return 0x00E4; /* adieresis */
    case 0x00C5: /* Aring */            return 0x00E5; /* aring */
    case 0x00C6: /* AE */               return 0x00E6; /* ae */
    case 0x00C7: /* Ccedilla */         return 0x00E7; /* ccedilla */
    case 0x00C8: /* Egrave */           return 0x00E8; /* egrave */
    case 0x00C9: /* Eacute */           return 0x00E9; /* eacute */
    case 0x00CA: /* Ecircumflex */      return 0x00EA; /* ecircumflex */
    case 0x00CB: /* Edieresis */        return 0x00EB; /* edieresis */
    case 0x00CC: /* Igrave */           return 0x00EC; /* igrave */
    case 0x00CD: /* Iacute */           return 0x00ED; /* iacute */
    case 0x00CE: /* Icircumflex */      return 0x00EE; /* icircumflex */
    case 0x00CF: /* Idieresis */        return 0x00EF; /* idieresis */
    case 0x00D0: /* Eth */              return 0x00F0; /* eth */
    case 0x00D1: /* Ntilde */           return 0x00F1; /* ntilde */
    case 0x00D2: /* Ograve */           return 0x00F2; /* ograve */
    case 0x00D3: /* Oacute */           return 0x00F3; /* oacute */
    case 0x00D4: /* Ocircumflex */      return 0x00F4; /* ocircumflex */
    case 0x00D5: /* Otilde */           return 0x00F5; /* otilde */
    case 0x00D6: /* Odieresis */        return 0x00F6; /* odieresis */
    case 0x00D8: /* Oslash */           return 0x00F8; /* oslash */
    case 0x00D9: /* Ugrave */           return 0x00F9; /* ugrave */
    case 0x00DA: /* Uacute */           return 0x00FA; /* uacute */
    case 0x00DB: /* Ucircumflex */      return 0x00FB; /* ucircumflex */
    case 0x00DC: /* Udieresis */        return 0x00FC; /* udieresis */
    case 0x00DD: /* Yacute */           return 0x00FD; /* yacute */
    case 0x00DE: /* Thorn */            return 0x00FE; /* thorn */
    case 0x0100: /* Amacron */          return 0x0101; /* amacron */
    case 0x0102: /* Abreve */           return 0x0103; /* abreve */
    case 0x0104: /* Aogonek */          return 0x0105; /* aogonek */
    case 0x0106: /* Cacute */           return 0x0107; /* cacute */
    case 0x0108: /* Ccircumflex */      return 0x0109; /* ccircumflex */
    case 0x010A: /* Cdotaccent */       return 0x010B; /* cdotaccent */
    case 0x010C: /* Ccaron */           return 0x010D; /* ccaron */
    case 0x010E: /* Dcaron */           return 0x010F; /* dcaron */
    case 0x0110: /* Dcroat */           return 0x0111; /* dcroat */
    case 0x0112: /* Emacron */          return 0x0113; /* emacron */
    case 0x0114: /* Ebreve */           return 0x0115; /* ebreve */
    case 0x0116: /* Edotaccent */       return 0x0117; /* edotaccent */
    case 0x0118: /* Eogonek */          return 0x0119; /* eogonek */
    case 0x011A: /* Ecaron */           return 0x011B; /* ecaron */
    case 0x011C: /* Gcircumflex */      return 0x011D; /* gcircumflex */
    case 0x011E: /* Gbreve */           return 0x011F; /* gbreve */
    case 0x0120: /* Gdotaccent */       return 0x0121; /* gdotaccent */
    case 0x0122: /* Gcommaaccent */     return 0x0123; /* gcommaaccent */
    case 0x0124: /* Hcircumflex */      return 0x0125; /* hcircumflex */
    case 0x0126: /* Hbar */             return 0x0127; /* hbar */
    case 0x0128: /* Itilde */           return 0x0129; /* itilde */
    case 0x012A: /* Imacron */          return 0x012B; /* imacron */
    case 0x012C: /* Ibreve */           return 0x012D; /* ibreve */
    case 0x012E: /* Iogonek */          return 0x012F; /* iogonek */
    case 0x0132: /* IJ */               return 0x0133; /* ij */
    case 0x0134: /* Jcircumflex */      return 0x0135; /* jcircumflex */
    case 0x0136: /* Kcommaaccent */     return 0x0137; /* kcommaaccent */
    case 0x0139: /* Lacute */           return 0x013A; /* lacute */
    case 0x013B: /* Lcommaaccent */     return 0x013C; /* lcommaaccent */
    case 0x013D: /* Lcaron */           return 0x013E; /* lcaron */
    case 0x013F: /* Ldot */             return 0x0140; /* ldot */
    case 0x0141: /* Lslash */           return 0x0142; /* lslash */
    case 0x0143: /* Nacute */           return 0x0144; /* nacute */
    case 0x0145: /* Ncommaaccent */     return 0x0146; /* ncommaaccent */
    case 0x0147: /* Ncaron */           return 0x0148; /* ncaron */
    case 0x014A: /* Eng */              return 0x014B; /* eng */
    case 0x014C: /* Omacron */          return 0x014D; /* omacron */
    case 0x014E: /* Obreve */           return 0x014F; /* obreve */
    case 0x0150: /* Ohungarumlaut */    return 0x0151; /* ohungarumlaut */
    case 0x0152: /* OE */               return 0x0153; /* oe */
    case 0x0154: /* Racute */           return 0x0155; /* racute */
    case 0x0156: /* Rcommaaccent */     return 0x0157; /* rcommaaccent */
    case 0x0158: /* Rcaron */           return 0x0159; /* rcaron */
    case 0x015A: /* Sacute */           return 0x015B; /* sacute */
    case 0x015C: /* Scircumflex */      return 0x015D; /* scircumflex */
    case 0x015E: /* Scedilla */         return 0x015F; /* scedilla */
    case 0x0160: /* Scaron */           return 0x0161; /* scaron */
    case 0x0162: /* Tcommaaccent */     return 0x0163; /* tcommaaccent */
    case 0x0164: /* Tcaron */           return 0x0165; /* tcaron */
    case 0x0166: /* Tbar */             return 0x0167; /* tbar */
    case 0x0168: /* Utilde */           return 0x0169; /* utilde */
    case 0x016A: /* Umacron */          return 0x016B; /* umacron */
    case 0x016C: /* Ubreve */           return 0x016D; /* ubreve */
    case 0x016E: /* Uring */            return 0x016F; /* uring */
    case 0x0170: /* Uhungarumlaut */    return 0x0171; /* uhungarumlaut */
    case 0x0172: /* Uogonek */          return 0x0173; /* uogonek */
    case 0x0174: /* Wcircumflex */      return 0x0175; /* wcircumflex */
    case 0x0176: /* Ycircumflex */      return 0x0177; /* ycircumflex */
    case 0x0178: /* Ydieresis */        return 0x00FF; /* ydieresis */
    case 0x0179: /* Zacute */           return 0x017A; /* zacute */
    case 0x017B: /* Zdotaccent */       return 0x017C; /* zdotaccent */
    case 0x017D: /* Zcaron */           return 0x017E; /* zcaron */
    case 0x01A0: /* Ohorn */            return 0x01A1; /* ohorn */
    case 0x01AF: /* Uhorn */            return 0x01B0; /* uhorn */
    case 0x01E6: /* Gcaron */           return 0x01E7; /* gcaron */
    case 0x01FA: /* Aringacute */       return 0x01FB; /* aringacute */
    case 0x01FC: /* AEacute */          return 0x01FD; /* aeacute */
    case 0x01FE: /* Oslashacute */      return 0x01FF; /* oslashacute */
    case 0x0218: /* Scommaaccent */     return 0x0219; /* scommaaccent */
    case 0x0386: /* Alphatonos */       return 0x03AC; /* alphatonos */
    case 0x0388: /* Epsilontonos */     return 0x03AD; /* epsilontonos */
    case 0x0389: /* Etatonos */         return 0x03AE; /* etatonos */
    case 0x038A: /* Iotatonos */        return 0x03AF; /* iotatonos */
    case 0x038C: /* Omicrontonos */     return 0x03CC; /* omicrontonos */
    case 0x038E: /* Upsilontonos */     return 0x03CD; /* upsilontonos */
    case 0x038F: /* Omegatonos */       return 0x03CE; /* omegatonos */
    case 0x0391: /* Alpha */            return 0x03B1; /* alpha */
    case 0x0392: /* Beta */             return 0x03B2; /* beta */
    case 0x0393: /* Gamma */            return 0x03B3; /* gamma */
    case 0x0395: /* Epsilon */          return 0x03B5; /* epsilon */
    case 0x0396: /* Zeta */             return 0x03B6; /* zeta */
    case 0x0397: /* Eta */              return 0x03B7; /* eta */
    case 0x0398: /* Theta */            return 0x03B8; /* theta */
    case 0x0399: /* Iota */             return 0x03B9; /* iota */
    case 0x039A: /* Kappa */            return 0x03BA; /* kappa */
    case 0x039B: /* Lambda */           return 0x03BB; /* lambda */
    case 0x039C: /* Mu */               return 0x00B5; /* mu */
    case 0x039D: /* Nu */               return 0x03BD; /* nu */
    case 0x039E: /* Xi */               return 0x03BE; /* xi */
    case 0x039F: /* Omicron */          return 0x03BF; /* omicron */
    case 0x03A0: /* Pi */               return 0x03C0; /* pi */
    case 0x03A1: /* Rho */              return 0x03C1; /* rho */
    case 0x03A3: /* Sigma */            return 0x03C3; /* sigma */
    case 0x03A4: /* Tau */              return 0x03C4; /* tau */
    case 0x03A5: /* Upsilon */          return 0x03C5; /* upsilon */
    case 0x03A6: /* Phi */              return 0x03C6; /* phi */
    case 0x03A7: /* Chi */              return 0x03C7; /* chi */
    case 0x03A8: /* Psi */              return 0x03C8; /* psi */
    case 0x03AA: /* Iotadieresis */     return 0x03CA; /* iotadieresis */
    case 0x03AB: /* Upsilondieresis */  return 0x03CB; /* upsilondieresis */
    case 0x1E80: /* Wgrave */           return 0x1E81; /* wgrave */
    case 0x1E82: /* Wacute */           return 0x1E83; /* wacute */
    case 0x1E84: /* Wdieresis */        return 0x1E85; /* wdieresis */
    case 0x1EF2: /* Ygrave */           return 0x1EF3; /* ygrave */
    case 0x2126: /* Omega */            return 0x03C9; /* omega */
    case 0x2206: /* Delta */            return 0x03B4; /* delta */
    case 0xF6BF: /* LL */               return 0xF6C0; /* ll */
    case 0xF6C9: /* Acute */            return 0x00B4; /* acute */
    case 0xF6CA: /* Caron */            return 0x02C7; /* caron */
    case 0xF6CB: /* Dieresis */         return 0x00A8; /* dieresis */
    case 0xF6CC: /* DieresisAcute */    return 0xF6D7; /* dieresisacute */
    case 0xF6CD: /* DieresisGrave */    return 0xF6D8; /* dieresisgrave */
    case 0xF6CE: /* Grave */            return 0x0060; /* grave */
    case 0xF6CF: /* Hungarumlaut */     return 0x02DD; /* hungarumlaut */
    case 0xF6D0: /* Macron */           return 0x00AF; /* macron */
    case 0xF6D1: /* cyrBreve */         return 0xF6D4; /* cyrbreve */
    case 0xF6D2: /* cyrFlex */          return 0xF6D5; /* cyrflex */
    case 0xF6D3: /* dblGrave */         return 0xF6D6; /* dblgrave */
    default: return c;
    }
}

Here is the caller graph for this function:

void ucs4_to_utf8 ( uint32_t  ucs4,
char *  utf8,
size_t len,
Boolean  do_lowercase 
)

Definition at line 3807 of file encodings.c.

{
    if (do_lowercase)
       ucs4 = ucs4_lowercase(ucs4);
    
    if (ucs4 < 0x80)
       *len = 1;
    else if (ucs4 < 0x800)
       *len = 2;
    else if (ucs4 < 0x10000)
       *len = 3;
    else if (ucs4 < 0x200000)
       *len = 4;
    else if (ucs4 < 0x4000000)
       *len = 5;
    else if (ucs4 <= 0x7fffffff)
       *len = 6;

    switch(*len) { /* note: code falls through cases! */
    case 6: utf8[5] = 0x80 | (ucs4 & 0x3f); ucs4 = ucs4 >> 6; ucs4 |= 0x4000000;
    case 5: utf8[4] = 0x80 | (ucs4 & 0x3f); ucs4 = ucs4 >> 6; ucs4 |= 0x200000;
    case 4: utf8[3] = 0x80 | (ucs4 & 0x3f); ucs4 = ucs4 >> 6; ucs4 |= 0x10000;
    case 3: utf8[2] = 0x80 | (ucs4 & 0x3f); ucs4 = ucs4 >> 6; ucs4 |= 0x800;
    case 2: utf8[1] = 0x80 | (ucs4 & 0x3f); ucs4 = ucs4 >> 6; ucs4 |= 0xc0;
    case 1: utf8[0] = ucs4;
    }
}

Here is the call graph for this function:

Here is the caller graph for this function:

Boolean utf8_lowercase ( char *  utf8)

Definition at line 3679 of file encodings.c.

{
    size_t utf8_len = strlen(utf8) + 1; /* also convert terminating byte (acutally not needed ...) */
    size_t ucs4_len = utf8_len * 6; /* ample ... */
    int i = 0;
    uint32_t *ucs4 = xmalloc(ucs4_len);
    uint32_t *ucs4_start = ucs4; /* save for free()ing */
    for (; *utf8 != '\0'; ucs4++, utf8 += i) {
       size_t conv_len;
       if ((i = utf8_to_ucs4(utf8, ucs4, 6 /* don't care about character len */)) < 0) {
           XDVI_ERROR((stderr, "Error in utf8_lowercase: Illegal UTF-8 sequence"));
           free(ucs4_start);
           return False;
       }
       ucs4_to_utf8(*ucs4, utf8, &conv_len, True); /* lowercases it */
       if ((int)conv_len != i) {
           XDVI_ERROR((stderr, "Error in utf8_lowercase: length after UCS4 conversion (%lu)\n"
                     "differs from length after utf8 conversion(%lu) (string: %s)\n",
                     (unsigned long)conv_len, (unsigned long)i, utf8));
           free(ucs4_start);
           return False;
       }
    }
    free(ucs4_start);
    return True;
}

Here is the call graph for this function:

Here is the caller graph for this function:

unsigned char utf8_to_iso_8859_1 ( const char *  utf8,
size_t len 
)

Definition at line 3843 of file encodings.c.

{
    unsigned char c = *utf8;
    uint32_t wc;

    if (c < 0x80) {
       *len = 1;
       return c;
    }
    else if (c < 0xe0) {
       *len = 2;
       wc = ((unsigned char)(c & 0x1f) << 6) | (unsigned char)(utf8[1] ^ 0x80);
       if (wc <= 0xff)
           return (unsigned char)wc;
       else
           return '?';
    }
    else if (c < 0xf0) {
       *len = 3;
       return '?';
    }
    else if (c < 0xf8) {
       *len = 4;
       return '?';
    }
    else if (c < 0xfc) {
       *len = 5;
       return '?';
    }
    else if (c < 0xfe) {
       *len = 6;
       return '?';
    }
    else
       return '?';
}
int utf8_to_ucs4 ( const char *  utf8,
uint32_t ucs4,
size_t  len 
)

Definition at line 3713 of file encodings.c.

{
    const unsigned char *str = (const unsigned char *)utf8;
    unsigned char c = *str;

    if (c < 0x80) {
       *ucs4 = c;
       return 1;
    }
    else if (c < 0xc2) {
       return -1; /* illegal UTF8; shouldn't happen */
    }
    else if (c < 0xe0) {
       if (len < 2)
           return -1; /* len too short */
       if (!((str[1] ^ 0x80) < 0x40)) {
           return -1; /* illegal UTF8; shouldn't happen */
       }
       *ucs4 = ((uint32_t) (c & 0x1f) << 6)
           | (uint32_t) (str[1] ^ 0x80);
       return 2;
    }
    else if (c < 0xf0) {
       if (len < 3)
           return -1; /* len too short */
       if (!((str[1] ^ 0x80) < 0x40
             && (str[2] ^ 0x80) < 0x40
             && (c >= 0xe1 || str[1] >= 0xa0))) {
           return -1; /* illegal UTF8; shouldn't happen */
       }
       *ucs4 = ((uint32_t) (c & 0x0f) << 12)
           | ((uint32_t) (str[1] ^ 0x80) << 6)
           | (uint32_t) (str[2] ^ 0x80);
       return 3;
    }
    else if (c < 0xf8 && sizeof(uint32_t) * 8 >= 32) {
       if (len < 4)
           return -1; /* len too short */
       if (!((str[1] ^ 0x80) < 0x40
             && (str[2] ^ 0x80) < 0x40
             && (str[3] ^ 0x80) < 0x40
             && (c >= 0xf1 || str[1] >= 0x90))) {
           return -1; /* illegal UTF8; shouldn't happen */
       }
       *ucs4 = ((uint32_t) (c & 0x07) << 18)
           | ((uint32_t) (str[1] ^ 0x80) << 12)
           | ((uint32_t) (str[2] ^ 0x80) << 6)
           | (uint32_t) (str[3] ^ 0x80);
       return 4;
    }
    else if (c < 0xfc && sizeof(uint32_t)*8 >= 32) {
       if (len < 5)
           return -1; /* len too short */
       if (!((str[1] ^ 0x80) < 0x40
             && (str[2] ^ 0x80) < 0x40
             && (str[3] ^ 0x80) < 0x40
             && (str[4] ^ 0x80) < 0x40
             && (c >= 0xf9 || str[1] >= 0x88))) {
           return -1; /* illegal UTF8; shouldn't happen */
       }
       *ucs4 = ((uint32_t) (c & 0x03) << 24)
           | ((uint32_t) (str[1] ^ 0x80) << 18)
           | ((uint32_t) (str[2] ^ 0x80) << 12)
           | ((uint32_t) (str[3] ^ 0x80) << 6)
           | (uint32_t) (str[4] ^ 0x80);
       return 5;
    }
    else if (c < 0xfe && sizeof(uint32_t)*8 >= 32) {
       if (len < 6)
           return -1; /* len too short */
       if (!((str[1] ^ 0x80) < 0x40
             && (str[2] ^ 0x80) < 0x40
             && (str[3] ^ 0x80) < 0x40
             && (str[4] ^ 0x80) < 0x40
             && (str[5] ^ 0x80) < 0x40
             && (c >= 0xfd || str[1] >= 0x84))) {
           return -1; /* illegal UTF8; shouldn't happen */
       }
       *ucs4 = ((uint32_t) (c & 0x01) << 30)
           | ((uint32_t) (str[1] ^ 0x80) << 24)
           | ((uint32_t) (str[2] ^ 0x80) << 18)
           | ((uint32_t) (str[3] ^ 0x80) << 12)
           | ((uint32_t) (str[4] ^ 0x80) << 6)
           | (uint32_t) (str[5] ^ 0x80);
       return 6;
    }
    else {
       return -1; /* illegal UTF8 */
    }
}

Here is the caller graph for this function:


Variable Documentation

struct adobe2unicode[] [static]

Definition at line 569 of file encodings.c.

Definition at line 201 of file encodings.c.

Definition at line 149 of file encodings.c.

Definition at line 98 of file encodings.c.

Definition at line 253 of file encodings.c.

uint32_t m_cork_encoding[256] [static]

Definition at line 409 of file encodings.c.

uint32_t m_ot1_encoding[256] [static]

Definition at line 305 of file encodings.c.

uint32_t m_t2_encoding[256] [static]

Definition at line 358 of file encodings.c.

uint32_t m_ts1_encoding[256] [static]

Definition at line 465 of file encodings.c.

uint32_t m_yfrak_encoding[176] [static]

Definition at line 518 of file encodings.c.

Definition at line 73 of file encodings.c.