Back to index

openldap  2.4.31
Defines | Functions | Variables
utf-8.c File Reference
#include "portable.h"
#include <stdio.h>
#include <ac/stdlib.h>
#include <ac/socket.h>
#include <ac/string.h>
#include <ac/time.h>
#include "ldap_utf8.h"
#include "ldap-int.h"
#include "ldap_defaults.h"

Go to the source code of this file.

Defines

#define c   const char
#define LDAP_UCS_UTF8LEN(c)

Functions

ber_len_t ldap_utf8_bytes (const char *p)
ber_len_t ldap_utf8_chars (const char *p)
int ldap_utf8_offset (const char *p)
int ldap_utf8_charlen (const char *p)
int ldap_utf8_charlen2 (const char *p)
ldap_ucs4_t ldap_x_utf8_to_ucs4 (const char *p)
int ldap_x_ucs4_to_utf8 (ldap_ucs4_t c, char *buf)
int ldap_ucs_to_utf8s (struct berval *ucs, int csize, struct berval *utf8s)
char * ldap_utf8_next (const char *p)
char * ldap_utf8_prev (const char *p)
int ldap_utf8_copy (char *dst, const char *src)
int ldap_utf8_isascii (const char *p)
int ldap_utf8_isdigit (const char *p)
int ldap_utf8_isxdigit (const char *p)
int ldap_utf8_isspace (const char *p)
int ldap_utf8_isalpha (const char *p)
int ldap_utf8_isalnum (const char *p)
int ldap_utf8_islower (const char *p)
int ldap_utf8_isupper (const char *p)
char *() ldap_utf8_strchr (const char *str, const char *chr)
ber_len_t() ldap_utf8_strcspn (const char *str, const char *set)
ber_len_t() ldap_utf8_strspn (const char *str, const char *set)
char *() ldap_utf8_strpbrk (const char *str, const char *set)
char *() ldap_utf8_strtok (char *str, const char *sep, char **last)

Variables

const char ldap_utf8_lentab []
c ldap_utf8_mintab []

Define Documentation

#define c   const char

Definition at line 120 of file utf-8.c.

#define LDAP_UCS_UTF8LEN (   c)
Value:
c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
       (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))

Definition at line 222 of file utf-8.c.


Function Documentation

int ldap_ucs_to_utf8s ( struct berval ucs,
int  csize,
struct berval utf8s 
)

Definition at line 236 of file utf-8.c.

{
       unsigned char *in, *end;
       char *ptr;
       ldap_ucs4_t u;
       int i, l = 0;

       utf8s->bv_val = NULL;
       utf8s->bv_len = 0;

       in = (unsigned char *)ucs->bv_val;

       /* Make sure we stop at an even multiple of csize */
       end = in + ( ucs->bv_len & ~(csize-1) );
       
       for (; in < end; ) {
              u = *in++;
              if (csize > 1) {
                     u <<= 8;
                     u |= *in++;
              }
              if (csize > 2) {
                     u <<= 8;
                     u |= *in++;
                     u <<= 8;
                     u |= *in++;
              }
              i = LDAP_UCS_UTF8LEN(u);
              if (i == 0)
                     return LDAP_INVALID_SYNTAX;
              l += i;
       }

       utf8s->bv_val = LDAP_MALLOC( l+1 );
       if (utf8s->bv_val == NULL)
              return LDAP_NO_MEMORY;
       utf8s->bv_len = l;

       ptr = utf8s->bv_val;
       for (in = (unsigned char *)ucs->bv_val; in < end; ) {
              u = *in++;
              if (csize > 1) {
                     u <<= 8;
                     u |= *in++;
              }
              if (csize > 2) {
                     u <<= 8;
                     u |= *in++;
                     u <<= 8;
                     u |= *in++;
              }
              ptr += ldap_x_ucs4_to_utf8(u, ptr);
       }
       *ptr = '\0';
       return LDAP_SUCCESS;
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 46 of file utf-8.c.

{
       ber_len_t bytes;

       for( bytes=0; p[bytes]; bytes++ ) {
              /* EMPTY */ ;
       }

       return bytes;
}
int ldap_utf8_charlen ( const char *  p)

Definition at line 88 of file utf-8.c.

{
       if (!(*p & 0x80))
              return 1;

       return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
}
int ldap_utf8_charlen2 ( const char *  p)

Definition at line 128 of file utf-8.c.

{
       int i = LDAP_UTF8_CHARLEN( p );

       if ( i > 2 ) {
              if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
                     i = 0;
       }
       return i;
}

Definition at line 57 of file utf-8.c.

{
       /* could be optimized and could check for invalid sequences */
       ber_len_t chars=0;

       for( ; *p ; LDAP_UTF8_INCR(p) ) {
              chars++;
       }

       return chars;
}

Here is the caller graph for this function:

int ldap_utf8_copy ( char *  dst,
const char *  src 
)

Definition at line 353 of file utf-8.c.

{
       int i;
       const unsigned char *u = (const unsigned char *) src;

       dst[0] = src[0];

       if( LDAP_UTF8_ISASCII(u) ) {
              return 1;
       }

       for( i=1; i<6; i++ ) {
              if ( ( u[i] & 0xc0 ) != 0x80 ) {
                     return i; 
              }
              dst[i] = src[i];
       }

       return i;
}
int ldap_utf8_isalnum ( const char *  p)

Definition at line 436 of file utf-8.c.

{
       unsigned c = * (const unsigned char *) p;

       if(!LDAP_ASCII(c)) return 0;

       return LDAP_ALNUM(c);
}
int ldap_utf8_isalpha ( const char *  p)

Definition at line 427 of file utf-8.c.

{
       unsigned c = * (const unsigned char *) p;

       if(!LDAP_ASCII(c)) return 0;

       return LDAP_ALPHA(c);
}
int ldap_utf8_isascii ( const char *  p)

Definition at line 380 of file utf-8.c.

{
       unsigned c = * (const unsigned char *) p;
       return LDAP_ASCII(c);
}
int ldap_utf8_isdigit ( const char *  p)

Definition at line 386 of file utf-8.c.

{
       unsigned c = * (const unsigned char *) p;

       if(!LDAP_ASCII(c)) return 0;

       return LDAP_DIGIT( c );
}
int ldap_utf8_islower ( const char *  p)

Definition at line 445 of file utf-8.c.

{
       unsigned c = * (const unsigned char *) p;

       if(!LDAP_ASCII(c)) return 0;

       return LDAP_LOWER(c);
}
int ldap_utf8_isspace ( const char *  p)

Definition at line 404 of file utf-8.c.

{
       unsigned c = * (const unsigned char *) p;

       if(!LDAP_ASCII(c)) return 0;

       switch(c) {
       case ' ':
       case '\t':
       case '\n':
       case '\r':
       case '\v':
       case '\f':
              return 1;
       }

       return 0;
}
int ldap_utf8_isupper ( const char *  p)

Definition at line 454 of file utf-8.c.

{
       unsigned c = * (const unsigned char *) p;

       if(!LDAP_ASCII(c)) return 0;

       return LDAP_UPPER(c);
}
int ldap_utf8_isxdigit ( const char *  p)

Definition at line 395 of file utf-8.c.

{
       unsigned c = * (const unsigned char *) p;

       if(!LDAP_ASCII(c)) return 0;

       return LDAP_HEX(c);
}
char* ldap_utf8_next ( const char *  p)

Definition at line 302 of file utf-8.c.

{
       int i;
       const unsigned char *u = (const unsigned char *) p;

       if( LDAP_UTF8_ISASCII(u) ) {
              return (char *) &p[1];
       }

       for( i=1; i<6; i++ ) {
              if ( ( u[i] & 0xc0 ) != 0x80 ) {
                     return (char *) &p[i];
              }
       }

       return (char *) &p[i];
}
int ldap_utf8_offset ( const char *  p)

Definition at line 70 of file utf-8.c.

{
       return LDAP_UTF8_NEXT(p) - p;
}
char* ldap_utf8_prev ( const char *  p)

Definition at line 329 of file utf-8.c.

{
       int i;
       const unsigned char *u = (const unsigned char *) p;

       for( i=-1; i>-6 ; i-- ) {
              if ( ( u[i] & 0xc0 ) != 0x80 ) {
                     return (char *) &p[i];
              }
       }

       return (char *) &p[i];
}
char*() ldap_utf8_strchr ( const char *  str,
const char *  chr 
)

Definition at line 470 of file utf-8.c.

{
       for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
              if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
                     return (char *) str;
              } 
       }

       return NULL;
}

Here is the call graph for this function:

ber_len_t() ldap_utf8_strcspn ( const char *  str,
const char *  set 
)

Definition at line 482 of file utf-8.c.

{
       const char *cstr;
       const char *cset;

       for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
              for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
                     if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
                            return cstr - str;
                     } 
              }
       }

       return cstr - str;
}

Here is the call graph for this function:

Here is the caller graph for this function:

char*() ldap_utf8_strpbrk ( const char *  str,
const char *  set 
)

Definition at line 520 of file utf-8.c.

{
       for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
              const char *cset;

              for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
                     if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
                            return (char *) str;
                     } 
              }
       }

       return NULL;
}

Here is the call graph for this function:

Here is the caller graph for this function:

ber_len_t() ldap_utf8_strspn ( const char *  str,
const char *  set 
)

Definition at line 499 of file utf-8.c.

{
       const char *cstr;
       const char *cset;

       for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
              for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
                     if( *cset == '\0' ) {
                            return cstr - str;
                     }

                     if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
                            break;
                     } 
              }
       }

       return cstr - str;
}

Here is the call graph for this function:

Here is the caller graph for this function:

char*() ldap_utf8_strtok ( char *  str,
const char *  sep,
char **  last 
)

Definition at line 536 of file utf-8.c.

{
       char *begin;
       char *end;

       if( last == NULL ) return NULL;

       begin = str ? str : *last;

       begin += ldap_utf8_strspn( begin, sep );

       if( *begin == '\0' ) {
              *last = NULL;
              return NULL;
       }

       end = &begin[ ldap_utf8_strcspn( begin, sep ) ];

       if( *end != '\0' ) {
              char *next = LDAP_UTF8_NEXT( end );
              *end = '\0';
              end = next;
       }

       *last = end;
       return begin;
}

Here is the call graph for this function:

Here is the caller graph for this function:

int ldap_x_ucs4_to_utf8 ( ldap_ucs4_t  c,
char *  buf 
)

Definition at line 167 of file utf-8.c.

{
       int len=0;
       unsigned char* p = (unsigned char *) buf;

       /* not a valid Unicode character */
       if ( c < 0 ) return 0;

       /* Just return length, don't convert */
       if(buf == NULL) {
              if( c < 0x80 ) return 1;
              else if( c < 0x800 ) return 2;
              else if( c < 0x10000 ) return 3;
              else if( c < 0x200000 ) return 4;
              else if( c < 0x4000000 ) return 5;
              else return 6;
       }

       if( c < 0x80 ) {
              p[len++] = c;

       } else if( c < 0x800 ) {
              p[len++] = 0xc0 | ( c >> 6 );
              p[len++] = 0x80 | ( c & 0x3f );

       } else if( c < 0x10000 ) {
              p[len++] = 0xe0 | ( c >> 12 );
              p[len++] = 0x80 | ( (c >> 6) & 0x3f );
              p[len++] = 0x80 | ( c & 0x3f );

       } else if( c < 0x200000 ) {
              p[len++] = 0xf0 | ( c >> 18 );
              p[len++] = 0x80 | ( (c >> 12) & 0x3f );
              p[len++] = 0x80 | ( (c >> 6) & 0x3f );
              p[len++] = 0x80 | ( c & 0x3f );

       } else if( c < 0x4000000 ) {
              p[len++] = 0xf8 | ( c >> 24 );
              p[len++] = 0x80 | ( (c >> 18) & 0x3f );
              p[len++] = 0x80 | ( (c >> 12) & 0x3f );
              p[len++] = 0x80 | ( (c >> 6) & 0x3f );
              p[len++] = 0x80 | ( c & 0x3f );

       } else /* if( c < 0x80000000 ) */ {
              p[len++] = 0xfc | ( c >> 30 );
              p[len++] = 0x80 | ( (c >> 24) & 0x3f );
              p[len++] = 0x80 | ( (c >> 18) & 0x3f );
              p[len++] = 0x80 | ( (c >> 12) & 0x3f );
              p[len++] = 0x80 | ( (c >> 6) & 0x3f );
              p[len++] = 0x80 | ( c & 0x3f );
       }

       return len;
}

Here is the caller graph for this function:

Definition at line 140 of file utf-8.c.

{
    const unsigned char *c = (const unsigned char *) p;
    ldap_ucs4_t ch;
       int len, i;
       static unsigned char mask[] = {
              0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };

       len = LDAP_UTF8_CHARLEN2(p, len);

       if( len == 0 ) return LDAP_UCS4_INVALID;

       ch = c[0] & mask[len];

       for(i=1; i < len; i++) {
              if ((c[i] & 0xc0) != 0x80) {
                     return LDAP_UCS4_INVALID;
              }

              ch <<= 6;
              ch |= c[i] & 0x3f;
       }

       return ch;
}

Here is the caller graph for this function:


Variable Documentation

Initial value:
 {
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 }

Definition at line 78 of file utf-8.c.

Initial value:
 {
       (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
       (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
       (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
       (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 }

Definition at line 121 of file utf-8.c.