Back to index

courier  0.68.2
unicode.h
Go to the documentation of this file.
00001 #ifndef       unicode_h
00002 #define       unicode_h
00003 
00004 /*
00005 ** Copyright 2000-2011 Double Precision, Inc.
00006 ** See COPYING for distribution information.
00007 **
00008 */
00009 
00010 #ifdef __cplusplus
00011 
00012 #include <string>
00013 #include <vector>
00014 #include <list>
00015 
00016 extern "C" {
00017 #endif
00018 
00019 #if 0
00020 }
00021 #endif
00022 
00023 #include      "../unicode/unicode_config.h" /* VPATH build */
00024 
00025 #include      <stdlib.h>
00026 
00027 #include      <stdio.h>
00028 #if HAVE_WCHAR_H
00029 #include      <wchar.h>
00030 #endif
00031 
00032 #if HAVE_STDDEF_H
00033 #include      <stddef.h>
00034 #endif
00035 #include      <stdint.h>
00036 
00037 #include      <sys/types.h>
00038 
00039 typedef uint32_t unicode_char;
00040 
00041 /*
00042 ** The system default character set, from the locale.
00043 */
00044 
00045 extern const char *unicode_default_chset();
00046 
00047 /* Unicode upper/lower/title case conversion functions */
00048 
00049 extern unicode_char unicode_uc(unicode_char);
00050 extern unicode_char unicode_lc(unicode_char);
00051 extern unicode_char unicode_tc(unicode_char);
00052 
00053 /*
00054 ** Look up HTML 4.0/XHTML entity.
00055 **
00056 ** n="amp", etc...
00057 **
00058 ** Returns the unicode entity value, or 0 if no such entity is defined.
00059 */
00060 
00061 unicode_char unicode_html40ent_lookup(const char *n);
00062 
00063 /*
00064 **
00065 ** Return "width" of unicode character.
00066 **
00067 ** This is defined as follows: for characters having the F or W property in
00068 ** tr11 (EastAsianWidth), unicode_wcwidth() returns 2.
00069 **
00070 ** Otherwise, characters having the BK, CR, LF, CM, NL, WJ, and ZW line
00071 ** breaking property as per tr14, unicode_wcwdith() returns 0. For all other
00072 ** cases, 1.
00073 **
00074 ** This provides a rough estimate of the "width" of the character if its
00075 ** shown on a text console.
00076 */
00077 
00078 extern int unicode_wcwidth(unicode_char c);
00079 extern size_t unicode_wcwidth_str(const unicode_char *c);
00080 
00081 /*
00082 ** The unicode-ish isspace()
00083 */
00084 extern int unicode_isspace(unicode_char ch);
00085 
00086 /* Internal unicode table lookup function */
00087 
00088 extern uint8_t unicode_tab_lookup(unicode_char ch,
00089                               const size_t *unicode_indextab,
00090                               size_t unicode_indextab_sizeof,
00091                               const uint8_t (*unicode_rangetab)[2],
00092                               const uint8_t *unicode_classtab,
00093                               uint8_t uclass);
00094 
00095 /*
00096 ** Implementation of grapheme cluster boundary rules, as per tr29,
00097 ** including  GB9a and GB9b.
00098 **
00099 ** Returns non-zero if there's a grapheme break between the two referenced
00100 ** characters.
00101 */
00102 
00103 int unicode_grapheme_break(unicode_char a, unicode_char b);
00104 
00105 /*
00106 ** Implementation of line break rules, as per tr14.
00107 **
00108 ** Invoke unicode_lb_init() to initialize the linebreaking algorithm. The
00109 ** first parameter is a callback function that gets invoked with two
00110 ** arguments: UNICODE_LB_{MANDATORY|NONE|ALLOWED}, and a passthrough argument.
00111 ** The second parameter to unicode_lb_init() is the opaque passthrough
00112 ** pointer, that is passed as the second argument to the callback function
00113 ** with no further interpretation.
00114 **
00115 ** unicode_lb_init() returns an opaque handle. Invoke unicode_lb_next(),
00116 ** passing the handle and one unicode character. Repeatedly invoke
00117 ** unicode_lb_next() to specify the input string for the linebreaking
00118 ** algorithm, then invoke unicode_lb_end() to finish calculating the
00119 ** linebreaking algorithm, and deallocate the opaque linebreaking handle.
00120 **
00121 ** The callback function gets invoked once for each invocation of
00122 ** unicode_lb_next(). The contract is that before unicode_lb_end() returns,
00123 ** the callback function will get invoked the exact number of times that
00124 ** unicode_lb_next(), as long as each invocation of the callback function
00125 ** returned 0; nothing more, nothing less. The first parameter to the callback
00126 ** function will be one of the following values:
00127 **
00128 ** UNICODE_LB_MANDATORY - a linebreak is MANDATORY before the corresponding
00129 ** character.
00130 ** UNICODE_LB_NONE - a linebreak is PROHIBITED before the corresponding
00131 ** character.
00132 ** UNICODE_LB_ALLOWED - a linebreak is OPTIONAL before the corresponding
00133 ** character (the preceding character is a space, or an equivalent).
00134 **
00135 ** The callback function should return 0. A non-zero value indicates an
00136 ** error, which gets propagated up to the caller. The contract that the
00137 ** callback function gets invoked the same number of times that
00138 ** unicode_lb_next() gets invoked is now broken.
00139 */
00140 
00141 #define UNICODE_LB_MANDATORY       -1
00142 #define UNICODE_LB_NONE            0
00143 #define UNICODE_LB_ALLOWED  1
00144 
00145 struct unicode_lb_info;
00146 
00147 typedef struct unicode_lb_info *unicode_lb_info_t;
00148 
00149 /*
00150 ** Allocate a linebreaking handle.
00151 */
00152 extern unicode_lb_info_t unicode_lb_init(int (*cb_func)(int, void *),
00153                                     void *cb_arg);
00154 
00155 /*
00156 ** Feed the next character through the linebreaking algorithm.
00157 ** A non-zero return code indicates that the callback function was invoked
00158 ** and it returned a non-zero return code (which is propagated as a return
00159 ** value). unicode_lb_end() must still be invoked, in this case.
00160 **
00161 ** A zero return code indicates that if the callback function was invoked,
00162 ** it returned 0.
00163 */
00164 
00165 extern int unicode_lb_next(unicode_lb_info_t i, unicode_char ch);
00166 
00167 /*
00168 ** Convenience function that invokes unicode_lb_next() with a list of
00169 ** unicode chars. Returns 0 if all invocations of unicode_lb_next() returned
00170 ** 0, or the first non-zero return value from unicode_lb_next().
00171 */
00172 
00173 extern int unicode_lb_next_cnt(unicode_lb_info_t i,
00174                             const unicode_char *chars,
00175                             size_t cnt);
00176 
00177 /*
00178 ** Finish the linebreaking algorithm.
00179 **
00180 ** A non-zero return code indicates that the callback function was invoked
00181 ** and it returned a non-zero return code (which is propagated as a return
00182 ** value).
00183 **
00184 ** A zero return code indicates that if the callback function was invoked,
00185 ** it returned 0, and that the callback function was invoked exactly the same
00186 ** number of times that unicode_lb_next() was invoked.
00187 **
00188 ** In all case, the linebreak handle will no longer be valid when this
00189 ** function returns.
00190 */
00191 
00192 extern int unicode_lb_end(unicode_lb_info_t i);
00193 
00194 /*
00195 ** An alternative linebreak API where the callback function receives the
00196 ** original unicode character in addition to its linebreak value.
00197 **
00198 ** User unicode_lbc_init(), unicode_lbc_next(), and unicode_lbc_end(), whose
00199 ** semantics are the same as their _lb_ counterparts.
00200 */
00201 
00202 struct unicode_lbc_info;
00203 
00204 typedef struct unicode_lbc_info *unicode_lbc_info_t;
00205 
00206 extern unicode_lbc_info_t unicode_lbc_init(int (*cb_func)(int, unicode_char,
00207                                                    void *),
00208                                       void *cb_arg);
00209 extern int unicode_lbc_next(unicode_lbc_info_t i, unicode_char ch);
00210 extern int unicode_lbc_end(unicode_lbc_info_t i);
00211 
00212 /*
00213 ** Set linebreaking options.
00214 **
00215 ** OPTIONS SUBJECT TO CHANGE.
00216 */
00217 
00218 extern void unicode_lb_set_opts(unicode_lb_info_t i, int opts);
00219 
00220 extern void unicode_lbc_set_opts(unicode_lbc_info_t i, int opts);
00221 
00222 /*
00223 ** Tailorization of LB24: Prevent pluses, as in "C++", from breaking.
00224 **
00225 ** Adds the following to LB24:
00226 **
00227 **            PR x PR
00228 **
00229 **            AL x PR
00230 **
00231 **            ID x PR
00232 **/
00233 #define UNICODE_LB_OPT_PRBREAK 0x0001
00234 
00235 
00236 /*
00237 ** Tailored / breaking rules.
00238 **
00239 ** Adds the following rule to LB13:
00240 **
00241 **            SY x EX
00242 **
00243 **            SY x AL
00244 **
00245 **            SY x ID
00246 **
00247 **            SP รท SY, which takes precedence over "x SY".
00248 */
00249 #define UNICODE_LB_OPT_SYBREAK 0x0002
00250 
00251 /*
00252 ** Tailored / breaking rules.
00253 **
00254 ** This reclassifies U+2013 and U+2014 as class WJ, prohibiting breaks before
00255 ** and after mdash and ndash.
00256 */
00257 #define UNICODE_LB_OPT_DASHWJ 0x0004
00258 
00259 /*
00260 ** Implemention of word break rules, as per tr29.
00261 **
00262 ** Invoke unicode_wb_init() to initialize the wordbreaking algorithm. The
00263 ** first parameter is a callback function that gets invoked with two
00264 ** arguments: an int flag, and a passthrough argument. The second parameter to
00265 ** unicode_wb_init() is the opaque passthrough pointer, that is passed as the
00266 ** second argument to the callback function with no further interpretation.
00267 **
00268 ** unicode_wb_init() returns an opaque handle. Invoke unicode_wb_next(),
00269 ** passing the handle and one unicode character. Repeatedly invoke
00270 ** unicode_wb_next() to specify the input string for the wordbreaking
00271 ** algorithm, then invoke unicode_wb_end() to finish calculating the
00272 ** wordbreaking algorithm, and deallocate the opaque wordbreaking handle.
00273 **
00274 ** The callback function gets invoked once for each invocation of
00275 ** unicode_wb_next(). The contract is that before unicode_wb_end() returns,
00276 ** the callback function will get invoked the exact number of times that
00277 ** unicode_wb_next(), as long as each invocation of the callback function
00278 ** returned 0; nothing more, nothing less. The first parameter to the callback
00279 ** function will be an int. A non-zero value indicates that there is a word
00280 ** break between this character and the preceding one.
00281 **
00282 ** The callback function should return 0. A non-zero value indicates an
00283 ** error, which gets propagated up to the caller. The contract that the
00284 ** callback function gets invoked the same number of times that
00285 ** unicode_lb_next() gets invoked is now broken.
00286 */
00287 
00288 struct unicode_wb_info;
00289 
00290 typedef struct unicode_wb_info *unicode_wb_info_t;
00291 
00292 /*
00293 ** Allocate a wordbreaking handle.
00294 */
00295 extern unicode_wb_info_t unicode_wb_init(int (*cb_func)(int, void *),
00296                                     void *cb_arg);
00297 
00298 /*
00299 ** Feed the next character through the wordbreaking algorithm.
00300 ** A non-zero return code indicates that the callback function was invoked
00301 ** and it returned a non-zero return code (which is propagated as a return
00302 ** value). unicode_wb_end() must still be invoked, in this case.
00303 **
00304 ** A zero return code indicates that if the callback function was invoked,
00305 ** it returned 0.
00306 */
00307 
00308 extern int unicode_wb_next(unicode_wb_info_t i, unicode_char ch);
00309 
00310 /*
00311 ** Convenience function that invokes unicode_wb_next() with a list of
00312 ** unicode chars. Returns 0 if all invocations of unicode_wb_next() returned
00313 ** 0, or the first non-zero return value from unicode_wb_next().
00314 */
00315 
00316 extern int unicode_wb_next_cnt(unicode_wb_info_t i,
00317                             const unicode_char *chars,
00318                             size_t cnt);
00319 
00320 /*
00321 ** Finish the wordbreaking algorithm.
00322 **
00323 ** A non-zero return code indicates that the callback function was invoked
00324 ** and it returned a non-zero return code (which is propagated as a return
00325 ** value).
00326 **
00327 ** A zero return code indicates that if the callback function was invoked,
00328 ** it returned 0, and that the callback function was invoked exactly the same
00329 ** number of times that unicode_wb_next() was invoked.
00330 **
00331 ** In all case, the wordbreak handle will no longer be valid when this
00332 ** function returns.
00333 */
00334 
00335 extern int unicode_wb_end(unicode_wb_info_t i);
00336 
00337 /*
00338 ** Search for a word boundary.
00339 **
00340 ** Obtain a handle by calling unicode_wbscan_init(), then invoke
00341 ** unicode_wbscan_next() to provide a unicode stream, then invoke
00342 ** unicode_wbscan_end(). unicode_wbscan_end() returns the number of unicode
00343 ** characters from the beginning of the stream until the first word boundary.
00344 **
00345 ** You may prematurely stop calling unicode_wbscan_next() once it returns a
00346 ** non-0 value, which means that there is sufficient context to compute the
00347 ** first word boundary, and all further calls to unicode_wbscan_next() will
00348 ** be internal no-ops.
00349 */
00350 
00351 struct unicode_wbscan_info;
00352 
00353 typedef struct unicode_wbscan_info *unicode_wbscan_info_t;
00354 
00355 unicode_wbscan_info_t unicode_wbscan_init();
00356 
00357 int unicode_wbscan_next(unicode_wbscan_info_t i, unicode_char ch);
00358 
00359 size_t unicode_wbscan_end(unicode_wbscan_info_t i);
00360 
00361 /*
00362 ** A buffer that holds unicode characters, and dynamically grows as needed.
00363 */
00364 
00365 struct unicode_buf {
00366        unicode_char *ptr;   /* The unicode characters */
00367        size_t size,         /* Buffer size */
00368               len,          /* How many characters in ptr are initialized */
00369               max;          /* Maximum size the buffer can grow to */
00370 };
00371 
00372 /*
00373 ** Initialize a buffer. Constructor.
00374 */
00375 
00376 void unicode_buf_init(/* Initialize this structure. ptr, size, len cleared */
00377                     struct unicode_buf *p,
00378 
00379                     /*
00380                     ** Maximum size the buffer can grow to. (size_t)-1
00381                     ** means unlimited.
00382                     */
00383                     size_t max);
00384 /*
00385 ** Like unicode_buf_init, and initialize the new buffer with the contents of
00386 ** another buffer. The maximum size of the initialized buffer is exactly the
00387 ** number of characters in the existing buffer. This copies a buffer using
00388 ** the minimum amount of heap space.
00389 */
00390 
00391 #define unicode_buf_init_copy(a,b)                      \
00392        do {                                             \
00393               unicode_buf_init((a), unicode_buf_len(b));       \
00394               unicode_buf_append_buf((a),(b));          \
00395        } while (0)
00396 
00397 /*
00398 ** Deinitialize the buffer. Destructor. Frees memory.
00399 */
00400 
00401 void unicode_buf_deinit(struct unicode_buf *p);
00402 
00403 /*
00404 ** Official way to access the characters in the unicode buffer.
00405 */
00406 #define unicode_buf_ptr(p) ((p)->ptr)
00407 
00408 /*
00409 ** Official way of obtaining the number of characters in the unicode buffer.
00410 */
00411 #define unicode_buf_len(p) ((p)->len)
00412 
00413 /*
00414 ** Remove all existing characters from an initialized buffer. Sets len to 0.
00415 */
00416 
00417 #define unicode_buf_clear(p) ((p)->len=0)
00418 
00419 /*
00420 ** Append characters to the existing characters in the unicode buffer.
00421 ** The buffer grows, if needed. If the buffer would exceed its maximum size,
00422 ** the extra characters get truncated.
00423 **
00424 ** Returns 0 if the characters were appended. -1 for a malloc failure.
00425 */
00426 
00427 int unicode_buf_append(struct unicode_buf *p,    /* The buffer */
00428                      const unicode_char *uc,     /* Characters to append */
00429                      size_t l);           /* How many of them */
00430 
00431 /*
00432 ** Convert an iso-8859-1 char string and invoke unicode_buf_append().
00433 */
00434 
00435 void unicode_buf_append_char(struct unicode_buf *dst,
00436                           const char *str,
00437                           size_t cnt);
00438 
00439 /*
00440 ** Remove some portion of the unicode buffer
00441 */
00442 
00443 void unicode_buf_remove(struct unicode_buf *p, /* The buffer */
00444                      size_t pos, /* Offset in buffer */
00445                      size_t cnt); /* How many to remove */
00446 
00447 /*
00448 ** Append the contents of an existing buffer to another one.
00449 */
00450 
00451 #define unicode_buf_append_buf(a,b)                                   \
00452        unicode_buf_append((a), unicode_buf_ptr(b), unicode_buf_len(b))
00453 
00454 
00455 /*
00456 ** The equivalent of strcmp() for unicode buffers.
00457 */
00458 
00459 int unicode_buf_cmp(const struct unicode_buf *a,
00460                   const struct unicode_buf *b);
00461 
00462 /*
00463 ** The equivalent of unicode_buf_cmp, except that the second buffer is an
00464 ** iso-8859-1 string.
00465 */
00466 
00467 int unicode_buf_cmp_str(const struct unicode_buf *p,
00468                      const char *c,       /* iso-8859-1 string */
00469                      size_t cl);   /* Number of chars in c */
00470 
00471 /*
00472 ** A wrapper for iconv(3). This wrapper provides a different API for iconv(3).
00473 ** A handle gets created by libmail_u_convert_init().
00474 ** libmail_u_convert_init() receives a pointer to the output function
00475 ** which receives converted character text.
00476 **
00477 ** The output function receives a pointer to the converted character text, and
00478 ** the number of characters in the converted text.
00479 **
00480 ** The character text to convert gets passed, repeatedly, to
00481 ** libmail_u_convert(). Each call to libmail_u_convert() results in
00482 ** the output function being invoked, zero or more times, with the converted
00483 ** text. Finally, libmail_u_convert_deinit() stops the conversion and
00484 ** deallocates the conversion handle.
00485 **
00486 ** Internal buffering takes place. libmail_u_convert_deinit() may result
00487 ** in the output function being called one or more times, to receive the final
00488 ** part of the converted character stream.
00489 **
00490 ** The output function should return 0. A non-0 value causes
00491 ** libmail_u_convert() and/or libmail_u_convert_deinit() returning
00492 ** non-0.
00493 */
00494 
00495 struct libmail_u_convert_hdr;
00496 
00497 typedef struct libmail_u_convert_hdr *libmail_u_convert_handle_t;
00498 
00499 /*
00500 ** libmail_u_convert_init() returns a non-NULL handle for the requested
00501 ** conversion, or NULL if the requested conversion is not available.
00502 */
00503 
00504 libmail_u_convert_handle_t
00505 libmail_u_convert_init(/* Convert from this chset */
00506                      const char *src_chset,
00507 
00508                      /* Convert to this chset */
00509                      const char *dst_chset,
00510 
00511                      /* The output function */
00512 
00513                      int (*output_func)(const char *, size_t, void *),
00514 
00515                      /* Passthrough arg */
00516                      void *convert_arg);
00517 
00518 /*
00519 ** Repeatedly pass the character text to convert to libmail_u_convert().
00520 **
00521 ** Returns non-0 if the output function returned non-0, or 0 if all invocations
00522 ** of the output function returned 0.
00523 */
00524 
00525 int libmail_u_convert(/* The conversion handle */
00526                     libmail_u_convert_handle_t handle,
00527 
00528                     /* Text to convert */
00529                     const char *text,
00530 
00531                     /* Number of bytes to convert */
00532                     size_t cnt);
00533 
00534 /*
00535 ** Finish character set conversion. The handle gets deallocated.
00536 **
00537 ** May still result in one or more invocations of the output function.
00538 ** Returns non-zero if any previous invocation of the output function returned
00539 ** non-zero (this includes any invocations of the output function resulting
00540 ** from this call, or prior libmail_u_convert() calls), or 0 if all
00541 ** invocations of the output function returned 0.
00542 **
00543 ** If the errptr is not NULL, *errptr is set to non-zero if there were any
00544 ** conversion errors -- if there was any text that could not be converted to
00545 ** the destination character text.
00546 */
00547 
00548 int libmail_u_convert_deinit(libmail_u_convert_handle_t handle,
00549                           int *errptr);
00550 
00551 
00552 /*
00553 ** Specialization: save converted character text in a buffer.
00554 **
00555 ** Implementation: call libmail_u_convert_tocbuf_init() instead of
00556 ** libmail_u_convert_init(), then call libmail_u_convert() and
00557 ** libmail_u_convert_deinit(), as usual.
00558 **
00559 ** If libmail_u_convert_deinit() returns 0, *cbufptr_ret gets initialized to a
00560 ** malloc()ed buffer, and the number of converted characters, the size of the
00561 ** malloc()ed buffer, are placed into *csize_ret arguments, that were passed
00562 ** to libmail_u_convert_tou_init().
00563 **
00564 ** Note: if the converted string is an empty string, *cbufsize_ret is set to 0,
00565 ** but *cbufptr_ptr still gets initialized (to a dummy malloced buffer).
00566 **
00567 ** The optional nullterminate places a trailing \0 character after the
00568 ** converted string (this is included in *cbufsize_ret).
00569 */
00570 
00571 libmail_u_convert_handle_t
00572 libmail_u_convert_tocbuf_init(/* Convert from this chset */
00573                            const char *src_chset,
00574 
00575                            /* Convert to this chset */
00576                            const char *dst_chset,
00577 
00578                            /* malloced buffer */
00579                            char **cbufptr_ret,
00580 
00581                            /* size of the malloced buffer */
00582                            size_t *cbufsize_ret,
00583 
00584                            /* null terminate the resulting string */
00585                            int nullterminate
00586                            );
00587 
00588 
00589 /*
00590 ** Specialization: convert some character text to a unicode_char array.
00591 **
00592 ** This is like libmail_u_convert_tocbuf_init(), but converts to a unicode_char
00593 ** array.
00594 **
00595 ** The returned *ucsize_ret is initialized with the number of unicode_chars,
00596 ** rather than the byte count.
00597 **
00598 ** In all other ways, this function behaves identically to
00599 ** libmail_u_convert_tocbuf_init().
00600 */
00601 
00602 libmail_u_convert_handle_t
00603 libmail_u_convert_tou_init(/* Convert from this chset */
00604                         const char *src_chset,
00605 
00606                         /* malloc()ed buffer pointer, on exit. */
00607                         unicode_char **ucptr_ret,
00608 
00609                         /* size of the malloc()ed buffer, upon exit */
00610                         size_t *ucsize_ret,
00611 
00612                         /* If true, terminate with U+0x0000, for convenience */
00613                         int nullterminate
00614                         );
00615 
00616 /*
00617 ** Specialization: convert a unicode_char array to some character text.
00618 **
00619 ** This is the opposite of libmail_u_convert_tou_init(). Call this to
00620 ** initialize the conversion handle, then use libmail_u_convert_uc()
00621 ** instead of libmail_u_convert.
00622 */
00623 
00624 libmail_u_convert_handle_t
00625 libmail_u_convert_fromu_init(/* Convert to this chset */
00626                           const char *dst_chset,
00627 
00628                           /* malloc()ed buffer pointer, on exit. */
00629                           char **cbufptr_ret,
00630 
00631                           /* size of the malloc()ed buffer, upon exit */
00632                           size_t *cbufsize_ret,
00633 
00634                           /* If true, terminate with U+0x0000, for convenience */
00635                           int nullterminate
00636                           );
00637 
00638 int libmail_u_convert_uc(/* The conversion handle */
00639                       libmail_u_convert_handle_t handle,
00640 
00641                       /* Text to convert */
00642                       const unicode_char *text,
00643 
00644                       /* Number of bytes to convert */
00645                       size_t cnt);
00646 
00647 /*
00648 ** Initialize conversion to UTF-8.
00649 **
00650 ** This is a wrapper for libmail_u_convert_tocbuf_init() that specifies the
00651 ** destination charset as UTF-8.
00652 */
00653 
00654 libmail_u_convert_handle_t
00655 libmail_u_convert_tocbuf_toutf8_init(const char *src_chset,
00656                                  char **cbufptr_ret,
00657                                  size_t *cbufsize_ret,
00658                                  int nullterminate);
00659 
00660 /*
00661 ** Initialize conversion from UTF-8.
00662 **
00663 ** This is a wrapper for libmail_u_convert_tocbuf_init() that specifies the
00664 ** source charset as UTF-8.
00665 */
00666 
00667 libmail_u_convert_handle_t
00668 libmail_u_convert_tocbuf_fromutf8_init(const char *dst_chset,
00669                                    char **cbufptr_ret,
00670                                    size_t *cbufsize_ret,
00671                                    int nullterminate);
00672 
00673 /*
00674 ** Convert a character string to UTF-8.
00675 **
00676 ** Returns a malloc-ed buffer holding the UTF-8 string, or NULL if an
00677 ** error occured.
00678 */
00679 char *libmail_u_convert_toutf8(/* Text to convert to UTF-8 */
00680                             const char *text,
00681 
00682                             /* Character set to convert to UTF-8 */
00683                             const char *charset,
00684 
00685                             /*
00686                             ** If non-NULL, and a non-NULL pointer is
00687                             ** returned, *error is set to non-zero if
00688                             ** a character conversion error has occured.
00689                             */
00690                             int *error);
00691 
00692 /*
00693 ** Convert UTF-8 text to another character set.
00694 **
00695 ** Returns a malloc-ed buffer holding the string converted to the specified
00696 ** character set, or NULL if an error occured.
00697 */
00698 
00699 char *libmail_u_convert_fromutf8(/* A UTF-8 string */
00700                              const char *text,
00701 
00702                              /*
00703                              ** Convert the UTF-8 string to this character
00704                              ** set.
00705                              */
00706 
00707                              const char *charset,
00708 
00709                              /*
00710                              ** If non-NULL, and a non-NULL pointer is
00711                              ** returned, *error is set to non-zero if
00712                              ** a character conversion error has occured.
00713                              */
00714                              int *error);
00715 
00716 /*
00717 ** Convert one charset to another charset, placing the result in a malloc-ed
00718 ** buffer.
00719 **
00720 ** Returns a malloc-ed buffer holding the string converted to the specified
00721 ** character set, or NULL if an error occured.
00722 */
00723 
00724 char *libmail_u_convert_tobuf(/* A string to convert */
00725                            const char *text,
00726 
00727                            /*
00728                            ** String's charset.
00729                            */
00730 
00731                            const char *charset,
00732 
00733                            /*
00734                            ** Destination charset
00735                            */
00736                            const char *dstcharset,
00737 
00738                            /*
00739                            ** If non-NULL, and a non-NULL pointer is
00740                            ** returned, *error is set to non-zero if
00741                            ** a character conversion error has occured.
00742                            */
00743                            int *error);
00744 
00745 /*
00746 ** Convenience function: call libmail_u_convert_tou_init(), feed the
00747 ** character string through libmail_u_convert(), then call
00748 ** libmail_u_convert_deinit().
00749 **
00750 ** If this function returns 0, *uc and *ucsize is set to a malloced buffer+size
00751 ** holding the unicode char array.
00752 */
00753 
00754 int libmail_u_convert_tou_tobuf(/* Character text to convert */
00755                             const char *text,
00756 
00757                             /* Number of characters */
00758                             size_t text_l,
00759 
00760                             /* text's charset */
00761                             const char *charset,
00762 
00763                             /*
00764                             ** If this function returns 0, this gets
00765                             ** initialized
00766                             */
00767                             unicode_char **uc,
00768 
00769                             /*
00770                             ** Size of the allocated buffer
00771                             */
00772                             size_t *ucsize,
00773 
00774                             /*
00775                             ** If not null and this function returns 0,
00776                             ** this is set to non-0 if there
00777                             ** was a conversion error (but the output
00778                             ** buffer gets still allocated and
00779                             ** initialized)
00780                             */
00781                             int *err);
00782 
00783 /*
00784 ** Convenience function: call libmail_u_convert_fromu_init(), feed the
00785 ** unicode_array through libmail_u_convert_uc(), then call
00786 ** libmail_u_convert_deinit().
00787 **
00788 ** If this function returns 0, *uc and *ucsize is set to a malloced buffer+size
00789 ** holding the converted character string
00790 */
00791 
00792 int libmail_u_convert_fromu_tobuf(/* Unicode array to convert to a char str */
00793                               const unicode_char *utext,
00794 
00795                               /*
00796                               ** Size of the unicode array.
00797                               ** If this is (size_t)-1, utext is a
00798                               ** 0-terminated array.
00799                               */
00800                               size_t utext_l,
00801 
00802                               /*
00803                               ** Convert the unicode array to this charset.
00804                               */
00805                               const char *charset,
00806 
00807                               /*
00808                               ** If libmail_u_convert_fromu_tobuf()
00809                               ** returns 0, this is initialized to a
00810                               ** malloced buffer with a 0-terminated
00811                               ** string is kept.
00812                               */
00813                               char **c,
00814 
00815                               /*
00816                               ** Size of the initialized array, including
00817                               ** the 0-terminator.
00818                               */
00819                               size_t *csize,
00820 
00821                               /*
00822                               ** If libmail_u_convert_fromu_tobuf()
00823                               ** returns 0 and this is not NULL,
00824                               ** *err is set to non-0 if there was a
00825                               ** conversion error to the requested
00826                               ** character set.
00827                               */
00828                               int *err);
00829 
00830 /*
00831 ** Convenience function: convert a string in a given character set
00832 ** to/from uppercase, lowercase, or something else.
00833 **
00834 ** This is done by calling libmail_u_convert_tou_tobuf() first,
00835 ** applying the title_func and char_func, then using
00836 ** libmail_u_convert_fromu_tobuf().
00837 **
00838 ** A NULL return indicates that the requested conversion cannot be performed.
00839 */
00840 
00841 char *libmail_u_convert_tocase( /* String to convert */
00842                             const char *str,
00843 
00844                             /* String's character set */
00845 
00846                             const char *charset,
00847 
00848                             /*
00849                             ** Conversion of the first character in
00850                             ** str: unicode_uc, unicode_lc, or unicode_tc:
00851                             */
00852 
00853                             unicode_char (*first_char_func)(unicode_char),
00854 
00855                             /*
00856                             ** Conversion of the second and the remaining
00857                             ** character in str. If NULL, same as
00858                             ** first_char_func.
00859                             */
00860                             unicode_char (*char_func)(unicode_char));
00861 
00862 
00863 
00864 /* Either UCS-4BE or UCS-4LE, matching the native unicode_char endianness */
00865 
00866 extern const char libmail_u_ucs4_native[];
00867 
00868 /* Either UCS-2BE or UCS-2LE, matching the native unicode_char endianness */
00869 
00870 extern const char libmail_u_ucs2_native[];
00871 
00872 /*
00873 ** Modified-UTF7 encoding used for IMAP folder names. Pass it for a charset
00874 ** parameter.
00875 **
00876 ** This can be followed by a " " and up to 15 characters to be escaped in
00877 ** addition to unicode chars.
00878 */
00879 
00880 #define unicode_x_imap_modutf7 "x-imap-modutf7"
00881 
00882 #if 0
00883 {
00884 #endif
00885 
00886 #ifdef __cplusplus
00887 }
00888 
00889 extern size_t unicode_wcwidth(const std::vector<unicode_char> &uc);
00890 
00891 namespace mail {
00892 
00893        /*
00894        ** Interface to iconv.
00895        **
00896        ** Subclass converted(). Invoke begin(), then operator(), repeatedly,
00897        ** then end().
00898        **
00899        ** converted() receives the converted text.
00900        */
00901 
00902        class iconvert {
00903 
00904               libmail_u_convert_handle_t handle;
00905 
00906        public:
00907               iconvert();
00908               ~iconvert();
00909 
00910               /* Start conversion.
00911               ** Returns false if the requested conversion cannot be done.
00912               **/
00913 
00914               bool begin(/* Convert from */
00915                         const std::string &src_chset,
00916 
00917                         /* Convert to */
00918                         const std::string &dst_chset);
00919 
00920               /* Feed iconv(3). Returns false if the conversion was aborted.
00921                */
00922 
00923               bool operator()(const char *, size_t);
00924 
00925               bool operator()(const unicode_char *, size_t);
00926 
00927               /*
00928               ** Get the results here. If the subclass returns a non-0
00929               ** value, the conversion is aborted.
00930               */
00931 
00932               virtual int converted(const char *, size_t);
00933 
00934               /*
00935               ** End of conversion.
00936               **
00937               ** Returns true if all calls to converted() returned 0,
00938               ** false if the conversion was aborted.
00939               **
00940               ** errflag is set to true if there was a character that could
00941               ** not be converted, and passed to converted().
00942               */
00943 
00944               bool end(bool &errflag)
00945               {
00946                      return end(&errflag);
00947               }
00948 
00949               bool end()
00950               {
00951                      return end(NULL);
00952               }
00953 
00954               /* Convert between two different charsets */
00955 
00956               static std::string convert(const std::string &text,
00957                                       const std::string &charset,
00958                                       const std::string &dstcharset,
00959                                       bool &errflag);
00960 
00961               /* Convert between two different charsets */
00962 
00963               static std::string convert(const std::string &text,
00964                                       const std::string &charset,
00965                                       const std::string &dstcharset)
00966               {
00967                      bool dummy;
00968 
00969                      return convert(text, charset, dstcharset, dummy);
00970               }
00971 
00972               /* Convert from unicode to a charset */
00973 
00974               static std::string convert(const std::vector<unicode_char> &uc,
00975                                       const std::string &dstcharset,
00976                                       bool &errflag);
00977 
00978               /* Convert from unicode to a charset */
00979 
00980               static std::string convert(const std::vector<unicode_char> &uc,
00981                                       const std::string &dstcharset)
00982               {
00983                      bool dummy;
00984 
00985                      return convert(uc, dstcharset, dummy);
00986               }
00987 
00988               /* Convert charset to unicode */
00989 
00990               static bool convert(const std::string &text,
00991                                 const std::string &charset,
00992                                 std::vector<unicode_char> &uc);
00993 
00994 
00995               /* Convert to upper/lower/title case */
00996 
00997               static std::string
00998                      convert_tocase(/* Text string */
00999                                    const std::string &text,
01000 
01001                                    /* Its charset */
01002                                    const std::string &charset,
01003 
01004                                    /* First character: unicode_uc, unicode_lc, or unicode_tc */
01005                                    unicode_char (*first_char_func)(unicode_char),
01006 
01007                                    /* If not NULL, second and subsequent chars */
01008                                    unicode_char (*char_func)(unicode_char)
01009                                    =NULL)
01010               {
01011                      bool dummy;
01012 
01013                      return convert_tocase(text, charset, dummy,
01014                                          first_char_func,
01015                                          char_func);
01016               }
01017 
01018               /* Convert to upper/lower/title case */
01019 
01020               static std::string
01021                      convert_tocase(/* Text string */
01022                                    const std::string &text,
01023 
01024                                    /* Its charset */
01025                                    const std::string &charset,
01026 
01027                                    /* Set if there's a conversion error */
01028                                    bool &err,
01029 
01030                                    /* First character: unicode_uc, unicode_lc, or unicode_tc */
01031                                    unicode_char (*first_char_func)(unicode_char),
01032 
01033                                    /* If not NULL, second and subsequent chars */
01034                                    unicode_char (*char_func)(unicode_char)
01035                                    =NULL);
01036        private:
01037               bool end(bool *);
01038 
01039        public:
01040               class tou;
01041               class fromu;
01042        };
01043 
01044        /* Convert output of iconvert to unicode_chars. */
01045 
01046        class iconvert::tou : public iconvert {
01047 
01048        public:
01049               bool begin(const std::string &chset);
01050 
01051               virtual int converted(const unicode_char *, size_t);
01052 
01053               using iconvert::operator();
01054        private:
01055               int converted(const char *ptr, size_t cnt);
01056 
01057        public:
01058               template<typename iter_t> class to_iter_class;
01059 
01060               template<typename input_iter_t,
01061                      typename output_iter_t>
01062                      static output_iter_t convert(input_iter_t from_iter,
01063                                                input_iter_t to_iter,
01064                                                const std::string &chset,
01065                                                output_iter_t out_iter);
01066 
01067               template<typename input_iter_t>
01068                      static void convert(input_iter_t from_iter,
01069                                        input_iter_t to_iter,
01070                                        const std::string &chset,
01071                                        std::vector<unicode_char> &out_buf)
01072               {
01073                      out_buf.clear();
01074                      std::back_insert_iterator<std::vector<unicode_char> >
01075                             insert_iter(out_buf);
01076 
01077                      convert(from_iter, to_iter, chset, insert_iter);
01078               }
01079 
01080               static void convert(const std::string &str,
01081                                 const std::string &chset,
01082                                 std::vector<unicode_char> &out_buf);
01083        };
01084 
01085        /* Helper class that saves unicode output into an output iterator */
01086 
01087        template<typename iter_t>
01088               class iconvert::tou::to_iter_class : public iconvert::tou {
01089 
01090               iter_t iter;
01091        public:
01092 
01093        to_iter_class(iter_t iterValue)
01094               : iter(iterValue) {}
01095 
01096               using tou::operator();
01097 
01098               operator iter_t() const { return iter; }
01099 
01100        private:
01101               int converted(const unicode_char *ptr, size_t cnt)
01102               {
01103                      while (cnt)
01104                      {
01105                             *iter=*ptr;
01106 
01107                             ++iter;
01108                             ++ptr;
01109                             --cnt;
01110                      }
01111                      return 0;
01112               }
01113        };
01114               
01115        template<typename input_iter_t,
01116               typename output_iter_t>
01117               output_iter_t iconvert::tou::convert(input_iter_t from_iter,
01118                                                input_iter_t to_iter,
01119                                                const std::string &chset,
01120                                                output_iter_t out_iter)
01121               {
01122                      class to_iter_class<output_iter_t> out(out_iter);
01123 
01124                      if (!out.begin(chset))
01125                             return out;
01126 
01127                      std::vector<char> string;
01128 
01129                      while (from_iter != to_iter)
01130                      {
01131                             string.push_back(*from_iter++);
01132 
01133                             if (string.size() > 31)
01134                             {
01135                                    out(&string[0], string.size());
01136                                    string.clear();
01137                             }
01138                      }
01139 
01140                      if (string.size() > 0)
01141                             out(&string[0], string.size());
01142 
01143                      out.end();
01144                      return out;
01145               }
01146               
01147        /* Convert output of iconvert from unicode_chars. */
01148 
01149        class iconvert::fromu : public iconvert {
01150 
01151        public:
01152               bool begin(const std::string &chset);
01153 
01154               using iconvert::operator();
01155 
01156               template<typename iter_t> class to_iter_class;
01157 
01158               template<typename input_iter_t,
01159                      typename output_iter_t>
01160                      static output_iter_t convert(input_iter_t from_iter,
01161                                                input_iter_t to_iter,
01162                                                const std::string &chset,
01163                                                output_iter_t out_iter);
01164 
01165               template<typename input_iter_t>
01166                      static void convert(input_iter_t from_iter,
01167                                        input_iter_t to_iter,
01168                                        const std::string &chset,
01169                                        std::string &out_buf)
01170               {
01171                      out_buf="";
01172                      std::back_insert_iterator<std::string>
01173                             insert_iter(out_buf);
01174 
01175                      convert(from_iter, to_iter, chset, insert_iter);
01176               }
01177 
01178               static void convert(const std::vector<unicode_char> &ubuf,
01179                                 const std::string &chset,
01180                                 std::string &out_buf);
01181 
01182               static std::string convert(const std::vector<unicode_char>
01183                                       &ubuf,
01184                                       const std::string &chset);
01185        };
01186 
01187        /* Helper class that saves unicode output into an output iterator */
01188 
01189        template<typename iter_t>
01190               class iconvert::fromu::to_iter_class : public iconvert::fromu {
01191 
01192               iter_t iter;
01193        public:
01194 
01195        to_iter_class(iter_t iterValue)
01196               : iter(iterValue) {}
01197 
01198               using fromu::operator();
01199 
01200               operator iter_t() const { return iter; }
01201 
01202        private:
01203               int converted(const char *ptr, size_t cnt)
01204               {
01205                      while (cnt)
01206                      {
01207                             *iter=*ptr;
01208 
01209                             ++iter;
01210                             ++ptr;
01211                             --cnt;
01212                      }
01213                      return 0;
01214               }
01215        };
01216               
01217        template<typename input_iter_t,
01218               typename output_iter_t>
01219               output_iter_t iconvert::fromu::convert(input_iter_t from_iter,
01220                                                  input_iter_t to_iter,
01221                                                  const std::string &chset,
01222                                                  output_iter_t out_iter)
01223               {
01224                      class to_iter_class<output_iter_t> out(out_iter);
01225 
01226                      if (!out.begin(chset))
01227                             return out;
01228 
01229                      std::vector<unicode_char> string;
01230 
01231                      while (from_iter != to_iter)
01232                      {
01233                             string.push_back(*from_iter++);
01234 
01235                             if (string.size() > 31)
01236                             {
01237                                    out(&string[0], string.size());
01238                                    string.clear();
01239                             }
01240                      }
01241 
01242                      if (string.size() > 0)
01243                             out(&string[0], string.size());
01244 
01245                      out.end();
01246                      return out;
01247               }
01248 
01249        /*
01250        ** Unicode linebreaking algorithm, tr14.
01251        */
01252 
01253        extern "C" int linebreak_trampoline(int value, void *ptr);
01254        extern "C" int linebreakc_trampoline(int value, unicode_char ch,
01255                                         void *ptr);
01256 
01257        /*
01258        ** Subclass linebreak_callback_base, implement operator()(int).
01259        **
01260        ** Use operator<< or operator()(iterator, iterator) to feed
01261        ** unicode_chars into the linebreaking algorithm. The subclass receives
01262        ** UNICODE_LB values, as they become available.
01263        */
01264 
01265        class linebreak_callback_base {
01266 
01267               unicode_lb_info_t handle;
01268 
01269               int opts;
01270 
01271               linebreak_callback_base(const linebreak_callback_base &);
01272               /* NOT IMPLEMENTED */
01273 
01274               linebreak_callback_base &operator==(const
01275                                               linebreak_callback_base &);
01276               /* NOT IMPLEMENTED */
01277 
01278        public:
01279               linebreak_callback_base();
01280               ~linebreak_callback_base();
01281 
01282               void finish();
01283 
01284               void set_opts(int opts);
01285 
01286               friend int linebreak_trampoline(int, void *);
01287 
01288               linebreak_callback_base &operator<<(unicode_char uc);
01289 
01290               template<typename iter_type>
01291                      linebreak_callback_base &operator()(iter_type beg_iter,
01292                                                      iter_type end_iter)
01293               {
01294                      while (beg_iter != end_iter)
01295                             operator<<(*beg_iter++);
01296                      return *this;
01297               }
01298 
01299               linebreak_callback_base &operator<<(const
01300                                               std::vector<unicode_char>
01301                                               &vec)
01302               {
01303                      return operator()(vec.begin(), vec.end());
01304               }
01305        private:
01306               virtual int operator()(int);
01307        };
01308 
01309        class linebreak_callback_save_buf : public linebreak_callback_base {
01310 
01311        public:
01312               std::list<int> lb_buf;
01313 
01314               linebreak_callback_save_buf();
01315               ~linebreak_callback_save_buf();
01316 
01317        private:
01318               int operator()(int value);
01319        };
01320 
01321        /*
01322        ** Convert an input iterator sequence over unicode_chars into
01323        ** an input iterator sequence over linebreak values.
01324        */
01325 
01326        template<typename input_t> class linebreak_iter
01327               : public std::iterator<std::input_iterator_tag, int, void>
01328        {
01329               mutable input_t iter_value, end_iter_value;
01330 
01331               mutable linebreak_callback_save_buf *buf;
01332 
01333               void fill() const
01334               {
01335                      if (buf == NULL)
01336                             return;
01337 
01338                      while (buf->lb_buf.empty())
01339                      {
01340                             if (iter_value == end_iter_value)
01341                             {
01342                                    buf->finish();
01343                                    if (buf->lb_buf.empty())
01344                                    {
01345                                           delete buf;
01346                                           buf=NULL;
01347                                    }
01348                                    break;
01349                             }
01350 
01351                             buf->operator<<(*iter_value++);
01352                      }
01353               }
01354 
01355               mutable value_type bufvalue;
01356 
01357        public:
01358               linebreak_iter(const input_t &iter_valueArg,
01359                             const input_t &iter_endvalueArg)
01360                      : iter_value(iter_valueArg),
01361                      end_iter_value(iter_endvalueArg),
01362                      buf(new linebreak_callback_save_buf)
01363                      {
01364                      }
01365 
01366               linebreak_iter() : buf(NULL)
01367               {
01368               }
01369 
01370               void set_opts(int opts)
01371               {
01372                      if (buf)
01373                             buf->set_opts(opts);
01374               }
01375 
01376               ~linebreak_iter()
01377               {
01378                      if (buf)
01379                             delete buf;
01380               }
01381 
01382               linebreak_iter(const linebreak_iter<input_t> &v)
01383                      : buf(NULL)
01384               {
01385                      operator=(v);
01386               }
01387 
01388               linebreak_iter<input_t> &operator=(const
01389                                              linebreak_iter<input_t> &v)
01390               {
01391                      if (buf)
01392                             delete buf;
01393                      buf=v.buf;
01394                      iter_value=v.iter_value;
01395                      end_iter_value=v.end_iter_value;
01396                      v.buf=NULL;
01397                      return *this;
01398               }
01399 
01400               bool operator==(const linebreak_iter<input_t> &v) const
01401               {
01402                      fill();
01403                      v.fill();
01404 
01405                      return buf == NULL && v.buf == NULL;
01406               }
01407 
01408               bool operator!=(const linebreak_iter<input_t> &v) const
01409               {
01410                      return !operator==(v);
01411               }
01412 
01413               value_type operator*() const
01414               {
01415                      fill();
01416                      return buf == NULL ? UNICODE_LB_MANDATORY:
01417                             buf->lb_buf.front();
01418               }
01419 
01420               linebreak_iter<input_t> &operator++()
01421               {
01422                      bufvalue=operator*();
01423 
01424                      if (buf)
01425                             buf->lb_buf.pop_front();
01426                      return *this;
01427               }
01428 
01429               const value_type *operator++(int)
01430               {
01431                      operator++();
01432                      return &bufvalue;
01433               }
01434        };
01435 
01436        /*
01437        ** Like linebreak_callback_base, except the subclass receives both
01438        ** the linebreaking value, and the unicode character.
01439        */
01440 
01441        class linebreakc_callback_base {
01442 
01443               unicode_lbc_info_t handle;
01444 
01445               int opts;
01446 
01447               linebreakc_callback_base(const linebreakc_callback_base &);
01448               /* NOT IMPLEMENTED */
01449 
01450               linebreakc_callback_base &operator==(const
01451                                                linebreakc_callback_base
01452                                                &);
01453               /* NOT IMPLEMENTED */
01454 
01455 
01456        public:
01457               linebreakc_callback_base();
01458               ~linebreakc_callback_base();
01459 
01460               void finish();
01461 
01462               void set_opts(int opts);
01463 
01464               friend int linebreakc_trampoline(int, unicode_char, void *);
01465 
01466               linebreakc_callback_base &operator<<(unicode_char uc);
01467 
01468               template<typename iter_type>
01469                      linebreakc_callback_base &operator()(iter_type beg_iter,
01470                                                      iter_type end_iter)
01471               {
01472                      while (beg_iter != end_iter)
01473                             operator<<(*beg_iter++);
01474                      return *this;
01475               }
01476 
01477               linebreakc_callback_base &operator<<(const
01478                                               std::vector<unicode_char>
01479                                               &vec)
01480               {
01481                      return operator()(vec.begin(), vec.end());
01482               }
01483        private:
01484               virtual int operator()(int, unicode_char);
01485        };
01486 
01487        class linebreakc_callback_save_buf : public linebreakc_callback_base {
01488 
01489        public:
01490               std::list<std::pair<int, unicode_char> > lb_buf;
01491 
01492               linebreakc_callback_save_buf();
01493               ~linebreakc_callback_save_buf();
01494 
01495        private:
01496               int operator()(int, unicode_char);
01497        };
01498 
01499 
01500        /*
01501        ** Convert an input iterator sequence over unicode_chars into
01502        ** an input iterator sequence over std::pair<int, unicode_char>,
01503        ** the original unicode character, and the linebreaking value before
01504        ** the character.
01505        */
01506 
01507        template<typename input_t> class linebreakc_iter
01508               : public std::iterator<std::input_iterator_tag,
01509               std::pair<int, unicode_char>, void>
01510        {
01511               mutable input_t iter_value, end_iter_value;
01512 
01513               mutable linebreakc_callback_save_buf *buf;
01514 
01515               void fill() const
01516               {
01517                      if (buf == NULL)
01518                             return;
01519 
01520                      while (buf->lb_buf.empty())
01521                      {
01522                             if (iter_value == end_iter_value)
01523                             {
01524                                    buf->finish();
01525                                    if (buf->lb_buf.empty())
01526                                    {
01527                                           delete buf;
01528                                           buf=NULL;
01529                                    }
01530                                    break;
01531                             }
01532 
01533                             buf->operator<<(*iter_value);
01534                             ++iter_value;
01535                      }
01536               }
01537 
01538               mutable value_type bufvalue;
01539 
01540        public:
01541               linebreakc_iter(const input_t &iter_valueArg,
01542                             const input_t &iter_endvalueArg)
01543                      : iter_value(iter_valueArg),
01544                      end_iter_value(iter_endvalueArg),
01545                      buf(new linebreakc_callback_save_buf)
01546                      {
01547                      }
01548 
01549               linebreakc_iter() : buf(NULL)
01550               {
01551               }
01552 
01553               ~linebreakc_iter()
01554               {
01555                      if (buf)
01556                             delete buf;
01557               }
01558 
01559               linebreakc_iter(const linebreakc_iter<input_t> &v)
01560                      : buf(NULL)
01561               {
01562                      operator=(v);
01563               }
01564 
01565               linebreakc_iter<input_t> &operator=(const
01566                                              linebreakc_iter<input_t> &v)
01567               {
01568                      if (buf)
01569                             delete buf;
01570                      buf=v.buf;
01571                      iter_value=v.iter_value;
01572                      end_iter_value=v.end_iter_value;
01573                      v.buf=NULL;
01574                      return *this;
01575               }
01576 
01577               bool operator==(const linebreakc_iter<input_t> &v) const
01578               {
01579                      fill();
01580                      v.fill();
01581 
01582                      return buf == NULL && v.buf == NULL;
01583               }
01584 
01585               bool operator!=(const linebreakc_iter<input_t> &v) const
01586               {
01587                      return !operator==(v);
01588               }
01589 
01590               value_type operator*() const
01591               {
01592                      fill();
01593                      return buf == NULL ?
01594                             std::make_pair(UNICODE_LB_MANDATORY,
01595                                           (unicode_char)0):
01596                             buf->lb_buf.front();
01597               }
01598 
01599               linebreakc_iter<input_t> &operator++()
01600               {
01601                      bufvalue=operator*();
01602 
01603                      if (buf)
01604                             buf->lb_buf.pop_front();
01605                      return *this;
01606               }
01607 
01608               const value_type *operator++(int)
01609               {
01610                      operator++();
01611                      return &bufvalue;
01612               }
01613        };
01614 
01615 
01616        /*
01617        ** Subclass wordbreak_callback_base, implement operator()(int).
01618        **
01619        ** Use operator<< or operator()(iterator, iterator) to feed
01620        ** unicode_chars into the wordbreaking algorithm. The subclass receives
01621        ** word flags, as they become available.
01622        */
01623 
01624        extern "C" int wordbreak_trampoline(int value, void *ptr);
01625 
01626        class wordbreak_callback_base {
01627 
01628               unicode_wb_info_t handle;
01629 
01630               wordbreak_callback_base(const wordbreak_callback_base &);
01631               /* NOT IMPLEMENTED */
01632 
01633               wordbreak_callback_base &operator==(const
01634                                               wordbreak_callback_base &);
01635               /* NOT IMPLEMENTED */
01636 
01637        public:
01638               wordbreak_callback_base();
01639               ~wordbreak_callback_base();
01640 
01641               void finish();
01642 
01643               friend int wordbreak_trampoline(int, void *);
01644 
01645               wordbreak_callback_base &operator<<(unicode_char uc);
01646 
01647               template<typename iter_type>
01648                      wordbreak_callback_base &operator()(iter_type beg_iter,
01649                                                      iter_type end_iter)
01650               {
01651                      while (beg_iter != end_iter)
01652                             operator<<(*beg_iter++);
01653                      return *this;
01654               }
01655 
01656               wordbreak_callback_base &operator<<(const
01657                                               std::vector<unicode_char>
01658                                               &vec)
01659               {
01660                      return operator()(vec.begin(), vec.end());
01661               }
01662        private:
01663               virtual int operator()(bool);
01664        };
01665 
01666        /*
01667        ** A C++ wrapper for unicode_wbscan.
01668        */
01669 
01670        class wordbreakscan {
01671 
01672               unicode_wbscan_info_t handle;
01673 
01674               wordbreakscan(const wordbreakscan &);
01675               /* NOT IMPLEMENTED */
01676 
01677               wordbreakscan &operator==(const wordbreakscan &);
01678               /* NOT IMPLEMENTED */
01679        public:
01680 
01681               wordbreakscan();
01682               ~wordbreakscan();
01683 
01684               bool operator<<(unicode_char uc);
01685 
01686               size_t finish();
01687        };
01688               
01689 }
01690 #endif
01691 
01692 #endif