Back to index

plt-scheme  4.2.1
string.c
Go to the documentation of this file.
00001 /*
00002   MzScheme
00003   Copyright (c) 2004-2009 PLT Scheme Inc.
00004   Copyright (c) 1995-2001 Matthew Flatt
00005 
00006     This library is free software; you can redistribute it and/or
00007     modify it under the terms of the GNU Library General Public
00008     License as published by the Free Software Foundation; either
00009     version 2 of the License, or (at your option) any later version.
00010 
00011     This library is distributed in the hope that it will be useful,
00012     but WITHOUT ANY WARRANTY; without even the implied warranty of
00013     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014     Library General Public License for more details.
00015 
00016     You should have received a copy of the GNU Library General Public
00017     License along with this library; if not, write to the Free
00018     Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00019     Boston, MA 02110-1301 USA.
00020 
00021   libscheme
00022   Copyright (c) 1994 Brent Benson
00023   All rights reserved.
00024 */
00025 
00026 #include "schpriv.h"
00027 #include "schvers.h"
00028 #include <string.h>
00029 #include <ctype.h>
00030 #ifndef DONT_USE_LOCALE
00031 # include <locale.h>
00032 # ifdef MZ_NO_ICONV
00033 #  define USE_ICONV_DLL
00034 # endif
00035 # ifndef USE_ICONV_DLL
00036 #  include <iconv.h>
00037 #  include <langinfo.h>
00038 # endif
00039 # include <wchar.h>
00040 # include <wctype.h>
00041 # include <errno.h>
00042 # ifdef MACOS_UNICODE_SUPPORT
00043 #  include <CoreFoundation/CFString.h>
00044 #  include <CoreFoundation/CFLocale.h>
00045 # endif
00046 # ifdef WINDOWS_UNICODE_SUPPORT
00047 #  include <windows.h>
00048 # endif
00049 #endif
00050 
00051 #ifndef SCHEME_PLATFORM_LIBRARY_SUBPATH
00052 # include "schsys.h"
00053 #endif
00054 
00055 #include "schustr.inc"
00056 
00057 #ifdef USE_ICONV_DLL
00058 typedef long iconv_t;
00059 typedef int *(*errno_proc_t)();
00060 typedef size_t (*iconv_proc_t)(iconv_t cd,
00061                             char **inbuf, size_t *inbytesleft,
00062                             char **outbuf, size_t *outbytesleft);
00063 typedef iconv_t (*iconv_open_proc_t)(const char *tocode, const char *fromcode);
00064 typedef void (*iconv_close_proc_t)(iconv_t cd);
00065 typedef char *(*locale_charset_proc_t)();
00066 static errno_proc_t iconv_errno;
00067 static iconv_proc_t iconv;
00068 static iconv_open_proc_t iconv_open;
00069 static iconv_close_proc_t iconv_close;
00070 static locale_charset_proc_t locale_charset; /* Not used, currently */
00071 #define mzCHK_PROC(x) x
00072 static int get_iconv_errno(void)
00073 {
00074   int *a;
00075   a = iconv_errno();
00076   return *a;
00077 }
00078 # undef HAVE_CODESET
00079 # define HAVE_CODESET 1
00080 # define CODESET 0
00081 # define ICONV_errno get_iconv_errno()
00082 extern wchar_t *scheme_get_dll_path(wchar_t *s);
00083 static int iconv_ready = 0;
00084 static void init_iconv()
00085 {
00086 # ifdef MZ_NO_ICONV
00087 # else
00088   HMODULE m;
00089   m = LoadLibraryW(scheme_get_dll_path(L"iconv.dll"));
00090   if (!m)
00091     m = LoadLibraryW(scheme_get_dll_path(L"libiconv.dll"));
00092   if (!m)
00093     m = LoadLibrary("iconv.dll");
00094   if (!m)
00095     m = LoadLibrary("libiconv.dll");
00096   if (m) {
00097     iconv = (iconv_proc_t)GetProcAddress(m, "libiconv");
00098     iconv_open = (iconv_open_proc_t)GetProcAddress(m, "libiconv_open");
00099     iconv_close = (iconv_close_proc_t)GetProcAddress(m, "libiconv_close");
00100     locale_charset = (locale_charset_proc_t)GetProcAddress(m, "locale_charset");
00101     /* Make sure we have all of them or none: */
00102     if (!iconv || !iconv_open || !iconv_close) {
00103       iconv = NULL;
00104       iconv_open = NULL;
00105       iconv_close = NULL;
00106     }
00107   }
00108   if (iconv) {
00109     iconv_errno = (errno_proc_t)GetProcAddress(m, "_errno");
00110     if (!iconv_errno) {
00111       /* The iconv.dll distributed with PLT Scheme links to msvcrt.dll.
00112         It's a slighly dangerous assumption that whaetever iconv we
00113         found also uses msvcrt.dll. */
00114       m = LoadLibrary("msvcrt.dll");
00115       if (m) {
00116        iconv_errno = (errno_proc_t)GetProcAddress(m, "_errno");
00117        if (!iconv_errno) {
00118          iconv = NULL;
00119          iconv_open = NULL;
00120          iconv_close = NULL;
00121        }
00122       }
00123     }
00124   }
00125 # endif
00126   iconv_ready = 1;
00127 }
00128 #else
00129 # define ICONV_errno errno
00130 # define iconv_ready 1
00131 # define mzCHK_PROC(x) 1
00132 static void init_iconv() { }
00133 #endif
00134 
00135 #ifdef MACOS_UNICODE_SUPPORT
00136 # define mzLOCALE_IS_UTF_8(s) (!s || !(*s))
00137 #endif
00138 #ifdef WINDOWS_UNICODE_SUPPORT
00139 # define mzLOCALE_IS_UTF_8(s) (!s || !(*s))
00140 #endif
00141 #ifndef mzLOCALE_IS_UTF_8
00142 # define mzLOCALE_IS_UTF_8(s) !mzCHK_PROC(iconv_open)
00143 #endif
00144 
00145 #define mzICONV_KIND 0
00146 #define mzUTF8_KIND 1
00147 #define mzUTF8_TO_UTF16_KIND 2
00148 #define mzUTF16_TO_UTF8_KIND 3
00149 
00150 typedef struct Scheme_Converter {
00151   Scheme_Object so;
00152   short closed;
00153   short kind;
00154   iconv_t cd;
00155   int permissive;
00156   Scheme_Custodian_Reference *mref;
00157 } Scheme_Converter;
00158 
00159 /* locals */
00160 
00161 /* These two locale variables are only valid when reset_locale()
00162    is called after continuation marks (and hence parameterization)
00163    may have changed. Similarly, setlocale() is only up-to-date
00164    when reset_locale() has been called. */
00165 static int locale_on;
00166 static const mzchar *current_locale_name = (mzchar *)"xxxx\0\0\0\0";
00167 static void reset_locale(void);
00168 
00169 #ifdef USE_ICONV_DLL
00170 static char *nl_langinfo(int which)
00171 {
00172   int i;
00173 
00174   reset_locale();
00175   if (!current_locale_name)
00176     current_locale_name = (mzchar *)"\0\0\0\0";
00177 
00178   if ((current_locale_name[0] == 'C')
00179       && !current_locale_name[1])
00180     return "US-ASCII";
00181 
00182   for (i = 0; current_locale_name[i]; i++) {
00183     if (current_locale_name[i] == '.') {
00184       if (current_locale_name[i + 1]) {
00185        int len, j = 0;
00186        char *enc;
00187        i++;
00188        len = scheme_char_strlen(current_locale_name) - i;
00189        enc = (char *)scheme_malloc_atomic(len + 1);
00190        while (current_locale_name[i]) {
00191          if (current_locale_name[i] > 127)
00192            return "UTF-8";
00193          enc[j++] = current_locale_name[i++];
00194        }
00195        enc[j] = 0;
00196        return enc;
00197       }
00198     }
00199   }
00200 
00201   return "UTF-8";
00202 }
00203 #endif
00204 
00205 #ifdef DONT_USE_LOCALE
00206 # define mz_iconv_nl_langinfo() ""
00207 #else
00208 static char *mz_iconv_nl_langinfo(){
00209   char *s;
00210 # if HAVE_CODESET
00211   s = nl_langinfo(CODESET);
00212 # else
00213   s = NULL;
00214 # endif
00215   if (!s)
00216     return "";
00217   else
00218     return s;
00219 }
00220 #endif
00221 
00222 static const char * const STRING_IS_NOT_UTF_8 = "string is not a well-formed UTF-8 encoding: ";
00223 
00224 static Scheme_Object *make_string (int argc, Scheme_Object *argv[]);
00225 static Scheme_Object *string (int argc, Scheme_Object *argv[]);
00226 static Scheme_Object *string_p (int argc, Scheme_Object *argv[]);
00227 static Scheme_Object *string_length (int argc, Scheme_Object *argv[]);
00228 static Scheme_Object *string_eq (int argc, Scheme_Object *argv[]);
00229 static Scheme_Object *string_locale_eq (int argc, Scheme_Object *argv[]);
00230 static Scheme_Object *string_ci_eq (int argc, Scheme_Object *argv[]);
00231 static Scheme_Object *string_locale_ci_eq (int argc, Scheme_Object *argv[]);
00232 static Scheme_Object *string_lt (int argc, Scheme_Object *argv[]);
00233 static Scheme_Object *string_locale_lt (int argc, Scheme_Object *argv[]);
00234 static Scheme_Object *string_gt (int argc, Scheme_Object *argv[]);
00235 static Scheme_Object *string_locale_gt (int argc, Scheme_Object *argv[]);
00236 static Scheme_Object *string_lt_eq (int argc, Scheme_Object *argv[]);
00237 static Scheme_Object *string_gt_eq (int argc, Scheme_Object *argv[]);
00238 static Scheme_Object *string_ci_lt (int argc, Scheme_Object *argv[]);
00239 static Scheme_Object *string_locale_ci_lt (int argc, Scheme_Object *argv[]);
00240 static Scheme_Object *string_ci_gt (int argc, Scheme_Object *argv[]);
00241 static Scheme_Object *string_locale_ci_gt (int argc, Scheme_Object *argv[]);
00242 static Scheme_Object *string_ci_lt_eq (int argc, Scheme_Object *argv[]);
00243 static Scheme_Object *string_ci_gt_eq (int argc, Scheme_Object *argv[]);
00244 static Scheme_Object *string_upcase (int argc, Scheme_Object *argv[]);
00245 static Scheme_Object *string_downcase (int argc, Scheme_Object *argv[]);
00246 static Scheme_Object *string_titlecase (int argc, Scheme_Object *argv[]);
00247 static Scheme_Object *string_foldcase (int argc, Scheme_Object *argv[]);
00248 static Scheme_Object *string_locale_upcase (int argc, Scheme_Object *argv[]);
00249 static Scheme_Object *string_locale_downcase (int argc, Scheme_Object *argv[]);
00250 static Scheme_Object *substring (int argc, Scheme_Object *argv[]);
00251 static Scheme_Object *string_append (int argc, Scheme_Object *argv[]);
00252 static Scheme_Object *string_to_list (int argc, Scheme_Object *argv[]);
00253 static Scheme_Object *list_to_string (int argc, Scheme_Object *argv[]);
00254 static Scheme_Object *string_copy (int argc, Scheme_Object *argv[]);
00255 static Scheme_Object *string_copy_bang (int argc, Scheme_Object *argv[]);
00256 static Scheme_Object *string_fill (int argc, Scheme_Object *argv[]);
00257 static Scheme_Object *string_to_immutable (int argc, Scheme_Object *argv[]);
00258 static Scheme_Object *string_normalize_c (int argc, Scheme_Object *argv[]);
00259 static Scheme_Object *string_normalize_kc (int argc, Scheme_Object *argv[]);
00260 static Scheme_Object *string_normalize_d (int argc, Scheme_Object *argv[]);
00261 static Scheme_Object *string_normalize_kd (int argc, Scheme_Object *argv[]);
00262 
00263 static Scheme_Object *make_byte_string (int argc, Scheme_Object *argv[]);
00264 static Scheme_Object *byte_string (int argc, Scheme_Object *argv[]);
00265 static Scheme_Object *byte_p (int argc, Scheme_Object *argv[]);
00266 static Scheme_Object *byte_string_p (int argc, Scheme_Object *argv[]);
00267 static Scheme_Object *byte_string_length (int argc, Scheme_Object *argv[]);
00268 static Scheme_Object *byte_string_eq (int argc, Scheme_Object *argv[]);
00269 static Scheme_Object *byte_string_lt (int argc, Scheme_Object *argv[]);
00270 static Scheme_Object *byte_string_gt (int argc, Scheme_Object *argv[]);
00271 static Scheme_Object *byte_substring (int argc, Scheme_Object *argv[]);
00272 static Scheme_Object *byte_string_append (int argc, Scheme_Object *argv[]);
00273 static Scheme_Object *byte_string_to_list (int argc, Scheme_Object *argv[]);
00274 static Scheme_Object *list_to_byte_string (int argc, Scheme_Object *argv[]);
00275 static Scheme_Object *byte_string_copy (int argc, Scheme_Object *argv[]);
00276 static Scheme_Object *byte_string_copy_bang (int argc, Scheme_Object *argv[]);
00277 static Scheme_Object *byte_string_fill (int argc, Scheme_Object *argv[]);
00278 static Scheme_Object *byte_string_to_immutable (int argc, Scheme_Object *argv[]);
00279 
00280 static Scheme_Object *byte_string_utf8_index (int argc, Scheme_Object *argv[]);
00281 static Scheme_Object *byte_string_utf8_ref (int argc, Scheme_Object *argv[]);
00282 static Scheme_Object *byte_string_utf8_length (int argc, Scheme_Object *argv[]);
00283 
00284 static Scheme_Object *byte_string_to_char_string (int argc, Scheme_Object *argv[]);
00285 static Scheme_Object *byte_string_to_char_string_locale (int argc, Scheme_Object *argv[]);
00286 static Scheme_Object *byte_string_to_char_string_latin1 (int argc, Scheme_Object *argv[]);
00287 static Scheme_Object *char_string_to_byte_string (int argc, Scheme_Object *argv[]);
00288 static Scheme_Object *char_string_to_byte_string_locale (int argc, Scheme_Object *argv[]);
00289 static Scheme_Object *char_string_to_byte_string_latin1 (int argc, Scheme_Object *argv[]);
00290 static Scheme_Object *char_string_utf8_length (int argc, Scheme_Object *argv[]);
00291 
00292 static Scheme_Object *version(int argc, Scheme_Object *argv[]);
00293 static Scheme_Object *format(int argc, Scheme_Object *argv[]);
00294 static Scheme_Object *sch_printf(int argc, Scheme_Object *argv[]);
00295 static Scheme_Object *sch_fprintf(int argc, Scheme_Object *argv[]);
00296 static Scheme_Object *banner(int argc, Scheme_Object *argv[]);
00297 static Scheme_Object *sch_getenv(int argc, Scheme_Object *argv[]);
00298 static Scheme_Object *sch_putenv(int argc, Scheme_Object *argv[]);
00299 static Scheme_Object *system_type(int argc, Scheme_Object *argv[]);
00300 static Scheme_Object *system_library_subpath(int argc, Scheme_Object *argv[]);
00301 static Scheme_Object *cmdline_args(int argc, Scheme_Object *argv[]);
00302 static Scheme_Object *current_locale(int argc, Scheme_Object *argv[]);
00303 static Scheme_Object *locale_string_encoding(int argc, Scheme_Object *argv[]);
00304 static Scheme_Object *system_language_country(int argc, Scheme_Object *argv[]);
00305 
00306 static Scheme_Object *byte_string_open_converter(int argc, Scheme_Object *argv[]);
00307 static Scheme_Object *byte_string_close_converter(int argc, Scheme_Object *argv[]);
00308 static Scheme_Object *byte_string_convert(int argc, Scheme_Object *argv[]);
00309 static Scheme_Object *byte_string_convert_end(int argc, Scheme_Object *argv[]);
00310 static Scheme_Object *byte_converter_p(int argc, Scheme_Object *argv[]);
00311 
00312 #ifdef MZ_PRECISE_GC
00313 static void register_traversers(void);
00314 #endif
00315 
00316 static int mz_char_strcmp(const char *who, const mzchar *str1, int l1, const mzchar *str2, int l2, int locale, int size_shortcut);
00317 static int mz_char_strcmp_ci(const char *who, const mzchar *str1, int l1, const mzchar *str2, int l2, int locale, int size_shortcut);
00318 static int mz_strcmp(const char *who, unsigned char *str1, int l1, unsigned char *str2, int l2);
00319 
00320 XFORM_NONGCING static int utf8_decode_x(const unsigned char *s, int start, int end,
00321                                    unsigned int *us, int dstart, int dend,
00322                                    long *ipos, long *jpos,
00323                                    char compact, char utf16,
00324                                    int *state, int might_continue, int permissive);
00325 XFORM_NONGCING static int utf8_encode_x(const unsigned int *us, int start, int end,
00326                                    unsigned char *s, int dstart, int dend,
00327                                    long *_ipos, long *_opos, char utf16);
00328 
00329 static char *string_to_from_locale(int to_bytes,
00330                                char *in, int delta, int len,
00331                                long *olen, int perm,
00332                                int *no_cvt);
00333 
00334 #define portable_isspace(x) (((x) < 128) && isspace(x))
00335 
00336 static Scheme_Object *sys_symbol;
00337 static Scheme_Object *platform_3m_path, *platform_cgc_path;
00338 static Scheme_Object *zero_length_char_string;
00339 static Scheme_Object *zero_length_byte_string;
00340 
00341 static Scheme_Hash_Table *putenv_str_table;
00342 
00343 static char *embedding_banner;
00344 static Scheme_Object *vers_str, *banner_str;
00345 
00346 static Scheme_Object *complete_symbol, *continues_symbol, *aborts_symbol, *error_symbol;
00347 
00348 void
00349 scheme_init_string (Scheme_Env *env)
00350 {
00351   Scheme_Object *p;
00352 
00353   REGISTER_SO(sys_symbol);
00354   sys_symbol = scheme_intern_symbol(SYSTEM_TYPE_NAME);
00355 
00356   REGISTER_SO(zero_length_char_string);
00357   REGISTER_SO(zero_length_byte_string);
00358   zero_length_char_string = scheme_alloc_char_string(0, 0);
00359   zero_length_byte_string = scheme_alloc_byte_string(0, 0);
00360 
00361   REGISTER_SO(complete_symbol);
00362   REGISTER_SO(continues_symbol);
00363   REGISTER_SO(aborts_symbol);
00364   REGISTER_SO(error_symbol);
00365   complete_symbol = scheme_intern_symbol("complete");
00366   continues_symbol = scheme_intern_symbol("continues");
00367   aborts_symbol = scheme_intern_symbol("aborts");
00368   error_symbol = scheme_intern_symbol("error");
00369 
00370   REGISTER_SO(platform_3m_path);
00371 #ifdef UNIX_FILE_SYSTEM
00372 # define MZ3M_SUBDIR "/3m"
00373 #else
00374 # ifdef DOS_FILE_SYSTEM
00375 #  define MZ3M_SUBDIR "\\3m"
00376 # else
00377 #  define MZ3M_SUBDIR ":3m"
00378 # endif
00379 #endif
00380   REGISTER_SO(platform_3m_path);
00381   REGISTER_SO(platform_cgc_path);
00382   platform_cgc_path = scheme_make_path(SCHEME_PLATFORM_LIBRARY_SUBPATH);
00383   platform_3m_path = scheme_make_path(SCHEME_PLATFORM_LIBRARY_SUBPATH MZ3M_SUBDIR);
00384 
00385   REGISTER_SO(putenv_str_table);
00386   REGISTER_SO(embedding_banner);
00387   REGISTER_SO(current_locale_name);
00388 
00389   p = scheme_make_folding_prim(string_p, "string?", 1, 1, 1);
00390   SCHEME_PRIM_PROC_FLAGS(p) |= SCHEME_PRIM_IS_UNARY_INLINED;
00391   scheme_add_global_constant("string?", p, env);
00392 
00393   scheme_add_global_constant("make-string",
00394                           scheme_make_immed_prim(make_string,
00395                                               "make-string",
00396                                               1, 2),
00397                           env);
00398   scheme_add_global_constant("string",
00399                           scheme_make_immed_prim(string,
00400                                               "string",
00401                                               0, -1),
00402                           env);
00403   scheme_add_global_constant("string-length",
00404                           scheme_make_folding_prim(string_length,
00405                                                 "string-length",
00406                                                 1, 1, 1),
00407                           env);
00408 
00409   p = scheme_make_immed_prim(scheme_checked_string_ref, "string-ref", 2, 2);
00410   SCHEME_PRIM_PROC_FLAGS(p) |= SCHEME_PRIM_IS_BINARY_INLINED;
00411   scheme_add_global_constant("string-ref", p, env);
00412 
00413 
00414   p = scheme_make_immed_prim(scheme_checked_string_set, "string-set!", 3, 3);
00415   SCHEME_PRIM_PROC_FLAGS(p) |= SCHEME_PRIM_IS_NARY_INLINED;
00416   scheme_add_global_constant("string-set!", p, env);
00417 
00418   scheme_add_global_constant("string=?",
00419                           scheme_make_immed_prim(string_eq,
00420                                               "string=?",
00421                                               2, -1),
00422                           env);
00423   scheme_add_global_constant("string-locale=?",
00424                           scheme_make_immed_prim(string_locale_eq,
00425                                               "string-locale=?",
00426                                               2, -1),
00427                           env);
00428   scheme_add_global_constant("string-ci=?",
00429                           scheme_make_immed_prim(string_ci_eq,
00430                                               "string-ci=?",
00431                                               2, -1),
00432                           env);
00433   scheme_add_global_constant("string-locale-ci=?",
00434                           scheme_make_immed_prim(string_locale_ci_eq,
00435                                               "string-locale-ci=?",
00436                                               2, -1),
00437                           env);
00438   scheme_add_global_constant("string<?",
00439                           scheme_make_immed_prim(string_lt,
00440                                               "string<?",
00441                                               2, -1),
00442                           env);
00443   scheme_add_global_constant("string-locale<?",
00444                           scheme_make_immed_prim(string_locale_lt,
00445                                               "string-locale<?",
00446                                               2, -1),
00447                           env);
00448   scheme_add_global_constant("string>?",
00449                           scheme_make_immed_prim(string_gt,
00450                                               "string>?",
00451                                               2, -1),
00452                           env);
00453   scheme_add_global_constant("string-locale>?",
00454                           scheme_make_immed_prim(string_locale_gt,
00455                                               "string-locale>?",
00456                                               2, -1),
00457                           env);
00458   scheme_add_global_constant("string<=?",
00459                           scheme_make_immed_prim(string_lt_eq,
00460                                               "string<=?",
00461                                               2, -1),
00462                           env);
00463   scheme_add_global_constant("string>=?",
00464                           scheme_make_immed_prim(string_gt_eq,
00465                                               "string>=?",
00466                                               2, -1),
00467                           env);
00468   scheme_add_global_constant("string-ci<?",
00469                           scheme_make_immed_prim(string_ci_lt,
00470                                               "string-ci<?",
00471                                               2, -1),
00472                           env);
00473   scheme_add_global_constant("string-locale-ci<?",
00474                           scheme_make_immed_prim(string_locale_ci_lt,
00475                                               "string-locale-ci<?",
00476                                               2, -1),
00477                           env);
00478   scheme_add_global_constant("string-ci>?",
00479                           scheme_make_immed_prim(string_ci_gt,
00480                                               "string-ci>?",
00481                                               2, -1),
00482                           env);
00483   scheme_add_global_constant("string-locale-ci>?",
00484                           scheme_make_immed_prim(string_locale_ci_gt,
00485                                               "string-locale-ci>?",
00486                                               2, -1),
00487                           env);
00488   scheme_add_global_constant("string-ci<=?",
00489                           scheme_make_immed_prim(string_ci_lt_eq,
00490                                               "string-ci<=?",
00491                                               2, -1),
00492                           env);
00493   scheme_add_global_constant("string-ci>=?",
00494                           scheme_make_immed_prim(string_ci_gt_eq,
00495                                               "string-ci>=?",
00496                                               2, -1),
00497                           env);
00498 
00499   scheme_add_global_constant("substring",
00500                           scheme_make_immed_prim(substring,
00501                                               "substring",
00502                                               2, 3),
00503                           env);
00504   scheme_add_global_constant("string-append",
00505                           scheme_make_immed_prim(string_append,
00506                                               "string-append",
00507                                               0, -1),
00508                           env);
00509   scheme_add_global_constant("string->list",
00510                           scheme_make_immed_prim(string_to_list,
00511                                               "string->list",
00512                                               1, 1),
00513                           env);
00514   scheme_add_global_constant("list->string",
00515                           scheme_make_immed_prim(list_to_string,
00516                                               "list->string",
00517                                               1, 1),
00518                           env);
00519   scheme_add_global_constant("string-copy",
00520                           scheme_make_immed_prim(string_copy,
00521                                               "string-copy",
00522                                               1, 1),
00523                           env);
00524   scheme_add_global_constant("string-copy!",
00525                           scheme_make_immed_prim(string_copy_bang,
00526                                               "string-copy!",
00527                                               3, 5),
00528                           env);
00529   scheme_add_global_constant("string-fill!",
00530                           scheme_make_immed_prim(string_fill,
00531                                               "string-fill!",
00532                                               2, 2),
00533                           env);
00534   scheme_add_global_constant("string->immutable-string",
00535                           scheme_make_immed_prim(string_to_immutable,
00536                                               "string->immutable-string",
00537                                               1, 1),
00538                           env);
00539   scheme_add_global_constant("string-normalize-nfc",
00540                           scheme_make_immed_prim(string_normalize_c,
00541                                               "string-normalize-nfc",
00542                                               1, 1),
00543                           env);
00544   scheme_add_global_constant("string-normalize-nfkc",
00545                           scheme_make_immed_prim(string_normalize_kc,
00546                                               "string-normalize-nfkc",
00547                                               1, 1),
00548                           env);
00549   scheme_add_global_constant("string-normalize-nfd",
00550                           scheme_make_immed_prim(string_normalize_d,
00551                                               "string-normalize-nfd",
00552                                               1, 1),
00553                           env);
00554   scheme_add_global_constant("string-normalize-nfkd",
00555                           scheme_make_immed_prim(string_normalize_kd,
00556                                               "string-normalize-nfkd",
00557                                               1, 1),
00558                           env);
00559 
00560   scheme_add_global_constant("string-upcase",
00561                           scheme_make_immed_prim(string_upcase,
00562                                               "string-upcase",
00563                                               1, 1),
00564                           env);
00565   scheme_add_global_constant("string-downcase",
00566                           scheme_make_immed_prim(string_downcase,
00567                                               "string-downcase",
00568                                               1, 1),
00569                           env);
00570   scheme_add_global_constant("string-titlecase",
00571                           scheme_make_immed_prim(string_titlecase,
00572                                               "string-titlecase",
00573                                               1, 1),
00574                           env);
00575   scheme_add_global_constant("string-foldcase",
00576                           scheme_make_immed_prim(string_foldcase,
00577                                               "string-foldcase",
00578                                               1, 1),
00579                           env);
00580 
00581   scheme_add_global_constant("string-locale-upcase",
00582                           scheme_make_immed_prim(string_locale_upcase,
00583                                               "string-locale-upcase",
00584                                               1, 1),
00585                           env);
00586   scheme_add_global_constant("string-locale-downcase",
00587                           scheme_make_immed_prim(string_locale_downcase,
00588                                               "string-locale-downcase",
00589                                               1, 1),
00590                           env);
00591 
00592   scheme_add_global_constant("current-locale",
00593                           scheme_register_parameter(current_locale,
00594                                                  "current-locale",
00595                                                  MZCONFIG_LOCALE),
00596                           env);
00597   scheme_add_global_constant("locale-string-encoding",
00598                           scheme_make_immed_prim(locale_string_encoding,
00599                                               "locale-string-encoding",
00600                                               0, 0),
00601                           env);
00602   scheme_add_global_constant("system-language+country",
00603                           scheme_make_immed_prim(system_language_country,
00604                                               "system-language+country",
00605                                               0, 0),
00606                           env);
00607 
00608   scheme_add_global_constant("bytes-converter?",
00609                           scheme_make_immed_prim(byte_converter_p,
00610                                               "bytes-converter?",
00611                                               1, 1),
00612                           env);
00613   scheme_add_global_constant("bytes-convert",
00614                           scheme_make_prim_w_arity2(byte_string_convert,
00615                                                  "bytes-convert",
00616                                                  1, 7,
00617                                                  3, 3),
00618                           env);
00619   scheme_add_global_constant("bytes-convert-end",
00620                           scheme_make_prim_w_arity2(byte_string_convert_end,
00621                                                  "bytes-convert-end",
00622                                                  0, 3,
00623                                                  2, 2),
00624                           env);
00625   scheme_add_global_constant("bytes-open-converter",
00626                           scheme_make_immed_prim(byte_string_open_converter,
00627                                               "bytes-open-converter",
00628                                               2, 2),
00629                           env);
00630   scheme_add_global_constant("bytes-close-converter",
00631                           scheme_make_immed_prim(byte_string_close_converter,
00632                                               "bytes-close-converter",
00633                                               1, 1),
00634                           env);
00635 
00636   scheme_add_global_constant("format",
00637                           scheme_make_noncm_prim(format,
00638                                                     "format",
00639                                                     1, -1),
00640                           env);
00641   scheme_add_global_constant("printf",
00642                           scheme_make_noncm_prim(sch_printf,
00643                                                     "printf",
00644                                                     1, -1),
00645                           env);
00646   scheme_add_global_constant("fprintf",
00647                           scheme_make_noncm_prim(sch_fprintf,
00648                                                     "fprintf",
00649                                                     2, -1),
00650                           env);
00651 
00652   scheme_add_global_constant("byte?",
00653                           scheme_make_folding_prim(byte_p,
00654                                                 "byte?",
00655                                                 1, 1, 1),
00656                           env);
00657 
00658   p = scheme_make_folding_prim(byte_string_p, "bytes?", 1, 1, 1);
00659   SCHEME_PRIM_PROC_FLAGS(p) |= SCHEME_PRIM_IS_NARY_INLINED;
00660   scheme_add_global_constant("bytes?", p, env);
00661 
00662   scheme_add_global_constant("make-bytes",
00663                           scheme_make_immed_prim(make_byte_string,
00664                                               "make-bytes",
00665                                               1, 2),
00666                           env);
00667   scheme_add_global_constant("bytes",
00668                           scheme_make_immed_prim(byte_string,
00669                                               "bytes",
00670                                               0, -1),
00671                           env);
00672   scheme_add_global_constant("bytes-length",
00673                           scheme_make_folding_prim(byte_string_length,
00674                                                 "bytes-length",
00675                                                 1, 1, 1),
00676                           env);
00677 
00678   p = scheme_make_immed_prim(scheme_checked_byte_string_ref, "bytes-ref", 2, 2);
00679   SCHEME_PRIM_PROC_FLAGS(p) |= SCHEME_PRIM_IS_BINARY_INLINED;
00680   scheme_add_global_constant("bytes-ref", p, env);
00681 
00682   p = scheme_make_immed_prim(scheme_checked_byte_string_set, "bytes-set!", 3, 3);
00683   SCHEME_PRIM_PROC_FLAGS(p) |= SCHEME_PRIM_IS_NARY_INLINED;
00684   scheme_add_global_constant("bytes-set!", p, env);
00685 
00686   scheme_add_global_constant("bytes=?",
00687                           scheme_make_immed_prim(byte_string_eq,
00688                                               "bytes=?",
00689                                               2, -1),
00690                           env);
00691   scheme_add_global_constant("bytes<?",
00692                           scheme_make_immed_prim(byte_string_lt,
00693                                               "bytes<?",
00694                                               2, -1),
00695                           env);
00696   scheme_add_global_constant("bytes>?",
00697                           scheme_make_immed_prim(byte_string_gt,
00698                                               "bytes>?",
00699                                               2, -1),
00700                           env);
00701 
00702   scheme_add_global_constant("subbytes",
00703                           scheme_make_immed_prim(byte_substring,
00704                                               "subbytes",
00705                                               2, 3),
00706                           env);
00707   scheme_add_global_constant("bytes-append",
00708                           scheme_make_immed_prim(byte_string_append,
00709                                               "bytes-append",
00710                                               0, -1),
00711                           env);
00712   scheme_add_global_constant("bytes->list",
00713                           scheme_make_immed_prim(byte_string_to_list,
00714                                               "bytes->list",
00715                                               1, 1),
00716                           env);
00717   scheme_add_global_constant("list->bytes",
00718                           scheme_make_immed_prim(list_to_byte_string,
00719                                               "list->bytes",
00720                                               1, 1),
00721                           env);
00722   scheme_add_global_constant("bytes-copy",
00723                           scheme_make_immed_prim(byte_string_copy,
00724                                               "bytes-copy",
00725                                               1, 1),
00726                           env);
00727   scheme_add_global_constant("bytes-copy!",
00728                           scheme_make_immed_prim(byte_string_copy_bang,
00729                                               "bytes-copy!",
00730                                               3, 5),
00731                           env);
00732   scheme_add_global_constant("bytes-fill!",
00733                           scheme_make_immed_prim(byte_string_fill,
00734                                               "bytes-fill!",
00735                                               2, 2),
00736                           env);
00737   scheme_add_global_constant("bytes->immutable-bytes",
00738                           scheme_make_immed_prim(byte_string_to_immutable,
00739                                               "bytes->immutable-bytes",
00740                                               1, 1),
00741                           env);
00742 
00743 
00744   scheme_add_global_constant("bytes-utf-8-index",
00745                           scheme_make_immed_prim(byte_string_utf8_index,
00746                                               "bytes-utf-8-index",
00747                                               2, 4),
00748                           env);
00749   scheme_add_global_constant("bytes-utf-8-length",
00750                           scheme_make_immed_prim(byte_string_utf8_length,
00751                                               "bytes-utf-8-length",
00752                                               1, 4),
00753                           env);
00754   scheme_add_global_constant("bytes-utf-8-ref",
00755                           scheme_make_immed_prim(byte_string_utf8_ref,
00756                                               "bytes-utf-8-ref",
00757                                               2, 4),
00758                           env);
00759 
00760   scheme_add_global_constant("bytes->string/utf-8",
00761                           scheme_make_immed_prim(byte_string_to_char_string,
00762                                               "bytes->string/utf-8",
00763                                               1, 4),
00764                           env);
00765   scheme_add_global_constant("bytes->string/locale",
00766                           scheme_make_immed_prim(byte_string_to_char_string_locale,
00767                                               "bytes->string/locale",
00768                                               1, 4),
00769                           env);
00770   scheme_add_global_constant("bytes->string/latin-1",
00771                           scheme_make_immed_prim(byte_string_to_char_string_latin1,
00772                                               "bytes->string/latin-1",
00773                                               1, 4),
00774                           env);
00775   scheme_add_global_constant("string->bytes/utf-8",
00776                           scheme_make_immed_prim(char_string_to_byte_string,
00777                                               "string->bytes/utf-8",
00778                                               1, 4),
00779                           env);
00780   scheme_add_global_constant("string->bytes/locale",
00781                           scheme_make_immed_prim(char_string_to_byte_string_locale,
00782                                               "string->bytes/locale",
00783                                               1, 4),
00784                           env);
00785   scheme_add_global_constant("string->bytes/latin-1",
00786                           scheme_make_immed_prim(char_string_to_byte_string_latin1,
00787                                               "string->bytes/latin-1",
00788                                               1, 4),
00789                           env);
00790 
00791   scheme_add_global_constant("string-utf-8-length",
00792                           scheme_make_immed_prim(char_string_utf8_length,
00793                                               "string-utf-8-length",
00794                                               1, 3),
00795                           env);
00796 
00797 
00798   /* In principle, `version' could be foldable, but it invites
00799      more problems than it solves... */
00800 
00801   scheme_add_global_constant("version",
00802                           scheme_make_immed_prim(version,
00803                                               "version",
00804                                               0, 0),
00805                           env);
00806   scheme_add_global_constant("banner",
00807                           scheme_make_immed_prim(banner,
00808                                               "banner",
00809                                               0, 0),
00810                           env);
00811 
00812   scheme_add_global_constant("getenv",
00813                           scheme_make_immed_prim(sch_getenv,
00814                                               "getenv",
00815                                               1, 1),
00816                           env);
00817   scheme_add_global_constant("putenv",
00818                           scheme_make_immed_prim(sch_putenv,
00819                                               "putenv",
00820                                               2, 2),
00821                           env);
00822 
00823   /* Don't make these folding, since they're platform-specific: */
00824 
00825   scheme_add_global_constant("system-type",
00826                           scheme_make_immed_prim(system_type,
00827                                               "system-type",
00828                                               0, 1),
00829                           env);
00830   scheme_add_global_constant("system-library-subpath",
00831                           scheme_make_immed_prim(system_library_subpath,
00832                                               "system-library-subpath",
00833                                               0, 1),
00834                           env);
00835 
00836   scheme_add_global_constant("current-command-line-arguments",
00837                           scheme_register_parameter(cmdline_args,
00838                                                  "current-command-line-arguments",
00839                                                  MZCONFIG_CMDLINE_ARGS),
00840                           env);
00841 
00842 #ifdef MZ_PRECISE_GC
00843   register_traversers();
00844 #endif
00845 }
00846 
00847 /**********************************************************************/
00848 /*                     UTF-8 char constructors                        */
00849 /**********************************************************************/
00850 
00851 Scheme_Object *scheme_make_sized_offset_utf8_string(char *chars, long d, long len)
00852 {
00853   long ulen;
00854   mzchar *us;
00855 
00856   if (len) {
00857     ulen = scheme_utf8_decode((unsigned char *)chars, d, d + len,
00858                            NULL, 0, -1,
00859                            NULL, 0 /* not UTF-16 */, 0xFFFD);
00860     us = scheme_malloc_atomic(sizeof(mzchar) * (ulen + 1));
00861     scheme_utf8_decode((unsigned char *)chars, d, d + len,
00862                      us, 0, -1,
00863                      NULL, 0 /* not UTF-16 */, 0xFFFD);
00864 
00865     us[ulen] = 0;
00866   } else {
00867     us = (mzchar *)"\0\0\0";
00868     ulen = 0;
00869   }
00870   return scheme_make_sized_offset_char_string(us, 0, ulen, 0);
00871 }
00872 
00873 Scheme_Object *
00874 scheme_make_sized_utf8_string(char *chars, long len)
00875 {
00876   return scheme_make_sized_offset_utf8_string(chars, 0, len);
00877 }
00878 
00879 Scheme_Object *
00880 scheme_make_immutable_sized_utf8_string(char *chars, long len)
00881 {
00882   Scheme_Object *s;
00883 
00884   s = scheme_make_sized_offset_utf8_string(chars, 0, len);
00885   if (len)
00886     SCHEME_SET_CHAR_STRING_IMMUTABLE(s);
00887 
00888   return s;
00889 }
00890 
00891 Scheme_Object *
00892 scheme_make_utf8_string(const char *chars)
00893 {
00894   return scheme_make_sized_offset_utf8_string((char *)chars, 0, -1);
00895 }
00896 
00897 Scheme_Object *
00898 scheme_make_locale_string(const char *chars)
00899 {
00900   return scheme_byte_string_to_char_string_locale(scheme_make_byte_string((char *)chars));
00901 }
00902 
00903 /**********************************************************************/
00904 /*                         index helpers                              */
00905 /**********************************************************************/
00906 
00907 void scheme_out_of_string_range(const char *name, const char *which,
00908                             Scheme_Object *i, Scheme_Object *s,
00909                             long start, long len)
00910 {
00911   int is_byte;
00912 
00913   is_byte = SCHEME_BYTE_STRINGP(s);
00914 
00915   if (len) {
00916     char *sstr;
00917     int slen;
00918 
00919     sstr = scheme_make_provided_string(s, 2, &slen);
00920     scheme_raise_exn(MZEXN_FAIL_CONTRACT,
00921                    "%s: %sindex %s out of range [%d, %d] for %s%s: %t",
00922                    name, which,
00923                    scheme_make_provided_string(i, 2, NULL),
00924                    start, len,
00925                    is_byte ? "byte-" : "",
00926                      SCHEME_VECTORP(s) ? "vector" : "string",
00927                    sstr, slen);
00928   } else {
00929     scheme_raise_exn(MZEXN_FAIL_CONTRACT,
00930                    "%s: %sindex %s out of range for empty %s%s",
00931                    name, which,
00932                    scheme_make_provided_string(i, 0, NULL),
00933                    is_byte ? "byte-" : "",
00934                      SCHEME_VECTORP(s) ? "vector" : "string");
00935   }
00936 }
00937 
00938 long scheme_extract_index(const char *name, int pos, int argc, Scheme_Object **argv, long top, int false_ok)
00939 {
00940   long i;
00941   int is_top = 0;
00942 
00943   if (SCHEME_INTP(argv[pos])) {
00944     i = SCHEME_INT_VAL(argv[pos]);
00945   } else if (SCHEME_BIGNUMP(argv[pos])) {
00946     if (SCHEME_BIGPOS(argv[pos])) {
00947       i = top; /* out-of-bounds */
00948       is_top = 1;
00949     } else
00950       i = -1; /* negative */
00951   } else
00952     i = -1;
00953 
00954   if (!is_top && (i < 0))
00955     scheme_wrong_type(name,
00956                     (false_ok ? "non-negative exact integer or #f" : "non-negative exact integer"),
00957                     pos, argc, argv);
00958 
00959   return i;
00960 }
00961 
00962 void scheme_get_substring_indices(const char *name, Scheme_Object *str,
00963                                   int argc, Scheme_Object **argv,
00964                                   int spos, int fpos, long *_start, long *_finish)
00965 {
00966   long len;
00967   long start, finish;
00968 
00969   if (SCHEME_VECTORP(str))
00970     len = SCHEME_VEC_SIZE(str);
00971   else if (SCHEME_CHAR_STRINGP(str))
00972     len = SCHEME_CHAR_STRTAG_VAL(str);
00973   else
00974     len = SCHEME_BYTE_STRTAG_VAL(str);
00975 
00976   if (argc > spos)
00977     start = scheme_extract_index(name, spos, argc, argv, len + 1, 0);
00978   else
00979     start = 0;
00980   if (argc > fpos)
00981     finish = scheme_extract_index(name, fpos, argc, argv, len + 1, 0);
00982   else
00983     finish = len;
00984 
00985   if (!(start <= len)) {
00986     scheme_out_of_string_range(name, (fpos < 100) ? "starting " : "", argv[spos], str, 0, len);
00987   }
00988   if (!(finish >= start && finish <= len)) {
00989     scheme_out_of_string_range(name, "ending ", argv[fpos], str, start, len);
00990   }
00991 
00992   *_start = start;
00993   *_finish = finish;
00994 }
00995 
00996 void scheme_do_get_substring_indices(const char *name, Scheme_Object *str,
00997                                      int argc, Scheme_Object **argv,
00998                                      int spos, int fpos, long *_start, long *_finish, long len)
00999 {
01000   if (argc > spos) {
01001     if (SCHEME_INTP(argv[spos])) {
01002       long start = SCHEME_INT_VAL(argv[spos]);
01003       if ((start >= 0) && (start < len)) {
01004         *_start = start;
01005         if (argc > fpos) {
01006           long finish = SCHEME_INT_VAL(argv[fpos]);
01007           if ((finish >= start) && (finish <= len)) {
01008             *_finish = finish;
01009             return;
01010           }
01011         } else {
01012           *_finish = len;
01013           return;
01014         }
01015       }
01016     }
01017   } else {
01018     *_start = 0;
01019     *_finish = len;
01020     return;
01021   }
01022 
01023   scheme_get_substring_indices(name, str, argc, argv, spos, fpos, _start, _finish);
01024 }
01025 
01026 /**********************************************************************/
01027 /*                          char strings                              */
01028 /**********************************************************************/
01029 
01030 #define SCHEME_X_STR_VAL(x) SCHEME_CHAR_STR_VAL(x)
01031 #define SCHEME_X_STRTAG_VAL(x) SCHEME_CHAR_STRTAG_VAL(x)
01032 #define SCHEME_X_STRINGP(x) SCHEME_CHAR_STRINGP(x)
01033 #define SCHEME_MUTABLE_X_STRINGP(x) SCHEME_MUTABLE_CHAR_STRINGP(x)
01034 #define SCHEME_SET_X_STRING_IMMUTABLE(x) SCHEME_SET_CHAR_STRING_IMMUTABLE(x)
01035 #define scheme_x_string_type scheme_char_string_type
01036 #define X(a, b) a##_char##b
01037 #define X_(a, b) a##_##b
01038 #define X__(a) a
01039 #define EMPTY (mzchar *)"\0\0\0"
01040 #define Xchar mzchar
01041 #define uXchar mzchar
01042 #define XSTR ""
01043 #define XSTRINGSTR "string"
01044 #define SUBXSTR "substring"
01045 #define CHARP(x) SCHEME_CHARP(x)
01046 #define CHAR_VAL(x) SCHEME_CHAR_VAL(x)
01047 #define CHAR_STR "character"
01048 #define MAKE_CHAR(x) _scheme_make_char(x)
01049 #define xstrlen scheme_char_strlen
01050 #include "strops.inc"
01051 
01052 #define GEN_STRING_COMP(name, scheme_name, comp, op, ul, size_shortcut)     \
01053 static Scheme_Object * name (int argc, Scheme_Object *argv[]) \
01054 {  mzchar *s, *prev; int i, sl, pl; int falz = 0;\
01055    if (!SCHEME_CHAR_STRINGP(argv[0])) \
01056     scheme_wrong_type(scheme_name, "string", 0, argc, argv); \
01057    prev = SCHEME_CHAR_STR_VAL(argv[0]); pl = SCHEME_CHAR_STRTAG_VAL(argv[0]); \
01058    for (i = 1; i < argc; i++) { \
01059      if (!SCHEME_CHAR_STRINGP(argv[i])) \
01060       scheme_wrong_type(scheme_name, "string", i, argc, argv); \
01061      s = SCHEME_CHAR_STR_VAL(argv[i]); sl = SCHEME_CHAR_STRTAG_VAL(argv[i]); \
01062      if (!falz) if (!(comp(scheme_name, \
01063                            prev, pl, \
01064                            s, sl, ul, size_shortcut) op 0)) falz = 1; \
01065      prev = s; pl = sl; \
01066   } \
01067   return falz ? scheme_false : scheme_true; \
01068 }
01069 
01070 GEN_STRING_COMP(string_eq, "string=?", mz_char_strcmp, ==, 0, 1)
01071 GEN_STRING_COMP(string_lt, "string<?", mz_char_strcmp, <, 0, 0)
01072 GEN_STRING_COMP(string_gt, "string>?", mz_char_strcmp, >, 0, 0)
01073 GEN_STRING_COMP(string_lt_eq, "string<=?", mz_char_strcmp, <=, 0, 0)
01074 GEN_STRING_COMP(string_gt_eq, "string>=?", mz_char_strcmp, >=, 0, 0)
01075 
01076 GEN_STRING_COMP(string_ci_eq, "string-ci=?", mz_char_strcmp_ci, ==, 0, 0)
01077 GEN_STRING_COMP(string_ci_lt, "string-ci<?", mz_char_strcmp_ci, <, 0, 0)
01078 GEN_STRING_COMP(string_ci_gt, "string-ci>?", mz_char_strcmp_ci, >, 0, 0)
01079 GEN_STRING_COMP(string_ci_lt_eq, "string-ci<=?", mz_char_strcmp_ci, <=, 0, 0)
01080 GEN_STRING_COMP(string_ci_gt_eq, "string-ci>=?", mz_char_strcmp_ci, >=, 0, 0)
01081 
01082 GEN_STRING_COMP(string_locale_eq, "string-locale=?", mz_char_strcmp, ==, 1, 0)
01083 GEN_STRING_COMP(string_locale_lt, "string-locale<?", mz_char_strcmp, <, 1, 0)
01084 GEN_STRING_COMP(string_locale_gt, "string-locale>?", mz_char_strcmp, >, 1, 0)
01085 GEN_STRING_COMP(string_locale_ci_eq, "string-locale-ci=?", mz_char_strcmp_ci, ==, 1, 0)
01086 GEN_STRING_COMP(string_locale_ci_lt, "string-locale-ci<?", mz_char_strcmp_ci, <, 1, 0)
01087 GEN_STRING_COMP(string_locale_ci_gt, "string-locale-ci>?", mz_char_strcmp_ci, >, 1, 0)
01088 
01089 /**********************************************************************/
01090 /*                         byte strings                               */
01091 /**********************************************************************/
01092 
01093 #define SCHEME_BYTEP(x) ((SCHEME_INTP(x)) && (SCHEME_INT_VAL(x) >= 0) && (SCHEME_INT_VAL(x) <= 255))
01094 #define BYTE_STR "exact integer in [0,255]"
01095 
01096 static Scheme_Object *
01097 byte_p(int argc, Scheme_Object *argv[])
01098 {
01099   return (SCHEME_BYTEP(argv[0]) ? scheme_true : scheme_false);
01100 }
01101 
01102 #define SCHEME_X_STR_VAL(x) SCHEME_BYTE_STR_VAL(x)
01103 #define SCHEME_X_STRTAG_VAL(x) SCHEME_BYTE_STRTAG_VAL(x)
01104 #define SCHEME_X_STRINGP(x) SCHEME_BYTE_STRINGP(x)
01105 #define SCHEME_MUTABLE_X_STRINGP(x) SCHEME_MUTABLE_BYTE_STRINGP(x)
01106 #define SCHEME_SET_X_STRING_IMMUTABLE(x) SCHEME_SET_BYTE_STRING_IMMUTABLE(x)
01107 #define scheme_x_string_type scheme_byte_string_type
01108 #define X(a, b) a##_byte##b
01109 #define X_(a, b) a##_byte_##b
01110 #define X__(a) byte_##a
01111 #define EMPTY ""
01112 #define Xchar char
01113 #define uXchar unsigned char
01114 #define XSTR "byte "
01115 #define XSTRINGSTR "bytes"
01116 #define SUBXSTR "subbytes"
01117 #define CHARP(x) SCHEME_BYTEP(x)
01118 #define CHAR_VAL(x) SCHEME_INT_VAL(x)
01119 #define CHAR_STR BYTE_STR
01120 #define MAKE_CHAR(x) scheme_make_integer_value(x)
01121 #define xstrlen strlen
01122 #include "strops.inc"
01123 
01124 /* comparisons */
01125 
01126 #define GEN_BYTE_STRING_COMP(name, scheme_name, comp, op) \
01127 static Scheme_Object * name (int argc, Scheme_Object *argv[]) \
01128 {  char *s, *prev; int i, sl, pl; int falz = 0;\
01129    if (!SCHEME_BYTE_STRINGP(argv[0])) \
01130     scheme_wrong_type(scheme_name, "byte string", 0, argc, argv); \
01131    prev = SCHEME_BYTE_STR_VAL(argv[0]); pl = SCHEME_BYTE_STRTAG_VAL(argv[0]); \
01132    for (i = 1; i < argc; i++) { \
01133      if (!SCHEME_BYTE_STRINGP(argv[i])) \
01134       scheme_wrong_type(scheme_name, "byte string", i, argc, argv); \
01135      s = SCHEME_BYTE_STR_VAL(argv[i]); sl = SCHEME_BYTE_STRTAG_VAL(argv[i]); \
01136      if (!falz) if (!(comp(scheme_name, \
01137                            (unsigned char *)prev, pl, \
01138                            (unsigned char *)s, sl) op 0)) falz = 1; \
01139      prev = s; pl = sl; \
01140   } \
01141   return falz ? scheme_false : scheme_true; \
01142 }
01143 
01144 GEN_BYTE_STRING_COMP(byte_string_eq, "bytes=?", mz_strcmp, ==)
01145 GEN_BYTE_STRING_COMP(byte_string_lt, "bytes<?", mz_strcmp, <)
01146 GEN_BYTE_STRING_COMP(byte_string_gt, "bytes>?", mz_strcmp, >)
01147 
01148 /**********************************************************************/
01149 /*                   byte string <-> char string                      */
01150 /**********************************************************************/
01151 
01152 /************************* bytes->string *************************/
01153 
01154 static Scheme_Object *
01155 do_byte_string_to_char_string(const char *who,
01156                            Scheme_Object *bstr,
01157                            long istart, long ifinish,
01158                            int perm, int as_locale)
01159 {
01160   int i, ulen;
01161   char *chars;
01162   unsigned int *v;
01163 
01164   chars = SCHEME_BYTE_STR_VAL(bstr);
01165 
01166   ulen = utf8_decode_x((unsigned char *)chars, istart, ifinish,
01167                      NULL, 0, -1,
01168                      NULL, NULL, 0, 0,
01169                      NULL, 0, 
01170                      (perm > -1) ? 0xD800 : 0);
01171   if (ulen < 0) {
01172     scheme_arg_mismatch(who,
01173                      STRING_IS_NOT_UTF_8,
01174                      bstr);
01175   }
01176 
01177   v = (unsigned int *)scheme_malloc_atomic((ulen + 1) * sizeof(unsigned int));
01178   utf8_decode_x((unsigned char *)chars, istart, ifinish,
01179               v, 0, -1,
01180               NULL, NULL, 0, 0,
01181               NULL, 0, 
01182               (perm > -1) ? 0xD800 : 0);
01183   
01184   if (perm > -1) {
01185     for (i = 0; i < ulen; i++) {
01186       if (v[i] == 0xD800)
01187        v[i] = perm;
01188     }
01189   }
01190   v[ulen] = 0;
01191 
01192   return scheme_make_sized_char_string(v, ulen, 0);
01193 }
01194 
01195 static Scheme_Object *
01196 do_byte_string_to_char_string_locale(const char *who,
01197                                  Scheme_Object *bstr,
01198                                  long istart, long ifinish,
01199                                  int perm)
01200 {
01201   char *us;
01202   long olen;
01203 
01204   reset_locale();
01205   if (!iconv_ready) init_iconv();
01206 
01207   if (mzLOCALE_IS_UTF_8(current_locale_name) || !locale_on || !mzCHK_PROC(iconv_open))
01208     return do_byte_string_to_char_string(who, bstr, istart, ifinish, perm, 1);
01209 
01210   if (istart < ifinish) {
01211     int no_cvt;
01212 
01213     us = string_to_from_locale(0, SCHEME_BYTE_STR_VAL(bstr),
01214                             istart, ifinish - istart,
01215                             &olen, perm, &no_cvt);
01216 
01217     if (!us) {
01218       if (no_cvt) {
01219        return do_byte_string_to_char_string(who, bstr, istart, ifinish, perm, 1);
01220       } else {
01221        scheme_arg_mismatch(who,
01222                          "byte string is not a valid encoding for the current locale: ",
01223                          bstr);
01224       }
01225     }
01226     ((mzchar *)us)[olen] = 0;
01227   } else {
01228     us = "\0\0\0";
01229     olen = 0;
01230   }
01231 
01232   return scheme_make_sized_char_string((mzchar *)us, olen, 0);
01233 }
01234 
01235 static Scheme_Object *
01236 do_string_to_vector(const char *who, int mode, int argc, Scheme_Object *argv[])
01237 {
01238   int permc;
01239   long istart, ifinish;
01240 
01241   if (!SCHEME_BYTE_STRINGP(argv[0]))
01242     scheme_wrong_type(who, "byte string", 0, argc, argv);
01243 
01244   if ((argc < 2) || SCHEME_FALSEP(argv[1]))
01245     permc = -1;
01246   else {
01247     if (!SCHEME_CHARP(argv[1]))
01248       scheme_wrong_type(who, "character or #f", 1, argc, argv);
01249     permc = SCHEME_CHAR_VAL(argv[1]);
01250   }
01251 
01252   scheme_get_substring_indices(who, argv[0], argc, argv,
01253                             2, 3,
01254                             &istart, &ifinish);
01255 
01256   if (mode == 0)
01257     return do_byte_string_to_char_string(who, argv[0], istart, ifinish, permc, 0);
01258   else if (mode == 1)
01259     return do_byte_string_to_char_string_locale(who, argv[0], istart, ifinish, permc);
01260   else {
01261     /* Latin-1 */
01262     mzchar *us;
01263     unsigned char *s;
01264     long i, len;
01265     len = ifinish - istart;
01266     s = (unsigned char *)SCHEME_BYTE_STR_VAL(argv[0]);
01267     us = (mzchar *)scheme_malloc_atomic((len + 1) * sizeof(mzchar));
01268     for (i = istart; i < ifinish; i++) {
01269       us[i - istart] = s[i];
01270     }
01271     us[len] = 0;
01272 
01273     return scheme_make_sized_char_string(us, len, 0);
01274   }
01275 }
01276 
01277 
01278 static Scheme_Object *
01279 byte_string_to_char_string (int argc, Scheme_Object *argv[])
01280 {
01281   return do_string_to_vector("bytes->string/utf-8", 0, argc, argv);
01282 }
01283 
01284 static Scheme_Object *
01285 byte_string_to_char_string_locale (int argc, Scheme_Object *argv[])
01286 {
01287   return do_string_to_vector("bytes->string/locale", 1, argc, argv);
01288 }
01289 
01290 static Scheme_Object *
01291 byte_string_to_char_string_latin1 (int argc, Scheme_Object *argv[])
01292 {
01293   return do_string_to_vector("bytes->string/latin-1", 2, argc, argv);
01294 }
01295 
01296 Scheme_Object *scheme_byte_string_to_char_string(Scheme_Object *o)
01297 {
01298   return do_byte_string_to_char_string("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), 0xFFFD, 0);
01299 }
01300 
01301 Scheme_Object *scheme_byte_string_to_char_string_locale(Scheme_Object *o)
01302 {
01303   return do_byte_string_to_char_string_locale("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), 0xFFFD);
01304 }
01305 
01306 /************************* string->bytes *************************/
01307 
01308 static Scheme_Object *do_char_string_to_byte_string(Scheme_Object *s, long istart, long ifinish, 
01309                                               int as_locale)
01310 {
01311   char *bs;
01312   int slen;
01313 
01314   slen = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(s), istart, ifinish,
01315                          NULL, 0,
01316                          0 /* UTF-16 */);
01317   bs = (char *)scheme_malloc_atomic(slen + 1);
01318   scheme_utf8_encode(SCHEME_CHAR_STR_VAL(s), istart, ifinish,
01319                    (unsigned char *)bs, 0,
01320                    0 /* UTF-16 */);
01321   bs[slen] = 0;
01322 
01323   return scheme_make_sized_byte_string(bs, slen, 0);
01324 }
01325 
01326 static Scheme_Object *
01327 do_char_string_to_byte_string_locale(const char *who,
01328                                  Scheme_Object *cstr,
01329                                  long istart, long ifinish,
01330                                  int perm)
01331 {
01332   char *s;
01333   long olen;
01334 
01335   reset_locale();
01336   if (!iconv_ready) init_iconv();
01337 
01338   if (mzLOCALE_IS_UTF_8(current_locale_name) || !locale_on || !mzCHK_PROC(iconv_open))
01339     return do_char_string_to_byte_string(cstr, istart, ifinish, 1);
01340 
01341   if (istart < ifinish) {
01342     int no_cvt;
01343 
01344     s = string_to_from_locale(1, (char *)SCHEME_CHAR_STR_VAL(cstr),
01345                            istart, ifinish - istart,
01346                            &olen, perm, &no_cvt);
01347 
01348     if (!s) {
01349       if (no_cvt) {
01350        return do_char_string_to_byte_string(cstr, istart, ifinish, 1);
01351       } else {
01352        scheme_arg_mismatch(who,
01353                          "string cannot be encoded for the current locale: ",
01354                          cstr);
01355       }
01356     }
01357     s[olen] = 0;
01358   } else {
01359     s = "";
01360     olen = 0;
01361   }
01362 
01363   return scheme_make_sized_byte_string(s, olen, 0);
01364 }
01365 
01366 
01367 Scheme_Object *scheme_char_string_to_byte_string(Scheme_Object *s)
01368 {
01369   return do_char_string_to_byte_string(s, 0, SCHEME_CHAR_STRLEN_VAL(s), 0);
01370 }
01371 
01372 Scheme_Object *scheme_char_string_to_byte_string_locale(Scheme_Object *s)
01373 {
01374   return do_char_string_to_byte_string_locale("s->s", s, 0, SCHEME_CHAR_STRLEN_VAL(s), '?');
01375 }
01376 
01377 static Scheme_Object *do_chars_to_bytes(const char *who, int mode,
01378                                    int argc, Scheme_Object *argv[])
01379 {
01380   long istart, ifinish;
01381   int permc;
01382 
01383   if (!SCHEME_CHAR_STRINGP(argv[0]))
01384     scheme_wrong_type(who, "string", 0, argc, argv);
01385 
01386   if ((argc < 2) || SCHEME_FALSEP(argv[1]))
01387     permc = -1;
01388   else {
01389     if (!SCHEME_BYTEP(argv[1]))
01390       scheme_wrong_type(who, "byte or #f", 1, argc, argv);
01391     permc = SCHEME_INT_VAL(argv[1]);
01392   }
01393 
01394   scheme_get_substring_indices(who, argv[0], argc, argv,
01395                             2, 3, &istart, &ifinish);
01396 
01397   if (mode == 1)
01398     return do_char_string_to_byte_string_locale(who, argv[0], istart, ifinish, permc);
01399   else if (mode == 0)
01400     return do_char_string_to_byte_string(argv[0], istart, ifinish, 0);
01401   else {
01402     /* Latin-1 */
01403     mzchar *us;
01404     unsigned char *s;
01405     long i, len;
01406     len = ifinish - istart;
01407     us = SCHEME_CHAR_STR_VAL(argv[0]);
01408     s = (unsigned char *)scheme_malloc_atomic(len + 1);
01409     for (i = istart; i < ifinish; i++) {
01410       if (us[i] < 256)
01411        s[i - istart] = us[i];
01412       else if (permc >= 0) {
01413        s[i - istart] = permc;
01414       } else {
01415        scheme_arg_mismatch(who,
01416                          "string cannot be encoded in Latin-1: ",
01417                          argv[0]);
01418       }
01419     }
01420     s[len] = 0;
01421 
01422     return scheme_make_sized_byte_string((char *)s, len, 0);
01423   }
01424 }
01425 
01426 static Scheme_Object *char_string_to_byte_string(int argc, Scheme_Object *argv[])
01427 {
01428   return do_chars_to_bytes("string->bytes/utf-8", 0, argc, argv);
01429 }
01430 
01431 static Scheme_Object *char_string_to_byte_string_locale(int argc, Scheme_Object *argv[])
01432 {
01433   return do_chars_to_bytes("string->bytes/locale", 1, argc, argv);
01434 }
01435 
01436 static Scheme_Object *char_string_to_byte_string_latin1(int argc, Scheme_Object *argv[])
01437 {
01438   return do_chars_to_bytes("string->bytes/latin-1", 2, argc, argv);
01439 }
01440 
01441 /************************* Other *************************/
01442 
01443 static Scheme_Object *char_string_utf8_length (int argc, Scheme_Object *argv[])
01444 {
01445   long istart, ifinish, len;
01446 
01447   if (!SCHEME_CHAR_STRINGP(argv[0]))
01448     scheme_wrong_type("string-utf-8-length", "string", 0, argc, argv);
01449 
01450   scheme_get_substring_indices("string-utf-8-length", argv[0], argc, argv,
01451                             1, 2, &istart, &ifinish);
01452 
01453   len = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(argv[0]), istart, ifinish,
01454                         NULL, 0, 0);
01455 
01456   return scheme_make_integer(len);
01457 }
01458 
01459 static Scheme_Object *
01460 byte_string_utf8_length (int argc, Scheme_Object *argv[])
01461 {
01462   int len, perm;
01463   long istart, ifinish;
01464   char *chars;
01465 
01466   if (!SCHEME_BYTE_STRINGP(argv[0]))
01467     scheme_wrong_type("bytes-utf-8-length", "string", 0, argc, argv);
01468 
01469   chars = SCHEME_BYTE_STR_VAL(argv[0]);
01470 
01471   if ((argc > 1) && !SCHEME_FALSEP(argv[1])) {
01472     if (!SCHEME_CHARP(argv[1]))
01473       scheme_wrong_type("bytes-utf-8-length", "character or #f", 1, argc, argv);
01474     perm = 1;
01475   } else
01476     perm = 0;
01477 
01478   scheme_get_substring_indices("bytes-utf-8-length", argv[0], argc, argv,
01479                             2, 3,
01480                             &istart, &ifinish);
01481 
01482   len = scheme_utf8_decode((unsigned char *)chars, istart, ifinish,
01483                         NULL, 0, -1,
01484                         NULL, 0, perm);
01485 
01486   if (len < 0)
01487     return scheme_false;
01488   else
01489     return scheme_make_integer(len);
01490 }
01491 
01492 static Scheme_Object *
01493 byte_string_utf8_index(int argc, Scheme_Object *argv[])
01494 {
01495   long istart, ifinish, pos = -1, opos, ipos;
01496   int result, perm;
01497   char *chars;
01498 
01499   if (!SCHEME_BYTE_STRINGP(argv[0]))
01500     scheme_wrong_type("bytes-utf-8-index", "byte string", 0, argc, argv);
01501 
01502   chars = SCHEME_BYTE_STR_VAL(argv[0]);
01503 
01504   if (SCHEME_INTP(argv[1])) {
01505     pos = SCHEME_INT_VAL(argv[1]);
01506   } else if (SCHEME_BIGNUMP(argv[1])) {
01507     if (SCHEME_BIGPOS(argv[1]))
01508       pos = 0x7FFFFFFF;
01509   }
01510 
01511   if (pos < 0) {
01512     scheme_wrong_type("bytes-utf-8-index", "non-negative exact integer", 1, argc, argv);
01513   }
01514 
01515   if ((argc > 2) && !SCHEME_FALSEP(argv[2])) {
01516     if (!SCHEME_CHARP(argv[2]))
01517       scheme_wrong_type("bytes-utf-8-index", "character or #f", 1, argc, argv);
01518     perm = 1;
01519   } else
01520     perm = 0;
01521 
01522   scheme_get_substring_indices("bytes-utf-8-index", argv[0], argc, argv,
01523                             3, 4,
01524                             &istart, &ifinish);
01525 
01526   result = utf8_decode_x((unsigned char *)chars, istart, ifinish,
01527                       NULL, 0, pos,
01528                       &ipos, &opos,
01529                       0, 0, NULL, 0, perm ? 1 : 0);
01530 
01531   if (((result < 0) && (result != -3))
01532       || ((ipos == ifinish) && (opos <= pos)))
01533     return scheme_false;
01534   else
01535     return scheme_make_integer(ipos);
01536 }
01537 
01538 static Scheme_Object *
01539 byte_string_utf8_ref(int argc, Scheme_Object *argv[])
01540 {
01541   long istart, ifinish, pos = -1, opos, ipos;
01542   char *chars;
01543   unsigned int us[1];
01544   Scheme_Object *perm;
01545 
01546   if (!SCHEME_BYTE_STRINGP(argv[0]))
01547     scheme_wrong_type("bytes-utf-8-ref", "byte string", 0, argc, argv);
01548 
01549   chars = SCHEME_BYTE_STR_VAL(argv[0]);
01550 
01551   if (SCHEME_INTP(argv[1])) {
01552     pos = SCHEME_INT_VAL(argv[1]);
01553   } else if (SCHEME_BIGNUMP(argv[1])) {
01554     if (SCHEME_BIGPOS(argv[1]))
01555       pos = 0x7FFFFFFF;
01556   }
01557 
01558   if (pos < 0) {
01559     scheme_wrong_type("bytes-utf-8-ref", "non-negative exact integer", 1, argc, argv);
01560   }
01561 
01562   if ((argc > 2) && !SCHEME_FALSEP(argv[2])) {
01563     if (!SCHEME_CHARP(argv[2]))
01564       scheme_wrong_type("bytes-utf-8-ref", "character or #f", 1, argc, argv);
01565     perm = argv[2];
01566   } else
01567     perm = 0;
01568 
01569   scheme_get_substring_indices("bytes-utf-8-ref", argv[0], argc, argv,
01570                             3, 4,
01571                             &istart, &ifinish);
01572 
01573   if (pos > 0) {
01574     utf8_decode_x((unsigned char *)chars, istart, ifinish,
01575                 NULL, 0, pos,
01576                 &ipos, &opos,
01577                 0, 0, NULL, 0, perm ? 1 : 0);
01578     if (opos < pos)
01579       return scheme_false;
01580     istart = ipos;
01581   }
01582 
01583   utf8_decode_x((unsigned char *)chars, istart, ifinish,
01584               us, 0, 1,
01585               &ipos, &opos,
01586               0, 0, NULL, 0, perm ? 0xFFFF : 0);
01587 
01588   if (opos < 1)
01589     return scheme_false;
01590   else if (us[0] == 0xFFFF)
01591     return perm;
01592   else
01593     return scheme_make_character(us[0]);
01594 }
01595 
01596 /********************************************************************/
01597 /*                            format                                */
01598 /********************************************************************/
01599 
01600 void scheme_do_format(const char *procname, Scheme_Object *port,
01601                     const mzchar *format, int flen,
01602                     int fpos, int offset, int argc, Scheme_Object **argv)
01603 {
01604   int i, start, end;
01605   int used = offset;
01606   int num_err = 0, char_err = 0, end_ok = 0;
01607   Scheme_Object *a[2];
01608 
01609   if (!format) {
01610     if (!SCHEME_CHAR_STRINGP(argv[fpos])) {
01611       scheme_wrong_type(procname, "format-string", fpos, argc, argv);
01612       return;
01613     }
01614     format = SCHEME_CHAR_STR_VAL(argv[fpos]);
01615     flen = SCHEME_CHAR_STRTAG_VAL(argv[fpos]);
01616   } else if (flen == -1)
01617     flen = strlen((char *)format);
01618 
01619   /* Check string first: */
01620   end = flen - 1;
01621   for (i = 0; i < end; i++) {
01622     if (format[i] == '~') {
01623       i++;
01624       if (scheme_isspace(format[i])) {
01625        /* skip spaces... */
01626       } else switch (format[i]) {
01627       case '~':
01628        if (i == end)
01629          end_ok = 1;
01630        break;
01631       case '%':
01632       case 'n':
01633       case 'N':
01634        break;
01635       case 'a':
01636       case 'A':
01637       case 's':
01638       case 'S':
01639       case 'v':
01640       case 'V':
01641       case 'e':
01642       case 'E':
01643        used++;
01644        break;
01645       case 'x':
01646       case 'X':
01647       case 'o':
01648       case 'O':
01649       case 'b':
01650       case 'B':
01651        if (!num_err && !char_err && (used < argc)) {
01652          Scheme_Object *o = argv[used];
01653          if (!SCHEME_EXACT_REALP(o)
01654              && (!SCHEME_COMPLEXP(o)
01655                 || !SCHEME_EXACT_REALP(scheme_complex_real_part(o))))
01656            num_err = used + 1;
01657        }
01658        used++;
01659        break;
01660       case 'c':
01661       case 'C':
01662        if (!num_err && !char_err && (used < argc)) {
01663          if (!SCHEME_CHARP(argv[used]))
01664            char_err = used + 1;
01665        }
01666        used++;
01667        break;
01668       default:
01669        {
01670          char buffer[64];
01671          sprintf(buffer, "pattern-string (tag ~%c not allowed)", format[i]);
01672          scheme_wrong_type(procname, buffer, fpos, argc, argv);
01673          return;
01674        }
01675       }
01676     }
01677   }
01678   if ((format[end] == '~') && !end_ok) {
01679     scheme_wrong_type(procname, "pattern-string (cannot end in ~)", fpos, argc, argv);
01680     return;
01681   }
01682   if (used != argc) {
01683     char *args;
01684     long alen;
01685 
01686     args = scheme_make_args_string("", -1, argc, argv, &alen);
01687 
01688     if (used > argc) {
01689       scheme_raise_exn(MZEXN_FAIL_CONTRACT,
01690                      "%s: format string requires %d arguments, given %d%t",
01691                      procname, used - offset, argc - offset, args, alen);
01692     } else {
01693       scheme_raise_exn(MZEXN_FAIL_CONTRACT,
01694                      "%s: format string requires %d arguments, given %d%t",
01695                      procname, used - offset, argc - offset, args, alen);
01696     }
01697     return;
01698   }
01699   if (num_err || char_err) {
01700     int pos = (num_err ? num_err : char_err) - 1;
01701     char *args, *bstr;
01702     long alen;
01703     int blen;
01704     char *type = (num_err ? "exact-number" : "character");
01705     Scheme_Object *bad = argv[pos];
01706 
01707     args = scheme_make_args_string("other ", pos, argc, argv, &alen);
01708     bstr = scheme_make_provided_string(bad, 1, &blen);
01709     scheme_raise_exn(MZEXN_FAIL_CONTRACT,
01710                    "%s: format string requires argument of type <%s>, given %t%t",
01711                    procname, type,
01712                    bstr, blen,
01713                    args, alen);
01714     return;
01715   }
01716 
01717   for (used = offset, i = start = 0; i < flen; i++) {
01718     if (format[i] == '~') {
01719       if (start < i) {
01720        (void)scheme_put_char_string(procname, port, format, start, i - start);
01721       }
01722       i++;
01723       if (scheme_isspace(format[i])) {
01724        /* skip spaces (at most one newline) */
01725        do {
01726          if ((format[i] == '\n') || (format[i] == '\r')) {
01727            /* got one */
01728            if ((format[i] == '\r') && (format[i + 1] == '\n'))
01729              i++; /* Windows-style CR-NL */
01730            i++;
01731            while (portable_isspace(format[i])
01732                  && !((format[i] == '\n') || (format[i] == '\r'))) {
01733              i++;
01734            }
01735            break;
01736          } else
01737            i++;
01738        } while (scheme_isspace(format[i]));
01739        --i; /* back up over something */
01740       } else switch (format[i]) {
01741       case '~':
01742        scheme_write_byte_string("~", 1, port);
01743        break;
01744       case '%':
01745       case 'n':
01746       case 'N':
01747        scheme_write_byte_string("\n", 1, port);
01748        break;
01749       case 'c':
01750       case 'C':
01751       case 'a':
01752       case 'A':
01753        a[0] = argv[used++];
01754        a[1] = port;
01755        _scheme_apply(scheme_display_proc, 2, a);
01756        break;
01757       case 's':
01758       case 'S':
01759        a[0] = argv[used++];
01760        a[1] = port;
01761        _scheme_apply(scheme_write_proc, 2, a);
01762        break;
01763       case 'v':
01764       case 'V':
01765        a[0] = argv[used++];
01766        a[1] = port;
01767        _scheme_apply(scheme_print_proc, 2, a);
01768        break;
01769       case 'e':
01770       case 'E':
01771        {
01772          int len;
01773          char *s;
01774          s = scheme_make_provided_string(argv[used++], 0, &len);
01775          scheme_write_byte_string(s, len, port);
01776        }
01777        break;
01778       case 'x':
01779       case 'X':
01780       case 'o':
01781       case 'O':
01782       case 'b':
01783       case 'B':
01784        {
01785          char *s;
01786          int radix;
01787 
01788          switch(format[i]) {
01789          case 'x':
01790          case 'X':
01791            radix = 16;
01792            break;
01793          case 'o':
01794          case 'O':
01795            radix = 8;
01796            break;
01797          default:
01798          case 'b':
01799          case 'B':
01800            radix = 2;
01801            break;
01802          }
01803          s = scheme_number_to_string(radix, argv[used++]);
01804 
01805          scheme_write_byte_string(s, strlen(s), port);
01806        }
01807        break;
01808       }
01809       SCHEME_USE_FUEL(1);
01810       start = i + 1;
01811     }
01812   }
01813 
01814   SCHEME_USE_FUEL(flen);
01815 
01816   if (start < i) {
01817     (void)scheme_put_char_string(procname, port, format, start, i - start);
01818   }
01819 }
01820 
01821 char *scheme_format(mzchar *format, int flen, int argc, Scheme_Object **argv, long *rlen)
01822 {
01823   Scheme_Object *port;
01824   port = scheme_make_byte_string_output_port();
01825   scheme_do_format("format", port, format, flen, 0, 0, argc, argv);
01826   return scheme_get_sized_byte_string_output(port, rlen);
01827 }
01828 
01829 void scheme_printf(mzchar *format, int flen, int argc, Scheme_Object **argv)
01830 {
01831   scheme_do_format("printf", scheme_get_param(scheme_current_config(), MZCONFIG_OUTPUT_PORT),
01832                  format, flen, 0, 0, argc, argv);
01833 }
01834 
01835 char *scheme_format_utf8(char *format, int flen, int argc, Scheme_Object **argv, long *rlen)
01836 {
01837   mzchar *s;
01838   long srlen;
01839   if (flen == -1)
01840     flen = strlen(format);
01841   s = scheme_utf8_decode_to_buffer_len((unsigned char *)format, flen, NULL, 0, &srlen);
01842   if (s)
01843     return scheme_format(s, srlen, argc, argv, rlen);
01844   else
01845     return "";
01846 }
01847 
01848 void scheme_printf_utf8(char *format, int flen, int argc, Scheme_Object **argv)
01849 {
01850   mzchar *s;
01851   long srlen;
01852   if (flen == -1)
01853     flen = strlen(format);
01854   s = scheme_utf8_decode_to_buffer_len((unsigned char *)format, flen, NULL, 0, &srlen);
01855   if (s)
01856     scheme_printf(s, srlen, argc, argv);
01857 }
01858 
01859 
01860 static Scheme_Object *
01861 format(int argc, Scheme_Object *argv[])
01862 {
01863   Scheme_Object *port;
01864   char *s;
01865   long len;
01866 
01867   port = scheme_make_byte_string_output_port();
01868 
01869   scheme_do_format("format", port, NULL, 0, 0, 1, argc, argv);
01870 
01871   s = scheme_get_sized_byte_string_output(port, &len);
01872   return scheme_make_sized_utf8_string(s, len);
01873 }
01874 
01875 static Scheme_Object *
01876 sch_printf(int argc, Scheme_Object *argv[])
01877 {
01878   scheme_do_format("printf", scheme_get_param(scheme_current_config(), MZCONFIG_OUTPUT_PORT),
01879                  NULL, 0, 0, 1, argc, argv);
01880   return scheme_void;
01881 }
01882 
01883 static Scheme_Object *
01884 sch_fprintf(int argc, Scheme_Object *argv[])
01885 {
01886   if (!SCHEME_OUTPUT_PORTP(argv[0]))
01887     scheme_wrong_type("fprintf", "output-port", 0, argc, argv);
01888 
01889   scheme_do_format("fprintf", argv[0], NULL, 0, 1, 2, argc, argv);
01890   return scheme_void;
01891 }
01892 
01893 /********************************************************************/
01894 /*                              misc                                */
01895 /********************************************************************/
01896 
01897 static Scheme_Object *
01898 version(int argc, Scheme_Object *argv[])
01899 {
01900   if (!vers_str) {
01901     REGISTER_SO(vers_str);
01902     vers_str = scheme_make_utf8_string(scheme_version());
01903     SCHEME_SET_CHAR_STRING_IMMUTABLE(vers_str);
01904   }
01905 
01906   return vers_str;
01907 }
01908 
01909 static Scheme_Object *
01910 banner(int argc, Scheme_Object *argv[])
01911 {
01912   if (!banner_str) {
01913     REGISTER_SO(banner_str);
01914     banner_str = scheme_make_utf8_string(scheme_banner());
01915     SCHEME_SET_CHAR_STRING_IMMUTABLE(banner_str);
01916   }
01917 
01918   return banner_str;
01919 }
01920 
01921 char *scheme_version(void)
01922 {
01923   return MZSCHEME_VERSION;
01924 }
01925 
01926 #ifdef MZ_PRECISE_GC
01927 # define VERSION_SUFFIX " [3m]"
01928 #else
01929 # ifdef USE_SENORA_GC
01930 #  define VERSION_SUFFIX " [cgc~]"
01931 # else
01932 #  define VERSION_SUFFIX " [cgc]"
01933 # endif
01934 #endif
01935 
01936 char *scheme_banner(void)
01937 {
01938   if (embedding_banner)
01939     return embedding_banner;
01940   else
01941     return "Welcome to MzScheme"
01942       " v" MZSCHEME_VERSION VERSION_SUFFIX
01943       ", Copyright (c) 2004-2009 PLT Scheme Inc.\n";
01944 }
01945 
01946 void scheme_set_banner(char *s)
01947 {
01948   embedding_banner = s;
01949 }
01950 
01951 int scheme_byte_string_has_null(Scheme_Object *o)
01952 {
01953   const char *s = SCHEME_BYTE_STR_VAL(o);
01954   int i = SCHEME_BYTE_STRTAG_VAL(o);
01955   while (i--) {
01956     if (!s[i])
01957       return 1;
01958   }
01959   return 0;
01960 }
01961 
01962 int scheme_any_string_has_null(Scheme_Object *o)
01963 {
01964   if (SCHEME_BYTE_STRINGP(o))
01965     return scheme_byte_string_has_null(o);
01966   else {
01967     const mzchar *s = SCHEME_CHAR_STR_VAL(o);
01968     int i = SCHEME_CHAR_STRTAG_VAL(o);
01969     while (i--) {
01970       if (!s[i])
01971        return 1;
01972     }
01973     return 0;
01974   }
01975 }
01976 
01977 #ifdef DOS_FILE_SYSTEM
01978 # include <windows.h>
01979 static char *mzGETENV(char *s)
01980 {
01981   int sz, got;
01982   char *res;
01983 
01984   sz = GetEnvironmentVariable(s, NULL, 0);
01985   if (!sz)
01986     return NULL;
01987   res = scheme_malloc_atomic(sz);
01988   got = GetEnvironmentVariable(s, res, sz);
01989   if (got < sz)
01990     res[got] = 0;
01991   return res;
01992 }
01993 
01994 static int mzPUTENV(char *var, char *val, char *together)
01995 {
01996   return !SetEnvironmentVariable(var, val);
01997 }
01998 
01999 #else
02000 # define mzGETENV getenv
02001 # define mzPUTENV(var, val, s) MSC_IZE(putenv)(s)
02002 #endif
02003 
02004 void
02005 scheme_init_getenv(void)
02006 {
02007 #ifndef GETENV_FUNCTION
02008   FILE *f = fopen("Environment", "r");
02009   if (f) {
02010     Scheme_Object *p = scheme_make_file_input_port(f);
02011     mz_jmp_buf *savebuf, newbuf;
02012     savebuf = scheme_current_thread->error_buf;
02013     scheme_current_thread->error_buf = &newbuf;
02014     if (!scheme_setjmp(newbuf)) {
02015       while (1) {
02016        Scheme_Object *v = scheme_read(p);
02017        if (SCHEME_EOFP(v))
02018          break;
02019 
02020        if (SCHEME_PAIRP(v) && SCHEME_PAIRP(SCHEME_CDR(v))
02021            && SCHEME_NULLP(SCHEME_CDR(SCHEME_CDR(v)))) {
02022          Scheme_Object *key = SCHEME_CAR(v);
02023          Scheme_Object *val = SCHEME_CADR(v);
02024          if (SCHEME_STRINGP(key) && SCHEME_STRINGP(val)) {
02025            Scheme_Object *a[2];
02026            a[0] = key;
02027            a[1] = val;
02028            sch_putenv(2, a);
02029            v = NULL;
02030          }
02031        }
02032 
02033        if (v)
02034          scheme_signal_error("bad environment specification: %V", v);
02035       }
02036     }
02037     scheme_current_thread->error_buf = savebuf;
02038     scheme_close_input_port(p);
02039 
02040     if (scheme_hash_get(putenv_str_table, (Scheme_Object *)"PLTNOMZJIT")) {
02041       scheme_set_startup_use_jit(0);
02042     }
02043   }
02044 #else
02045   if (mzGETENV("PLTNOMZJIT")) {
02046     scheme_set_startup_use_jit(0);
02047   }
02048 #endif
02049 }
02050 
02051 static Scheme_Object *sch_getenv(int argc, Scheme_Object *argv[])
02052 {
02053   char *s;
02054   Scheme_Object *bs;
02055 
02056   if (!SCHEME_CHAR_STRINGP(argv[0])
02057       || scheme_any_string_has_null(argv[0]))
02058     scheme_wrong_type("getenv", CHAR_STRING_W_NO_NULLS, 0, argc, argv);
02059 
02060   bs = scheme_char_string_to_byte_string_locale(argv[0]);
02061 
02062 #ifdef GETENV_FUNCTION
02063   s = mzGETENV(SCHEME_BYTE_STR_VAL(bs));
02064 #else
02065   if (putenv_str_table) {
02066     s = (char *)scheme_hash_get(putenv_str_table, (Scheme_Object *)SCHEME_BYTE_STR_VAL(argv[0]));
02067     /* If found, skip over the `=' in the table: */
02068     if (s)
02069       s += SCHEME_BYTE_STRTAG_VAL(bs) + 1;
02070   } else
02071     s = NULL;
02072 #endif
02073 
02074   if (s)
02075     return scheme_make_locale_string(s);
02076 
02077   return scheme_false;
02078 }
02079 
02080 static Scheme_Object *sch_putenv(int argc, Scheme_Object *argv[])
02081 {
02082   char *s, *var, *val;
02083   long varlen, vallen;
02084   Scheme_Object *bs;
02085 
02086   if (!SCHEME_CHAR_STRINGP(argv[0])
02087       || scheme_any_string_has_null(argv[0]))
02088     scheme_wrong_type("putenv", CHAR_STRING_W_NO_NULLS, 0, argc, argv);
02089   if (!SCHEME_CHAR_STRINGP(argv[1])
02090       || scheme_any_string_has_null(argv[1]))
02091     scheme_wrong_type("putenv", CHAR_STRING_W_NO_NULLS, 1, argc, argv);
02092 
02093   bs = scheme_char_string_to_byte_string_locale(argv[0]);
02094   var = SCHEME_BYTE_STR_VAL(bs);
02095 
02096   bs = scheme_char_string_to_byte_string_locale(argv[1]);
02097   val = SCHEME_BYTE_STR_VAL(bs);
02098 
02099   varlen = strlen(var);
02100   vallen = strlen(val);
02101 
02102   s = (char *)scheme_malloc_atomic(varlen + vallen + 2);
02103   memcpy(s, var, varlen);
02104   memcpy(s + varlen + 1, val, vallen + 1);
02105   s[varlen] = '=';
02106 
02107 #ifdef MZ_PRECISE_GC
02108   {
02109     /* Can't put moveable string into array. */
02110     char *ss;
02111     ss = s;
02112     s = malloc(varlen + vallen + 2);
02113     memcpy(s, ss, varlen + vallen + 2);
02114 
02115     /* Free old, if in table: */
02116     if (putenv_str_table) {
02117       ss = (char *)scheme_hash_get(putenv_str_table, (Scheme_Object *)var);
02118       if (ss)
02119        free(ss);
02120     }
02121   }
02122 #endif
02123 
02124   if (!putenv_str_table)
02125     putenv_str_table = scheme_make_hash_table(SCHEME_hash_string);
02126 
02127   scheme_hash_set(putenv_str_table, (Scheme_Object *)var, (Scheme_Object *)s);
02128 
02129 #ifdef GETENV_FUNCTION
02130   return mzPUTENV(var, val, s) ? scheme_false : scheme_true;
02131 #else
02132   return scheme_true;
02133 #endif
02134 }
02135 
02136 static void machine_details(char *s);
02137 
02138 static Scheme_Object *system_type(int argc, Scheme_Object *argv[])
02139 {
02140   if (argc) {
02141     Scheme_Object *sym;
02142     sym = scheme_intern_symbol("link");
02143     if (SAME_OBJ(argv[0], sym)) {
02144 #if defined(OS_X) && !defined(XONX)
02145       return scheme_intern_symbol("framework");
02146 #else
02147 # ifdef DOS_FILE_SYSTEM
02148       return scheme_intern_symbol("dll");
02149 # else
02150 #  ifdef MZ_USES_SHARED_LIB
02151       return scheme_intern_symbol("shared");
02152 #  else
02153       return scheme_intern_symbol("static");
02154 #  endif
02155 # endif
02156 #endif
02157     }
02158 
02159     sym = scheme_intern_symbol("machine");
02160     if (SAME_OBJ(argv[0], sym)) {
02161       char buff[1024];
02162       
02163       machine_details(buff);
02164     
02165       return scheme_make_utf8_string(buff);
02166     }
02167 
02168     sym = scheme_intern_symbol("gc");
02169     if (SAME_OBJ(argv[0], sym)) {
02170 #ifdef MZ_PRECISE_GC
02171       return scheme_intern_symbol("3m");
02172 #else
02173       return scheme_intern_symbol("cgc");
02174 #endif
02175     }
02176 
02177     sym = scheme_intern_symbol("so-suffix");
02178     if (SAME_OBJ(argv[0], sym)) {
02179 #ifdef DOS_FILE_SYSTEM
02180       return scheme_make_byte_string(".dll");
02181 #else
02182 # ifdef OS_X
02183       return scheme_make_byte_string(".dylib");
02184 # else
02185 #  ifdef USE_CYGWIN_SO_SUFFIX
02186       return scheme_make_byte_string(".dll");
02187 #  else
02188       return scheme_make_byte_string(".so");
02189 #  endif
02190 # endif
02191 #endif
02192     }
02193 
02194     sym = scheme_intern_symbol("os");
02195     if (!SAME_OBJ(argv[0], sym)) {
02196       scheme_wrong_type("system-type", "'os, 'link, 'machine, 'gc, or 'so-suffix", 0, argc, argv);
02197       return NULL;
02198     }
02199   }
02200 
02201   return sys_symbol;
02202 }
02203 
02204 static Scheme_Object *system_library_subpath(int argc, Scheme_Object *argv[])
02205 {
02206   if (argc > 0) {
02207     Scheme_Object *sym;
02208 
02209     if (SCHEME_FALSEP(argv[0]))
02210       return platform_cgc_path;
02211     
02212     sym = scheme_intern_symbol("cgc");
02213     if (SAME_OBJ(sym, argv[0]))
02214       return platform_cgc_path;
02215 
02216     sym = scheme_intern_symbol("3m");
02217     if (SAME_OBJ(sym, argv[0]))
02218       return platform_3m_path;
02219 
02220     scheme_wrong_type("system-library-subpath", "'cgc, '3m, or #f", 0, argc, argv);
02221     return NULL;
02222   } else {
02223 #ifdef MZ_PRECISE_GC
02224     return platform_3m_path;
02225 #else
02226     return platform_cgc_path;
02227 #endif
02228   }
02229 }
02230 
02231 const char *scheme_system_library_subpath()
02232 {
02233   return SCHEME_PLATFORM_LIBRARY_SUBPATH;
02234 }
02235 
02236 /* Our own strncpy - which would be really stupid, except the one for
02237    the implementation in Solaris 2.6 is broken (it doesn't always stop
02238    at the null terminator). */
02239 int scheme_strncmp(const char *a, const char *b, int len)
02240 {
02241   while (len-- && (*a == *b) && *a) {
02242     a++;
02243     b++;
02244   }
02245 
02246   if (len < 0)
02247     return 0;
02248   else
02249     return *a - *b;
02250 }
02251 
02252 static Scheme_Object *ok_cmdline(int argc, Scheme_Object **argv)
02253 {
02254   if (SCHEME_VECTORP(argv[0])) {
02255     Scheme_Object *vec = argv[0], *vec2, *str;
02256     int i, size = SCHEME_VEC_SIZE(vec);
02257 
02258 
02259     if (!size)
02260       return vec;
02261 
02262     for (i = 0; i < size; i++) {
02263       if (!SCHEME_CHAR_STRINGP(SCHEME_VEC_ELS(vec)[i]))
02264        return NULL;
02265     }
02266 
02267     /* Make sure vector and strings are immutable: */
02268     vec2 = scheme_make_vector(size, NULL);
02269     if (size)
02270       SCHEME_SET_VECTOR_IMMUTABLE(vec2);
02271     for (i = 0; i < size; i++) {
02272       str = SCHEME_VEC_ELS(vec)[i];
02273       if (!SCHEME_IMMUTABLE_CHAR_STRINGP(str)) {
02274        str = scheme_make_sized_char_string(SCHEME_CHAR_STR_VAL(str), SCHEME_CHAR_STRLEN_VAL(str), 0);
02275        SCHEME_SET_CHAR_STRING_IMMUTABLE(str);
02276       }
02277       SCHEME_VEC_ELS(vec2)[i] = str;
02278     }
02279 
02280     return vec2;
02281   }
02282 
02283   return NULL;
02284 }
02285 
02286 static Scheme_Object *cmdline_args(int argc, Scheme_Object *argv[])
02287 {
02288   return scheme_param_config("current-command-line-arguments",
02289                           scheme_make_integer(MZCONFIG_CMDLINE_ARGS),
02290                           argc, argv,
02291                           -1, ok_cmdline, "vector of strings", 1);
02292 }
02293 
02294 /**********************************************************************/
02295 /*                           locale ops                               */
02296 /**********************************************************************/
02297 
02298 static Scheme_Object *ok_locale(int argc, Scheme_Object **argv)
02299 {
02300   if (SCHEME_FALSEP(argv[0]))
02301     return argv[0];
02302   else if (SCHEME_CHAR_STRINGP(argv[0])) {
02303     if (SCHEME_IMMUTABLEP(argv[0]))
02304       return argv[0];
02305     else {
02306       Scheme_Object *str = argv[0];
02307       str = scheme_make_immutable_sized_char_string(SCHEME_CHAR_STR_VAL(str), SCHEME_CHAR_STRLEN_VAL(str), 1);
02308       return str;
02309     }
02310   }
02311 
02312   return NULL;
02313 }
02314 
02315 static Scheme_Object *current_locale(int argc, Scheme_Object *argv[])
02316 {
02317   Scheme_Object *v;
02318 
02319   v = scheme_param_config("current-locale",
02320                        scheme_make_integer(MZCONFIG_LOCALE),
02321                        argc, argv,
02322                        -1, ok_locale, "#f or string", 1);
02323 
02324   return v;
02325 }
02326 
02327 static Scheme_Object *locale_string_encoding(int argc, Scheme_Object *argv[])
02328 {
02329   reset_locale();
02330   if (mzLOCALE_IS_UTF_8(current_locale_name) || !locale_on)
02331     return scheme_make_utf8_string("UTF-8");
02332   
02333 #if HAVE_CODESET
02334   return scheme_make_utf8_string(nl_langinfo(CODESET));
02335 #else
02336   /* nl_langinfo doesn't work, so just make up something */
02337   return scheme_make_utf8_string("UTF-8");
02338 #endif
02339 }
02340 
02341 static Scheme_Object *system_language_country(int argc, Scheme_Object *argv[])
02342 {
02343 #ifdef MACOS_UNICODE_SUPPORT
02344   /* Mac OS X */
02345   CFLocaleRef l;
02346   CFStringRef s;
02347   int len;
02348   char *r;
02349 
02350   l = CFLocaleCopyCurrent();
02351   s = CFLocaleGetIdentifier(l);
02352 
02353   len = CFStringGetLength(s);
02354   r = (char *)scheme_malloc_atomic(len * 6 + 1);
02355   CFStringGetCString(s, r, len * 6 + 1, kCFStringEncodingUTF8);
02356 
02357   CFRelease(l);
02358 
02359   return scheme_make_sized_utf8_string(r, 5);
02360 #else
02361 # ifdef WINDOWS_UNICODE_SUPPORT
02362   /* Windows */
02363   LCID l;
02364   int llen, clen;
02365   char *lang, *country, *s;
02366   l = GetUserDefaultLCID();
02367 
02368   llen = GetLocaleInfo(l, LOCALE_SENGLANGUAGE, NULL, 0);
02369   lang = (char *)scheme_malloc_atomic(llen);
02370   GetLocaleInfo(l, LOCALE_SENGLANGUAGE, lang, llen);
02371   if (llen)
02372     llen -= 1; /* drop nul terminator */
02373 
02374   clen = GetLocaleInfo(l, LOCALE_SENGCOUNTRY, NULL, 0);
02375   country = (char *)scheme_malloc_atomic(clen);
02376   GetLocaleInfo(l, LOCALE_SENGCOUNTRY, country, clen);
02377   if (clen)
02378     clen -= 1; /* drop nul terminator */
02379 
02380   s = (char *)scheme_malloc_atomic(clen + llen + 1);
02381   memcpy(s, lang, llen);
02382   memcpy(s + 1 + llen, country, clen);
02383   s[llen] = '_';
02384   
02385   return scheme_make_sized_utf8_string(s, llen + 1 + clen);
02386 # else
02387   /* Unix */
02388   char *s;
02389   
02390   s = getenv("LC_ALL");
02391   if (!s)
02392     s = getenv("LC_CTYPE");
02393   if (!s)
02394     s = getenv("LANG");
02395   
02396   if (s) {
02397     /* Check that the environment variable has the form
02398        xx_XX[.ENC] */
02399     if ((s[0] >= 'a') && (s[0] <= 'z')
02400        && (s[1] >= 'a') && (s[1] <= 'z')
02401        && (s[2] == '_')
02402        && (s[3] >= 'A') && (s[3] <= 'Z')
02403        && (s[4] >= 'A') && (s[4] <= 'Z')
02404        && (!s[5] || s[5] == '.')) {
02405       /* Good */
02406     } else
02407       s = NULL;
02408   }
02409   
02410   if (!s)
02411     s = "en_US";
02412   
02413   return scheme_make_sized_utf8_string(s, 5);
02414 # endif
02415 #endif
02416 }
02417 
02418 #ifndef DONT_USE_LOCALE
02419 
02420 #define ICONV_ARG_CAST /* empty */
02421 
02422 static char *do_convert(iconv_t cd,
02423                      /* if cd == -1 and either from_e or to_e can be NULL, then
02424                         reset_locale() must have been called */
02425                      const char *from_e, const char *to_e,
02426                      /* 1 => UCS-4 -> UTF-8; 2 => UTF-8 -> UCS-4; 0 => other */
02427                      int to_from_utf8,
02428                      /* in can be NULL to output just a shift; in that case,
02429                         id should be 0, too */
02430                      char *in, int id, int iilen,
02431                      char *out, int od, int iolen,
02432                      /* if grow, then reallocate when out isn't big enough */
02433                      int grow,
02434                      /* if add_end_shift, add a shift sequence to the end;
02435                         not useful if in is already NULL to indicate a shift */
02436                      int add_end_shift,
02437                      /* extra specifies the length of a terminator,
02438                         not included in iolen or *oolen */
02439                      int extra,
02440                      /* these two report actual read/wrote sizes: */
02441                      long *oilen, long *oolen,
02442                      /* status is set to
02443                         0 for complete,
02444                         -1 for partial input,
02445                         -2 for error,
02446                         1 for more avail */
02447                      int *status)
02448 {
02449   int dip, dop, close_it = 0, mz_utf8 = 0;
02450   size_t il, ol, r;
02451   GC_CAN_IGNORE char *ip, *op;
02452 
02453   /* Defaults: */
02454   *status = -1;
02455   if (oilen)
02456     *oilen = 0;
02457   *oolen = 0;
02458 
02459   if (cd == (iconv_t)-1) {
02460     if (!iconv_ready) init_iconv();
02461     if (mzCHK_PROC(iconv_open)) {
02462       if (!from_e)
02463        from_e = mz_iconv_nl_langinfo();
02464       if (!to_e)
02465        to_e = mz_iconv_nl_langinfo();
02466       cd = iconv_open(to_e, from_e);
02467       close_it = 1;
02468     } else if (to_from_utf8) {
02469       /* Assume UTF-8 */
02470       mz_utf8 = 1;
02471     }
02472   }
02473 
02474   if ((cd == (iconv_t)-1) && !mz_utf8) {
02475     if (out) {
02476       while (extra--) {
02477        out[extra] = 0;
02478       }
02479     }
02480     return out;
02481   }
02482 
02483   /* The converter is ready. Allocate out space, if necessary */
02484 
02485   if (!out) {
02486     if (iolen <= 0)
02487       iolen = iilen;
02488     out = (char *)scheme_malloc_atomic(iolen + extra);
02489     od = 0;
02490   }
02491 
02492   /* il and ol are the number of available chars */
02493   il = iilen;
02494   ol = iolen;
02495   /* dip and dop are the number of characters read so far;
02496      we use these and NULL out the ip and op pointers
02497      for the sake of precise GC */
02498   dip = 0;
02499   dop = 0;
02500   if (!in)
02501     add_end_shift = 0;
02502 
02503   while (1) {
02504     int icerr;
02505 
02506     if (mz_utf8) {
02507       /* Use our UTF-8 routines as if they were iconv */
02508       if (to_from_utf8 == 1) {
02509        /* UCS-4 -> UTF-8 */
02510        /* We assume that in + id and iilen are mzchar-aligned */
02511        int opos, uid, uilen;
02512        uid = (id + dip) >> 2;
02513        uilen = (iilen - dip) >> 2;
02514        opos = scheme_utf8_encode((const unsigned int *)in, uid, uilen,
02515                               NULL, 0,
02516                               0);
02517        if (opos <= iolen) {
02518          opos = scheme_utf8_encode((const unsigned int *)in, uid, uilen,
02519                                 (unsigned char *)out, od + dop,
02520                                 0);
02521          dop += opos;
02522          dip += iilen;
02523          icerr = 0;
02524          r = (size_t)opos;
02525        } else {
02526          icerr = E2BIG;
02527          r = (size_t)-1;
02528        }
02529       } else {
02530        /* UTF-8 -> UCS-4 */
02531        /* We assume that out + od is mzchar-aligned */
02532        long ipos, opos;
02533 
02534        r = utf8_decode_x((unsigned char *)in, id + dip, iilen,
02535                        (unsigned int *)out, (od + dop) >> 2, iolen >> 2,
02536                        &ipos, &opos,
02537                        0, 0, NULL, 0, 0);
02538        
02539        opos <<= 2;
02540        dop = (opos - od);
02541        dip = (ipos - id);
02542 
02543        if ((r == -1) || (r == -2)) {
02544          r = (size_t)-1;
02545          icerr = EILSEQ;
02546        } else if (r == -3) {
02547          icerr = E2BIG;
02548          r = (size_t)-1;
02549        } else
02550          icerr = 0;
02551       }
02552     } else  {
02553       ip = in XFORM_OK_PLUS id + dip;
02554       op = out XFORM_OK_PLUS od + dop;
02555       r = iconv(cd, ICONV_ARG_CAST &ip, &il, &op, &ol);
02556       dip = ip - (in XFORM_OK_PLUS id);
02557       dop = op - (out XFORM_OK_PLUS od);
02558       ip = op = NULL;
02559       icerr = ICONV_errno;
02560     }
02561 
02562     /* Record how many chars processed, now */
02563     if (oilen)
02564       *oilen = dip;
02565     *oolen = dop;
02566 
02567     /* Got all the chars? */
02568     if (r == (size_t)-1) {
02569       if (icerr == E2BIG) {
02570        if (grow) {
02571          /* Double the string size and try again */
02572          char *naya;
02573          naya = (char *)scheme_malloc_atomic((iolen * 2) + extra);
02574          memcpy(naya, out + od, *oolen);
02575          ol += iolen;
02576          iolen += iolen;
02577          out = naya;
02578          od = 0;
02579        } else {
02580          *status = 1;
02581          if (close_it)
02582            iconv_close(cd);
02583          while (extra--) {
02584            out[od + dop + extra] = 0;
02585          }
02586          return out;
02587        }
02588       } else {
02589        /* Either EINVAL (premature end) or EILSEQ (bad sequence) */
02590        if (icerr == EILSEQ)
02591          *status = -2;
02592        if (close_it)
02593          iconv_close(cd);
02594        while (extra--) {
02595          out[od + dop + extra] = 0;
02596        }
02597        return out;
02598       }
02599     } else {
02600       /* All done... */
02601       if (add_end_shift) {
02602        add_end_shift = 0;
02603        in = NULL;
02604        dip = 0;
02605        id = 0;
02606        il = 0; /* should be redundant */
02607        oilen = NULL; /* so it doesn't get set to 0 */
02608       } else {
02609        *status = 0;
02610        if (close_it)
02611          iconv_close(cd);
02612        while (extra--) {
02613          out[od + dop + extra] = 0;
02614        }
02615        return out;
02616       }
02617     }
02618   }
02619 }
02620 
02621 #define MZ_SC_BUF_SIZE 32
02622 
02623 static char *string_to_from_locale(int to_bytes,
02624                                char *in, int delta, int len,
02625                                long *olen, int perm,
02626                                int *no_cvt)
02627      /* Call this function only when iconv is available, and only when
02628        reset_locale() has been called */
02629 {
02630   Scheme_Object *parts = scheme_null, *one;
02631   char *c;
02632   long clen, used;
02633   int status;
02634   iconv_t cd;
02635 
02636   if (!iconv_ready) init_iconv();
02637 
02638   if (to_bytes)
02639     cd = iconv_open(mz_iconv_nl_langinfo(), MZ_UCS4_NAME);
02640   else
02641     cd = iconv_open(MZ_UCS4_NAME, mz_iconv_nl_langinfo());
02642   if (cd == (iconv_t)-1) {
02643     *no_cvt = 1;
02644     return NULL;
02645   }
02646   *no_cvt = 0;
02647 
02648   while (len) {
02649     /* We might have conversion errors... */
02650     c = do_convert(cd, NULL, NULL, 0,
02651                  (char *)in, (to_bytes ? 4 : 1) * delta, (to_bytes ? 4 : 1) * len,
02652                  NULL, 0, (to_bytes ? 1 : 4) * (len + 1),
02653                  1 /* grow */, 1, (to_bytes ? 1 : 4) /* terminator size */,
02654                  &used, &clen,
02655                  &status);
02656 
02657     if (to_bytes)
02658       used >>= 2;
02659 
02660     if ((perm < 0) && (used < len)) {
02661       iconv_close(cd);
02662       return NULL;
02663     }
02664 
02665     delta += used;
02666     len -= used;
02667 
02668     if (!len && SCHEME_NULLP(parts)) {
02669       if (to_bytes) {
02670        *olen = clen;
02671        c[*olen] = 0;
02672       } else {
02673        *olen = (clen >> 2);
02674        ((mzchar *)c)[*olen] = 0;
02675       }
02676       iconv_close(cd);
02677       return c;
02678     }
02679 
02680     /* We can get here if there was some conversion error at some
02681        point. We're building up a list of parts. */
02682 
02683     if (to_bytes) {
02684       one = scheme_make_sized_byte_string(c, clen, 0);
02685     } else {
02686       one = scheme_make_sized_char_string((mzchar *)c, clen >> 2, 0);
02687     }
02688 
02689     parts = scheme_make_pair(one, parts);
02690 
02691     if (len) {
02692       /* Conversion error, so skip one char. */
02693       if (to_bytes) {
02694        char bc[1];
02695        bc[0] = perm;
02696        one = scheme_make_sized_byte_string(bc, 1, 1);
02697       } else {
02698        mzchar bc[1];
02699        bc[0] = perm;
02700        one = scheme_make_sized_char_string(bc, 1, 1);
02701       }
02702       parts = scheme_make_pair(one, parts);
02703       delta += 1;
02704       len -= 1;
02705     }
02706   }
02707 
02708   iconv_close(cd);
02709 
02710   if (to_bytes) {
02711     parts = append_all_byte_strings_backwards(parts);
02712     *olen = SCHEME_BYTE_STRTAG_VAL(parts);
02713 
02714     return SCHEME_BYTE_STR_VAL(parts);
02715   } else {
02716     parts = append_all_strings_backwards(parts);
02717     *olen = SCHEME_CHAR_STRTAG_VAL(parts);
02718 
02719     return (char *)SCHEME_CHAR_STR_VAL(parts);
02720   }
02721 }
02722 
02723 static char *locale_recase(int to_up,
02724                         /* in must be null-terminated, iilen doesn't include it */
02725                         char *in, int id, int iilen,
02726                         /* iolen, in contrast, includes the terminator */
02727                         char *out, int od, int iolen,
02728                         long *oolen)
02729      /* Assumes that reset_locale() has been called */
02730 {
02731 #ifdef NO_MBTOWC_FUNCTIONS
02732   /* No wide-char functions...
02733      The C library's toupper and tolower is supposed to be
02734      locale-sensitive. It can't be right for characters that are
02735      encoded in multiple bytes, but probably it will do the right
02736      thing in common cases. */
02737   int i;
02738 
02739   /* First, copy "in" to "out" */
02740   if (iilen + 1 >= iolen) {
02741     out = (char *)scheme_malloc_atomic(iilen + 1);
02742     od = 0;
02743   }
02744   memcpy(out + od, in + id, iilen);
02745   out[od + iilen] = 0;
02746   *oolen = iilen;
02747 
02748   /* Re-case chars in "out" */
02749   for (i = 0; i < iilen; i++) {
02750     if (to_up)
02751       out[od + i] = toupper(out[od + i]);
02752     else
02753       out[od + i] = tolower(out[od + i]);
02754   }
02755 
02756   return out;
02757 #else
02758   /* To change the case, convert the string to multibyte, re-case the
02759      multibyte, then convert back. */
02760 # define MZ_WC_BUF_SIZE 32
02761   GC_CAN_IGNORE mbstate_t state;
02762   size_t wl, wl2, ml, ml2;
02763   wchar_t *wc, *ws, wcbuf[MZ_WC_BUF_SIZE], cwc;
02764   const char *s;
02765   unsigned int j;
02766   /* The "n" versions are apparently not too standard: */
02767 # define mz_mbsnrtowcs(t, f, fl, tl, s) mbsrtowcs(t, f, tl, s)
02768 # define mz_wcsnrtombs(t, f, fl, tl, s) wcsrtombs(t, f, tl, s)
02769 
02770   /* ----- to wide char ---- */
02771 
02772   /* Get length */
02773   memset(&state, 0, sizeof(mbstate_t));
02774   s = in XFORM_OK_PLUS id;
02775   wl = mz_mbsnrtowcs(NULL, &s, iilen, 0, &state);
02776   s = NULL;
02777   if (wl < 0) return NULL;
02778 
02779   /* Allocate space */
02780   if (wl < MZ_WC_BUF_SIZE) {
02781     wc = wcbuf;
02782   } else {
02783     wc = (wchar_t *)scheme_malloc_atomic(sizeof(wchar_t) * (wl + 1));
02784   }
02785 
02786   /* Convert */
02787   memset(&state, 0, sizeof(mbstate_t));
02788   s = in XFORM_OK_PLUS id;
02789   wl2 = mz_mbsnrtowcs(wc, &s, iilen, wl + 1, &state);
02790   s = NULL;
02791   if (wl2 < 0) return NULL; /* Very strange! */
02792 
02793   wc[wl] = 0; /* just in case */
02794 
02795   /* ---- re-case ---- */
02796 
02797   if (to_up) {
02798     for (j = 0; j < wl; j++) {
02799       cwc = towupper(wc[j]);
02800       wc[j] = cwc;
02801     }
02802   } else {
02803     for (j = 0; j < wl; j++) {
02804       cwc = towlower(wc[j]);
02805       wc[j] = cwc;
02806     }
02807   }
02808 
02809   /* ---- back to multibyte ---- */
02810 
02811   /* Measure */
02812   memset(&state, 0, sizeof(mbstate_t));
02813   ws = wc;
02814   ml = mz_wcsnrtombs(NULL, (const wchar_t **)&ws, wl, 0, &state);
02815   ws = NULL;
02816   if (ml < 0) return NULL;
02817 
02818   /* Allocate space */
02819   *oolen = ml;
02820   if (ml + 1 >= (unsigned int)iolen) {
02821     out = (char *)scheme_malloc_atomic(ml + 1);
02822     od = 0;
02823   }
02824 
02825   /* Convert */
02826   memset(&state, 0, sizeof(mbstate_t));
02827   ws = wc;
02828   ml2 = mz_wcsnrtombs(out + od, (const wchar_t **)&ws, wl, ml + 1, &state);
02829   ws = NULL;
02830   if (ml2 < 0) return NULL; /* Very strange! */
02831 
02832   out[od + ml] = 0;
02833 
02834   return out;
02835 #endif
02836 }
02837 
02838 int mz_locale_strcoll(char *s1, int d1, int l1, char *s2, int d2, int l2, int cvt_case)
02839      /* The s1 and s2 arguments are actually UCS-4.
02840         Assumes that reset_locale() has been called. */
02841 {
02842   long clen1, clen2, used1, used2, origl1, origl2;
02843   char *c1, *c2, buf1[MZ_SC_BUF_SIZE], buf2[MZ_SC_BUF_SIZE];
02844   char case_buf1[MZ_SC_BUF_SIZE], case_buf2[MZ_SC_BUF_SIZE];
02845   int status, got_more;
02846 
02847   /* First, convert UCS-4 to locale-specific encoding. If some
02848      characters don't fit into the encoding, then we'll have leftover
02849      characters. Count unconvertable charc as greater than anything
02850      that can be converted */
02851 
02852   origl1 = l1;
02853   origl2 = l2;
02854 
02855   /* Loop to check both convertable and unconvertable parts */
02856   while (1) {
02857     if (!origl1 && !origl2)
02858       return 0;
02859     if (!origl1)
02860       return -1;
02861     if (!origl2)
02862       return 1;
02863 
02864     /* Loop to get consistent parts of the wto strings, in case
02865        a conversion fails. */
02866     got_more = 0;
02867     l1 = origl1;
02868     l2 = origl2;
02869     while (1) {
02870       c1 = do_convert((iconv_t)-1, MZ_UCS4_NAME, NULL, 1,
02871                     s1, d1 * 4, 4 * l1,
02872                     buf1, 0, MZ_SC_BUF_SIZE - 1,
02873                     1 /* grow */, 0, 1 /* terminator size */,
02874                     &used1, &clen1,
02875                     &status);
02876       c2 = do_convert((iconv_t)-1, MZ_UCS4_NAME, NULL, 1,
02877                     s2, d2 * 4, 4 * l2,
02878                     buf2, 0, MZ_SC_BUF_SIZE - 1,
02879                     1 /* grow */, 0, 1 /* terminator size */,
02880                     &used2, &clen2,
02881                     &status);
02882 
02883       if ((used1 < 4 * l1) || (used2 < 4 * l2)) {
02884        if (got_more) {
02885          /* Something went wrong. We've already tried to
02886             even out the parts that work. Let's give up
02887             on the first characters */
02888          clen1 = clen2 = 0;
02889          break;
02890        } else if (used1 == used2) {
02891          /* Not everything, but both ended at the same point */
02892          break;
02893        } else {
02894          /* Pick the smallest */
02895          if (used2 < used1) {
02896            used1 = used2;
02897            got_more = 1;
02898          } else
02899            got_more = 2;
02900          l2 = (used1 >> 2);
02901          l1 = (used1 >> 2);
02902 
02903          if (!l1) {
02904            /* Nothing to get this time. */
02905            clen1 = clen2 = 0;
02906            c1 = c2 = "";
02907            used1 = used2 = 0;
02908            break;
02909          }
02910        }
02911       } else
02912        /* Got all that we wanted */
02913        break;
02914     }
02915 
02916     if (cvt_case) {
02917       if (clen1)
02918        c1 = locale_recase(0, c1, 0, clen1,
02919                         case_buf1, 0, MZ_SC_BUF_SIZE - 1,
02920                         &clen1);
02921       else
02922        c1 = NULL;
02923       if (clen2)
02924        c2 = locale_recase(0, c2, 0, clen2,
02925                         case_buf2, 0, MZ_SC_BUF_SIZE - 1,
02926                         &clen2);
02927       else
02928        c2 = NULL;
02929       /* There shouldn't have been conversion errors, but just in
02930         case, care of NULL. */
02931       if (!c1) c1 = "";
02932       if (!c2) c2 = "";
02933     }
02934 
02935     /* Collate, finally. */
02936     status = strcoll(c1, c2);
02937 
02938     /* If one is bigger than the other, we're done. */
02939     if (status)
02940       return status;
02941 
02942     /* Otherwise, is there more to check? */
02943     origl1 -= (used1 >> 2);
02944     origl2 -= (used2 >> 2);
02945     d1 += (used1 >> 2);
02946     d2 += (used2 >> 2);
02947     if (!origl1 && !origl2)
02948       return 0;
02949 
02950     /* There's more. It must be that the next character wasn't
02951        convertable in one of the encodings. */
02952     if (got_more)
02953       return ((got_more == 2) ? 1 : -1);
02954 
02955     if (!origl1)
02956       return -1;
02957 
02958     /* Compare an unconverable character directly. No case conversions
02959        if it's outside the locale. */
02960     if (((unsigned int *)s1)[d1] > ((unsigned int *)s2)[d2])
02961       return 1;
02962     else if (((unsigned int *)s1)[d1] < ((unsigned int *)s2)[d2])
02963       return -1;
02964     else {
02965       /* We've skipped one unconvertable char, and they still look the
02966         same.  Now try again. */
02967       origl1 -= 1;
02968       origl2 -= 1;
02969       d1 += 1;
02970       d2 += 1;
02971     }
02972   }
02973 }
02974 
02975 #ifdef MACOS_UNICODE_SUPPORT
02976 int mz_native_strcoll(char *s1, int d1, int l1, char *s2, int d2, int l2, int cvt_case)
02977      /* The s1 and s2 arguments are actually UTF-16. */
02978 {
02979   CFStringRef str1, str2;
02980   CFComparisonResult r;
02981 
02982   str1 = CFStringCreateWithBytes(NULL, (unsigned char *)s1 XFORM_OK_PLUS (d1 * 2), (l1 * 2), 
02983                              kCFStringEncodingUnicode, FALSE);
02984   str2 = CFStringCreateWithBytes(NULL, (unsigned char *)s2 XFORM_OK_PLUS (d2 * 2), (l2 * 2), 
02985                              kCFStringEncodingUnicode, FALSE);
02986 
02987   r = CFStringCompare(str1, str2, (kCFCompareLocalized
02988                                | (cvt_case ? kCFCompareCaseInsensitive : 0)));
02989 
02990   CFRelease(str1);
02991   CFRelease(str2);
02992 
02993   return (int)r;
02994 }
02995 #endif
02996 
02997 #ifdef WINDOWS_UNICODE_SUPPORT
02998 int mz_native_strcoll(char *s1, int d1, int l1, char *s2, int d2, int l2, int cvt_case)
02999      /* The s1 and s2 arguments are actually UTF-16. */
03000 {
03001   int r;
03002 
03003   r = CompareStringW(LOCALE_USER_DEFAULT,
03004                    ((cvt_case ? NORM_IGNORECASE : 0)
03005                     | NORM_IGNOREKANATYPE
03006                     | NORM_IGNOREWIDTH),
03007                    (wchar_t *)s1 + d1, l1, (wchar_t *)s2 + d2, l2);
03008 
03009   return r - 2;
03010 }
03011 #endif
03012 
03013 typedef int (*strcoll_proc)(char *s1, int d1, int l1, char *s2, int d2, int l2, int cvt_case);
03014 
03015 int do_locale_comp(const char *who, const mzchar *us1, long ul1, const mzchar *us2, long ul2, int cvt_case)
03016 {
03017   int xl1;
03018   int v, endres, utf16 = 0;
03019   GC_CAN_IGNORE strcoll_proc mz_strcoll = mz_locale_strcoll;
03020 
03021 #if defined(MACOS_UNICODE_SUPPORT) || defined(WINDOWS_UNICODE_SUPPORT)
03022   if (current_locale_name && !*current_locale_name) {
03023     utf16 = 1;
03024     mz_strcoll = mz_native_strcoll;
03025   }
03026 #endif
03027 
03028   if (utf16) {
03029     us1 = (mzchar *)scheme_ucs4_to_utf16(us1, 0, ul1, NULL, 0, &ul1, 1);
03030     us2 = (mzchar *)scheme_ucs4_to_utf16(us2, 0, ul2, NULL, 0, &ul2, 1);
03031     ((short *)us1)[ul1] = 0;
03032     ((short *)us2)[ul2] = 0;
03033   }
03034 
03035   if (ul1 > ul2) {
03036     ul1 = ul2;
03037     endres = 1;
03038   } else {
03039     if (ul2 > ul1)
03040       endres = -1;
03041     else
03042       endres = 0;
03043   }
03044 
03045   /* Walk back through the strings looking for nul characters. If we
03046      find one, compare the part after the null character to update
03047      endres, then continue. Unfortunately, we do too much work if an
03048      earlier part of the string (tested later) determines the result,
03049      but hopefully nul characters are rare. */
03050 
03051   xl1 = 0;
03052   while (ul1--) {
03053     if ((utf16 && (!(((short *)us1)[ul1]) || !(((short *)us2)[ul1])))
03054        || (!utf16 && (!(us1[ul1]) || !(us2[ul1])))) {
03055       if (utf16) {
03056        if (((short *)us1)[ul1])
03057          endres = 1;
03058        else if (((short *)us2)[ul1])
03059          endres = -1;
03060       } else {
03061        if (us1[ul1])
03062          endres = 1;
03063        else if (us2[ul1])
03064          endres = -1;
03065       }
03066 
03067       if (xl1)
03068        v = mz_strcoll((char *)us1, ul1 + 1, xl1, (char *)us2, ul1 + 1, xl1, cvt_case);
03069       else
03070        v = 0;
03071 
03072       if (v)
03073        endres = v;
03074       xl1 = 0;
03075     } else {
03076       xl1++;
03077     }
03078   }
03079 
03080   v = mz_strcoll((char *)us1, 0, xl1, (char *)us2, 0, xl1, cvt_case);
03081   if (v)
03082     endres = v;
03083 
03084   return endres;
03085 }
03086 
03087 
03088 mzchar *do_locale_recase(int to_up, mzchar *in, int delta, int len, long *olen)
03089 {
03090   Scheme_Object *parts = scheme_null;
03091   char *c, buf[MZ_SC_BUF_SIZE], case_buf[MZ_SC_BUF_SIZE];
03092   long clen, used;
03093   int status;
03094 
03095   while (len) {
03096     /* We might have conversion errors... */
03097     c = do_convert((iconv_t)-1, MZ_UCS4_NAME, NULL, 1,
03098                  (char *)in, 4 * delta, 4 * len,
03099                  buf, 0, MZ_SC_BUF_SIZE - 1,
03100                  1 /* grow */, 0, 1 /* terminator size */,
03101                  &used, &clen,
03102                  &status);
03103 
03104     used >>= 2;
03105     delta += used;
03106     len -= used;
03107 
03108     c = locale_recase(to_up, c, 0, clen,
03109                     case_buf, 0, MZ_SC_BUF_SIZE - 1,
03110                     &clen);
03111     if (!c)
03112       clen = 0;
03113 
03114     c = do_convert((iconv_t)-1, NULL, MZ_UCS4_NAME, 2,
03115                  c, 0, clen,
03116                  NULL, 0, 0,
03117                  1 /* grow */, 0, sizeof(mzchar) /* terminator size */,
03118                  &used, &clen,
03119                  &status);
03120 
03121     if (!len && SCHEME_NULLP(parts)) {
03122       *olen = (clen >> 2);
03123       ((mzchar *)c)[*olen] = 0;
03124       return (mzchar *)c;
03125     }
03126 
03127     /* We can get here if there was some conversion error at some
03128        point. We're building up a list of parts. */
03129 
03130     parts = scheme_make_pair(scheme_make_sized_char_string((mzchar *)c, clen >> 2, 0),
03131                           parts);
03132 
03133     if (len) {
03134       /* Conversion error, so skip one char. */
03135       parts = scheme_make_pair(scheme_make_sized_offset_char_string(in, delta, 1, 1),
03136                             parts);
03137       delta += 1;
03138       len -= 1;
03139     }
03140   }
03141 
03142   parts = append_all_strings_backwards(parts);
03143   *olen = SCHEME_CHAR_STRTAG_VAL(parts);
03144 
03145   return SCHEME_CHAR_STR_VAL(parts);
03146 }
03147 
03148 #ifdef MACOS_UNICODE_SUPPORT
03149 mzchar *do_native_recase(int to_up, mzchar *in, int delta, int len, long *olen)
03150      /* The in argument is actually UTF-16. */
03151 {
03152   CFMutableStringRef mstr;
03153   CFStringRef str;
03154   GC_CAN_IGNORE CFRange rng;
03155   char *result;
03156 
03157   str = CFStringCreateWithBytes(NULL, ((unsigned char *)in) XFORM_OK_PLUS (delta * 2), (len * 2), 
03158                             kCFStringEncodingUnicode, FALSE);
03159   mstr = CFStringCreateMutableCopy(NULL, 0, str);
03160   CFRelease(str);
03161 
03162   if (to_up)
03163     CFStringUppercase(mstr, NULL);
03164   else
03165     CFStringLowercase(mstr, NULL);
03166 
03167   len = CFStringGetLength(mstr);
03168   *olen = len;
03169 
03170   result = (char *)scheme_malloc_atomic((len + 1) * 2);
03171 
03172   rng = CFRangeMake(0, len);
03173   CFStringGetCharacters(mstr, rng, (UniChar *)result);
03174   CFRelease(mstr);
03175 
03176   ((UniChar *)result)[len] = 0;
03177 
03178   return (mzchar *)result;
03179 }
03180 #endif
03181 
03182 #ifdef WINDOWS_UNICODE_SUPPORT
03183 mzchar *do_native_recase(int to_up, mzchar *in, int delta, int len, long *olen)
03184      /* The in argument is actually UTF-16. */
03185 {
03186   char *result;
03187 
03188   result = (char *)scheme_malloc_atomic((len + 1) * 2);
03189   memcpy(result, ((char *)in) + (2 * delta), len * 2);
03190   ((wchar_t*)result)[len] = 0;
03191 
03192   if (to_up)
03193     CharUpperBuffW((wchar_t *)result, len);
03194   else {
03195     int i;
03196     /* CharLowerBuff doesn't work with unicows.dll -- strange.
03197        So we use CharLower, instead. */
03198     for (i = 0; i < len; i++) {
03199       CharLowerW(((wchar_t *)result) + i);
03200     }
03201   }
03202 
03203   *olen = len;
03204   return (mzchar *)result;
03205 }
03206 #endif
03207 
03208 typedef mzchar *(*recase_proc)(int to_up, mzchar *in, int delta, int len, long *olen);
03209 
03210 static Scheme_Object *mz_recase(const char *who, int to_up, mzchar *us, long ulen)
03211 {
03212   long ulen1;
03213   int utf16 = 0, i, delta = 0;
03214   mzchar *us1;
03215   recase_proc mz_do_recase = do_locale_recase;
03216   Scheme_Object *s, *parts = scheme_null;
03217 
03218   reset_locale();
03219 
03220 #if defined(MACOS_UNICODE_SUPPORT) || defined(WINDOWS_UNICODE_SUPPORT)
03221   if (current_locale_name && !*current_locale_name) {
03222     utf16 = 1;
03223     mz_do_recase = do_native_recase;
03224   }
03225 #endif
03226 
03227   if (utf16) {
03228     us = (mzchar *)scheme_ucs4_to_utf16(us, 0, ulen, NULL, 0, &ulen, 1);
03229     ((short *)us)[ulen] = 0;
03230   }
03231 
03232   /* If there are nulls in the string, then we have to make multiple
03233      calls to mz_do_recase */
03234   i = 0;
03235   while (1) {
03236     for (; i < ulen; i++) {
03237       if (utf16) {
03238        if (!((short *)us)[i])
03239          break;
03240       } else if (!us[i])
03241        break;
03242     }
03243 
03244     us1 = mz_do_recase(to_up, us, delta, i - delta, &ulen1);
03245 
03246     if (utf16) {
03247       us1 = scheme_utf16_to_ucs4((unsigned short *)us1, 0, ulen1, NULL, 0, &ulen1, 1);
03248       us1[ulen1] = 0;
03249     }
03250 
03251     s = scheme_make_sized_char_string((mzchar *)us1, ulen1, 0);
03252 
03253     if (SCHEME_NULLP(parts) && (i == ulen))
03254       return s;
03255 
03256     parts = scheme_make_pair(s, parts);
03257 
03258     if (i == ulen)
03259       break;
03260 
03261     /* upcasing and encoding a nul char is easy: */
03262     s = scheme_make_sized_char_string((mzchar *)"\0\0\0\0", 1, 0);
03263     parts = scheme_make_pair(s, parts);
03264     i++;
03265     delta = i;
03266 
03267     if (i == ulen)
03268       break;
03269   }
03270 
03271   return append_all_strings_backwards(parts);
03272 }
03273 
03274 #endif
03275 
03276 static Scheme_Object *
03277 unicode_recase(const char *who, int to_up, int argc, Scheme_Object *argv[])
03278 {
03279   long len;
03280   mzchar *chars;
03281 
03282   if (!SCHEME_CHAR_STRINGP(argv[0]))
03283     scheme_wrong_type(who, "string", 0, argc, argv);
03284 
03285   chars = SCHEME_CHAR_STR_VAL(argv[0]);
03286   len = SCHEME_CHAR_STRTAG_VAL(argv[0]);
03287 
03288   return mz_recase(who, to_up, chars, len);
03289 }
03290 
03291 static Scheme_Object *
03292 string_locale_upcase(int argc, Scheme_Object *argv[])
03293 {
03294   return unicode_recase("string-locale-upcase", 1, argc, argv);
03295 }
03296 
03297 static Scheme_Object *
03298 string_locale_downcase(int argc, Scheme_Object *argv[])
03299 {
03300   return unicode_recase("string-locale-downcase", 0, argc, argv);
03301 }
03302 
03303 static void reset_locale(void)
03304 {
03305   Scheme_Object *v;
03306   const mzchar *name;
03307 
03308   v = scheme_get_param(scheme_current_config(), MZCONFIG_LOCALE);
03309   locale_on = SCHEME_TRUEP(v);
03310 
03311   if (locale_on) {
03312     name = SCHEME_CHAR_STR_VAL(v);
03313 #ifndef DONT_USE_LOCALE
03314     if ((current_locale_name != name)
03315        && mz_char_strcmp("result-locale",
03316                        current_locale_name, scheme_char_strlen(current_locale_name),
03317                        name, SCHEME_CHAR_STRLEN_VAL(v),
03318                        0, 1)) {
03319       /* We only need CTYPE and COLLATE; two calls seem to be much
03320         faster than one call with ALL */
03321       char *n, buf[32];
03322 
03323       n = scheme_utf8_encode_to_buffer(name, SCHEME_CHAR_STRLEN_VAL(v), buf, 32);
03324 
03325       if (!setlocale(LC_CTYPE, n))
03326        setlocale(LC_CTYPE, "C");
03327       if (!setlocale(LC_COLLATE, n))
03328        setlocale(LC_COLLATE, "C");
03329     }
03330 #endif
03331     current_locale_name = name;
03332   }
03333 }
03334 
03335 static int find_special_casing(int ch)
03336 {
03337   /* Binary search */
03338   int i, lo, hi, j;
03339 
03340   i = NUM_SPECIAL_CASINGS >> 1;
03341   lo = i;
03342   hi = NUM_SPECIAL_CASINGS - i - 1;
03343 
03344   while (1) {
03345     if (uchar_special_casings[i * 10] == ch)
03346       return i * 10;
03347     if (uchar_special_casings[i * 10] > ch) {
03348       j = i - lo;
03349       i = j + (lo >> 1);
03350       hi = lo - (i - j) - 1;
03351       lo = i - j;
03352     } else {
03353       j = i + 1;
03354       i = j + (hi >> 1);
03355       lo = i - j;
03356       hi = hi - (i - j) - 1;
03357     }
03358   }
03359 }
03360 
03361 static int is_final_sigma(int mode, mzchar *s, int d, int i, int len)
03362 {
03363   int j;
03364 
03365   if (mode == 3)
03366     return 1;
03367   
03368   /* find a cased char before, skipping case-ignorable: */
03369   for (j = i - 1; j >= d; j--) {
03370     if (!scheme_iscaseignorable(s[j])) {
03371       if (scheme_iscased(s[j]))
03372        break;
03373       else
03374        return 0;
03375     }
03376   }
03377   if (j < d)
03378     return 0;
03379 
03380   /* next non-case-ignorable must not be cased: */
03381   for (j = i + 1; j < d + len; j++) {
03382     if (!scheme_iscaseignorable(s[j])) {
03383       return !scheme_iscased(s[j]);
03384     }
03385   }
03386 
03387   return 1;
03388 }
03389 
03390 mzchar *scheme_string_recase(mzchar *s, int d, int len, int mode, int inplace, int *_len)
03391 {
03392   mzchar *t;
03393   int i, extra = 0, pos, special = 0, td, prev_was_cased = 0, xmode = mode;
03394 
03395   for (i = 0; i < len; i++) {
03396     if (scheme_isspecialcasing(s[d+i])) {
03397       pos = find_special_casing(s[d+i]);
03398       if (!uchar_special_casings[pos + 9] || is_final_sigma(xmode, s, d, i, len)) {
03399        special = 1;
03400        extra += (uchar_special_casings[pos + 1 + (xmode << 1)] - 1);
03401       }
03402     }
03403     if (mode == 2) {
03404       if (!scheme_iscaseignorable(s[d+i]))
03405        prev_was_cased = scheme_iscased(s[d+i]);
03406       xmode = (prev_was_cased ? 0 : 2);
03407     }
03408   }
03409 
03410   if (_len)
03411     *_len = len + extra;
03412 
03413   if (!extra && inplace) {
03414     t = s;
03415     td = d;
03416   } else {
03417     t = scheme_malloc_atomic(sizeof(mzchar) * (len + extra + 1));
03418     td = 0;
03419   }
03420 
03421   if (!special) {
03422     if (mode == 0) {
03423       for (i = 0; i < len; i++) {
03424        t[i+td] = scheme_tolower(s[i+d]);
03425       }
03426     } else if (mode == 1) {
03427       for (i = 0; i < len; i++) {
03428        t[i+td] = scheme_toupper(s[i+d]);
03429       }
03430     } else if (mode == 2) {
03431       prev_was_cased = 0;
03432       for (i = 0; i < len; i++) {
03433        if (!prev_was_cased)
03434          t[i+td] = scheme_totitle(s[i+d]);
03435        else
03436          t[i+td] = scheme_tolower(s[i+d]);
03437        if (!scheme_iscaseignorable(s[i+d]))
03438          prev_was_cased = scheme_iscased(s[i+d]);
03439       }
03440     } else /* if (mode == 3) */ {
03441       for (i = 0; i < len; i++) {
03442        t[i+td] = scheme_tofold(s[i+d]);
03443       }
03444     }
03445   } else {
03446     int j = 0, c;
03447     prev_was_cased = 0;
03448     for (i = 0; i < len; i++) {
03449       if (mode == 0) {
03450        t[j+td] = scheme_tolower(s[i+d]);
03451       } else if (mode == 1) {
03452        t[j+td] = scheme_toupper(s[i+d]);
03453       } else if (mode == 2) {
03454        if (!prev_was_cased) {
03455          xmode = 2;
03456          t[j+td] = scheme_totitle(s[i+d]);
03457        } else {
03458          xmode = 0;
03459          t[j+td] = scheme_tolower(s[i+d]);
03460        }
03461        if (!scheme_iscaseignorable(s[i+d]))
03462          prev_was_cased = scheme_iscased(s[i+d]);
03463       } else /* if (mode == 3) */ {
03464        t[j+td] = scheme_tofold(s[i+d]);
03465       }
03466 
03467       if (scheme_isspecialcasing(s[i+d])) {
03468        pos = find_special_casing(s[i+d]);
03469        if (!uchar_special_casings[pos + 9] || is_final_sigma(xmode, s, d, i, len)) {
03470          c = uchar_special_casings[pos + 1 + (xmode << 1)];
03471          pos = uchar_special_casings[pos + 2 + (xmode << 1)];
03472          while (c--) {
03473            t[(j++)+td] = uchar_special_casing_data[pos++];
03474          }
03475        } else
03476          j++;
03477       } else
03478        j++;
03479     }
03480   }
03481   t[len+extra+td] = 0;
03482 
03483   return t;
03484 }
03485 
03486 static Scheme_Object *string_recase (const char *name, int argc, Scheme_Object *argv[], int mode)
03487 {
03488   mzchar *s;
03489   int len;
03490 
03491   if (!SCHEME_CHAR_STRINGP(argv[0]))
03492     scheme_wrong_type(name, "string", 0, argc, argv);
03493   
03494   s = SCHEME_CHAR_STR_VAL(argv[0]);
03495   len = SCHEME_CHAR_STRLEN_VAL(argv[0]);
03496 
03497   s = scheme_string_recase(s, 0, len, mode, 0, &len);
03498 
03499   return scheme_make_sized_char_string(s, len, 0);
03500 }
03501 
03502 static Scheme_Object *string_upcase (int argc, Scheme_Object *argv[])
03503 {
03504   return string_recase("string-upcase", argc, argv, 1);
03505 }
03506 
03507 static Scheme_Object *string_downcase (int argc, Scheme_Object *argv[])
03508 {
03509   return string_recase("string-downcase", argc, argv, 0);
03510 }
03511 
03512 static Scheme_Object *string_titlecase (int argc, Scheme_Object *argv[])
03513 {
03514   return string_recase("string-titlecase", argc, argv, 2);
03515 }
03516 
03517 static Scheme_Object *string_foldcase (int argc, Scheme_Object *argv[])
03518 {
03519   return string_recase("string-foldcase", argc, argv, 3);
03520 }
03521 
03522 /**********************************************************************/
03523 /*                          normalization                             */
03524 /**********************************************************************/
03525 
03526 #define MZ_JAMO_INITIAL_CONSONANT_START  0x1100
03527 #define MZ_JAMO_INITIAL_CONSONANT_COUNT  19
03528 #define MZ_JAMO_INITIAL_CONSONANT_END    (MZ_JAMO_INITIAL_CONSONANT_START + MZ_JAMO_INITIAL_CONSONANT_COUNT - 1)
03529 
03530 #define MZ_JAMO_VOWEL_START              0x1161
03531 #define MZ_JAMO_VOWEL_COUNT              21
03532 #define MZ_JAMO_VOWEL_END                (MZ_JAMO_VOWEL_START + MZ_JAMO_VOWEL_COUNT - 1)
03533 
03534 /* First in this range is not actually a consonant, but a placeholder for "no consonant" */
03535 #define MZ_JAMO_TRAILING_CONSONANT_START 0x11A7
03536 #define MZ_JAMO_TRAILING_CONSONANT_COUNT 28
03537 #define MZ_JAMO_TRAILING_CONSONANT_END   (MZ_JAMO_TRAILING_CONSONANT_START + MZ_JAMO_TRAILING_CONSONANT_COUNT - 1)
03538 
03539 #define MZ_JAMO_SYLLABLE_START           0xAC00
03540 #define MZ_JAMO_SYLLABLE_END             (MZ_JAMO_SYLLABLE_START + 11171)
03541 
03542 static mzchar get_composition(mzchar a, mzchar b)
03543 {
03544   unsigned long key = (a << 16) | b;
03545   int pos = (COMPOSE_TABLE_SIZE >> 1), new_pos;
03546   int below_len = pos;
03547   int above_len = (COMPOSE_TABLE_SIZE - pos - 1);
03548   
03549   if (a > 0xFFFF) return 0;
03550 
03551   /* Binary search: */
03552   while (key != utable_compose_pairs[pos]) {
03553     if (key > utable_compose_pairs[pos]) {
03554       if (!above_len)
03555        return 0;
03556       new_pos = pos + (above_len >> 1) + 1;
03557       below_len = (new_pos - pos - 1);
03558       above_len = (above_len - below_len - 1);
03559       pos = new_pos;
03560     } else if (key < utable_compose_pairs[pos]) {
03561       if (!below_len)
03562        return 0;
03563       new_pos = pos - ((below_len >> 1) + 1);
03564       above_len = (pos - new_pos - 1);
03565       below_len = (below_len - above_len - 1);
03566       pos = new_pos;
03567     }
03568   }
03569 
03570   return utable_compose_result[pos];
03571 }
03572 
03573 mzchar get_canon_decomposition(mzchar key, mzchar *b)
03574 {
03575   int pos = (DECOMPOSE_TABLE_SIZE >> 1), new_pos;
03576   int below_len = pos;
03577   int above_len = (DECOMPOSE_TABLE_SIZE - pos - 1);
03578 
03579   /* Binary search: */
03580   while (key != utable_decomp_keys[pos]) {
03581     if (key > utable_decomp_keys[pos]) {
03582       if (!above_len)
03583        return 0;
03584       new_pos = pos + (above_len >> 1) + 1;
03585       below_len = (new_pos - pos - 1);
03586       above_len = (above_len - below_len - 1);
03587       pos = new_pos;
03588     } else if (key < utable_decomp_keys[pos]) {
03589       if (!below_len)
03590        return 0;
03591       new_pos = pos - ((below_len >> 1) + 1);
03592       above_len = (pos - new_pos - 1);
03593       below_len = (below_len - above_len - 1);
03594       pos = new_pos;
03595     }
03596   }
03597 
03598   pos = utable_decomp_indices[pos];
03599   if (pos < 0) {
03600     pos = -(pos + 1);
03601     pos <<= 1;
03602     *b = utable_compose_long_pairs[pos + 1];
03603     return utable_compose_long_pairs[pos];
03604   } else {
03605     key = utable_compose_pairs[pos];
03606     *b = (key & 0xFFFF);
03607     return (key >> 16);
03608   }
03609 }
03610 
03611 int get_kompat_decomposition(mzchar key, unsigned short **chars)
03612 {
03613   int pos = (KOMPAT_DECOMPOSE_TABLE_SIZE >> 1), new_pos;
03614   int below_len = pos;
03615   int above_len = (KOMPAT_DECOMPOSE_TABLE_SIZE - pos - 1);
03616 
03617   /* Binary search: */
03618   while (key != utable_kompat_decomp_keys[pos]) {
03619     if (key > utable_kompat_decomp_keys[pos]) {
03620       if (!above_len)
03621        return 0;
03622       new_pos = pos + (above_len >> 1) + 1;
03623       below_len = (new_pos - pos - 1);
03624       above_len = (above_len - below_len - 1);
03625       pos = new_pos;
03626     } else if (key < utable_kompat_decomp_keys[pos]) {
03627       if (!below_len)
03628        return 0;
03629       new_pos = pos - ((below_len >> 1) + 1);
03630       above_len = (pos - new_pos - 1);
03631       below_len = (below_len - above_len - 1);
03632       pos = new_pos;
03633     }
03634   }
03635 
03636   *chars = utable_kompat_decomp_strs XFORM_OK_PLUS utable_kompat_decomp_indices[pos];
03637   return utable_kompat_decomp_lens[pos];
03638 }
03639 
03640 static Scheme_Object *normalize_c(Scheme_Object *o)
03641 /* Assumes then given string is in normal form D */
03642 {
03643   mzchar *s, *s2, tmp, last_c0 = 0;
03644   int len, i, j = 0, last_c0_pos = 0, last_cc = 0;
03645 
03646   s = SCHEME_CHAR_STR_VAL(o);
03647   len = SCHEME_CHAR_STRLEN_VAL(o);
03648 
03649   s2 = (mzchar *)scheme_malloc_atomic((len + 1) * sizeof(mzchar));
03650   memcpy(s2, s, len * sizeof(mzchar));
03651   
03652   for (i = 0; i < len; i++) {
03653     if ((i + 1 < len)
03654        && (s2[i] >= MZ_JAMO_INITIAL_CONSONANT_START)
03655        && (s2[i] <= MZ_JAMO_INITIAL_CONSONANT_END)
03656        && (s2[i+1] >= MZ_JAMO_VOWEL_START)
03657        && (s2[i+1] <= MZ_JAMO_VOWEL_END)) {
03658       /* Need Hangul composition */
03659       if ((i + 2 < len)
03660          && (s2[i+2] > MZ_JAMO_TRAILING_CONSONANT_START)
03661          && (s2[i+2] <= MZ_JAMO_TRAILING_CONSONANT_END)) {
03662        /* 3-char composition */
03663        tmp = (MZ_JAMO_SYLLABLE_START
03664               + ((s2[i] - MZ_JAMO_INITIAL_CONSONANT_START) 
03665                 * MZ_JAMO_VOWEL_COUNT * MZ_JAMO_TRAILING_CONSONANT_COUNT)
03666               + ((s2[i+1] - MZ_JAMO_VOWEL_START)
03667                 * MZ_JAMO_TRAILING_CONSONANT_COUNT)
03668               + (s2[i+2] - MZ_JAMO_TRAILING_CONSONANT_START));
03669        i += 2;
03670       } else {
03671        /* 2-char composition */
03672        tmp = (MZ_JAMO_SYLLABLE_START
03673               + ((s2[i] - MZ_JAMO_INITIAL_CONSONANT_START) 
03674                 * MZ_JAMO_VOWEL_COUNT * MZ_JAMO_TRAILING_CONSONANT_COUNT)
03675               + ((s2[i+1] - MZ_JAMO_VOWEL_START)
03676                 * MZ_JAMO_TRAILING_CONSONANT_COUNT));
03677        i++;
03678       }
03679       last_c0 = tmp;
03680       last_c0_pos = j;
03681       last_cc = 0;
03682       s2[j++] = tmp;
03683     } else {
03684       int cc;
03685       
03686       cc = scheme_combining_class(s2[i]);
03687       if (last_c0 && (cc > last_cc))
03688        tmp = get_composition(last_c0, s2[i]);
03689       else
03690        tmp = 0;
03691 
03692       if (tmp) {
03693        /* Need to compose */
03694        s2[last_c0_pos] = tmp;
03695        last_c0 = tmp;
03696       } else if (!cc) {
03697        /* Reset last_c0... */
03698        tmp = s2[i];
03699        if (scheme_needs_maybe_compose(tmp)) {
03700          last_c0 = tmp;
03701          last_c0_pos = j;
03702        } else {
03703          last_c0 = 0;
03704        }
03705        last_cc = -1;
03706        s2[j++] = tmp;
03707       } else {
03708        s2[j++] = s2[i];
03709        last_cc = cc;
03710       }
03711     }
03712   }
03713 
03714   s2[j] = 0;
03715   if (len - j > 16) {
03716     s2 = (mzchar *)scheme_malloc_atomic((j + 1) * sizeof(mzchar));
03717     memcpy(s2, s, (j + 1) * sizeof(mzchar));
03718     s2 = s;
03719   }
03720 
03721   return scheme_make_sized_char_string(s2, j, 0);
03722 }
03723 
03724 static Scheme_Object *normalize_d(Scheme_Object *o, int kompat)
03725 {
03726   mzchar *s, tmp, *s2;
03727   int len, i, delta, j, swapped;
03728 
03729   s = SCHEME_CHAR_STR_VAL(o);
03730   len = SCHEME_CHAR_STRLEN_VAL(o);
03731 
03732   /* Run through string list to predict expansion: */
03733   delta = 0;
03734   for (i = 0; i < len; i++) {
03735     if (scheme_needs_decompose(s[i])) {
03736       int klen;
03737       mzchar snd;
03738       GC_CAN_IGNORE unsigned short *start;
03739 
03740       tmp = s[i];
03741       while (scheme_needs_decompose(tmp)) {
03742        if (kompat)
03743          klen = get_kompat_decomposition(tmp, &start);
03744        else
03745          klen = 0;
03746        if (klen) {
03747          delta += (klen - 1);
03748          break;
03749        } else {
03750          tmp = get_canon_decomposition(tmp, &snd);
03751          if (tmp) {
03752            if (snd) {
03753              delta++;
03754              if (kompat) {
03755               klen = get_kompat_decomposition(snd, &start);
03756               if (klen)
03757                 delta += (klen - 1);
03758              }
03759            }
03760          } else
03761            break;
03762        }
03763       }
03764     } else if ((s[i] >= MZ_JAMO_SYLLABLE_START)
03765               && (s[i] <= MZ_JAMO_SYLLABLE_END)) {
03766       tmp = s[i];
03767       tmp -= MZ_JAMO_SYLLABLE_START;
03768       if (tmp % MZ_JAMO_TRAILING_CONSONANT_COUNT)
03769        delta += 2;
03770       else
03771        delta += 1;
03772     }
03773   }
03774 
03775   s2 = (mzchar *)scheme_malloc_atomic((len + delta + 1) * sizeof(mzchar));
03776 
03777   j = 0;
03778   for (i = 0; i < len; i++) {
03779     if (scheme_needs_decompose(s[i])) {
03780       mzchar snd, tmp2;
03781       int snds = 0, klen = 0, k;
03782       GC_CAN_IGNORE unsigned short*start;
03783 
03784       tmp = s[i];
03785       while (scheme_needs_decompose(tmp)) {
03786        if (kompat)
03787          klen = get_kompat_decomposition(tmp, &start);
03788        else
03789          klen = 0;
03790        if (klen) {
03791          for (k = 0; k < klen; k++) {
03792            s2[j++] = start[k];
03793          }
03794          break;
03795        } else {
03796          tmp2 = get_canon_decomposition(tmp, &snd);
03797          if (tmp2) {
03798            tmp = tmp2;
03799            if (snd) {
03800              if (kompat)
03801               klen = get_kompat_decomposition(snd, &start);
03802              else
03803               klen = 0;
03804              if (klen) {
03805               snds += klen;
03806               for (k = 0; k < klen; k++) {
03807                 s2[len + delta - snds + k] = start[k];
03808               }
03809               klen = 0;
03810              } else {
03811               snds++;
03812               s2[len + delta - snds] = snd;
03813              }
03814            }
03815          } else 
03816            break;
03817        }
03818       }
03819       if (!klen)
03820        s2[j++] = tmp;
03821       memcpy(s2 + j, s2 + len + delta - snds, snds * sizeof(mzchar));
03822       j += snds;
03823     } else if ((s[i] >= MZ_JAMO_SYLLABLE_START)
03824               && (s[i] <= MZ_JAMO_SYLLABLE_END)) {
03825       int l, v, t;
03826       tmp = s[i];
03827       tmp -= MZ_JAMO_SYLLABLE_START;
03828       l = tmp / (MZ_JAMO_VOWEL_COUNT * MZ_JAMO_TRAILING_CONSONANT_COUNT);
03829       v = (tmp % (MZ_JAMO_VOWEL_COUNT * MZ_JAMO_TRAILING_CONSONANT_COUNT)) / MZ_JAMO_TRAILING_CONSONANT_COUNT;
03830       t = tmp % MZ_JAMO_TRAILING_CONSONANT_COUNT;
03831       s2[j++] = MZ_JAMO_INITIAL_CONSONANT_START + l;
03832       s2[j++] = MZ_JAMO_VOWEL_START + v;
03833       if (t) {
03834        s2[j++] = MZ_JAMO_TRAILING_CONSONANT_START + t;
03835       }
03836     } else {
03837       s2[j++] = s[i];
03838     }
03839   }
03840   s2[j] = 0;
03841   len += delta;
03842 
03843   /* Reorder pass: */
03844   do {
03845     swapped = 0;
03846     for (i = 0; i < len; i++) {
03847       if ((i + 1 < len)
03848          && scheme_combining_class(s2[i])
03849          && scheme_combining_class(s2[i+1])
03850          && (scheme_combining_class(s2[i+1]) < scheme_combining_class(s2[i]))) {
03851        /* Reorder and try again: */
03852        tmp = s2[i + 1];
03853        s2[i + 1] = s2[i];
03854        s2[i] = tmp;
03855        i--;
03856        swapped = 1;
03857       }
03858     }
03859   } while (swapped);
03860 
03861   return scheme_make_sized_char_string(s2, len, 0);
03862 }
03863 
03864 static Scheme_Object *do_string_normalize_c (const char *who, int argc, Scheme_Object *argv[], int kompat)
03865 {
03866   Scheme_Object *o;
03867   mzchar *s, last_c0 = 0, snd;
03868   int len, i, last_cc = 0;
03869 
03870   o = argv[0];
03871   if (!SCHEME_CHAR_STRINGP(o))
03872     scheme_wrong_type(who, "string", 0, argc, argv);
03873 
03874   s = SCHEME_CHAR_STR_VAL(o);
03875   len = SCHEME_CHAR_STRLEN_VAL(o);
03876 
03877   for (i = 0; i < len; i++) {
03878     if (scheme_needs_decompose(s[i])
03879        && (kompat || get_canon_decomposition(s[i], &snd))) {
03880       /* Decomposition may expose a different composition */
03881       break;
03882     } else if ((i + 1 < len)
03883        && scheme_combining_class(s[i])
03884        && scheme_combining_class(s[i+1])
03885        && (scheme_combining_class(s[i+1]) < scheme_combining_class(s[i]))) {
03886       /* Need to reorder */
03887       break;
03888     } else if ((s[i] >= MZ_JAMO_INITIAL_CONSONANT_START)
03889               && (s[i] <= MZ_JAMO_INITIAL_CONSONANT_END)
03890               && (s[i+1] >= MZ_JAMO_VOWEL_START)
03891               && (s[i+1] <= MZ_JAMO_VOWEL_END)) {
03892       /* Need Hangul composition */
03893       break;
03894     } else if (last_c0 
03895               && get_composition(last_c0, s[i])
03896               && (scheme_combining_class(s[i]) > last_cc)) {
03897       /* Need to compose */
03898       break;
03899     } else {
03900       int cc;
03901 
03902       cc = scheme_combining_class(s[i]);
03903 
03904       if (!cc) {
03905        if (scheme_needs_maybe_compose(s[i]))
03906          last_c0 = s[i];
03907        else
03908          last_c0 = 0;
03909        last_cc = -1;
03910       } else
03911        last_cc = cc;
03912     }
03913   }
03914 
03915   if (i < len) {
03916     o = normalize_c(normalize_d(o, kompat));
03917   }
03918 
03919   return o;
03920 }
03921 
03922 static Scheme_Object *string_normalize_c (int argc, Scheme_Object *argv[])
03923 {
03924   return do_string_normalize_c("string-normalize-nfc", argc, argv, 0);
03925 }
03926 
03927 static Scheme_Object *string_normalize_kc (int argc, Scheme_Object *argv[])
03928 {
03929   return do_string_normalize_c("string-normalize-nfkc", argc, argv, 1);
03930 }
03931 
03932 static Scheme_Object *do_string_normalize_d (const char *who, int argc, Scheme_Object *argv[], int kompat)
03933 {
03934   Scheme_Object *o;
03935   mzchar *s;
03936   int len, i;
03937 
03938   o = argv[0];
03939   if (!SCHEME_CHAR_STRINGP(o))
03940     scheme_wrong_type(who, "string", 0, argc, argv);
03941 
03942   s = SCHEME_CHAR_STR_VAL(o);
03943   len = SCHEME_CHAR_STRLEN_VAL(o);
03944 
03945   for (i = len; i--; ) {
03946     if (scheme_needs_decompose(s[i])) {
03947       /* Need to decompose */
03948       mzchar snd;
03949       if (kompat || get_canon_decomposition(s[i], &snd))
03950        break;
03951     } else if ((i + 1 < len)
03952               && scheme_combining_class(s[i])
03953               && scheme_combining_class(s[i+1])
03954               && (scheme_combining_class(s[i+1]) < scheme_combining_class(s[i]))) {
03955       /* Need to reorder */
03956       break;
03957     } else if ((s[i] >= MZ_JAMO_SYLLABLE_START)
03958               && (s[i] <= MZ_JAMO_SYLLABLE_END)) {
03959       /* Need Hangul decomposition */
03960       break;
03961     }
03962   }
03963 
03964   if (i >= 0) {
03965     o = normalize_d(o, kompat);
03966   }
03967 
03968   return o;
03969 }
03970 
03971 static Scheme_Object *string_normalize_d (int argc, Scheme_Object *argv[])
03972 {
03973   return do_string_normalize_d("string-normalize-nfd", argc, argv, 0);
03974 }
03975 
03976 static Scheme_Object *string_normalize_kd (int argc, Scheme_Object *argv[])
03977 {
03978   return do_string_normalize_d("string-normalize-nfkd", argc, argv, 1);
03979 }
03980 
03981 /**********************************************************************/
03982 /*                            strcmps                                 */
03983 /**********************************************************************/
03984 
03985 int scheme_char_strlen(const mzchar *s)
03986 {
03987   int i;
03988   for (i = 0; s[i]; i++) {
03989   }
03990   return i;
03991 }
03992 
03993 static int mz_char_strcmp(const char *who, const mzchar *str1, int l1, const mzchar *str2, int l2, 
03994                        int use_locale, int size_shortcut)
03995 {
03996   int endres;
03997 
03998   if (size_shortcut && (l1 != l2))
03999     return 1;
04000 
04001 #ifndef DONT_USE_LOCALE
04002   if (use_locale) {
04003     reset_locale();
04004     if (locale_on) {
04005       return do_locale_comp(who, str1, l1, str2, l2, 0);
04006     }
04007   }
04008 #endif
04009 
04010   if (l1 > l2) {
04011     l1 = l2;
04012     endres = 1;
04013   } else {
04014     if (l2 > l1)
04015       endres = -1;
04016     else
04017       endres = 0;
04018   }
04019 
04020   while (l1--) {
04021     unsigned int a, b;
04022 
04023     a = *(str1++);
04024     b = *(str2++);
04025 
04026     a = a - b;
04027     if (a)
04028       return a;
04029   }
04030 
04031   return endres;
04032 }
04033 
04034 static int mz_char_strcmp_ci(const char *who, const mzchar *str1, int l1, const mzchar *str2, int l2, 
04035                           int use_locale, int size_shortcut)
04036 {
04037   int p1, p2, sp1, sp2, a, b;
04038   mzchar spec1[SPECIAL_CASE_FOLD_MAX], spec2[SPECIAL_CASE_FOLD_MAX];
04039 
04040   if (size_shortcut && (l1 != l2))
04041     return 1;
04042 
04043 #ifndef DONT_USE_LOCALE
04044   if (use_locale) {
04045     reset_locale();
04046     if (locale_on) {
04047       return do_locale_comp(who, str1, l1, str2, l2, 1);
04048     }
04049   }
04050 #endif
04051 
04052   p1 = sp1 = 0;
04053   p2 = sp2 = 0;
04054 
04055   while (((p1 < l1) || sp1) && ((p2 < l2) || sp2)) {
04056     if (sp1) {
04057       a = spec1[--sp1];
04058     } else {
04059       a = str1[p1];
04060       if (scheme_isspecialcasing(a)) {
04061        int pos, i;
04062        pos = find_special_casing(a);
04063        sp1 = uchar_special_casings[pos + 7];
04064        pos = uchar_special_casings[pos + 8];
04065        for (i = sp1; i--; pos++) {
04066          spec1[i] = uchar_special_casing_data[pos];
04067        }
04068        a = spec1[--sp1];
04069       } else {
04070        a = scheme_tofold(a);
04071       }
04072       p1++;
04073     }
04074 
04075     if (sp2) {
04076       b = spec2[--sp2];
04077     } else {
04078       b = str2[p2];
04079       if (scheme_isspecialcasing(b)) {
04080        int pos, i;
04081        pos = find_special_casing(b);
04082        sp2 = uchar_special_casings[pos + 7];
04083        pos = uchar_special_casings[pos + 8];
04084        for (i = sp2; i--; pos++) {
04085          spec2[i] = uchar_special_casing_data[pos];
04086        }
04087        b = spec2[--sp2];
04088       } else {
04089        b = scheme_tofold(b);
04090       }
04091       p2++;
04092     }
04093 
04094     a = a - b;
04095     if (a)
04096       return a;
04097   }
04098 
04099   return ((p1 < l1) || sp1) - ((p2 < l2) || sp2);
04100 }
04101 
04102 static int mz_strcmp(const char *who, unsigned char *str1, int l1, unsigned char *str2, int l2)
04103 {
04104   int endres;
04105 
04106   if (l1 > l2) {
04107     l1 = l2;
04108     endres = 1;
04109   } else {
04110     if (l2 > l1)
04111       endres = -1;
04112     else
04113       endres = 0;
04114   }
04115 
04116   while (l1--) {
04117     unsigned int a, b;
04118 
04119     a = *(str1++);
04120     b = *(str2++);
04121 
04122     a = a - b;
04123     if (a)
04124       return a;
04125   }
04126 
04127   return endres;
04128 }
04129 
04130 /**********************************************************************/
04131 /*                  byte string conversion                            */
04132 /**********************************************************************/
04133 
04134 static void close_converter(Scheme_Object *o, void *data)
04135 {
04136   Scheme_Converter *c = (Scheme_Converter *)o;
04137 
04138   if (!c->closed) {
04139     c->closed = 1;
04140     if (c->kind == mzICONV_KIND) {
04141       iconv_close(c->cd);
04142       c->cd = (iconv_t)-1;
04143     }
04144     if (c->mref) {
04145       scheme_remove_managed(c->mref, (Scheme_Object *)c);
04146       c->mref = NULL;
04147     }
04148   }
04149 }
04150 
04151 Scheme_Object *scheme_open_converter(const char *from_e, const char *to_e)
04152 {
04153   Scheme_Converter *c;
04154   iconv_t cd;
04155   int kind;
04156   int permissive;
04157   int need_regis = 1;
04158   Scheme_Custodian_Reference *mref;
04159 
04160   if (!iconv_ready) init_iconv();
04161 
04162   if (!*to_e || !*from_e)
04163     reset_locale();
04164 
04165   if ((!strcmp(from_e, "UTF-8")
04166        || !strcmp(from_e, "UTF-8-permissive")
04167        || (!*from_e && mzLOCALE_IS_UTF_8(current_locale_name)))
04168       && (!strcmp(to_e, "UTF-8")
04169          || (!*to_e && mzLOCALE_IS_UTF_8(current_locale_name)))) {
04170     /* Use the built-in UTF-8<->UTF-8 converter: */
04171     kind = mzUTF8_KIND;
04172     if (!strcmp(from_e, "UTF-8-permissive"))
04173       permissive = 0xFFFD;
04174     else
04175       permissive = 0;
04176     cd = (iconv_t)-1;
04177     need_regis = (*to_e && *from_e);
04178   } else if ((!strcmp(from_e, "platform-UTF-8")
04179              || !strcmp(from_e, "platform-UTF-8-permissive"))
04180             && !strcmp(to_e, "platform-UTF-16")) {
04181     kind = mzUTF8_TO_UTF16_KIND;
04182     if (!strcmp(from_e, "platform-UTF-8-permissive"))
04183       permissive = 0xFFFD;
04184     else
04185       permissive = 0;
04186     cd = (iconv_t)-1;
04187     need_regis = 0;
04188   } else if (!strcmp(from_e, "platform-UTF-16")
04189             && !strcmp(to_e, "platform-UTF-8")) {
04190     kind = mzUTF16_TO_UTF8_KIND;
04191     permissive = 0;
04192     cd = (iconv_t)-1;
04193     need_regis = 0;
04194   } else {
04195     if (!iconv_ready) init_iconv();
04196 
04197     if (!mzCHK_PROC(iconv_open))
04198       return scheme_false;
04199 
04200     if (!*from_e || !*to_e)
04201       reset_locale();
04202 
04203     if (!*from_e)
04204       from_e = mz_iconv_nl_langinfo();
04205     if (!*to_e)
04206       to_e = mz_iconv_nl_langinfo();
04207     cd = iconv_open(to_e, from_e);
04208 
04209     if (cd == (iconv_t)-1)
04210       return scheme_false;
04211 
04212     kind = mzICONV_KIND;
04213     permissive = 0;
04214   }
04215 
04216   c = MALLOC_ONE_TAGGED(Scheme_Converter);
04217   c->so.type = scheme_string_converter_type;
04218   c->closed = 0;
04219   c->kind = kind;
04220   c->permissive = permissive;
04221   c->cd = cd;
04222   if (!need_regis)
04223     mref = NULL;
04224   else
04225     mref = scheme_add_managed(NULL,
04226                            (Scheme_Object *)c,
04227                            close_converter,
04228                            NULL, 1);
04229   c->mref = mref;
04230 
04231   return (Scheme_Object *)c;
04232 }
04233 
04234 static Scheme_Object *byte_string_open_converter(int argc, Scheme_Object **argv)
04235 {
04236   Scheme_Object *s1, *s2;
04237   char *from_e, *to_e;
04238   
04239   if (!SCHEME_CHAR_STRINGP(argv[0]))
04240     scheme_wrong_type("bytes-open-converter", "byte string", 0, argc, argv);
04241   if (!SCHEME_CHAR_STRINGP(argv[1]))
04242     scheme_wrong_type("bytes-open-converter", "byte string", 1, argc, argv);
04243 
04244   scheme_custodian_check_available(NULL, "bytes-open-converter", "converter");
04245 
04246   s1 = scheme_char_string_to_byte_string(argv[0]);
04247   s2 = scheme_char_string_to_byte_string(argv[1]);
04248 
04249   if (scheme_byte_string_has_null(s1))
04250     return scheme_false;
04251   if (scheme_byte_string_has_null(s2))
04252     return scheme_false;
04253 
04254   from_e = SCHEME_BYTE_STR_VAL(s1);
04255   to_e = SCHEME_BYTE_STR_VAL(s2);
04256 
04257   return scheme_open_converter(from_e, to_e);
04258 }
04259 
04260 static Scheme_Object *convert_one(const char *who, int opos, int argc, Scheme_Object *argv[])
04261 {
04262   char *r, *instr;
04263   int status;
04264   long amt_read, amt_wrote;
04265   long istart, ifinish, ostart, ofinish;
04266   Scheme_Object *a[3], *status_sym;
04267   Scheme_Converter *c;
04268 
04269   if (!SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_string_converter_type))
04270     scheme_wrong_type(who, "converter", 0, argc, argv);
04271 
04272   if (opos > 1) {
04273     if (!SCHEME_BYTE_STRINGP(argv[1]))
04274       scheme_wrong_type(who, "bytes", 1, argc, argv);
04275     scheme_get_substring_indices(who, argv[1], argc, argv, 2, 3, &istart, &ifinish);
04276   } else {
04277     istart = 0;
04278     ifinish = 4; /* This is really a guess about how much space we need for a shift terminator */
04279   }
04280 
04281   if (argc > opos) {
04282     if (SCHEME_TRUEP(argv[opos])) {
04283       if (!SCHEME_MUTABLE_BYTE_STRINGP(argv[opos]))
04284        scheme_wrong_type(who, "mutable byte string", opos, argc, argv);
04285       r = SCHEME_BYTE_STR_VAL(argv[opos]);
04286       scheme_get_substring_indices(who, argv[opos], argc, argv, opos + 1, opos + 2, &ostart, &ofinish);
04287     } else {
04288       int ip;
04289       r = NULL;
04290       for (ip = opos + 1; ip <= opos + 2; ip++) {
04291        if (argc > ip) {
04292          int ok = 0;
04293          if (SCHEME_INTP(argv[ip]))
04294            ok = SCHEME_INT_VAL(argv[ip]) >= 0;
04295          else if (SCHEME_BIGNUMP(argv[ip]))
04296            ok = SCHEME_BIGPOS(argv[ip]);
04297          else if ((ip == opos + 2) && SCHEME_FALSEP(argv[ip]))
04298            ok = 1;
04299          if (!ok)
04300            scheme_wrong_type(who,
04301                            ((ip == opos + 2)
04302                             ? "non-negative exact integer or #f"
04303                             : "non-negative exact integer"),
04304                            ip, argc, argv);
04305        }
04306       }
04307       if ((argc > opos + 2) && SCHEME_TRUEP(argv[opos + 2])) {
04308        Scheme_Object *delta;
04309        if (scheme_bin_lt(argv[opos + 2], argv[opos + 1])) {
04310          scheme_arg_mismatch(who,
04311                            "ending index is less than the starting index: ",
04312                            argv[opos + 2]);
04313        }
04314        delta = scheme_bin_minus(argv[opos + 2], argv[opos + 1]);
04315        if (SCHEME_BIGNUMP(delta))
04316          ofinish = -1;
04317        else
04318          ofinish = SCHEME_INT_VAL(delta);
04319        ostart = 0;
04320       } else {
04321        ostart = 0;
04322        ofinish = -1;
04323       }
04324     }
04325   } else {
04326     r = NULL;
04327     ostart = 0;
04328     ofinish = -1;
04329   }
04330 
04331   c = (Scheme_Converter *)argv[0];
04332   if (c->closed)
04333     scheme_arg_mismatch(who, "converter is closed: ", argv[0]);
04334 
04335   instr = ((opos > 1) ? SCHEME_BYTE_STR_VAL(argv[1]) : NULL);
04336 
04337   if (c->kind == mzUTF16_TO_UTF8_KIND) {
04338     if (istart & 0x1) {
04339       /* Copy to word-align */
04340       char *c2;
04341       c2 = (char *)scheme_malloc_atomic(ifinish - istart);
04342       memcpy(c2, instr XFORM_OK_PLUS istart, ifinish - istart);
04343       ifinish = ifinish - istart;
04344       istart = 0;
04345       instr = c2;
04346     }
04347 
04348     status = utf8_encode_x((const unsigned int *)instr, istart >> 1, ifinish >> 1,
04349                         (unsigned char *)r, ostart, ofinish,
04350                         &amt_read, &amt_wrote, 1);
04351     
04352     amt_read -= (istart >> 1);
04353 
04354     if (amt_read) {
04355       if (!r) {
04356        /* Need to allocate, then do it again: */
04357        r = (char *)scheme_malloc_atomic(amt_wrote + 1);
04358        utf8_encode_x((const unsigned int *)instr, istart >> 1, ifinish >> 1,
04359                     (unsigned char *)r, ostart, ofinish,
04360                     NULL, NULL, 1);
04361        r[amt_wrote] = 0;
04362       }
04363       amt_read <<= 1;
04364     }
04365 
04366     /* We might get a -1 result because the input has an odd number of
04367        bytes, and 2nd+next-to-last bytes form an unpaired
04368        surrogate. In that case, the transformer normally needs one
04369        more byte: Windows is little-endian, so we need the byte to
04370        tell whether the surrogate is paired, and for all other
04371        platforms (where we assume that surrogates are paired), we need
04372        the byte to generate output. Technically, on a big-endian
04373        non-Windows machine, we could generate the first byte of UTF-8
04374        output and keep the byte as state, but we don't. */
04375 
04376     if (status != -1) {
04377       if (amt_read < ((ifinish - istart) & ~0x1)) {
04378        /* Must have run out of output space */
04379        status = 1;
04380       } else {
04381        /* Read all of input --- but it wasn't really all if there
04382           was an odd number of bytes. */
04383        if ((ifinish - istart) & 0x1)
04384          status = -1;
04385        else
04386          status = 0;
04387       }
04388     }
04389   } else if (c->kind != mzICONV_KIND) {
04390     /* UTF-8 -> UTF-{8,16} "identity" converter, but maybe permissive */
04391     if (instr) {
04392       long _ostart, _ofinish;
04393       int utf16;
04394 
04395       if (c->kind == mzUTF8_TO_UTF16_KIND) {
04396        _ostart = ostart;
04397        _ofinish = ofinish;
04398        if (_ostart & 0x1)
04399          _ostart++;
04400        _ostart >>= 1;
04401        if (_ofinish > 0)
04402          _ofinish >>= 1;
04403        utf16 = 1;
04404       } else {
04405        _ostart = ostart;
04406        _ofinish = ofinish;
04407        utf16 = 0;
04408       }
04409 
04410       status = utf8_decode_x((unsigned char *)instr, istart, ifinish,
04411                           (unsigned int *)r, _ostart, _ofinish,
04412                           &amt_read, &amt_wrote,
04413                           1, utf16, NULL, 1, c->permissive);
04414       
04415       if (utf16) {
04416        _ostart <<= 1;
04417        amt_wrote <<= 1;
04418        if ((ostart & 0x1) && (amt_wrote > _ostart)) {
04419          /* Shift down one byte: */
04420          memmove(r XFORM_OK_PLUS ostart, r XFORM_OK_PLUS _ostart, amt_wrote - _ostart);
04421        }
04422       }
04423 
04424       amt_read -= istart;
04425       amt_wrote -= _ostart;
04426       if (status == -3) {
04427        /* r is not NULL; ran out of room */
04428        status = 1;
04429       } else {
04430        if (amt_wrote) {
04431          if (!r) {
04432            /* Need to allocate, then do it again: */
04433            r = (char *)scheme_malloc_atomic(amt_wrote + 1);
04434            utf8_decode_x((unsigned char *)instr, istart, ifinish,
04435                        (unsigned int *)r, ostart, _ofinish,
04436                        NULL, NULL,
04437                        1, utf16, NULL, 1, c->permissive);
04438            r[amt_wrote] = 0;
04439          }
04440        } else if (!r)
04441          r = "";
04442        if (status > 0)
04443          status = 0;
04444       }
04445     } else {
04446       r = "";
04447       status = 0;
04448       amt_read = 0;
04449       amt_wrote = 0;
04450     }
04451   } else {
04452     r = do_convert(c->cd, NULL, NULL, 0,
04453                  instr, istart, ifinish-istart,
04454                  r, ostart, ofinish-ostart,
04455                  !r, /* grow? */
04456                  0,
04457                  (r ? 0 : 1), /* terminator */
04458                  &amt_read, &amt_wrote,
04459                  &status);
04460   }
04461 
04462   if (status == 0) {
04463     /* Converted all input without error */
04464     status_sym = complete_symbol;
04465   } else if (status == 1) {
04466     /* Filled output, more input ready */
04467     status_sym = continues_symbol;
04468   } else if (status == -1) {
04469     /* Input ends in the middle of an encoding */
04470     status_sym = aborts_symbol;
04471   } else {
04472     /* Assert: status == -2 */
04473     /* Input has error (that won't be fixed by
04474        adding more characters */
04475     status_sym = error_symbol;
04476   }
04477 
04478   if (argc <= opos) {
04479     a[0] = scheme_make_sized_byte_string(r, amt_wrote, 0);
04480   } else {
04481     a[0] = scheme_make_integer(amt_wrote);
04482   }
04483   if (opos > 1) {
04484     a[1] = scheme_make_integer(amt_read);
04485     a[2] = status_sym;
04486     return scheme_values(3, a);
04487   } else {
04488     a[1] = status_sym;
04489     return scheme_values(2, a);
04490   }
04491 }
04492 
04493 static Scheme_Object *byte_string_convert(int argc, Scheme_Object *argv[])
04494 {
04495   return convert_one("bytes-convert", 4, argc, argv);
04496 }
04497 
04498 static Scheme_Object *byte_string_convert_end(int argc, Scheme_Object *argv[])
04499 {
04500   return convert_one("bytes-convert-end", 1, argc, argv);
04501 }
04502 
04503 void scheme_close_converter(Scheme_Object *conv)
04504 {
04505   close_converter(conv, NULL);
04506 }
04507 
04508 static Scheme_Object *byte_string_close_converter(int argc, Scheme_Object **argv)
04509 {
04510   if (!SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_string_converter_type))
04511     scheme_wrong_type("bytes-close-converter", "converter", 0, argc, argv);
04512 
04513   scheme_close_converter(argv[0]);
04514 
04515   return scheme_void;
04516 }
04517 
04518 static Scheme_Object *
04519 byte_converter_p(int argc, Scheme_Object *argv[])
04520 {
04521   return (SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_string_converter_type)
04522          ? scheme_true
04523          : scheme_false);
04524 }
04525 
04526 /**********************************************************************/
04527 /*                         utf8 converter                             */
04528 /**********************************************************************/
04529 
04530 static int utf8_decode_x(const unsigned char *s, int start, int end,
04531                       unsigned int *us, int dstart, int dend,
04532                       long *ipos, long *jpos,
04533                       char compact, char utf16, int *_state,
04534                       int might_continue, int permissive)
04535      /* Results:
04536        non-negative => translation complete, = number of produced chars
04537        -1 => input ended in middle of encoding (only if might_continue)
04538        -2 => encoding error (only if permissive is 0)
04539        -3 => not enough output room
04540 
04541        ipos & jpos are filled with ending positions (between [d]start
04542        and [d]end) before return, unless they are NULL.
04543 
04544        compact => UTF-8 to UTF-8 or UTF-16 --- the latter if utf16
04545        for Windows for utf16, decode extended UTF-8 that allows surrogates
04546 
04547        _state provides initial state and is filled with ending state;
04548        when it's not NULL, the us must be NULL
04549 
04550        might_continue => allows -1 result without consuming characters
04551 
04552        permissive is non-zero => use permissive as value for bad byte
04553        sequences. When generating UTF-8, this must be an ASCII character
04554         or U+FFFD. */
04555 
04556 {
04557   int i, j, oki, failmode = -3, state;
04558   int init_doki;
04559   int nextbits, v;
04560   unsigned int sc;
04561 # ifdef WINDOWS_UNICODE_SUPPORT
04562   int pending_surrogate = 0;
04563 # endif
04564 
04565   if (_state) {
04566     state = (*_state) & 0x7;
04567     init_doki = (((*_state) >> 3) & 0x7);
04568     nextbits = ((((*_state) >> 6) & 0xF) << 2);
04569     /* Need v to detect 0xD800 through 0xDFFF
04570        Note that we have 22 bits to work with, which is
04571        is enough to detect > 0x10FFFF */
04572     v = ((*_state) >> 10);
04573   } else {
04574     state = 0;
04575     init_doki = 0;
04576     nextbits = 0;
04577     v = 0;
04578   }
04579 
04580   /* In non-permissive mode, a negative result means ill-formed input.
04581      Permissive mode accepts anything and tries to convert it.  In
04582      that case, the strategy for illegal sequences is to convert
04583      anything bad to the given "permissive" value. */
04584 
04585   if (end < 0)
04586     end = strlen((char *)s);
04587   if (dend < 0)
04588     dend = 0x7FFFFFFF;
04589 
04590 # define ENCFAIL i = oki; failmode = -2; break
04591 
04592   oki = start;
04593   j = dstart;
04594   i = start;
04595   if (j < dend) {
04596     while (i < end) {
04597       sc = s[i];
04598       if (sc < 0x80) {
04599        if (state) {
04600          /* In a sequence, but didn't continue */
04601          state = 0;
04602          nextbits = 0;
04603          if (permissive) {
04604            v = permissive;
04605            i = oki;
04606            j += init_doki;
04607          } else {
04608            ENCFAIL;
04609          }
04610        } else {
04611          v = sc;
04612        }
04613       } else if ((sc & 0xC0) == 0x80) {
04614        /* Continues a sequence ... */
04615        if (state) {
04616          /* ... and we're in one ... */
04617          if (!nextbits || (sc & nextbits)) {
04618            /* and we have required bits. */
04619            v = (v << 6) + (sc & 0x3F);
04620            nextbits = 0;
04621            --state;
04622            if (state) {
04623              i++;
04624              continue;
04625            }
04626            /* We finished. One last check: */
04627            if ((((v >= 0xD800) && (v <= 0xDFFF))
04628                || (v > 0x10FFFF))
04629 # ifdef WINDOWS_UNICODE_SUPPORT
04630               && (!utf16
04631                   /* If UTF-16 for Windows, just apply upper-limit check */
04632                   || (v > 0x10FFFF))
04633 # endif
04634               ) {
04635              /* UTF-16 surrogates or other illegal code units */
04636              if (permissive) {
04637               v = permissive;
04638               j += init_doki;
04639               i = oki;
04640              } else {
04641               ENCFAIL;
04642              }
04643            }
04644          } else {
04645            /* ... but we're missing required bits. */
04646            state = 0;
04647            nextbits = 0;
04648            if (permissive) {
04649              v = permissive;
04650              j += init_doki;
04651              i = oki;
04652            } else {
04653              ENCFAIL;
04654            }
04655          }
04656        } else {
04657          /* ... but we're not in one */
04658          if (permissive) {
04659            v = permissive;
04660          } else {
04661            ENCFAIL;
04662          }
04663        }
04664       } else if (state) {
04665        /* bad: already in a sequence */
04666        state = 0;
04667        if (permissive) {
04668          v = permissive;
04669          i = oki;
04670          j += init_doki;
04671        } else {
04672          ENCFAIL;
04673        }
04674       } else {
04675        if ((sc & 0xE0) == 0xC0) {
04676          if (sc & 0x1E) {
04677            state = 1;
04678            v = (sc & 0x1F);
04679            i++;
04680            continue;
04681          }
04682          /* else too small */
04683        } else if ((sc & 0xF0) == 0xE0) {
04684          state = 2;
04685          v = (sc & 0xF);
04686          if (!v)
04687            nextbits = 0x20;
04688          i++;
04689          continue;
04690        } else if ((sc & 0xF8) == 0xF0) {
04691          v = (sc & 0x7);
04692          if (v <= 4) {
04693            state = 3;
04694            if (!v)
04695              nextbits = 0x30;
04696            i++;
04697            continue;
04698          } 
04699          /* Else will be larger than 0x10FFFF, so fail */
04700        }
04701        /* Too small, or 0xFF or 0xFe, or start of a 5- or 6-byte sequence */
04702        if (permissive) {
04703          v = permissive;
04704        } else {
04705          ENCFAIL;
04706        }
04707       }
04708 
04709       /* If we get here, we're supposed to output v */
04710 
04711       if (compact) {
04712        if (utf16) {
04713          if (v > 0xFFFF) {
04714 # ifdef WINDOWS_UNICODE_SUPPORT
04715            if (pending_surrogate) {
04716              if (us)
04717               ((unsigned short *)us)[j] = pending_surrogate;
04718              j++; /* Accept previously written unpaired surrogate */
04719              pending_surrogate = 0;
04720            }
04721 # endif
04722            if (j + 1 >= dend)
04723              break;
04724            if (us) {
04725              v -= 0x10000;
04726              ((unsigned short *)us)[j] = 0xD800 | ((v >> 10) & 0x3FF);
04727              ((unsigned short *)us)[j+1] = 0xDC00 | (v & 0x3FF);
04728            }
04729            j++;
04730          } else {
04731 # ifdef WINDOWS_UNICODE_SUPPORT
04732            /* We allow a surrogate by itself, but don't allow
04733               a 0xDC00 after a 0xD800, otherwise multiple encodings can
04734               map to the same thing. */
04735            if ((v >= 0xD800) && (v <= 0xDFFF)) {
04736              if (pending_surrogate && ((v & 0xDC00) == 0xDC00)) {
04737               /* This looks like a surrogate pair, so disallow it. */
04738               if (permissive) {
04739                 /* We need to fill in 6 permissive substitutions,
04740                    one for each input byte. If we can't put all 6,
04741                    then don't use any input. */
04742                 if (j + 5 >= dend) {
04743                   break;
04744                 } else {
04745                   int p;
04746                   if (us) {
04747                     for (p = 0; p < 5; p++) {
04748                      if (j + p >= dend)
04749                        break;
04750                      ((unsigned short *)us)[j+p] = permissive;
04751                     }
04752                   }
04753                   j += 5;
04754                   v = permissive;
04755                 }
04756               } else {
04757                 ENCFAIL;
04758               }
04759               pending_surrogate = 0;
04760              } else {
04761               if (pending_surrogate) {
04762                 if (us)
04763                   ((unsigned short *)us)[j] = pending_surrogate;
04764                 j++; /* Accept previousy written unpaired surrogate */
04765                 pending_surrogate = 0;
04766                 if (j >= dend)
04767                   break;
04768               }
04769               if ((v & 0xDC00) == 0xD800)
04770                 pending_surrogate = v;
04771               else
04772                 pending_surrogate = 0;
04773              }
04774            } else {
04775              if (pending_surrogate) {
04776               if (us)
04777                 ((unsigned short *)us)[j] = pending_surrogate;
04778               j++; /* Accept previousy written unpaired surrogate */
04779               pending_surrogate = 0;
04780               if (j >= dend)
04781                 break;
04782              }
04783            }
04784 
04785            if (pending_surrogate)
04786              --j; /* don't accept unpaired surrogate, yet */
04787            else if (us)
04788              ((unsigned short *)us)[j] = v;
04789 # else
04790            if (us)
04791              ((unsigned short *)us)[j] = v;
04792 # endif
04793          }
04794        } else {
04795          int delta;
04796          delta = (i - oki);
04797          if (delta) {
04798            if (j + delta + 1 < dend) {
04799              if (us)
04800               memcpy(((char *)us) + j, s + oki, delta + 1);
04801              j += delta;
04802            } else
04803              break;
04804          } else if (v == 0xFFFD) {
04805             if (j + 3 < dend) {
04806               if (us) {
04807                 ((unsigned char *)us)[j] = 0xEF;
04808                 ((unsigned char *)us)[j+1] = 0xBF;
04809                 ((unsigned char *)us)[j+2] = 0xBD;
04810               }
04811               j += 2;
04812             } else
04813               break;
04814           } else if (us) {
04815             ((unsigned char *)us)[j] = v;
04816           }
04817        }
04818       } else if (us) {
04819        us[j] = v;
04820       }
04821       j++;
04822       i++;
04823       oki = i;
04824       init_doki = 0;
04825       if (j >= dend)
04826        break;
04827     }
04828   }
04829 
04830   if (_state) {
04831     if (!state)
04832       *_state = 0;
04833     else
04834       *_state = (state 
04835                | (((end - oki) + init_doki) << 3)
04836                | ((nextbits >> 2) << 6)
04837                | (v << 10));
04838   } else if (state) {
04839     if (might_continue || !permissive) {
04840       failmode = -1;
04841       i = end - 1; /* to ensure that failmode is returned */
04842     } else if (permissive) {
04843       for (i = oki; i < end; i++) {
04844        if (j < dend) {
04845          if (us) {
04846            if (compact) {
04847              if (utf16)
04848               ((unsigned short *)us)[j] = permissive;
04849              else
04850               ((unsigned char *)us)[j] = permissive;
04851            } else
04852              us[j] = permissive;
04853          }
04854          j++;
04855        } else
04856          break;
04857       }
04858       oki = i;
04859     }
04860   }
04861 
04862 # ifdef WINDOWS_UNICODE_SUPPORT
04863   if (pending_surrogate)
04864     oki -= 3;
04865 #endif
04866 
04867   if (ipos)
04868     *ipos = oki;
04869   if (jpos)
04870     *jpos = j;
04871 
04872   if (i < end)
04873     return failmode;
04874 
04875 # ifdef WINDOWS_UNICODE_SUPPORT
04876   if (pending_surrogate) {
04877     /* input must have ended right after surrogate */
04878     return -1;
04879   }
04880 #endif
04881 
04882   return j - dstart;
04883 }
04884 
04885 int scheme_utf8_decode(const unsigned char *s, int start, int end,
04886                      unsigned int *us, int dstart, int dend,
04887                      long *ipos, char utf16, int permissive)
04888 {
04889   return utf8_decode_x(s, start, end, us, dstart, dend,
04890                      ipos, NULL, utf16, utf16, NULL, 0, permissive);
04891 }
04892 
04893 int scheme_utf8_decode_as_prefix(const unsigned char *s, int start, int end,
04894                              unsigned int *us, int dstart, int dend,
04895                              long *ipos, char utf16, int permissive)
04896      /* Always returns number of read characters, not error codes. */
04897 {
04898   long opos;
04899   utf8_decode_x(s, start, end, us, dstart, dend,
04900               ipos, &opos, utf16, utf16, NULL, 1, permissive);
04901   return opos - dstart;
04902 }
04903 
04904 int scheme_utf8_decode_all(const unsigned char *s, int len, unsigned int *us, int permissive)
04905 {
04906   return utf8_decode_x(s, 0, len, us, 0, -1, NULL, NULL, 0, 0, NULL, 0, permissive);
04907 }
04908 
04909 int scheme_utf8_decode_prefix(const unsigned char *s, int len, unsigned int *us, int permissive)
04910      /* us != NULL */
04911 {
04912   {
04913     /* Try fast path (all ASCII) */
04914     int i;
04915     for (i = 0; i < len; i++) {
04916       if (s[i] < 128)
04917        us[i] = s[i];
04918       else
04919        break;
04920     }
04921     if (i == len)
04922       return len;
04923   }
04924 
04925   return utf8_decode_x(s, 0, len, us, 0, -1, NULL, NULL, 0, 0, NULL, 1, permissive);
04926 }
04927 
04928 mzchar *scheme_utf8_decode_to_buffer_len(const unsigned char *s, int len,
04929                                     mzchar *buf, int blen, long *_ulen)
04930 {
04931   int ulen;
04932 
04933   ulen = utf8_decode_x(s, 0, len, NULL, 0, -1,
04934                      NULL, NULL, 0, 0,
04935                      NULL, 0, 0);
04936   if (ulen < 0)
04937     return NULL;
04938   if (ulen + 1 > blen) {
04939     buf = (mzchar *)scheme_malloc_atomic((ulen + 1) * sizeof(mzchar));
04940   }
04941   utf8_decode_x(s, 0, len, buf, 0, -1,
04942               NULL, NULL, 0, 0,
04943               NULL, 0, 0);
04944   buf[ulen] = 0;
04945   *_ulen = ulen;
04946   return buf;
04947 }
04948 
04949 mzchar *scheme_utf8_decode_to_buffer(const unsigned char *s, int len,
04950                                  mzchar *buf, int blen)
04951 {
04952   long ulen;
04953   return scheme_utf8_decode_to_buffer_len(s, len, buf, blen, &ulen);
04954 }
04955 
04956 int scheme_utf8_decode_count(const unsigned char *s, int start, int end,
04957                           int *_state, int might_continue, int permissive)
04958 {
04959   long pos = 0;
04960 
04961   if (!_state || !*_state) {
04962     /* Try fast path (all ASCII): */
04963     int i;
04964     for (i = start; i < end; i++) {
04965       if (s[i] > 127)
04966        break;
04967     }
04968     if (i == end)
04969       return end - start;
04970   }
04971 
04972   utf8_decode_x(s, start, end,
04973               NULL, 0, -1,
04974               NULL, &pos,
04975               0, 0, _state,
04976               might_continue, permissive);
04977 
04978   return pos;
04979 }
04980 
04981 static int utf8_encode_x(const unsigned int *us, int start, int end,
04982                       unsigned char *s, int dstart, int dend,
04983                       long *_ipos, long *_opos, char utf16)
04984   /* Results:
04985         -1 => input ended in the middle of an encoding - only when utf16 and _opos
04986        non-negative => reports number of bytes/code-units produced */
04987 {
04988   int i, j, done = start;
04989 
04990   if (dend < 0)
04991     dend = 0x7FFFFFFF;
04992 
04993   if (!s) {
04994     unsigned int wc;
04995     j = 0;
04996     for (i = start; i < end; i++) {
04997       if (utf16) {
04998        wc = ((unsigned short *)us)[i];
04999        if ((wc & 0xF800) == 0xD800) {
05000          /* Unparse surrogates. We assume that the surrogates are
05001             well formed, unless this is Windows or if we're at the
05002              end and _opos is 0. */
05003 # ifdef WINDOWS_UNICODE_SUPPORT
05004 #  define UNPAIRED_MASK 0xFC00
05005 # else
05006 #  define UNPAIRED_MASK 0xF800
05007 # endif
05008          if (((i + 1) == end) && ((wc & UNPAIRED_MASK) == 0xD800) && _opos) {
05009            /* Ended in the middle of a surrogate pair */
05010            *_opos = j;
05011            if (_ipos)
05012              *_ipos = i;
05013            return -1;
05014          }
05015 # ifdef WINDOWS_UNICODE_SUPPORT
05016          if ((wc & 0xFC00) != 0xD800) {
05017            /* Count as one */
05018          } else if ((i + 1 >= end)
05019                    || (((((unsigned short *)us)[i+1]) & 0xFC00) != 0xDC00)) {
05020          } else 
05021 # endif
05022            {
05023              i++;
05024              wc = ((wc & 0x3FF) << 10) + ((((unsigned short *)us)[i]) & 0x3FF);
05025              wc += 0x10000;
05026            }
05027        }
05028       } else {
05029        wc = us[i];
05030       }
05031       if (wc < 0x80) {
05032        j += 1;
05033       } else if (wc < 0x800) {
05034        j += 2;
05035       } else if (wc < 0x10000) {
05036        j += 3;
05037       } else if (wc < 0x200000) {
05038        j += 4;
05039       } else if (wc < 0x4000000) {
05040        j += 5;
05041       } else {
05042        j += 6;
05043       }
05044     }
05045     if (_ipos)
05046       *_ipos = i;
05047     if (_opos)
05048       *_opos = j + dstart;
05049     return j;
05050   } else {
05051     unsigned int wc;
05052     j = dstart;
05053     for (i = start; i < end; i++) {
05054       if (utf16) {
05055        wc = ((unsigned short *)us)[i];
05056        if ((wc & 0xF800) == 0xD800) {
05057          /* Unparse surrogates. We assume that the surrogates are
05058             well formed on non-Windows platforms, but when _opos,
05059             we detect ending in the middle of an surrogate pair. */
05060          if (((i + 1) == end) && ((wc & UNPAIRED_MASK) == 0xD800) && _opos) {
05061            /* Ended in the middle of a surrogate pair */
05062            *_opos = j;
05063            if (_ipos)
05064              *_ipos = i;
05065            return -1;
05066          }
05067 # ifdef WINDOWS_UNICODE_SUPPORT
05068          if ((wc & 0xFC00) != 0xD800) {
05069            /* Let the misplaced surrogate through */
05070          } else if ((i + 1 >= end)
05071                    || (((((unsigned short *)us)[i+1]) & 0xFC00) != 0xDC00)) {
05072            /* Let the misplaced surrogate through */
05073          } else
05074 # endif
05075            {
05076              i++;
05077              wc = ((wc & 0x3FF) << 10) + ((((unsigned short *)us)[i]) & 0x3FF);
05078              wc += 0x10000;
05079            }
05080        }
05081       } else {
05082        wc = us[i];
05083       }
05084 
05085       if (wc < 0x80) {
05086        if (j + 1 > dend)
05087          break;
05088        s[j++] = wc;
05089       } else if (wc < 0x800) {
05090        if (j + 2 > dend)
05091          break;
05092        s[j++] = 0xC0 | ((wc & 0x7C0) >> 6);
05093        s[j++] = 0x80 | (wc & 0x3F);
05094       } else if (wc < 0x10000) {
05095        if (j + 3 > dend)
05096          break;
05097        s[j++] = 0xE0 | ((wc & 0xF000) >> 12);
05098        s[j++] = 0x80 | ((wc & 0x0FC0) >> 6);
05099        s[j++] = 0x80 | (wc & 0x3F);
05100       } else if (wc < 0x200000) {
05101        if (j + 4 > dend)
05102          break;
05103        s[j++] = 0xF0 | ((wc & 0x1C0000) >> 18);
05104        s[j++] = 0x80 | ((wc & 0x03F000) >> 12);
05105        s[j++] = 0x80 | ((wc & 0x000FC0) >> 6);
05106        s[j++] = 0x80 | (wc & 0x3F);
05107       } else if (wc < 0x4000000) {
05108        if (j + 5 > dend)
05109          break;
05110        s[j++] = 0xF8 | ((wc & 0x3000000) >> 24);
05111        s[j++] = 0x80 | ((wc & 0x0FC0000) >> 18);
05112        s[j++] = 0x80 | ((wc & 0x003F000) >> 12);
05113        s[j++] = 0x80 | ((wc & 0x0000FC0) >> 6);
05114        s[j++] = 0x80 | (wc & 0x3F);
05115       } else {
05116        if (j + 6 > dend)
05117          break;
05118        s[j++] = 0xFC | ((wc & 0x40000000) >> 30);
05119        s[j++] = 0x80 | ((wc & 0x3F000000) >> 24);
05120        s[j++] = 0x80 | ((wc & 0x00FC0000) >> 18);
05121        s[j++] = 0x80 | ((wc & 0x0003F000) >> 12);
05122        s[j++] = 0x80 | ((wc & 0x00000FC0) >> 6);
05123        s[j++] = 0x80 | (wc & 0x3F);
05124       }
05125       done = i;
05126     }
05127     if (_ipos)
05128       *_ipos = done;
05129     if (_opos)
05130       *_opos = j;
05131     return j - dstart;
05132   }
05133 }
05134 
05135 int scheme_utf8_encode(const unsigned int *us, int start, int end,
05136                      unsigned char *s, int dstart,
05137                      char utf16)
05138 {
05139   return utf8_encode_x(us, start, end,
05140                      s, dstart, -1,
05141                      NULL, NULL, utf16);
05142 }
05143 
05144 int scheme_utf8_encode_all(const unsigned int *us, int len, unsigned char *s)
05145 {
05146   return utf8_encode_x(us, 0, len, s, 0, -1, NULL, NULL, 0 /* utf16 */);
05147 }
05148 
05149 char *scheme_utf8_encode_to_buffer_len(const mzchar *s, int len,
05150                                    char *buf, int blen,
05151                                    long *_slen)
05152 {
05153   int slen;
05154 
05155   /* ASCII with len < blen is a common case: */
05156   if (len < blen) {
05157     for (slen = 0; slen < len; slen++) {
05158       if (s[slen] > 127)
05159         break;
05160       else
05161         buf[slen] = s[slen];
05162     }
05163     if (slen == len) {
05164       buf[slen] = 0;
05165       *_slen = slen;
05166       return buf;
05167     }
05168   }
05169 
05170   slen = utf8_encode_x(s, 0, len, NULL, 0, -1, NULL, NULL, 0);
05171   if (slen + 1 > blen) {
05172     buf = (char *)scheme_malloc_atomic(slen + 1);
05173   }
05174   utf8_encode_x(s, 0, len, (unsigned char *)buf, 0, -1, NULL, NULL, 0);
05175   buf[slen] = 0;
05176   *_slen = slen;
05177   return buf;
05178 }
05179 
05180 char *scheme_utf8_encode_to_buffer(const mzchar *s, int len,
05181                                char *buf, int blen)
05182 {
05183   long slen;
05184   return scheme_utf8_encode_to_buffer_len(s, len, buf, blen, &slen);
05185 }
05186 
05187 unsigned short *scheme_ucs4_to_utf16(const mzchar *text, int start, int end,
05188                                  unsigned short *buf, int bufsize,
05189                                  long *ulen, int term_size)
05190 {
05191   mzchar v;
05192   int extra, i, j;
05193   unsigned short *utf16;
05194 
05195   /* Count characters that fall outside UCS-2: */
05196   for (i = start, extra = 0; i < end; i++) {
05197     if (text[i] > 0xFFFF)
05198       extra++;
05199   }
05200 
05201   if ((end - start) + extra + term_size < bufsize)
05202     utf16 = buf;
05203   else
05204     utf16 = (unsigned short *)scheme_malloc_atomic(sizeof(unsigned short) * ((end - start) + extra + term_size));
05205 
05206   for (i = start, j = 0; i < end; i++) {
05207     v = text[i];
05208     if (v > 0xFFFF) {
05209       utf16[j++] = 0xD800 | ((v >> 10) & 0x3FF);
05210       utf16[j++] = 0xDC00 | (v & 0x3FF);
05211     } else
05212       utf16[j++] = v;
05213   }
05214 
05215   *ulen = j;
05216 
05217   return utf16;
05218 }
05219 
05220 mzchar *scheme_utf16_to_ucs4(const unsigned short *text, int start, int end,
05221                           mzchar *buf, int bufsize,
05222                           long *ulen, int term_size)
05223 {
05224   int wc;
05225   int i, j;
05226 
05227   for (i = start, j = 0; i < end; i++) {
05228     wc = text[i];
05229     if ((wc & 0xF800) == 0xD800) {
05230       i++;
05231     }
05232     j++;
05233   }
05234 
05235   if (j + term_size >= bufsize)
05236     buf = (mzchar *)scheme_malloc_atomic((j + term_size) * sizeof(mzchar));
05237 
05238   for (i = start, j = 0; i < end; i++) {
05239     wc = text[i];
05240     if ((wc & 0xF800) == 0xD800) {
05241       i++;
05242       wc = ((wc & 0x3FF) << 10) + ((((unsigned short *)text)[i]) & 0x3FF);
05243       wc += 0x10000;
05244     }
05245     buf[j++] = wc;
05246   }
05247 
05248   *ulen = j;
05249 
05250   return buf;
05251 }
05252 
05253 /**********************************************************************/
05254 /*                     machine type details                           */
05255 /**********************************************************************/
05256 
05257 /**************************** MacOS ***********************************/
05258 
05259 #if defined(MACINTOSH_EVENTS) && !defined(OS_X)
05260 # include <Gestalt.h>
05261 extern long scheme_this_ip(void);
05262 static void machine_details(char *s)
05263 {
05264    OSErr err;
05265    long lng;
05266    char sysvers[30];
05267    char machine_name[256];
05268 
05269    err = Gestalt(gestaltSystemVersion, &lng);
05270    if (err != noErr) {
05271      strcpy(sysvers, "<unknown system>");
05272    } else {
05273      int i;
05274      sprintf(sysvers, "%X.%X",
05275             (lng >> 8) & 0xff,
05276             lng & 0xff);
05277      /* remove trailing zeros, put dot before something else */
05278      i = strlen(sysvers);
05279      if (i > 1) {
05280        if (sysvers[i-1] != '.') {
05281         if (sysvers[i-1] == '0') {
05282           sysvers[i-1] = 0;
05283           i--;
05284         } else {
05285           sysvers[i] = sysvers[i-1];
05286           sysvers[i-1] = '.';
05287           i++;
05288           sysvers[i] = 0;
05289         }
05290        }
05291      }
05292    }
05293 
05294    err = Gestalt(gestaltMachineType, &lng);
05295    if (err != noErr) {
05296      strcpy(machine_name, "<unknown machine>");
05297    } else {
05298         Str255 machine_name_pascal;
05299 
05300         GetIndString(machine_name_pascal, kMachineNameStrID, lng);
05301         CopyPascalStringToC(machine_name_pascal, machine_name);
05302    }
05303 
05304    lng = scheme_this_ip();
05305 
05306    sprintf(s, "%s %s %d.%d.%d.%d", sysvers, machine_name,
05307           ((unsigned char *)&lng)[0],
05308           ((unsigned char *)&lng)[1],
05309           ((unsigned char *)&lng)[2],
05310           ((unsigned char *)&lng)[3]);
05311 }
05312 #endif
05313 
05314 /*************************** Windows **********************************/
05315 
05316 #ifdef DOS_FILE_SYSTEM
05317 # include <windows.h>
05318 void machine_details(char *buff)
05319 {
05320   OSVERSIONINFO info;
05321   BOOL hasInfo;
05322   char *p;
05323 
05324   info.dwOSVersionInfoSize = sizeof(info);
05325 
05326   GetVersionEx(&info);
05327 
05328   hasInfo = FALSE;
05329 
05330   p = info.szCSDVersion;
05331 
05332   while (p < info.szCSDVersion + sizeof(info.szCSDVersion) &&
05333         *p) {
05334     if (*p != ' ') {
05335       hasInfo = TRUE;
05336       break;
05337     }
05338     p = p XFORM_OK_PLUS 1;
05339   }
05340 
05341   sprintf(buff,"Windows %s %ld.%ld (Build %ld)%s%s",
05342          (info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) ?
05343          "9x" :
05344          (info.dwPlatformId == VER_PLATFORM_WIN32_NT) ?
05345          "NT" : "Unknown platform",
05346          info.dwMajorVersion,info.dwMinorVersion,
05347          (info.dwPlatformId == VER_PLATFORM_WIN32_NT) ?
05348          info.dwBuildNumber :
05349          info.dwBuildNumber & 0xFFFF,
05350          hasInfo ? " " : "",hasInfo ? info.szCSDVersion : "");
05351 }
05352 #endif
05353 
05354 /***************************** OSKit **********************************/
05355 
05356 #ifdef USE_OSKIT_CONSOLE
05357 void machine_details(char *buff)
05358 {
05359   strcpy(buff, "OSKit");
05360 }
05361 #endif
05362 
05363 /***************************** Unix ***********************************/
05364 
05365 #if (!defined(MACINTOSH_EVENTS) || defined(OS_X)) && !defined(DOS_FILE_SYSTEM) && !defined(USE_OSKIT_CONSOLE)
05366 static char *uname_locations[] = { "/bin/uname",
05367                                "/usr/bin/uname",
05368                                /* The above should cover everything, but
05369                                   just in case... */
05370                                "/sbin/uname",
05371                                "/usr/sbin/uname",
05372                                "/usr/local/bin/uname",
05373                                "/usr/local/uname",
05374                                NULL };
05375 
05376 static int try_subproc(Scheme_Object *subprocess_proc, char *prog)
05377 {
05378   Scheme_Object *a[5];
05379   mz_jmp_buf * volatile savebuf, newbuf;
05380 
05381   savebuf = scheme_current_thread->error_buf;
05382   scheme_current_thread->error_buf = &newbuf;
05383 
05384   if (!scheme_setjmp(newbuf)) {
05385     a[0] = scheme_false;
05386     a[1] = scheme_false;
05387     a[2] = scheme_false;
05388     a[3] = scheme_make_locale_string(prog);
05389     a[4] = scheme_make_locale_string("-a");
05390     _scheme_apply_multi(subprocess_proc, 5, a);
05391     scheme_current_thread->error_buf = savebuf;
05392     return 1;
05393   } else {
05394     scheme_clear_escape();
05395     scheme_current_thread->error_buf = savebuf;
05396     return 0;
05397   }
05398 }
05399 
05400 void machine_details(char *buff)
05401 {
05402   Scheme_Object *subprocess_proc;
05403   int i;
05404 
05405   subprocess_proc = scheme_builtin_value("subprocess");
05406 
05407   for (i = 0; uname_locations[i]; i++) {
05408     if (scheme_file_exists(uname_locations[i])) {
05409       /* Try running it. */
05410       if (try_subproc(subprocess_proc, uname_locations[i])) {
05411        Scheme_Object *sout, *sin, *serr;
05412        long c;
05413 
05414        sout = scheme_current_thread->ku.multiple.array[1];
05415        sin = scheme_current_thread->ku.multiple.array[2];
05416        serr = scheme_current_thread->ku.multiple.array[3];
05417 
05418        scheme_close_output_port(sin);
05419        scheme_close_input_port(serr);
05420 
05421        /* Read result: */
05422        strcpy(buff, "<unknown machine>");
05423        c = scheme_get_bytes(sout, 1023, buff, 0);
05424        buff[c] = 0;
05425 
05426        scheme_close_input_port(sout);
05427 
05428        /* Remove trailing whitespace (especially newlines) */
05429        while (c && portable_isspace(((unsigned char *)buff)[c - 1])) {
05430          buff[--c] = 0;
05431        }
05432 
05433        return;
05434       }
05435     }
05436   }
05437 
05438   strcpy(buff, "<unknown machine>");
05439 }
05440 #endif
05441 
05442 
05443 /**********************************************************************/
05444 /*                           Precise GC                               */
05445 /**********************************************************************/
05446 
05447 #ifdef MZ_PRECISE_GC
05448 
05449 START_XFORM_SKIP;
05450 
05451 #define MARKS_FOR_STRING_C
05452 #include "mzmark.c"
05453 
05454 static void register_traversers(void)
05455 {
05456   GC_REG_TRAV(scheme_string_converter_type, mark_string_convert);
05457 }
05458 
05459 END_XFORM_SKIP;
05460 
05461 #endif