Back to index

plt-scheme  4.2.1
cordxtra.c
Go to the documentation of this file.
00001 /*
00002  * Copyright (c) 1993-1994 by Xerox Corporation.  All rights reserved.
00003  *
00004  * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
00005  * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
00006  *
00007  * Permission is hereby granted to use or copy this program
00008  * for any purpose,  provided the above notices are retained on all copies.
00009  * Permission to modify the code and to distribute modified code is granted,
00010  * provided the above notices are retained, and a notice that the code was
00011  * modified is included with the above copyright notice.
00012  *
00013  * Author: Hans-J. Boehm (boehm@parc.xerox.com)
00014  */
00015 /*
00016  * These are functions on cords that do not need to understand their
00017  * implementation.  They serve also serve as example client code for
00018  * cord_basics.
00019  */
00020 /* Boehm, December 8, 1995 1:53 pm PST */
00021 # include <stdio.h>
00022 # include <string.h>
00023 # include <stdlib.h>
00024 # include <stdarg.h>
00025 # include "cord.h"
00026 # include "ec.h"
00027 # define I_HIDE_POINTERS    /* So we get access to allocation lock.   */
00028                             /* We use this for lazy file reading,     */
00029                             /* so that we remain independent   */
00030                             /* of the threads primitives.             */
00031 # include "gc.h"
00032 
00033 /* For now we assume that pointer reads and writes are atomic,        */
00034 /* i.e. another thread always sees the state before or after   */
00035 /* a write.  This might be false on a Motorola M68K with       */
00036 /* pointers that are not 32-bit aligned.  But there probably   */
00037 /* aren't too many threads packages running on those.          */
00038 # define ATOMIC_WRITE(x,y) (x) = (y)
00039 # define ATOMIC_READ(x) (*(x))
00040 
00041 /* The standard says these are in stdio.h, but they aren't always: */
00042 # ifndef SEEK_SET
00043 #   define SEEK_SET 0
00044 # endif
00045 # ifndef SEEK_END
00046 #   define SEEK_END 2
00047 # endif
00048 
00049 # define BUFSZ 2048  /* Size of stack allocated buffers when          */
00050                      /* we want large buffers.                 */
00051 
00052 typedef void (* oom_fn)(void);
00053 
00054 # define OUT_OF_MEMORY {  if (CORD_oom_fn != (oom_fn) 0) (*CORD_oom_fn)(); \
00055                        ABORT("Out of memory\n"); }
00056 # define ABORT(msg) { fprintf(stderr, "%s\n", msg); abort(); }
00057 
00058 CORD CORD_cat_char(CORD x, char c)
00059 {
00060     register char * string;
00061     
00062     if (c == '\0') return(CORD_cat(x, CORD_nul(1)));
00063     string = GC_MALLOC_ATOMIC(2);
00064     if (string == 0) OUT_OF_MEMORY;
00065     string[0] = c;
00066     string[1] = '\0';
00067     return(CORD_cat_char_star(x, string, 1));
00068 }
00069 
00070 CORD CORD_catn(int nargs, ...)
00071 {
00072     register CORD result = CORD_EMPTY;
00073     va_list args;
00074     register int i;
00075 
00076     va_start(args, nargs);
00077     for (i = 0; i < nargs; i++) {
00078         register CORD next = va_arg(args, CORD);
00079         result = CORD_cat(result, next);
00080     }
00081     va_end(args);
00082     return(result);
00083 }
00084 
00085 typedef struct {
00086        size_t len;
00087        size_t count;
00088        char * buf;
00089 } CORD_fill_data;
00090 
00091 int CORD_fill_proc(char c, void * client_data)
00092 {
00093     register CORD_fill_data * d = (CORD_fill_data *)client_data;
00094     register size_t count = d -> count;
00095     
00096     (d -> buf)[count] = c;
00097     d -> count = ++count;
00098     if (count >= d -> len) {
00099        return(1);
00100     } else {
00101        return(0);
00102     }
00103 }
00104 
00105 int CORD_batched_fill_proc(const char * s, void * client_data)
00106 {
00107     register CORD_fill_data * d = (CORD_fill_data *)client_data;
00108     register size_t count = d -> count;
00109     register size_t max = d -> len;
00110     register char * buf = d -> buf;
00111     register const char * t = s;
00112     
00113     while((buf[count] = *t++) != '\0') {
00114         count++;
00115         if (count >= max) {
00116             d -> count = count;
00117             return(1);
00118         }
00119     }
00120     d -> count = count;
00121     return(0);
00122 }
00123 
00124 /* Fill buf with len characters starting at i.                 */
00125 /* Assumes len characters are available.                       */ 
00126 void CORD_fill_buf(CORD x, size_t i, size_t len, char * buf)
00127 {
00128     CORD_fill_data fd;
00129     
00130     fd.len = len;
00131     fd.buf = buf;
00132     fd.count = 0;
00133     (void)CORD_iter5(x, i, CORD_fill_proc, CORD_batched_fill_proc, &fd);
00134 }
00135 
00136 int CORD_cmp(CORD x, CORD y)
00137 {
00138     CORD_pos xpos;
00139     CORD_pos ypos;
00140     register size_t avail, yavail;
00141     
00142     if (y == CORD_EMPTY) return(x != CORD_EMPTY);
00143     if (x == CORD_EMPTY) return(-1);
00144     if (CORD_IS_STRING(y) && CORD_IS_STRING(x)) return(strcmp(x,y));
00145     CORD_set_pos(xpos, x, 0);
00146     CORD_set_pos(ypos, y, 0);
00147     for(;;) {
00148         if (!CORD_pos_valid(xpos)) {
00149             if (CORD_pos_valid(ypos)) {
00150               return(-1);
00151             } else {
00152                 return(0);
00153             }
00154         }
00155         if (!CORD_pos_valid(ypos)) {
00156             return(1);
00157         }
00158         if ((avail = CORD_pos_chars_left(xpos)) <= 0
00159             || (yavail = CORD_pos_chars_left(ypos)) <= 0) {
00160             register char xcurrent = CORD_pos_fetch(xpos);
00161             register char ycurrent = CORD_pos_fetch(ypos);
00162             if (xcurrent != ycurrent) return(xcurrent - ycurrent);
00163             CORD_next(xpos);
00164             CORD_next(ypos);
00165         } else {
00166             /* process as many characters as we can     */
00167             register int result;
00168             
00169             if (avail > yavail) avail = yavail;
00170             result = strncmp(CORD_pos_cur_char_addr(xpos),
00171                           CORD_pos_cur_char_addr(ypos), avail);
00172             if (result != 0) return(result);
00173             CORD_pos_advance(xpos, avail);
00174             CORD_pos_advance(ypos, avail);
00175         }
00176     }
00177 }
00178 
00179 int CORD_ncmp(CORD x, size_t x_start, CORD y, size_t y_start, size_t len)
00180 {
00181     CORD_pos xpos;
00182     CORD_pos ypos;
00183     register size_t count;
00184     register long avail, yavail;
00185     
00186     CORD_set_pos(xpos, x, x_start);
00187     CORD_set_pos(ypos, y, y_start);
00188     for(count = 0; count < len;) {
00189         if (!CORD_pos_valid(xpos)) {
00190             if (CORD_pos_valid(ypos)) {
00191               return(-1);
00192             } else {
00193                 return(0);
00194             }
00195         }
00196         if (!CORD_pos_valid(ypos)) {
00197             return(1);
00198         }
00199         if ((avail = CORD_pos_chars_left(xpos)) <= 0
00200             || (yavail = CORD_pos_chars_left(ypos)) <= 0) {
00201             register char xcurrent = CORD_pos_fetch(xpos);
00202             register char ycurrent = CORD_pos_fetch(ypos);
00203             if (xcurrent != ycurrent) return(xcurrent - ycurrent);
00204             CORD_next(xpos);
00205             CORD_next(ypos);
00206             count++;
00207         } else {
00208             /* process as many characters as we can     */
00209             register int result;
00210             
00211             if (avail > yavail) avail = yavail;
00212             count += avail;
00213             if (count > len) avail -= (count - len);
00214             result = strncmp(CORD_pos_cur_char_addr(xpos),
00215                           CORD_pos_cur_char_addr(ypos), (size_t)avail);
00216             if (result != 0) return(result);
00217             CORD_pos_advance(xpos, (size_t)avail);
00218             CORD_pos_advance(ypos, (size_t)avail);
00219         }
00220     }
00221     return(0);
00222 }
00223 
00224 char * CORD_to_char_star(CORD x)
00225 {
00226     register size_t len = CORD_len(x);
00227     char * result = GC_MALLOC_ATOMIC(len + 1);
00228     
00229     if (result == 0) OUT_OF_MEMORY;
00230     CORD_fill_buf(x, 0, len, result);
00231     result[len] = '\0';
00232     return(result);
00233 }
00234 
00235 CORD CORD_from_char_star(const char *s)
00236 {
00237     char * result;
00238     size_t len = strlen(s);
00239 
00240     if (0 == len) return(CORD_EMPTY);
00241     result = GC_MALLOC_ATOMIC(len + 1);
00242     if (result == 0) OUT_OF_MEMORY;
00243     memcpy(result, s, len+1);
00244     return(result);
00245 }
00246 
00247 const char * CORD_to_const_char_star(CORD x)
00248 {
00249     if (x == 0) return("");
00250     if (CORD_IS_STRING(x)) return((const char *)x);
00251     return(CORD_to_char_star(x));
00252 }
00253 
00254 char CORD_fetch(CORD x, size_t i)
00255 {
00256     CORD_pos xpos;
00257     
00258     CORD_set_pos(xpos, x, i);
00259     if (!CORD_pos_valid(xpos)) ABORT("bad index?");
00260     return(CORD_pos_fetch(xpos));
00261 }
00262 
00263 
00264 int CORD_put_proc(char c, void * client_data)
00265 {
00266     register FILE * f = (FILE *)client_data;
00267     
00268     return(putc(c, f) == EOF);
00269 }
00270 
00271 int CORD_batched_put_proc(const char * s, void * client_data)
00272 {
00273     register FILE * f = (FILE *)client_data;
00274     
00275     return(fputs(s, f) == EOF);
00276 }
00277     
00278 
00279 int CORD_put(CORD x, FILE * f)
00280 {
00281     if (CORD_iter5(x, 0, CORD_put_proc, CORD_batched_put_proc, f)) {
00282         return(EOF);
00283     } else {
00284        return(1);
00285     }
00286 }
00287 
00288 typedef struct {
00289     size_t pos;             /* Current position in the cord */
00290     char target;     /* Character we're looking for     */
00291 } chr_data;
00292 
00293 int CORD_chr_proc(char c, void * client_data)
00294 {
00295     register chr_data * d = (chr_data *)client_data;
00296     
00297     if (c == d -> target) return(1);
00298     (d -> pos) ++;
00299     return(0);
00300 }
00301 
00302 int CORD_rchr_proc(char c, void * client_data)
00303 {
00304     register chr_data * d = (chr_data *)client_data;
00305     
00306     if (c == d -> target) return(1);
00307     (d -> pos) --;
00308     return(0);
00309 }
00310 
00311 int CORD_batched_chr_proc(const char *s, void * client_data)
00312 {
00313     register chr_data * d = (chr_data *)client_data;
00314     register char * occ = strchr(s, d -> target);
00315     
00316     if (occ == 0) {
00317        d -> pos += strlen(s);
00318        return(0);
00319     } else {
00320        d -> pos += occ - s;
00321        return(1);
00322     }
00323 }
00324 
00325 size_t CORD_chr(CORD x, size_t i, int c)
00326 {
00327     chr_data d;
00328     
00329     d.pos = i;
00330     d.target = c;
00331     if (CORD_iter5(x, i, CORD_chr_proc, CORD_batched_chr_proc, &d)) {
00332         return(d.pos);
00333     } else {
00334        return(CORD_NOT_FOUND);
00335     }
00336 }
00337 
00338 size_t CORD_rchr(CORD x, size_t i, int c)
00339 {
00340     chr_data d;
00341     
00342     d.pos = i;
00343     d.target = c;
00344     if (CORD_riter4(x, i, CORD_rchr_proc, &d)) {
00345         return(d.pos);
00346     } else {
00347        return(CORD_NOT_FOUND);
00348     }
00349 }
00350 
00351 /* Find the first occurrence of s in x at position start or later.    */
00352 /* This uses an asymptotically poor algorithm, which should typically        */
00353 /* perform acceptably.  We compare the first few characters directly, */
00354 /* and call CORD_ncmp whenever there is a partial match.              */
00355 /* This has the advantage that we allocate very little, or not at all.       */
00356 /* It's very fast if there are few close misses.               */
00357 size_t CORD_str(CORD x, size_t start, CORD s)
00358 {
00359     CORD_pos xpos;
00360     size_t xlen = CORD_len(x);
00361     size_t slen;
00362     register size_t start_len;
00363     const char * s_start;
00364     unsigned long s_buf = 0;       /* The first few characters of s   */
00365     unsigned long x_buf = 0;       /* Start of candidate substring.   */
00366                             /* Initialized only to make compilers     */
00367                             /* happy.                          */
00368     unsigned long mask = 0;
00369     register size_t i;
00370     register size_t match_pos;
00371     
00372     if (s == CORD_EMPTY) return(start);
00373     if (CORD_IS_STRING(s)) {
00374         s_start = s;
00375         slen = strlen(s);
00376     } else {
00377         s_start = CORD_to_char_star(CORD_substr(s, 0, sizeof(unsigned long)));
00378         slen = CORD_len(s);
00379     }
00380     if (xlen < start || xlen - start < slen) return(CORD_NOT_FOUND);
00381     start_len = slen;
00382     if (start_len > sizeof(unsigned long)) start_len = sizeof(unsigned long);
00383     CORD_set_pos(xpos, x, start);
00384     for (i = 0; i < start_len; i++) {
00385         mask <<= 8;
00386         mask |= 0xff;
00387         s_buf <<= 8;
00388         s_buf |= (unsigned char)s_start[i];
00389         x_buf <<= 8;
00390         x_buf |= (unsigned char)CORD_pos_fetch(xpos);
00391         CORD_next(xpos);
00392     }
00393     for (match_pos = start; ; match_pos++) {
00394        if ((x_buf & mask) == s_buf) {
00395            if (slen == start_len ||
00396               CORD_ncmp(x, match_pos + start_len,
00397                        s, start_len, slen - start_len) == 0) {
00398                return(match_pos);
00399            }
00400        }
00401        if ( match_pos == xlen - slen ) {
00402            return(CORD_NOT_FOUND);
00403        }
00404        x_buf <<= 8;
00405         x_buf |= (unsigned char)CORD_pos_fetch(xpos);
00406         CORD_next(xpos);
00407     }
00408 }
00409 
00410 void CORD_ec_flush_buf(CORD_ec x)
00411 {
00412     register size_t len = x[0].ec_bufptr - x[0].ec_buf;
00413     char * s;
00414 
00415     if (len == 0) return;
00416     s = GC_MALLOC_ATOMIC(len+1);
00417     memcpy(s, x[0].ec_buf, len);
00418     s[len] = '\0';
00419     x[0].ec_cord = CORD_cat_char_star(x[0].ec_cord, s, len);
00420     x[0].ec_bufptr = x[0].ec_buf;
00421 }
00422 
00423 void CORD_ec_append_cord(CORD_ec x, CORD s)
00424 {
00425     CORD_ec_flush_buf(x);
00426     x[0].ec_cord = CORD_cat(x[0].ec_cord, s);
00427 }
00428 
00429 /*ARGSUSED*/
00430 char CORD_nul_func(size_t i, void * client_data)
00431 {
00432     return((char)(unsigned long)client_data);
00433 }
00434 
00435 
00436 CORD CORD_chars(char c, size_t i)
00437 {
00438     return(CORD_from_fn(CORD_nul_func, (void *)(unsigned long)c, i));
00439 }
00440 
00441 CORD CORD_from_file_eager(FILE * f)
00442 {
00443     register int c;
00444     CORD_ec ecord;
00445     
00446     CORD_ec_init(ecord);
00447     for(;;) {
00448         c = getc(f);
00449         if (c == 0) {
00450           /* Append the right number of NULs     */
00451           /* Note that any string of NULs is rpresented in 4 words,   */
00452           /* independent of its length.                               */
00453             register size_t count = 1;
00454             
00455             CORD_ec_flush_buf(ecord);
00456             while ((c = getc(f)) == 0) count++;
00457             ecord[0].ec_cord = CORD_cat(ecord[0].ec_cord, CORD_nul(count));
00458         }
00459         if (c == EOF) break;
00460         CORD_ec_append(ecord, c);
00461     }
00462     (void) fclose(f);
00463     return(CORD_balance(CORD_ec_to_cord(ecord)));
00464 }
00465 
00466 /* The state maintained for a lazily read file consists primarily     */
00467 /* of a large direct-mapped cache of previously read values.          */
00468 /* We could rely more on stdio buffering.  That would have 2          */
00469 /* disadvantages:                                              */
00470 /*     1) Empirically, not all fseek implementations preserve the     */
00471 /*        buffer whenever they could.                                 */
00472 /*     2) It would fail if 2 different sections of a long cord        */
00473 /*        were being read alternately.                                */
00474 /* We do use the stdio buffer for read ahead.                         */
00475 /* To guarantee thread safety in the presence of atomic pointer              */
00476 /* writes, cache lines are always replaced, and never modified in     */
00477 /* place.                                                      */
00478 
00479 # define LOG_CACHE_SZ 14
00480 # define CACHE_SZ (1 << LOG_CACHE_SZ)
00481 # define LOG_LINE_SZ 9
00482 # define LINE_SZ (1 << LOG_LINE_SZ)
00483 
00484 typedef struct {
00485     size_t tag;
00486     char data[LINE_SZ];
00487        /* data[i%LINE_SZ] = ith char in file if tag = i/LINE_SZ       */
00488 } cache_line;
00489 
00490 typedef struct {
00491     FILE * lf_file;
00492     size_t lf_current;      /* Current file pointer value */
00493     cache_line * volatile lf_cache[CACHE_SZ/LINE_SZ];
00494 } lf_state;
00495 
00496 # define MOD_CACHE_SZ(n) ((n) & (CACHE_SZ - 1))
00497 # define DIV_CACHE_SZ(n) ((n) >> LOG_CACHE_SZ)
00498 # define MOD_LINE_SZ(n) ((n) & (LINE_SZ - 1))
00499 # define DIV_LINE_SZ(n) ((n) >> LOG_LINE_SZ)
00500 # define LINE_START(n) ((n) & ~(LINE_SZ - 1))
00501 
00502 typedef struct {
00503     lf_state * state;
00504     size_t file_pos; /* Position of needed character. */
00505     cache_line * new_cache;
00506 } refill_data;
00507 
00508 /* Executed with allocation lock. */
00509 static char refill_cache(client_data)
00510 refill_data * client_data;
00511 {
00512     register lf_state * state = client_data -> state;
00513     register size_t file_pos = client_data -> file_pos;
00514     FILE *f = state -> lf_file;
00515     size_t line_start = LINE_START(file_pos);
00516     size_t line_no = DIV_LINE_SZ(MOD_CACHE_SZ(file_pos));
00517     cache_line * new_cache = client_data -> new_cache;
00518     
00519     if (line_start != state -> lf_current
00520         && fseek(f, line_start, SEEK_SET) != 0) {
00521            ABORT("fseek failed");
00522     }
00523     if (fread(new_cache -> data, sizeof(char), LINE_SZ, f)
00524        <= file_pos - line_start) {
00525        ABORT("fread failed");
00526     }
00527     new_cache -> tag = DIV_LINE_SZ(file_pos);
00528     /* Store barrier goes here. */
00529     ATOMIC_WRITE(state -> lf_cache[line_no], new_cache);
00530     state -> lf_current = line_start + LINE_SZ;
00531     return(new_cache->data[MOD_LINE_SZ(file_pos)]);
00532 }
00533 
00534 char CORD_lf_func(size_t i, void * client_data)
00535 {
00536     register lf_state * state = (lf_state *)client_data;
00537     register cache_line * volatile * cl_addr =
00538               &(state -> lf_cache[DIV_LINE_SZ(MOD_CACHE_SZ(i))]);
00539     register cache_line * cl = (cache_line *)ATOMIC_READ(cl_addr);
00540     
00541     if (cl == 0 || cl -> tag != DIV_LINE_SZ(i)) {
00542        /* Cache miss */
00543        refill_data rd;
00544        
00545         rd.state = state;
00546         rd.file_pos =  i;
00547         rd.new_cache = GC_NEW_ATOMIC(cache_line);
00548         if (rd.new_cache == 0) OUT_OF_MEMORY;
00549         return((char)(GC_word)
00550                 GC_call_with_alloc_lock((GC_fn_type) refill_cache, &rd));
00551     }
00552     return(cl -> data[MOD_LINE_SZ(i)]);
00553 }    
00554 
00555 /*ARGSUSED*/
00556 void CORD_lf_close_proc(void * obj, void * client_data)  
00557 {
00558     if (fclose(((lf_state *)obj) -> lf_file) != 0) {
00559        ABORT("CORD_lf_close_proc: fclose failed");
00560     }
00561 }                    
00562 
00563 CORD CORD_from_file_lazy_inner(FILE * f, size_t len)
00564 {
00565     register lf_state * state = GC_NEW(lf_state);
00566     register int i;
00567     
00568     if (state == 0) OUT_OF_MEMORY;
00569     if (len != 0) {
00570        /* Dummy read to force buffer allocation.        */
00571        /* This greatly increases the probability */
00572        /* of avoiding deadlock if buffer allocation     */
00573        /* is redirected to GC_malloc and the            */
00574        /* world is multithreaded.                */
00575        char buf[1];
00576 
00577        (void) fread(buf, 1, 1, f); 
00578        rewind(f);
00579     }
00580     state -> lf_file = f;
00581     for (i = 0; i < CACHE_SZ/LINE_SZ; i++) {
00582         state -> lf_cache[i] = 0;
00583     }
00584     state -> lf_current = 0;
00585     GC_REGISTER_FINALIZER(state, CORD_lf_close_proc, 0, 0, 0);
00586     return(CORD_from_fn(CORD_lf_func, state, len));
00587 }
00588 
00589 CORD CORD_from_file_lazy(FILE * f)
00590 {
00591     register long len;
00592     
00593     if (fseek(f, 0l, SEEK_END) != 0) {
00594         ABORT("Bad fd argument - fseek failed");
00595     }
00596     if ((len = ftell(f)) < 0) {
00597         ABORT("Bad fd argument - ftell failed");
00598     }
00599     rewind(f);
00600     return(CORD_from_file_lazy_inner(f, (size_t)len));
00601 }
00602 
00603 # define LAZY_THRESHOLD (128*1024 + 1)
00604 
00605 CORD CORD_from_file(FILE * f)
00606 {
00607     register long len;
00608     
00609     if (fseek(f, 0l, SEEK_END) != 0) {
00610         ABORT("Bad fd argument - fseek failed");
00611     }
00612     if ((len = ftell(f)) < 0) {
00613         ABORT("Bad fd argument - ftell failed");
00614     }
00615     rewind(f);
00616     if (len < LAZY_THRESHOLD) {
00617         return(CORD_from_file_eager(f));
00618     } else {
00619         return(CORD_from_file_lazy_inner(f, (size_t)len));
00620     }
00621 }