Back to index

lightning-sunbird  0.9+nobinonly
cord.h
Go to the documentation of this file.
00001 /* 
00002  * Copyright (c) 1993-1994 by Xerox Corporation.  All rights reserved.
00003  *
00004  * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
00005  * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
00006  *
00007  * Permission is hereby granted to use or copy this program
00008  * for any purpose,  provided the above notices are retained on all copies.
00009  * Permission to modify the code and to distribute modified code is granted,
00010  * provided the above notices are retained, and a notice that the code was
00011  * modified is included with the above copyright notice.
00012  *
00013  * Author: Hans-J. Boehm (boehm@parc.xerox.com)
00014  */
00015 /* Boehm, October 5, 1995 4:20 pm PDT */
00016  
00017 /*
00018  * Cords are immutable character strings.  A number of operations
00019  * on long cords are much more efficient than their strings.h counterpart.
00020  * In particular, concatenation takes constant time independent of the length
00021  * of the arguments.  (Cords are represented as trees, with internal
00022  * nodes representing concatenation and leaves consisting of either C
00023  * strings or a functional description of the string.)
00024  *
00025  * The following are reasonable applications of cords.  They would perform
00026  * unacceptably if C strings were used:
00027  * - A compiler that produces assembly language output by repeatedly
00028  *   concatenating instructions onto a cord representing the output file.
00029  * - A text editor that converts the input file to a cord, and then
00030  *   performs editing operations by producing a new cord representing
00031  *   the file after echa character change (and keeping the old ones in an
00032  *   edit history)
00033  *
00034  * For optimal performance, cords should be built by
00035  * concatenating short sections.
00036  * This interface is designed for maximum compatibility with C strings.
00037  * ASCII NUL characters may be embedded in cords using CORD_from_fn.
00038  * This is handled correctly, but CORD_to_char_star will produce a string
00039  * with embedded NULs when given such a cord. 
00040  *
00041  * This interface is fairly big, largely for performance reasons.
00042  * The most basic constants and functions:
00043  *
00044  * CORD - the type fo a cord;
00045  * CORD_EMPTY - empty cord;
00046  * CORD_len(cord) - length of a cord;
00047  * CORD_cat(cord1,cord2) - concatenation of two cords;
00048  * CORD_substr(cord, start, len) - substring (or subcord);
00049  * CORD_pos i;  CORD_FOR(i, cord) {  ... CORD_pos_fetch(i) ... } -
00050  *    examine each character in a cord.  CORD_pos_fetch(i) is the char.
00051  * CORD_fetch(int i) - Retrieve i'th character (slowly).
00052  * CORD_cmp(cord1, cord2) - compare two cords.
00053  * CORD_from_file(FILE * f) - turn a read-only file into a cord.
00054  * CORD_to_char_star(cord) - convert to C string.
00055  *   (Non-NULL C constant strings are cords.)
00056  * CORD_printf (etc.) - cord version of printf. Use %r for cords.
00057  */
00058 # ifndef CORD_H
00059 
00060 # define CORD_H
00061 # include <stddef.h>
00062 # include <stdio.h>
00063 /* Cords have type const char *.  This is cheating quite a bit, and not      */
00064 /* 100% portable.  But it means that nonempty character string        */
00065 /* constants may be used as cords directly, provided the string is    */
00066 /* never modified in place.  The empty cord is represented by, and    */
00067 /* can be written as, 0.                                       */
00068 
00069 typedef const char * CORD;
00070 
00071 /* An empty cord is always represented as nil    */
00072 # define CORD_EMPTY 0
00073 
00074 /* Is a nonempty cord represented as a C string? */
00075 #define CORD_IS_STRING(s) (*(s) != '\0')
00076 
00077 /* Concatenate two cords.  If the arguments are C strings, they may   */
00078 /* not be subsequently altered.                                       */
00079 CORD CORD_cat(CORD x, CORD y);
00080 
00081 /* Concatenate a cord and a C string with known length.  Except for the      */
00082 /* empty string case, this is a special case of CORD_cat.  Since the  */
00083 /* length is known, it can be faster.                                 */
00084 /* The string y is shared with the resulting CORD.  Hence it should   */
00085 /* not be altered by the caller.                               */
00086 CORD CORD_cat_char_star(CORD x, const char * y, size_t leny);
00087 
00088 /* Compute the length of a cord */
00089 size_t CORD_len(CORD x);
00090 
00091 /* Cords may be represented by functions defining the ith character */
00092 typedef char (* CORD_fn)(size_t i, void * client_data);
00093 
00094 /* Turn a functional description into a cord.    */
00095 CORD CORD_from_fn(CORD_fn fn, void * client_data, size_t len);
00096 
00097 /* Return the substring (subcord really) of x with length at most n,  */
00098 /* starting at position i.  (The initial character has position 0.)   */
00099 CORD CORD_substr(CORD x, size_t i, size_t n);
00100 
00101 /* Return the argument, but rebalanced to allow more efficient        */
00102 /* character retrieval, substring operations, and comparisons.        */
00103 /* This is useful only for cords that were built using repeated       */
00104 /* concatenation.  Guarantees log time access to the result, unless   */
00105 /* x was obtained through a large number of repeated substring ops    */
00106 /* or the embedded functional descriptions take longer to evaluate.   */
00107 /* May reallocate significant parts of the cord.  The argument is not */
00108 /* modified; only the result is balanced.                      */
00109 CORD CORD_balance(CORD x);
00110 
00111 /* The following traverse a cord by applying a function to each       */
00112 /* character.  This is occasionally appropriate, especially where     */
00113 /* speed is crucial.  But, since C doesn't have nested functions,     */
00114 /* clients of this sort of traversal are clumsy to write.  Consider   */
00115 /* the functions that operate on cord positions instead.              */
00116 
00117 /* Function to iteratively apply to individual characters in cord.    */
00118 typedef int (* CORD_iter_fn)(char c, void * client_data);
00119 
00120 /* Function to apply to substrings of a cord.  Each substring is a    */
00121 /* a C character string, not a general cord.                          */
00122 typedef int (* CORD_batched_iter_fn)(const char * s, void * client_data);
00123 # define CORD_NO_FN ((CORD_batched_iter_fn)0)
00124 
00125 /* Apply f1 to each character in the cord, in ascending order,        */
00126 /* starting at position i. If                                         */
00127 /* f2 is not CORD_NO_FN, then multiple calls to f1 may be replaced by */
00128 /* a single call to f2.  The parameter f2 is provided only to allow   */
00129 /* some optimization by the client.  This terminates when the right   */
00130 /* end of this string is reached, or when f1 or f2 return != 0.  In the      */
00131 /* latter case CORD_iter returns != 0.  Otherwise it returns 0.              */
00132 /* The specified value of i must be < CORD_len(x).                    */
00133 int CORD_iter5(CORD x, size_t i, CORD_iter_fn f1,
00134               CORD_batched_iter_fn f2, void * client_data);
00135 
00136 /* A simpler version that starts at 0, and without f2:  */
00137 int CORD_iter(CORD x, CORD_iter_fn f1, void * client_data);
00138 # define CORD_iter(x, f1, cd) CORD_iter5(x, 0, f1, CORD_NO_FN, cd)
00139 
00140 /* Similar to CORD_iter5, but end-to-beginning.  No provisions for    */
00141 /* CORD_batched_iter_fn.                                       */
00142 int CORD_riter4(CORD x, size_t i, CORD_iter_fn f1, void * client_data);
00143 
00144 /* A simpler version that starts at the end:     */
00145 int CORD_riter(CORD x, CORD_iter_fn f1, void * client_data);
00146 
00147 /* Functions that operate on cord positions.  The easy way to traverse       */
00148 /* cords.  A cord position is logically a pair consisting of a cord   */
00149 /* and an index into that cord.  But it is much faster to retrieve a  */
00150 /* charcter based on a position than on an index.  Unfortunately,     */
00151 /* positions are big (order of a few 100 bytes), so allocate them with       */
00152 /* caution.                                                    */
00153 /* Things in cord_pos.h should be treated as opaque, except as        */
00154 /* described below.  Also note that                                   */
00155 /* CORD_pos_fetch, CORD_next and CORD_prev have both macro and function      */
00156 /* definitions.  The former may evaluate their argument more than once. */
00157 # include "private/cord_pos.h"
00158 
00159 /*
00160        Visible definitions from above:
00161        
00162        typedef <OPAQUE but fairly big> CORD_pos[1];
00163        
00164        * Extract the cord from a position:
00165        CORD CORD_pos_to_cord(CORD_pos p);
00166        
00167        * Extract the current index from a position:
00168        size_t CORD_pos_to_index(CORD_pos p);
00169        
00170        * Fetch the character located at the given position:
00171        char CORD_pos_fetch(CORD_pos p);
00172        
00173        * Initialize the position to refer to the given cord and index.
00174        * Note that this is the most expensive function on positions:
00175        void CORD_set_pos(CORD_pos p, CORD x, size_t i);
00176        
00177        * Advance the position to the next character.
00178        * P must be initialized and valid.
00179        * Invalidates p if past end:
00180        void CORD_next(CORD_pos p);
00181        
00182        * Move the position to the preceding character.
00183        * P must be initialized and valid.
00184        * Invalidates p if past beginning:
00185        void CORD_prev(CORD_pos p);
00186        
00187        * Is the position valid, i.e. inside the cord?
00188        int CORD_pos_valid(CORD_pos p);
00189 */
00190 # define CORD_FOR(pos, cord) \
00191     for (CORD_set_pos(pos, cord, 0); CORD_pos_valid(pos); CORD_next(pos))
00192 
00193                      
00194 /* An out of memory handler to call.  May be supplied by client.      */
00195 /* Must not return.                                            */
00196 extern void (* CORD_oom_fn)(void);
00197 
00198 /* Dump the representation of x to stdout in an implementation defined       */
00199 /* manner.  Intended for debugging only.                       */
00200 void CORD_dump(CORD x);
00201 
00202 /* The following could easily be implemented by the client.  They are */
00203 /* provided in cordxtra.c for convenience.                            */
00204 
00205 /* Concatenate a character to the end of a cord. */
00206 CORD CORD_cat_char(CORD x, char c);
00207 
00208 /* Concatenate n cords.     */
00209 CORD CORD_catn(int n, /* CORD */ ...);
00210 
00211 /* Return the character in CORD_substr(x, i, 1)         */
00212 char CORD_fetch(CORD x, size_t i);
00213 
00214 /* Return < 0, 0, or > 0, depending on whether x < y, x = y, x > y    */
00215 int CORD_cmp(CORD x, CORD y);
00216 
00217 /* A generalization that takes both starting positions for the               */
00218 /* comparison, and a limit on the number of characters to be compared.       */
00219 int CORD_ncmp(CORD x, size_t x_start, CORD y, size_t y_start, size_t len);
00220 
00221 /* Find the first occurrence of s in x at position start or later.    */
00222 /* Return the position of the first character of s in x, or           */
00223 /* CORD_NOT_FOUND if there is none.                                   */
00224 size_t CORD_str(CORD x, size_t start, CORD s);
00225 
00226 /* Return a cord consisting of i copies of (possibly NUL) c.  Dangerous      */
00227 /* in conjunction with CORD_to_char_star.                      */
00228 /* The resulting representation takes constant space, independent of i.      */
00229 CORD CORD_chars(char c, size_t i);
00230 # define CORD_nul(i) CORD_chars('\0', (i))
00231 
00232 /* Turn a file into cord.  The file must be seekable.  Its contents   */
00233 /* must remain constant.  The file may be accessed as an immediate    */
00234 /* result of this call and/or as a result of subsequent accesses to   */
00235 /* the cord.  Short files are likely to be immediately read, but      */
00236 /* long files are likely to be read on demand, possibly relying on    */
00237 /* stdio for buffering.                                               */
00238 /* We must have exclusive access to the descriptor f, i.e. we may     */
00239 /* read it at any time, and expect the file pointer to be             */
00240 /* where we left it.  Normally this should be invoked as              */
00241 /* CORD_from_file(fopen(...))                                         */
00242 /* CORD_from_file arranges to close the file descriptor when it is no */
00243 /* longer needed (e.g. when the result becomes inaccessible).         */ 
00244 /* The file f must be such that ftell reflects the actual character   */
00245 /* position in the file, i.e. the number of characters that can be    */
00246 /* or were read with fread.  On UNIX systems this is always true.  On */
00247 /* MS Windows systems, f must be opened in binary mode.               */
00248 CORD CORD_from_file(FILE * f);
00249 
00250 /* Equivalent to the above, except that the entire file will be read  */
00251 /* and the file pointer will be closed immediately.                   */
00252 /* The binary mode restriction from above does not apply.             */
00253 CORD CORD_from_file_eager(FILE * f);
00254 
00255 /* Equivalent to the above, except that the file will be read on demand.*/
00256 /* The binary mode restriction applies.                               */
00257 CORD CORD_from_file_lazy(FILE * f);
00258 
00259 /* Turn a cord into a C string.    The result shares no structure with       */
00260 /* x, and is thus modifiable.                                         */
00261 char * CORD_to_char_star(CORD x);
00262 
00263 /* Turn a C string into a CORD.  The C string is copied, and so may   */
00264 /* subsequently be modified.                                          */
00265 CORD CORD_from_char_star(const char *s);
00266 
00267 /* Identical to the above, but the result may share structure with    */
00268 /* the argument and is thus not modifiable.                           */
00269 const char * CORD_to_const_char_star(CORD x); 
00270 
00271 /* Write a cord to a file, starting at the current position.  No      */
00272 /* trailing NULs are newlines are added.                       */
00273 /* Returns EOF if a write error occurs, 1 otherwise.                  */
00274 int CORD_put(CORD x, FILE * f);
00275 
00276 /* "Not found" result for the following two functions.                */
00277 # define CORD_NOT_FOUND ((size_t)(-1))
00278 
00279 /* A vague analog of strchr.  Returns the position (an integer, not   */
00280 /* a pointer) of the first occurrence of (char) c inside x at position       */
00281 /* i or later. The value i must be < CORD_len(x).                     */
00282 size_t CORD_chr(CORD x, size_t i, int c);
00283 
00284 /* A vague analog of strrchr.  Returns index of the last occurrence   */
00285 /* of (char) c inside x at position i or earlier. The value i         */
00286 /* must be < CORD_len(x).                                      */
00287 size_t CORD_rchr(CORD x, size_t i, int c);
00288 
00289 
00290 /* The following are also not primitive, but are implemented in       */
00291 /* cordprnt.c.  They provide functionality similar to the ANSI C      */
00292 /* functions with corresponding names, but with the following         */
00293 /* additions and changes:                                      */
00294 /* 1. A %r conversion specification specifies a CORD argument.  Field */
00295 /*    width, precision, etc. have the same semantics as for %s.              */
00296 /*    (Note that %c,%C, and %S were already taken.)                   */
00297 /* 2. The format string is represented as a CORD.                      */
00298 /* 3. CORD_sprintf and CORD_vsprintf assign the result through the 1st       */     /*    argument.      Unlike their ANSI C versions, there is no need to guess */
00299 /*    the correct buffer size.                                        */
00300 /* 4. Most of the conversions are implement through the native               */
00301 /*    vsprintf.  Hence they are usually no faster, and                */
00302 /*    idiosyncracies of the native printf are preserved.  However,    */
00303 /*    CORD arguments to CORD_sprintf and CORD_vsprintf are NOT copied;       */
00304 /*    the result shares the original structure.  This may make them   */
00305 /*    very efficient in some unusual applications.                    */
00306 /*    The format string is copied.                             */
00307 /* All functions return the number of characters generated or -1 on   */
00308 /* error.  This complies with the ANSI standard, but is inconsistent  */
00309 /* with some older implementations of sprintf.                        */
00310 
00311 /* The implementation of these is probably less portable than the rest       */
00312 /* of this package.                                            */
00313 
00314 #ifndef CORD_NO_IO
00315 
00316 #include <stdarg.h>
00317 
00318 int CORD_sprintf(CORD * out, CORD format, ...);
00319 int CORD_vsprintf(CORD * out, CORD format, va_list args);
00320 int CORD_fprintf(FILE * f, CORD format, ...);
00321 int CORD_vfprintf(FILE * f, CORD format, va_list args);
00322 int CORD_printf(CORD format, ...);
00323 int CORD_vprintf(CORD format, va_list args);
00324 
00325 #endif /* CORD_NO_IO */
00326 
00327 # endif /* CORD_H */