Back to index

tetex-bin  3.0
search.c
Go to the documentation of this file.
00001 /* search.c -- searching large bodies of text.
00002    $Id: search.c,v 1.3 2004/04/11 17:56:46 karl Exp $
00003 
00004    Copyright (C) 1993, 1997, 1998, 2002, 2004 Free Software Foundation, Inc.
00005 
00006    This program is free software; you can redistribute it and/or modify
00007    it under the terms of the GNU General Public License as published by
00008    the Free Software Foundation; either version 2, or (at your option)
00009    any later version.
00010 
00011    This program is distributed in the hope that it will be useful,
00012    but WITHOUT ANY WARRANTY; without even the implied warranty of
00013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014    GNU General Public License for more details.
00015 
00016    You should have received a copy of the GNU General Public License
00017    along with this program; if not, write to the Free Software
00018    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
00019 
00020    Written by Brian Fox (bfox@ai.mit.edu). */
00021 
00022 #include "info.h"
00023 
00024 #include "search.h"
00025 #include "nodes.h"
00026 
00027 /* The search functions take two arguments:
00028 
00029      1) a string to search for, and
00030 
00031      2) a pointer to a SEARCH_BINDING which contains the buffer, start,
00032         and end of the search.
00033 
00034    They return a long, which is the offset from the start of the buffer
00035    at which the match was found.  An offset of -1 indicates failure. */
00036 
00037 /* A function which makes a binding with buffer and bounds. */
00038 SEARCH_BINDING *
00039 make_binding (char *buffer, long int start, long int end)
00040 {
00041   SEARCH_BINDING *binding;
00042 
00043   binding = (SEARCH_BINDING *)xmalloc (sizeof (SEARCH_BINDING));
00044   binding->buffer = buffer;
00045   binding->start = start;
00046   binding->end = end;
00047   binding->flags = 0;
00048 
00049   return (binding);
00050 }
00051 
00052 /* Make a copy of BINDING without duplicating the data. */
00053 SEARCH_BINDING *
00054 copy_binding (SEARCH_BINDING *binding)
00055 {
00056   SEARCH_BINDING *copy;
00057 
00058   copy = make_binding (binding->buffer, binding->start, binding->end);
00059   copy->flags = binding->flags;
00060   return (copy);
00061 }
00062 
00063 
00064 /* **************************************************************** */
00065 /*                                                                  */
00066 /*                 The Actual Searching Functions                   */
00067 /*                                                                  */
00068 /* **************************************************************** */
00069 
00070 /* Search forwards or backwards for the text delimited by BINDING.
00071    The search is forwards if BINDING->start is greater than BINDING->end. */
00072 long
00073 search (char *string, SEARCH_BINDING *binding)
00074 {
00075   long result;
00076 
00077   /* If the search is backwards, then search backwards, otherwise forwards. */
00078   if (binding->start > binding->end)
00079     result = search_backward (string, binding);
00080   else
00081     result = search_forward (string, binding);
00082 
00083   return (result);
00084 }
00085 
00086 /* Search forwards for STRING through the text delimited in BINDING. */
00087 long
00088 search_forward (char *string, SEARCH_BINDING *binding)
00089 {
00090   register int c, i, len;
00091   register char *buff, *end;
00092   char *alternate = (char *)NULL;
00093 
00094   len = strlen (string);
00095 
00096   /* We match characters in the search buffer against STRING and ALTERNATE.
00097      ALTERNATE is a case reversed version of STRING; this is cheaper than
00098      case folding each character before comparison.   Alternate is only
00099      used if the case folding bit is turned on in the passed BINDING. */
00100 
00101   if (binding->flags & S_FoldCase)
00102     {
00103       alternate = xstrdup (string);
00104 
00105       for (i = 0; i < len; i++)
00106         {
00107           if (islower (alternate[i]))
00108             alternate[i] = toupper (alternate[i]);
00109           else if (isupper (alternate[i]))
00110             alternate[i] = tolower (alternate[i]);
00111         }
00112     }
00113 
00114   buff = binding->buffer + binding->start;
00115   end = binding->buffer + binding->end + 1;
00116 
00117   while (buff < (end - len))
00118     {
00119       for (i = 0; i < len; i++)
00120         {
00121           c = buff[i];
00122 
00123           if ((c != string[i]) && (!alternate || c != alternate[i]))
00124             break;
00125         }
00126 
00127       if (!string[i])
00128         {
00129           if (alternate)
00130             free (alternate);
00131           if (binding->flags & S_SkipDest)
00132             buff += len;
00133           return ((long) (buff - binding->buffer));
00134         }
00135 
00136       buff++;
00137     }
00138 
00139   if (alternate)
00140     free (alternate);
00141 
00142   return ((long) -1);
00143 }
00144 
00145 /* Search for STRING backwards through the text delimited in BINDING. */
00146 long
00147 search_backward (char *input_string, SEARCH_BINDING *binding)
00148 {
00149   register int c, i, len;
00150   register char *buff, *end;
00151   char *string;
00152   char *alternate = (char *)NULL;
00153 
00154   len = strlen (input_string);
00155 
00156   /* Reverse the characters in the search string. */
00157   string = (char *)xmalloc (1 + len);
00158   for (c = 0, i = len - 1; input_string[c]; c++, i--)
00159     string[i] = input_string[c];
00160 
00161   string[c] = '\0';
00162 
00163   /* We match characters in the search buffer against STRING and ALTERNATE.
00164      ALTERNATE is a case reversed version of STRING; this is cheaper than
00165      case folding each character before comparison.   ALTERNATE is only
00166      used if the case folding bit is turned on in the passed BINDING. */
00167 
00168   if (binding->flags & S_FoldCase)
00169     {
00170       alternate = xstrdup (string);
00171 
00172       for (i = 0; i < len; i++)
00173         {
00174           if (islower (alternate[i]))
00175             alternate[i] = toupper (alternate[i]);
00176           else if (isupper (alternate[i]))
00177             alternate[i] = tolower (alternate[i]);
00178         }
00179     }
00180 
00181   buff = binding->buffer + binding->start - 1;
00182   end = binding->buffer + binding->end;
00183 
00184   while (buff > (end + len))
00185     {
00186       for (i = 0; i < len; i++)
00187         {
00188           c = *(buff - i);
00189 
00190           if (c != string[i] && (!alternate || c != alternate[i]))
00191             break;
00192         }
00193 
00194       if (!string[i])
00195         {
00196           free (string);
00197           if (alternate)
00198             free (alternate);
00199 
00200           if (binding->flags & S_SkipDest)
00201             buff -= len;
00202           return ((long) (1 + (buff - binding->buffer)));
00203         }
00204 
00205       buff--;
00206     }
00207 
00208   free (string);
00209   if (alternate)
00210     free (alternate);
00211 
00212   return ((long) -1);
00213 }
00214 
00215 /* Find STRING in LINE, returning the offset of the end of the string.
00216    Return an offset of -1 if STRING does not appear in LINE.  The search
00217    is bound by the end of the line (i.e., either NEWLINE or 0). */
00218 int
00219 string_in_line (char *string, char *line)
00220 {
00221   register int end;
00222   SEARCH_BINDING binding;
00223 
00224   /* Find the end of the line. */
00225   for (end = 0; line[end] && line[end] != '\n'; end++);
00226 
00227   /* Search for STRING within these confines. */
00228   binding.buffer = line;
00229   binding.start = 0;
00230   binding.end = end;
00231   binding.flags = S_FoldCase | S_SkipDest;
00232 
00233   return (search_forward (string, &binding));
00234 }
00235 
00236 /* Return non-zero if STRING is the first text to appear at BINDING. */
00237 int
00238 looking_at (char *string, SEARCH_BINDING *binding)
00239 {
00240   long search_end;
00241 
00242   search_end = search (string, binding);
00243 
00244   /* If the string was not found, SEARCH_END is -1.  If the string was found,
00245      but not right away, SEARCH_END is != binding->start.  Otherwise, the
00246      string was found at binding->start. */
00247   return (search_end == binding->start);
00248 }
00249 
00250 /* **************************************************************** */
00251 /*                                                                  */
00252 /*                    Small String Searches                         */
00253 /*                                                                  */
00254 /* **************************************************************** */
00255 
00256 /* Function names that start with "skip" are passed a string, and return
00257    an offset from the start of that string.  Function names that start
00258    with "find" are passed a SEARCH_BINDING, and return an absolute position
00259    marker of the item being searched for.  "Find" functions return a value
00260    of -1 if the item being looked for couldn't be found. */
00261 
00262 /* Return the index of the first non-whitespace character in STRING. */
00263 int
00264 skip_whitespace (char *string)
00265 {
00266   register int i;
00267 
00268   for (i = 0; string && whitespace (string[i]); i++);
00269   return (i);
00270 }
00271 
00272 /* Return the index of the first non-whitespace or newline character in
00273    STRING. */
00274 int
00275 skip_whitespace_and_newlines (char *string)
00276 {
00277   register int i;
00278 
00279   for (i = 0; string && whitespace_or_newline (string[i]); i++);
00280   return (i);
00281 }
00282 
00283 /* Return the index of the first whitespace character in STRING. */
00284 int
00285 skip_non_whitespace (char *string)
00286 {
00287   register int i;
00288 
00289   for (i = 0; string && string[i] && !whitespace (string[i]); i++);
00290   return (i);
00291 }
00292 
00293 /* Return the index of the first non-node character in STRING.  Note that
00294    this function contains quite a bit of hair to ignore periods in some
00295    special cases.  This is because we here at GNU ship some info files which
00296    contain nodenames that contain periods.  No such nodename can start with
00297    a period, or continue with whitespace, newline, or ')' immediately following
00298    the period.  If second argument NEWLINES_OKAY is non-zero, newlines should
00299    be skipped while parsing out the nodename specification. */
00300 int
00301 skip_node_characters (char *string, int newlines_okay)
00302 {
00303   register int c, i = 0;
00304   int paren_seen = 0;
00305   int paren = 0;
00306 
00307   /* Handle special case.  This is when another function has parsed out the
00308      filename component of the node name, and we just want to parse out the
00309      nodename proper.  In that case, a period at the start of the nodename
00310      indicates an empty nodename. */
00311   if (string && *string == '.')
00312     return (0);
00313 
00314   if (string && *string == '(')
00315     {
00316       paren++;
00317       paren_seen++;
00318       i++;
00319     }
00320 
00321   for (; string && (c = string[i]); i++)
00322     {
00323       if (paren)
00324         {
00325           if (c == '(')
00326             paren++;
00327           else if (c == ')')
00328             paren--;
00329 
00330           continue;
00331         }
00332       
00333       /* If the character following the close paren is a space or period,
00334          then this node name has no more characters associated with it. */
00335       if (c == '\t' ||
00336           c == ','  ||
00337           c == INFO_TAGSEP ||
00338           ((!newlines_okay) && (c == '\n')) ||
00339           ((paren_seen && string[i - 1] == ')') &&
00340            (c == ' ' || c == '.')) ||
00341           (c == '.' &&
00342            (
00343 #if 0
00344 /* This test causes a node name ending in a period, like `This.', not to
00345    be found.  The trailing . is stripped.  This occurs in the jargon
00346    file (`I see no X here.' is a node name).  */
00347            (!string[i + 1]) ||
00348 #endif
00349             (whitespace_or_newline (string[i + 1])) ||
00350             (string[i + 1] == ')'))))
00351         break;
00352     }
00353   return (i);
00354 }
00355 
00356 
00357 /* **************************************************************** */
00358 /*                                                                  */
00359 /*                   Searching FILE_BUFFER's                        */
00360 /*                                                                  */
00361 /* **************************************************************** */
00362 
00363 /* Return the absolute position of the first occurence of a node separator in
00364    BINDING-buffer.  The search starts at BINDING->start.  Return -1 if no node
00365    separator was found. */
00366 long
00367 find_node_separator (SEARCH_BINDING *binding)
00368 {
00369   register long i;
00370   char *body;
00371 
00372   body = binding->buffer;
00373 
00374   /* A node is started by [^L]^_[^L]\n.  That is to say, the C-l's are
00375      optional, but the DELETE and NEWLINE are not.  This separator holds
00376      true for all separated elements in an Info file, including the tags
00377      table (if present) and the indirect tags table (if present). */
00378   for (i = binding->start; i < binding->end - 1; i++)
00379     if (((body[i] == INFO_FF && body[i + 1] == INFO_COOKIE) &&
00380          (body[i + 2] == '\n' ||
00381           (body[i + 2] == INFO_FF && body[i + 3] == '\n'))) ||
00382         ((body[i] == INFO_COOKIE) &&
00383          (body[i + 1] == '\n' ||
00384           (body[i + 1] == INFO_FF && body[i + 2] == '\n'))))
00385       return (i);
00386   return (-1);
00387 }
00388 
00389 /* Return the length of the node separator characters that BODY is
00390    currently pointing at. */
00391 int
00392 skip_node_separator (char *body)
00393 {
00394   register int i;
00395 
00396   i = 0;
00397 
00398   if (body[i] == INFO_FF)
00399     i++;
00400 
00401   if (body[i++] != INFO_COOKIE)
00402     return (0);
00403 
00404   if (body[i] == INFO_FF)
00405     i++;
00406 
00407   if (body[i++] != '\n')
00408     return (0);
00409 
00410   return (i);
00411 }
00412 
00413 /* Return the number of characters from STRING to the start of
00414    the next line. */
00415 int
00416 skip_line (char *string)
00417 {
00418   register int i;
00419 
00420   for (i = 0; string && string[i] && string[i] != '\n'; i++);
00421 
00422   if (string[i] == '\n')
00423     i++;
00424 
00425   return (i);
00426 }
00427 
00428 /* Return the absolute position of the beginning of a tags table in this
00429    binding starting the search at binding->start. */
00430 long
00431 find_tags_table (SEARCH_BINDING *binding)
00432 {
00433   SEARCH_BINDING tmp_search;
00434   long position;
00435 
00436   tmp_search.buffer = binding->buffer;
00437   tmp_search.start = binding->start;
00438   tmp_search.end = binding->end;
00439   tmp_search.flags = S_FoldCase;
00440 
00441   while ((position = find_node_separator (&tmp_search)) != -1 )
00442     {
00443       tmp_search.start = position;
00444       tmp_search.start += skip_node_separator (tmp_search.buffer
00445           + tmp_search.start);
00446 
00447       if (looking_at (TAGS_TABLE_BEG_LABEL, &tmp_search))
00448         return (position);
00449     }
00450   return (-1);
00451 }
00452 
00453 /* Return the absolute position of the node named NODENAME in BINDING.
00454    This is a brute force search, and we wish to avoid it when possible.
00455    This function is called when a tag (indirect or otherwise) doesn't
00456    really point to the right node.  It returns the absolute position of
00457    the separator preceding the node. */
00458 long
00459 find_node_in_binding (char *nodename, SEARCH_BINDING *binding)
00460 {
00461   long position;
00462   int offset, namelen;
00463   SEARCH_BINDING tmp_search;
00464 
00465   namelen = strlen (nodename);
00466 
00467   tmp_search.buffer = binding->buffer;
00468   tmp_search.start = binding->start;
00469   tmp_search.end = binding->end;
00470   tmp_search.flags = 0;
00471 
00472   while ((position = find_node_separator (&tmp_search)) != -1)
00473     {
00474       tmp_search.start = position;
00475       tmp_search.start += skip_node_separator
00476         (tmp_search.buffer + tmp_search.start);
00477 
00478       offset = string_in_line
00479         (INFO_NODE_LABEL, tmp_search.buffer + tmp_search.start);
00480 
00481       if (offset == -1)
00482         continue;
00483 
00484       tmp_search.start += offset;
00485       tmp_search.start += skip_whitespace (tmp_search.buffer + tmp_search.start);
00486       offset = skip_node_characters
00487         (tmp_search.buffer + tmp_search.start, DONT_SKIP_NEWLINES);
00488 
00489       /* Notice that this is an exact match.  You cannot grovel through
00490          the buffer with this function looking for random nodes. */
00491        if ((offset == namelen) &&
00492            (tmp_search.buffer[tmp_search.start] == nodename[0]) &&
00493            (strncmp (tmp_search.buffer + tmp_search.start, nodename, offset) == 0))
00494          return (position);
00495     }
00496   return (-1);
00497 }