Back to index

cell-binutils  2.17cvs20070401
app.c
Go to the documentation of this file.
00001 /* This is the Assembler Pre-Processor
00002    Copyright 1987, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
00003    1999, 2000, 2001, 2002, 2003, 2006, 2007
00004    Free Software Foundation, Inc.
00005 
00006    This file is part of GAS, the GNU Assembler.
00007 
00008    GAS is free software; you can redistribute it and/or modify
00009    it under the terms of the GNU General Public License as published by
00010    the Free Software Foundation; either version 2, or (at your option)
00011    any later version.
00012 
00013    GAS is distributed in the hope that it will be useful,
00014    but WITHOUT ANY WARRANTY; without even the implied warranty of
00015    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016    GNU General Public License for more details.
00017 
00018    You should have received a copy of the GNU General Public License
00019    along with GAS; see the file COPYING.  If not, write to the Free
00020    Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
00021    02110-1301, USA.  */
00022 
00023 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90.  */
00024 /* App, the assembler pre-processor.  This pre-processor strips out
00025    excess spaces, turns single-quoted characters into a decimal
00026    constant, and turns the # in # <number> <filename> <garbage> into a
00027    .linefile.  This needs better error-handling.  */
00028 
00029 #include "as.h"
00030 
00031 #if (__STDC__ != 1)
00032 #ifndef const
00033 #define const  /* empty */
00034 #endif
00035 #endif
00036 
00037 #ifdef TC_M68K
00038 /* Whether we are scrubbing in m68k MRI mode.  This is different from
00039    flag_m68k_mri, because the two flags will be affected by the .mri
00040    pseudo-op at different times.  */
00041 static int scrub_m68k_mri;
00042 
00043 /* The pseudo-op which switches in and out of MRI mode.  See the
00044    comment in do_scrub_chars.  */
00045 static const char mri_pseudo[] = ".mri 0";
00046 #else
00047 #define scrub_m68k_mri 0
00048 #endif
00049 
00050 #if defined TC_ARM && defined OBJ_ELF
00051 /* The pseudo-op for which we need to special-case `@' characters.
00052    See the comment in do_scrub_chars.  */
00053 static const char   symver_pseudo[] = ".symver";
00054 static const char * symver_state;
00055 #endif
00056 
00057 static char lex[256];
00058 static const char symbol_chars[] =
00059 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
00060 
00061 #define LEX_IS_SYMBOL_COMPONENT           1
00062 #define LEX_IS_WHITESPACE          2
00063 #define LEX_IS_LINE_SEPARATOR             3
00064 #define LEX_IS_COMMENT_START              4
00065 #define LEX_IS_LINE_COMMENT_START  5
00066 #define       LEX_IS_TWOCHAR_COMMENT_1ST  6
00067 #define       LEX_IS_STRINGQUOTE          8
00068 #define       LEX_IS_COLON                9
00069 #define       LEX_IS_NEWLINE                     10
00070 #define       LEX_IS_ONECHAR_QUOTE        11
00071 #ifdef TC_V850
00072 #define LEX_IS_DOUBLEDASH_1ST             12
00073 #endif
00074 #ifdef TC_M32R
00075 #define DOUBLEBAR_PARALLEL
00076 #endif
00077 #ifdef DOUBLEBAR_PARALLEL
00078 #define LEX_IS_DOUBLEBAR_1ST              13
00079 #endif
00080 #define LEX_IS_PARALLEL_SEPARATOR  14
00081 #define IS_SYMBOL_COMPONENT(c)            (lex[c] == LEX_IS_SYMBOL_COMPONENT)
00082 #define IS_WHITESPACE(c)           (lex[c] == LEX_IS_WHITESPACE)
00083 #define IS_LINE_SEPARATOR(c)              (lex[c] == LEX_IS_LINE_SEPARATOR)
00084 #define IS_PARALLEL_SEPARATOR(c)   (lex[c] == LEX_IS_PARALLEL_SEPARATOR)
00085 #define IS_COMMENT(c)                     (lex[c] == LEX_IS_COMMENT_START)
00086 #define IS_LINE_COMMENT(c)         (lex[c] == LEX_IS_LINE_COMMENT_START)
00087 #define       IS_NEWLINE(c)               (lex[c] == LEX_IS_NEWLINE)
00088 
00089 static int process_escape (int);
00090 
00091 /* FIXME-soon: The entire lexer/parser thingy should be
00092    built statically at compile time rather than dynamically
00093    each and every time the assembler is run.  xoxorich.  */
00094 
00095 void
00096 do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
00097 {
00098   const char *p;
00099   int c;
00100 
00101   lex[' '] = LEX_IS_WHITESPACE;
00102   lex['\t'] = LEX_IS_WHITESPACE;
00103   lex['\r'] = LEX_IS_WHITESPACE;
00104   lex['\n'] = LEX_IS_NEWLINE;
00105   lex[':'] = LEX_IS_COLON;
00106 
00107 #ifdef TC_M68K
00108   scrub_m68k_mri = m68k_mri;
00109 
00110   if (! m68k_mri)
00111 #endif
00112     {
00113       lex['"'] = LEX_IS_STRINGQUOTE;
00114 
00115 #if ! defined (TC_HPPA) && ! defined (TC_I370)
00116       /* I370 uses single-quotes to delimit integer, float constants.  */
00117       lex['\''] = LEX_IS_ONECHAR_QUOTE;
00118 #endif
00119 
00120 #ifdef SINGLE_QUOTE_STRINGS
00121       lex['\''] = LEX_IS_STRINGQUOTE;
00122 #endif
00123     }
00124 
00125   /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
00126      in state 5 of do_scrub_chars must be changed.  */
00127 
00128   /* Note that these override the previous defaults, e.g. if ';' is a
00129      comment char, then it isn't a line separator.  */
00130   for (p = symbol_chars; *p; ++p)
00131     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
00132 
00133   for (c = 128; c < 256; ++c)
00134     lex[c] = LEX_IS_SYMBOL_COMPONENT;
00135 
00136 #ifdef tc_symbol_chars
00137   /* This macro permits the processor to specify all characters which
00138      may appears in an operand.  This will prevent the scrubber from
00139      discarding meaningful whitespace in certain cases.  The i386
00140      backend uses this to support prefixes, which can confuse the
00141      scrubber as to whether it is parsing operands or opcodes.  */
00142   for (p = tc_symbol_chars; *p; ++p)
00143     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
00144 #endif
00145 
00146   /* The m68k backend wants to be able to change comment_chars.  */
00147 #ifndef tc_comment_chars
00148 #define tc_comment_chars comment_chars
00149 #endif
00150   for (p = tc_comment_chars; *p; p++)
00151     lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
00152 
00153   for (p = line_comment_chars; *p; p++)
00154     lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
00155 
00156   for (p = line_separator_chars; *p; p++)
00157     lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
00158 
00159 #ifdef tc_parallel_separator_chars
00160   /* This macro permits the processor to specify all characters which
00161      separate parallel insns on the same line.  */
00162   for (p = tc_parallel_separator_chars; *p; p++)
00163     lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
00164 #endif
00165 
00166   /* Only allow slash-star comments if slash is not in use.
00167      FIXME: This isn't right.  We should always permit them.  */
00168   if (lex['/'] == 0)
00169     lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
00170 
00171 #ifdef TC_M68K
00172   if (m68k_mri)
00173     {
00174       lex['\''] = LEX_IS_STRINGQUOTE;
00175       lex[';'] = LEX_IS_COMMENT_START;
00176       lex['*'] = LEX_IS_LINE_COMMENT_START;
00177       /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
00178         then it can't be used in an expression.  */
00179       lex['!'] = LEX_IS_LINE_COMMENT_START;
00180     }
00181 #endif
00182 
00183 #ifdef TC_V850
00184   lex['-'] = LEX_IS_DOUBLEDASH_1ST;
00185 #endif
00186 #ifdef DOUBLEBAR_PARALLEL
00187   lex['|'] = LEX_IS_DOUBLEBAR_1ST;
00188 #endif
00189 #ifdef TC_D30V
00190   /* Must do this is we want VLIW instruction with "->" or "<-".  */
00191   lex['-'] = LEX_IS_SYMBOL_COMPONENT;
00192 #endif
00193 }
00194 
00195 /* Saved state of the scrubber.  */
00196 static int state;
00197 static int old_state;
00198 static char *out_string;
00199 static char out_buf[20];
00200 static int add_newlines;
00201 static char *saved_input;
00202 static int saved_input_len;
00203 static char input_buffer[32 * 1024];
00204 static const char *mri_state;
00205 static char mri_last_ch;
00206 
00207 /* Data structure for saving the state of app across #include's.  Note that
00208    app is called asynchronously to the parsing of the .include's, so our
00209    state at the time .include is interpreted is completely unrelated.
00210    That's why we have to save it all.  */
00211 
00212 struct app_save
00213 {
00214   int          state;
00215   int          old_state;
00216   char *       out_string;
00217   char         out_buf[sizeof (out_buf)];
00218   int          add_newlines;
00219   char *       saved_input;
00220   int          saved_input_len;
00221 #ifdef TC_M68K
00222   int          scrub_m68k_mri;
00223 #endif
00224   const char * mri_state;
00225   char         mri_last_ch;
00226 #if defined TC_ARM && defined OBJ_ELF
00227   const char * symver_state;
00228 #endif
00229 };
00230 
00231 char *
00232 app_push (void)
00233 {
00234   register struct app_save *saved;
00235 
00236   saved = (struct app_save *) xmalloc (sizeof (*saved));
00237   saved->state = state;
00238   saved->old_state = old_state;
00239   saved->out_string = out_string;
00240   memcpy (saved->out_buf, out_buf, sizeof (out_buf));
00241   saved->add_newlines = add_newlines;
00242   if (saved_input == NULL)
00243     saved->saved_input = NULL;
00244   else
00245     {
00246       saved->saved_input = xmalloc (saved_input_len);
00247       memcpy (saved->saved_input, saved_input, saved_input_len);
00248       saved->saved_input_len = saved_input_len;
00249     }
00250 #ifdef TC_M68K
00251   saved->scrub_m68k_mri = scrub_m68k_mri;
00252 #endif
00253   saved->mri_state = mri_state;
00254   saved->mri_last_ch = mri_last_ch;
00255 #if defined TC_ARM && defined OBJ_ELF
00256   saved->symver_state = symver_state;
00257 #endif
00258 
00259   /* do_scrub_begin() is not useful, just wastes time.  */
00260 
00261   state = 0;
00262   saved_input = NULL;
00263 
00264   return (char *) saved;
00265 }
00266 
00267 void
00268 app_pop (char *arg)
00269 {
00270   register struct app_save *saved = (struct app_save *) arg;
00271 
00272   /* There is no do_scrub_end ().  */
00273   state = saved->state;
00274   old_state = saved->old_state;
00275   out_string = saved->out_string;
00276   memcpy (out_buf, saved->out_buf, sizeof (out_buf));
00277   add_newlines = saved->add_newlines;
00278   if (saved->saved_input == NULL)
00279     saved_input = NULL;
00280   else
00281     {
00282       assert (saved->saved_input_len <= (int) (sizeof input_buffer));
00283       memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
00284       saved_input = input_buffer;
00285       saved_input_len = saved->saved_input_len;
00286       free (saved->saved_input);
00287     }
00288 #ifdef TC_M68K
00289   scrub_m68k_mri = saved->scrub_m68k_mri;
00290 #endif
00291   mri_state = saved->mri_state;
00292   mri_last_ch = saved->mri_last_ch;
00293 #if defined TC_ARM && defined OBJ_ELF
00294   symver_state = saved->symver_state;
00295 #endif
00296 
00297   free (arg);
00298 }
00299 
00300 /* @@ This assumes that \n &c are the same on host and target.  This is not
00301    necessarily true.  */
00302 
00303 static int
00304 process_escape (int ch)
00305 {
00306   switch (ch)
00307     {
00308     case 'b':
00309       return '\b';
00310     case 'f':
00311       return '\f';
00312     case 'n':
00313       return '\n';
00314     case 'r':
00315       return '\r';
00316     case 't':
00317       return '\t';
00318     case '\'':
00319       return '\'';
00320     case '"':
00321       return '\"';
00322     default:
00323       return ch;
00324     }
00325 }
00326 
00327 /* This function is called to process input characters.  The GET
00328    parameter is used to retrieve more input characters.  GET should
00329    set its parameter to point to a buffer, and return the length of
00330    the buffer; it should return 0 at end of file.  The scrubbed output
00331    characters are put into the buffer starting at TOSTART; the TOSTART
00332    buffer is TOLEN bytes in length.  The function returns the number
00333    of scrubbed characters put into TOSTART.  This will be TOLEN unless
00334    end of file was seen.  This function is arranged as a state
00335    machine, and saves its state so that it may return at any point.
00336    This is the way the old code used to work.  */
00337 
00338 int
00339 do_scrub_chars (int (*get) (char *, int), char *tostart, int tolen)
00340 {
00341   char *to = tostart;
00342   char *toend = tostart + tolen;
00343   char *from;
00344   char *fromend;
00345   int fromlen;
00346   register int ch, ch2 = 0;
00347   /* Character that started the string we're working on.  */
00348   static char quotechar;
00349 
00350   /*State 0: beginning of normal line
00351          1: After first whitespace on line (flush more white)
00352          2: After first non-white (opcode) on line (keep 1white)
00353          3: after second white on line (into operands) (flush white)
00354          4: after putting out a .linefile, put out digits
00355          5: parsing a string, then go to old-state
00356          6: putting out \ escape in a "d string.
00357          7: no longer used
00358          8: no longer used
00359          9: After seeing symbol char in state 3 (keep 1white after symchar)
00360         10: After seeing whitespace in state 9 (keep white before symchar)
00361         11: After seeing a symbol character in state 0 (eg a label definition)
00362         -1: output string in out_string and go to the state in old_state
00363         -2: flush text until a '*' '/' is seen, then go to state old_state
00364 #ifdef TC_V850
00365         12: After seeing a dash, looking for a second dash as a start
00366             of comment.
00367 #endif
00368 #ifdef DOUBLEBAR_PARALLEL
00369         13: After seeing a vertical bar, looking for a second
00370             vertical bar as a parallel expression separator.
00371 #endif
00372 #ifdef TC_IA64
00373         14: After seeing a `(' at state 0, looking for a `)' as
00374             predicate.
00375         15: After seeing a `(' at state 1, looking for a `)' as
00376             predicate.
00377 #endif
00378 #ifdef TC_Z80
00379         16: After seeing an 'a' or an 'A' at the start of a symbol
00380         17: After seeing an 'f' or an 'F' in state 16
00381 #endif
00382          */
00383 
00384   /* I added states 9 and 10 because the MIPS ECOFF assembler uses
00385      constructs like ``.loc 1 20''.  This was turning into ``.loc
00386      120''.  States 9 and 10 ensure that a space is never dropped in
00387      between characters which could appear in an identifier.  Ian
00388      Taylor, ian@cygnus.com.
00389 
00390      I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
00391      correctly on the PA (and any other target where colons are optional).
00392      Jeff Law, law@cs.utah.edu.
00393 
00394      I added state 13 so that something like "cmp r1, r2 || trap #1" does not
00395      get squashed into "cmp r1,r2||trap#1", with the all important space
00396      between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
00397 
00398   /* This macro gets the next input character.  */
00399 
00400 #define GET()                                           \
00401   (from < fromend                                       \
00402    ? * (unsigned char *) (from++)                       \
00403    : (saved_input = NULL,                               \
00404       fromlen = (*get) (input_buffer, sizeof input_buffer),    \
00405       from = input_buffer,                              \
00406       fromend = from + fromlen,                                \
00407       (fromlen == 0                                     \
00408        ? EOF                                            \
00409        : * (unsigned char *) (from++))))
00410 
00411   /* This macro pushes a character back on the input stream.  */
00412 
00413 #define UNGET(uch) (*--from = (uch))
00414 
00415   /* This macro puts a character into the output buffer.  If this
00416      character fills the output buffer, this macro jumps to the label
00417      TOFULL.  We use this rather ugly approach because we need to
00418      handle two different termination conditions: EOF on the input
00419      stream, and a full output buffer.  It would be simpler if we
00420      always read in the entire input stream before processing it, but
00421      I don't want to make such a significant change to the assembler's
00422      memory usage.  */
00423 
00424 #define PUT(pch)                          \
00425   do                                      \
00426     {                                     \
00427       *to++ = (pch);                      \
00428       if (to >= toend)                           \
00429        goto tofull;                       \
00430     }                                     \
00431   while (0)
00432 
00433   if (saved_input != NULL)
00434     {
00435       from = saved_input;
00436       fromend = from + saved_input_len;
00437     }
00438   else
00439     {
00440       fromlen = (*get) (input_buffer, sizeof input_buffer);
00441       if (fromlen == 0)
00442        return 0;
00443       from = input_buffer;
00444       fromend = from + fromlen;
00445     }
00446 
00447   while (1)
00448     {
00449       /* The cases in this switch end with continue, in order to
00450         branch back to the top of this while loop and generate the
00451         next output character in the appropriate state.  */
00452       switch (state)
00453        {
00454        case -1:
00455          ch = *out_string++;
00456          if (*out_string == '\0')
00457            {
00458              state = old_state;
00459              old_state = 3;
00460            }
00461          PUT (ch);
00462          continue;
00463 
00464        case -2:
00465          for (;;)
00466            {
00467              do
00468               {
00469                 ch = GET ();
00470 
00471                 if (ch == EOF)
00472                   {
00473                     as_warn (_("end of file in comment"));
00474                     goto fromeof;
00475                   }
00476 
00477                 if (ch == '\n')
00478                   PUT ('\n');
00479               }
00480              while (ch != '*');
00481 
00482              while ((ch = GET ()) == '*')
00483               ;
00484 
00485              if (ch == EOF)
00486               {
00487                 as_warn (_("end of file in comment"));
00488                 goto fromeof;
00489               }
00490 
00491              if (ch == '/')
00492               break;
00493 
00494              UNGET (ch);
00495            }
00496 
00497          state = old_state;
00498          UNGET (' ');
00499          continue;
00500 
00501        case 4:
00502          ch = GET ();
00503          if (ch == EOF)
00504            goto fromeof;
00505          else if (ch >= '0' && ch <= '9')
00506            PUT (ch);
00507          else
00508            {
00509              while (ch != EOF && IS_WHITESPACE (ch))
00510               ch = GET ();
00511              if (ch == '"')
00512               {
00513                 quotechar = ch;
00514                 state = 5;
00515                 old_state = 3;
00516                 PUT (ch);
00517               }
00518              else
00519               {
00520                 while (ch != EOF && ch != '\n')
00521                   ch = GET ();
00522                 state = 0;
00523                 PUT (ch);
00524               }
00525            }
00526          continue;
00527 
00528        case 5:
00529          /* We are going to copy everything up to a quote character,
00530             with special handling for a backslash.  We try to
00531             optimize the copying in the simple case without using the
00532             GET and PUT macros.  */
00533          {
00534            char *s;
00535            int len;
00536 
00537            for (s = from; s < fromend; s++)
00538              {
00539               ch = *s;
00540               if (ch == '\\'
00541                   || ch == quotechar
00542                   || ch == '\n')
00543                 break;
00544              }
00545            len = s - from;
00546            if (len > toend - to)
00547              len = toend - to;
00548            if (len > 0)
00549              {
00550               memcpy (to, from, len);
00551               to += len;
00552               from += len;
00553              }
00554          }
00555 
00556          ch = GET ();
00557          if (ch == EOF)
00558            {
00559              as_warn (_("end of file in string; '%c' inserted"), quotechar);
00560              state = old_state;
00561              UNGET ('\n');
00562              PUT (quotechar);
00563            }
00564          else if (ch == quotechar)
00565            {
00566              state = old_state;
00567              PUT (ch);
00568            }
00569 #ifndef NO_STRING_ESCAPES
00570          else if (ch == '\\')
00571            {
00572              state = 6;
00573              PUT (ch);
00574            }
00575 #endif
00576          else if (scrub_m68k_mri && ch == '\n')
00577            {
00578              /* Just quietly terminate the string.  This permits lines like
00579                  bne label  loop if we haven't reach end yet.  */
00580              state = old_state;
00581              UNGET (ch);
00582              PUT ('\'');
00583            }
00584          else
00585            {
00586              PUT (ch);
00587            }
00588          continue;
00589 
00590        case 6:
00591          state = 5;
00592          ch = GET ();
00593          switch (ch)
00594            {
00595              /* Handle strings broken across lines, by turning '\n' into
00596                '\\' and 'n'.  */
00597            case '\n':
00598              UNGET ('n');
00599              add_newlines++;
00600              PUT ('\\');
00601              continue;
00602 
00603            case EOF:
00604              as_warn (_("end of file in string; '%c' inserted"), quotechar);
00605              PUT (quotechar);
00606              continue;
00607 
00608            case '"':
00609            case '\\':
00610            case 'b':
00611            case 'f':
00612            case 'n':
00613            case 'r':
00614            case 't':
00615            case 'v':
00616            case 'x':
00617            case 'X':
00618            case '0':
00619            case '1':
00620            case '2':
00621            case '3':
00622            case '4':
00623            case '5':
00624            case '6':
00625            case '7':
00626              break;
00627 
00628            default:
00629 #ifdef ONLY_STANDARD_ESCAPES
00630              as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
00631 #endif
00632              break;
00633            }
00634          PUT (ch);
00635          continue;
00636 
00637 #ifdef DOUBLEBAR_PARALLEL
00638        case 13:
00639          ch = GET ();
00640          if (ch != '|')
00641            abort ();
00642 
00643          /* Reset back to state 1 and pretend that we are parsing a
00644             line from just after the first white space.  */
00645          state = 1;
00646          PUT ('|');
00647          continue;
00648 #endif
00649 #ifdef TC_Z80
00650        case 16:
00651          /* We have seen an 'a' at the start of a symbol, look for an 'f'.  */
00652          ch = GET ();
00653          if (ch == 'f' || ch == 'F') 
00654            {
00655              state = 17;
00656              PUT (ch);
00657            }
00658          else
00659            {
00660              state = 9;
00661              break;
00662            }
00663        case 17:
00664          /* We have seen "af" at the start of a symbol,
00665             a ' here is a part of that symbol.  */
00666          ch = GET ();
00667          state = 9;
00668          if (ch == '\'')
00669            /* Change to avoid warning about unclosed string.  */
00670            PUT ('`');
00671          else
00672            UNGET (ch);
00673          break;
00674 #endif
00675        }
00676 
00677       /* OK, we are somewhere in states 0 through 4 or 9 through 11.  */
00678 
00679       /* flushchar: */
00680       ch = GET ();
00681 
00682 #ifdef TC_IA64
00683       if (ch == '(' && (state == 0 || state == 1))
00684        {
00685          state += 14;
00686          PUT (ch);
00687          continue;
00688        }
00689       else if (state == 14 || state == 15)
00690        {
00691          if (ch == ')')
00692            {
00693              state -= 14;
00694              PUT (ch);
00695              ch = GET ();
00696            }
00697          else
00698            {
00699              PUT (ch);
00700              continue;
00701            }
00702        }
00703 #endif
00704 
00705     recycle:
00706 
00707 #if defined TC_ARM && defined OBJ_ELF
00708       /* We need to watch out for .symver directives.  See the comment later
00709         in this function.  */
00710       if (symver_state == NULL)
00711        {
00712          if ((state == 0 || state == 1) && ch == symver_pseudo[0])
00713            symver_state = symver_pseudo + 1;
00714        }
00715       else
00716        {
00717          /* We advance to the next state if we find the right
00718             character.  */
00719          if (ch != '\0' && (*symver_state == ch))
00720            ++symver_state;
00721          else if (*symver_state != '\0')
00722            /* We did not get the expected character, or we didn't
00723               get a valid terminating character after seeing the
00724               entire pseudo-op, so we must go back to the beginning.  */
00725            symver_state = NULL;
00726          else
00727            {
00728              /* We've read the entire pseudo-op.  If this is the end
00729                of the line, go back to the beginning.  */
00730              if (IS_NEWLINE (ch))
00731               symver_state = NULL;
00732            }
00733        }
00734 #endif /* TC_ARM && OBJ_ELF */
00735 
00736 #ifdef TC_M68K
00737       /* We want to have pseudo-ops which control whether we are in
00738         MRI mode or not.  Unfortunately, since m68k MRI mode affects
00739         the scrubber, that means that we need a special purpose
00740         recognizer here.  */
00741       if (mri_state == NULL)
00742        {
00743          if ((state == 0 || state == 1)
00744              && ch == mri_pseudo[0])
00745            mri_state = mri_pseudo + 1;
00746        }
00747       else
00748        {
00749          /* We advance to the next state if we find the right
00750             character, or if we need a space character and we get any
00751             whitespace character, or if we need a '0' and we get a
00752             '1' (this is so that we only need one state to handle
00753             ``.mri 0'' and ``.mri 1'').  */
00754          if (ch != '\0'
00755              && (*mri_state == ch
00756                 || (*mri_state == ' '
00757                     && lex[ch] == LEX_IS_WHITESPACE)
00758                 || (*mri_state == '0'
00759                     && ch == '1')))
00760            {
00761              mri_last_ch = ch;
00762              ++mri_state;
00763            }
00764          else if (*mri_state != '\0'
00765                  || (lex[ch] != LEX_IS_WHITESPACE
00766                      && lex[ch] != LEX_IS_NEWLINE))
00767            {
00768              /* We did not get the expected character, or we didn't
00769                get a valid terminating character after seeing the
00770                entire pseudo-op, so we must go back to the
00771                beginning.  */
00772              mri_state = NULL;
00773            }
00774          else
00775            {
00776              /* We've read the entire pseudo-op.  mips_last_ch is
00777                either '0' or '1' indicating whether to enter or
00778                leave MRI mode.  */
00779              do_scrub_begin (mri_last_ch == '1');
00780              mri_state = NULL;
00781 
00782              /* We continue handling the character as usual.  The
00783                main gas reader must also handle the .mri pseudo-op
00784                to control expression parsing and the like.  */
00785            }
00786        }
00787 #endif
00788 
00789       if (ch == EOF)
00790        {
00791          if (state != 0)
00792            {
00793              as_warn (_("end of file not at end of a line; newline inserted"));
00794              state = 0;
00795              PUT ('\n');
00796            }
00797          goto fromeof;
00798        }
00799 
00800       switch (lex[ch])
00801        {
00802        case LEX_IS_WHITESPACE:
00803          do
00804            {
00805              ch = GET ();
00806            }
00807          while (ch != EOF && IS_WHITESPACE (ch));
00808          if (ch == EOF)
00809            goto fromeof;
00810 
00811          if (state == 0)
00812            {
00813              /* Preserve a single whitespace character at the
00814                beginning of a line.  */
00815              state = 1;
00816              UNGET (ch);
00817              PUT (' ');
00818              break;
00819            }
00820 
00821 #ifdef KEEP_WHITE_AROUND_COLON
00822          if (lex[ch] == LEX_IS_COLON)
00823            {
00824              /* Only keep this white if there's no white *after* the
00825                colon.  */
00826              ch2 = GET ();
00827              UNGET (ch2);
00828              if (!IS_WHITESPACE (ch2))
00829               {
00830                 state = 9;
00831                 UNGET (ch);
00832                 PUT (' ');
00833                 break;
00834               }
00835            }
00836 #endif
00837          if (IS_COMMENT (ch)
00838              || ch == '/'
00839              || IS_LINE_SEPARATOR (ch)
00840              || IS_PARALLEL_SEPARATOR (ch))
00841            {
00842              if (scrub_m68k_mri)
00843               {
00844                 /* In MRI mode, we keep these spaces.  */
00845                 UNGET (ch);
00846                 PUT (' ');
00847                 break;
00848               }
00849              goto recycle;
00850            }
00851 
00852          /* If we're in state 2 or 11, we've seen a non-white
00853             character followed by whitespace.  If the next character
00854             is ':', this is whitespace after a label name which we
00855             normally must ignore.  In MRI mode, though, spaces are
00856             not permitted between the label and the colon.  */
00857          if ((state == 2 || state == 11)
00858              && lex[ch] == LEX_IS_COLON
00859              && ! scrub_m68k_mri)
00860            {
00861              state = 1;
00862              PUT (ch);
00863              break;
00864            }
00865 
00866          switch (state)
00867            {
00868            case 1:
00869              /* We can arrive here if we leave a leading whitespace
00870                character at the beginning of a line.  */
00871              goto recycle;
00872            case 2:
00873              state = 3;
00874              if (to + 1 < toend)
00875               {
00876                 /* Optimize common case by skipping UNGET/GET.  */
00877                 PUT (' ');  /* Sp after opco */
00878                 goto recycle;
00879               }
00880              UNGET (ch);
00881              PUT (' ');
00882              break;
00883            case 3:
00884              if (scrub_m68k_mri)
00885               {
00886                 /* In MRI mode, we keep these spaces.  */
00887                 UNGET (ch);
00888                 PUT (' ');
00889                 break;
00890               }
00891              goto recycle;  /* Sp in operands */
00892            case 9:
00893            case 10:
00894              if (scrub_m68k_mri)
00895               {
00896                 /* In MRI mode, we keep these spaces.  */
00897                 state = 3;
00898                 UNGET (ch);
00899                 PUT (' ');
00900                 break;
00901               }
00902              state = 10;    /* Sp after symbol char */
00903              goto recycle;
00904            case 11:
00905              if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
00906               state = 1;
00907              else
00908               {
00909                 /* We know that ch is not ':', since we tested that
00910                    case above.  Therefore this is not a label, so it
00911                    must be the opcode, and we've just seen the
00912                    whitespace after it.  */
00913                 state = 3;
00914               }
00915              UNGET (ch);
00916              PUT (' ');     /* Sp after label definition.  */
00917              break;
00918            default:
00919              BAD_CASE (state);
00920            }
00921          break;
00922 
00923        case LEX_IS_TWOCHAR_COMMENT_1ST:
00924          ch2 = GET ();
00925          if (ch2 == '*')
00926            {
00927              for (;;)
00928               {
00929                 do
00930                   {
00931                     ch2 = GET ();
00932                     if (ch2 != EOF && IS_NEWLINE (ch2))
00933                      add_newlines++;
00934                   }
00935                 while (ch2 != EOF && ch2 != '*');
00936 
00937                 while (ch2 == '*')
00938                   ch2 = GET ();
00939 
00940                 if (ch2 == EOF || ch2 == '/')
00941                   break;
00942 
00943                 /* This UNGET will ensure that we count newlines
00944                    correctly.  */
00945                 UNGET (ch2);
00946               }
00947 
00948              if (ch2 == EOF)
00949               as_warn (_("end of file in multiline comment"));
00950 
00951              ch = ' ';
00952              goto recycle;
00953            }
00954 #ifdef DOUBLESLASH_LINE_COMMENTS
00955          else if (ch2 == '/')
00956            {
00957              do
00958               {
00959                 ch = GET ();
00960               }
00961              while (ch != EOF && !IS_NEWLINE (ch));
00962              if (ch == EOF)
00963               as_warn ("end of file in comment; newline inserted");
00964              state = 0;
00965              PUT ('\n');
00966              break;
00967            }
00968 #endif
00969          else
00970            {
00971              if (ch2 != EOF)
00972               UNGET (ch2);
00973              if (state == 9 || state == 10)
00974               state = 3;
00975              PUT (ch);
00976            }
00977          break;
00978 
00979        case LEX_IS_STRINGQUOTE:
00980          quotechar = ch;
00981          if (state == 10)
00982            {
00983              /* Preserve the whitespace in foo "bar".  */
00984              UNGET (ch);
00985              state = 3;
00986              PUT (' ');
00987 
00988              /* PUT didn't jump out.  We could just break, but we
00989                know what will happen, so optimize a bit.  */
00990              ch = GET ();
00991              old_state = 3;
00992            }
00993          else if (state == 9)
00994            old_state = 3;
00995          else
00996            old_state = state;
00997          state = 5;
00998          PUT (ch);
00999          break;
01000 
01001 #ifndef IEEE_STYLE
01002        case LEX_IS_ONECHAR_QUOTE:
01003          if (state == 10)
01004            {
01005              /* Preserve the whitespace in foo 'b'.  */
01006              UNGET (ch);
01007              state = 3;
01008              PUT (' ');
01009              break;
01010            }
01011          ch = GET ();
01012          if (ch == EOF)
01013            {
01014              as_warn (_("end of file after a one-character quote; \\0 inserted"));
01015              ch = 0;
01016            }
01017          if (ch == '\\')
01018            {
01019              ch = GET ();
01020              if (ch == EOF)
01021               {
01022                 as_warn (_("end of file in escape character"));
01023                 ch = '\\';
01024               }
01025              else
01026               ch = process_escape (ch);
01027            }
01028          sprintf (out_buf, "%d", (int) (unsigned char) ch);
01029 
01030          /* None of these 'x constants for us.  We want 'x'.  */
01031          if ((ch = GET ()) != '\'')
01032            {
01033 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
01034              as_warn (_("missing close quote; (assumed)"));
01035 #else
01036              if (ch != EOF)
01037               UNGET (ch);
01038 #endif
01039            }
01040          if (strlen (out_buf) == 1)
01041            {
01042              PUT (out_buf[0]);
01043              break;
01044            }
01045          if (state == 9)
01046            old_state = 3;
01047          else
01048            old_state = state;
01049          state = -1;
01050          out_string = out_buf;
01051          PUT (*out_string++);
01052          break;
01053 #endif
01054 
01055        case LEX_IS_COLON:
01056 #ifdef KEEP_WHITE_AROUND_COLON
01057          state = 9;
01058 #else
01059          if (state == 9 || state == 10)
01060            state = 3;
01061          else if (state != 3)
01062            state = 1;
01063 #endif
01064          PUT (ch);
01065          break;
01066 
01067        case LEX_IS_NEWLINE:
01068          /* Roll out a bunch of newlines from inside comments, etc.  */
01069          if (add_newlines)
01070            {
01071              --add_newlines;
01072              UNGET (ch);
01073            }
01074          /* Fall through.  */
01075 
01076        case LEX_IS_LINE_SEPARATOR:
01077          state = 0;
01078          PUT (ch);
01079          break;
01080 
01081        case LEX_IS_PARALLEL_SEPARATOR:
01082          state = 1;
01083          PUT (ch);
01084          break;
01085 
01086 #ifdef TC_V850
01087        case LEX_IS_DOUBLEDASH_1ST:
01088          ch2 = GET ();
01089          if (ch2 != '-')
01090            {
01091              UNGET (ch2);
01092              goto de_fault;
01093            }
01094          /* Read and skip to end of line.  */
01095          do
01096            {
01097              ch = GET ();
01098            }
01099          while (ch != EOF && ch != '\n');
01100 
01101          if (ch == EOF)
01102            as_warn (_("end of file in comment; newline inserted"));
01103 
01104          state = 0;
01105          PUT ('\n');
01106          break;
01107 #endif
01108 #ifdef DOUBLEBAR_PARALLEL
01109        case LEX_IS_DOUBLEBAR_1ST:
01110          ch2 = GET ();
01111          UNGET (ch2);
01112          if (ch2 != '|')
01113            goto de_fault;
01114 
01115          /* Handle '||' in two states as invoking PUT twice might
01116             result in the first one jumping out of this loop.  We'd
01117             then lose track of the state and one '|' char.  */
01118          state = 13;
01119          PUT ('|');
01120          break;
01121 #endif
01122        case LEX_IS_LINE_COMMENT_START:
01123          /* FIXME-someday: The two character comment stuff was badly
01124             thought out.  On i386, we want '/' as line comment start
01125             AND we want C style comments.  hence this hack.  The
01126             whole lexical process should be reworked.  xoxorich.  */
01127          if (ch == '/')
01128            {
01129              ch2 = GET ();
01130              if (ch2 == '*')
01131               {
01132                 old_state = 3;
01133                 state = -2;
01134                 break;
01135               }
01136              else
01137               {
01138                 UNGET (ch2);
01139               }
01140            }
01141 
01142          if (state == 0 || state == 1)    /* Only comment at start of line.  */
01143            {
01144              int startch;
01145 
01146              startch = ch;
01147 
01148              do
01149               {
01150                 ch = GET ();
01151               }
01152              while (ch != EOF && IS_WHITESPACE (ch));
01153 
01154              if (ch == EOF)
01155               {
01156                 as_warn (_("end of file in comment; newline inserted"));
01157                 PUT ('\n');
01158                 break;
01159               }
01160 
01161              if (ch < '0' || ch > '9' || state != 0 || startch != '#')
01162               {
01163                 /* Not a cpp line.  */
01164                 while (ch != EOF && !IS_NEWLINE (ch))
01165                   ch = GET ();
01166                 if (ch == EOF)
01167                   as_warn (_("end of file in comment; newline inserted"));
01168                 state = 0;
01169                 PUT ('\n');
01170                 break;
01171               }
01172              /* Looks like `# 123 "filename"' from cpp.  */
01173              UNGET (ch);
01174              old_state = 4;
01175              state = -1;
01176              if (scrub_m68k_mri)
01177               out_string = "\tlinefile ";
01178              else
01179               out_string = "\t.linefile ";
01180              PUT (*out_string++);
01181              break;
01182            }
01183 
01184 #ifdef TC_D10V
01185          /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
01186             Trap is the only short insn that has a first operand that is
01187             neither register nor label.
01188             We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
01189             We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
01190             already LEX_IS_LINE_COMMENT_START.  However, it is the
01191             only character in line_comment_chars for d10v, hence we
01192             can recognize it as such.  */
01193          /* An alternative approach would be to reset the state to 1 when
01194             we see '||', '<'- or '->', but that seems to be overkill.  */
01195          if (state == 10)
01196            PUT (' ');
01197 #endif
01198          /* We have a line comment character which is not at the
01199             start of a line.  If this is also a normal comment
01200             character, fall through.  Otherwise treat it as a default
01201             character.  */
01202          if (strchr (tc_comment_chars, ch) == NULL
01203              && (! scrub_m68k_mri
01204                 || (ch != '!' && ch != '*')))
01205            goto de_fault;
01206          if (scrub_m68k_mri
01207              && (ch == '!' || ch == '*' || ch == '#')
01208              && state != 1
01209              && state != 10)
01210            goto de_fault;
01211          /* Fall through.  */
01212        case LEX_IS_COMMENT_START:
01213 #if defined TC_ARM && defined OBJ_ELF
01214          /* On the ARM, `@' is the comment character.
01215             Unfortunately this is also a special character in ELF .symver
01216             directives (and .type, though we deal with those another way).
01217             So we check if this line is such a directive, and treat
01218             the character as default if so.  This is a hack.  */
01219          if ((symver_state != NULL) && (*symver_state == 0))
01220            goto de_fault;
01221 #endif
01222 #ifdef WARN_COMMENTS
01223          if (!found_comment)
01224            as_where (&found_comment_file, &found_comment);
01225 #endif
01226          do
01227            {
01228              ch = GET ();
01229            }
01230          while (ch != EOF && !IS_NEWLINE (ch));
01231          if (ch == EOF)
01232            as_warn (_("end of file in comment; newline inserted"));
01233          state = 0;
01234          PUT ('\n');
01235          break;
01236 
01237        case LEX_IS_SYMBOL_COMPONENT:
01238          if (state == 10)
01239            {
01240              /* This is a symbol character following another symbol
01241                character, with whitespace in between.  We skipped
01242                the whitespace earlier, so output it now.  */
01243              UNGET (ch);
01244              state = 3;
01245              PUT (' ');
01246              break;
01247            }
01248 
01249 #ifdef TC_Z80
01250          /* "af'" is a symbol containing '\''.  */
01251          if (state == 3 && (ch == 'a' || ch == 'A')) 
01252            {
01253              state = 16;
01254              PUT (ch);
01255              ch = GET ();
01256              if (ch == 'f' || ch == 'F') 
01257               {
01258                 state = 17;
01259                 PUT (ch);
01260                 break;
01261               }
01262              else
01263               {
01264                 state = 9;
01265                 if (!IS_SYMBOL_COMPONENT (ch)) 
01266                   {
01267                     UNGET (ch);
01268                     break;
01269                   }
01270               }
01271            }
01272 #endif
01273          if (state == 3)
01274            state = 9;
01275 
01276          /* This is a common case.  Quickly copy CH and all the
01277             following symbol component or normal characters.  */
01278          if (to + 1 < toend
01279              && mri_state == NULL
01280 #if defined TC_ARM && defined OBJ_ELF
01281              && symver_state == NULL
01282 #endif
01283              )
01284            {
01285              char *s;
01286              int len;
01287 
01288              for (s = from; s < fromend; s++)
01289               {
01290                 int type;
01291 
01292                 ch2 = *(unsigned char *) s;
01293                 type = lex[ch2];
01294                 if (type != 0
01295                     && type != LEX_IS_SYMBOL_COMPONENT)
01296                   break;
01297               }
01298 
01299              if (s > from)
01300               /* Handle the last character normally, for
01301                  simplicity.  */
01302               --s;
01303 
01304              len = s - from;
01305 
01306              if (len > (toend - to) - 1)
01307               len = (toend - to) - 1;
01308 
01309              if (len > 0)
01310               {
01311                 PUT (ch);
01312                 memcpy (to, from, len);
01313                 to += len;
01314                 from += len;
01315                 if (to >= toend)
01316                   goto tofull;
01317                 ch = GET ();
01318               }
01319            }
01320 
01321          /* Fall through.  */
01322        default:
01323        de_fault:
01324          /* Some relatively `normal' character.  */
01325          if (state == 0)
01326            {
01327              state = 11;    /* Now seeing label definition.  */
01328            }
01329          else if (state == 1)
01330            {
01331              state = 2;     /* Ditto.  */
01332            }
01333          else if (state == 9)
01334            {
01335              if (!IS_SYMBOL_COMPONENT (ch))
01336               state = 3;
01337            }
01338          else if (state == 10)
01339            {
01340              if (ch == '\\')
01341               {
01342                 /* Special handling for backslash: a backslash may
01343                    be the beginning of a formal parameter (of a
01344                    macro) following another symbol character, with
01345                    whitespace in between.  If that is the case, we
01346                    output a space before the parameter.  Strictly
01347                    speaking, correct handling depends upon what the
01348                    macro parameter expands into; if the parameter
01349                    expands into something which does not start with
01350                    an operand character, then we don't want to keep
01351                    the space.  We don't have enough information to
01352                    make the right choice, so here we are making the
01353                    choice which is more likely to be correct.  */
01354                 PUT (' ');
01355               }
01356 
01357              state = 3;
01358            }
01359          PUT (ch);
01360          break;
01361        }
01362     }
01363 
01364   /*NOTREACHED*/
01365 
01366  fromeof:
01367   /* We have reached the end of the input.  */
01368   return to - tostart;
01369 
01370  tofull:
01371   /* The output buffer is full.  Save any input we have not yet
01372      processed.  */
01373   if (fromend > from)
01374     {
01375       saved_input = from;
01376       saved_input_len = fromend - from;
01377     }
01378   else
01379     saved_input = NULL;
01380 
01381   return to - tostart;
01382 }
01383