1 /* This is the Assembler Pre-Processor
2 Copyright (C) 1987, 1990, 1991, 1992, 1994 Free Software Foundation, Inc.
4 This file is part of GAS, the GNU Assembler.
6 GAS is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 GAS is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GAS; see the file COPYING. If not, write to
18 the Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
20 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90 */
21 /* App, the assembler pre-processor. This pre-processor strips out excess
22 spaces, turns single-quoted characters into a decimal constant, and turns
23 # <number> <filename> <garbage> into a .line <number>\n.file <filename>
24 pair. This needs better error-handling. */
27 #include "as.h" /* For BAD_CASE() only */
31 #define const /* empty */
36 static const char symbol_chars
[] =
37 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
39 #define LEX_IS_SYMBOL_COMPONENT 1
40 #define LEX_IS_WHITESPACE 2
41 #define LEX_IS_LINE_SEPARATOR 3
42 #define LEX_IS_COMMENT_START 4
43 #define LEX_IS_LINE_COMMENT_START 5
44 #define LEX_IS_TWOCHAR_COMMENT_1ST 6
45 #define LEX_IS_TWOCHAR_COMMENT_2ND 7
46 #define LEX_IS_STRINGQUOTE 8
47 #define LEX_IS_COLON 9
48 #define LEX_IS_NEWLINE 10
49 #define LEX_IS_ONECHAR_QUOTE 11
50 #define IS_SYMBOL_COMPONENT(c) (lex[c] == LEX_IS_SYMBOL_COMPONENT)
51 #define IS_WHITESPACE(c) (lex[c] == LEX_IS_WHITESPACE)
52 #define IS_LINE_SEPARATOR(c) (lex[c] == LEX_IS_LINE_SEPARATOR)
53 #define IS_COMMENT(c) (lex[c] == LEX_IS_COMMENT_START)
54 #define IS_LINE_COMMENT(c) (lex[c] == LEX_IS_LINE_COMMENT_START)
55 #define IS_NEWLINE(c) (lex[c] == LEX_IS_NEWLINE)
57 static int process_escape
PARAMS ((int));
59 /* FIXME-soon: The entire lexer/parser thingy should be
60 built statically at compile time rather than dynamically
61 each and every time the assembler is run. xoxorich. */
68 lex
[' '] = LEX_IS_WHITESPACE
;
69 lex
['\t'] = LEX_IS_WHITESPACE
;
70 lex
['\n'] = LEX_IS_NEWLINE
;
71 lex
[';'] = LEX_IS_LINE_SEPARATOR
;
72 lex
[':'] = LEX_IS_COLON
;
76 lex
['"'] = LEX_IS_STRINGQUOTE
;
79 lex
['\''] = LEX_IS_ONECHAR_QUOTE
;
82 #ifdef SINGLE_QUOTE_STRINGS
83 lex
['\''] = LEX_IS_STRINGQUOTE
;
87 /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
88 in state 5 of do_scrub_chars must be changed. */
90 /* Note that these override the previous defaults, e.g. if ';' is a
91 comment char, then it isn't a line separator. */
92 for (p
= symbol_chars
; *p
; ++p
)
94 lex
[(unsigned char) *p
] = LEX_IS_SYMBOL_COMPONENT
;
95 } /* declare symbol characters */
97 for (p
= comment_chars
; *p
; p
++)
99 lex
[(unsigned char) *p
] = LEX_IS_COMMENT_START
;
100 } /* declare comment chars */
102 for (p
= line_comment_chars
; *p
; p
++)
104 lex
[(unsigned char) *p
] = LEX_IS_LINE_COMMENT_START
;
105 } /* declare line comment chars */
107 for (p
= line_separator_chars
; *p
; p
++)
109 lex
[(unsigned char) *p
] = LEX_IS_LINE_SEPARATOR
;
110 } /* declare line separators */
112 /* Only allow slash-star comments if slash is not in use */
115 lex
['/'] = LEX_IS_TWOCHAR_COMMENT_1ST
;
117 /* FIXME-soon. This is a bad hack but otherwise, we can't do
118 c-style comments when '/' is a line comment char. xoxorich. */
121 lex
['*'] = LEX_IS_TWOCHAR_COMMENT_2ND
;
126 lex
['\''] = LEX_IS_STRINGQUOTE
;
127 lex
[';'] = LEX_IS_COMMENT_START
;
128 lex
['*'] = LEX_IS_LINE_COMMENT_START
;
129 /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
130 then it can't be used in an expression. */
131 lex
['!'] = LEX_IS_LINE_COMMENT_START
;
133 } /* do_scrub_begin() */
135 /* Saved state of the scrubber */
137 static int old_state
;
138 static char *out_string
;
139 static char out_buf
[20];
140 static int add_newlines
;
141 static char *saved_input
;
142 static int saved_input_len
;
144 /* Data structure for saving the state of app across #include's. Note that
145 app is called asynchronously to the parsing of the .include's, so our
146 state at the time .include is interpreted is completely unrelated.
147 That's why we have to save it all. */
154 char out_buf
[sizeof (out_buf
)];
163 register struct app_save
*saved
;
165 saved
= (struct app_save
*) xmalloc (sizeof (*saved
));
166 saved
->state
= state
;
167 saved
->old_state
= old_state
;
168 saved
->out_string
= out_string
;
169 memcpy (saved
->out_buf
, out_buf
, sizeof (out_buf
));
170 saved
->add_newlines
= add_newlines
;
171 saved
->saved_input
= saved_input
;
172 saved
->saved_input_len
= saved_input_len
;
174 /* do_scrub_begin() is not useful, just wastes time. */
179 return (char *) saved
;
186 register struct app_save
*saved
= (struct app_save
*) arg
;
188 /* There is no do_scrub_end (). */
189 state
= saved
->state
;
190 old_state
= saved
->old_state
;
191 out_string
= saved
->out_string
;
192 memcpy (out_buf
, saved
->out_buf
, sizeof (out_buf
));
193 add_newlines
= saved
->add_newlines
;
194 saved_input
= saved
->saved_input
;
195 saved_input_len
= saved
->saved_input_len
;
200 /* @@ This assumes that \n &c are the same on host and target. This is not
227 /* This function is called to process input characters. The GET
228 parameter is used to retrieve more input characters. GET should
229 set its parameter to point to a buffer, and return the length of
230 the buffer; it should return 0 at end of file. The scrubbed output
231 characters are put into the buffer starting at TOSTART; the TOSTART
232 buffer is TOLEN bytes in length. The function returns the number
233 of scrubbed characters put into TOSTART. This will be TOLEN unless
234 end of file was seen. This function is arranged as a state
235 machine, and saves its state so that it may return at any point.
236 This is the way the old code used to work. */
239 do_scrub_chars (get
, tostart
, tolen
)
240 int (*get
) PARAMS ((char **));
245 char *toend
= tostart
+ tolen
;
249 register int ch
, ch2
= 0;
250 int not_cpp_line
= 0;
252 /*State 0: beginning of normal line
253 1: After first whitespace on line (flush more white)
254 2: After first non-white (opcode) on line (keep 1white)
255 3: after second white on line (into operands) (flush white)
256 4: after putting out a .line, put out digits
257 5: parsing a string, then go to old-state
258 6: putting out \ escape in a "d string.
259 7: After putting out a .appfile, put out string.
260 8: After putting out a .appfile string, flush until newline.
261 9: After seeing symbol char in state 3 (keep 1white after symchar)
262 10: After seeing whitespace in state 9 (keep white before symchar)
263 11: After seeing a symbol character in state 0 (eg a label definition)
264 -1: output string in out_string and go to the state in old_state
265 -2: flush text until a '*' '/' is seen, then go to state old_state
268 /* I added states 9 and 10 because the MIPS ECOFF assembler uses
269 constructs like ``.loc 1 20''. This was turning into ``.loc
270 120''. States 9 and 10 ensure that a space is never dropped in
271 between characters which could appear in a identifier. Ian
272 Taylor, ian@cygnus.com.
274 I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
275 correctly on the PA (and any other target where colons are optional).
276 Jeff Law, law@cs.utah.edu. */
278 /* This macro gets the next input character. */
283 : ((saved_input != NULL \
284 ? (free (saved_input), \
285 saved_input = NULL, \
288 fromlen = (*get) (&from), \
289 fromend = from + fromlen, \
294 /* This macro pushes a character back on the input stream. */
296 #define UNGET(uch) (*--from = (uch))
298 /* This macro puts a character into the output buffer. If this
299 character fills the output buffer, this macro jumps to the label
300 TOFULL. We use this rather ugly approach because we need to
301 handle two different termination conditions: EOF on the input
302 stream, and a full output buffer. It would be simpler if we
303 always read in the entire input stream before processing it, but
304 I don't want to make such a significant change to the assembler's
316 if (saved_input
!= NULL
)
319 fromend
= from
+ saved_input_len
;
323 fromlen
= (*get
) (&from
);
326 fromend
= from
+ fromlen
;
331 /* The cases in this switch end with continue, in order to
332 branch back to the top of this while loop and generate the
333 next output character in the appropriate state. */
338 if (*out_string
== '\0')
355 as_warn ("end of file in comment");
364 while ((ch
= GET ()) == '*')
369 as_warn ("end of file in comment");
387 else if (ch
>= '0' && ch
<= '9')
391 while (ch
!= EOF
&& IS_WHITESPACE (ch
))
396 out_string
= "\n\t.appfile ";
403 while (ch
!= EOF
&& ch
!= '\n')
412 /* We are going to copy everything up to a quote character,
413 with special handling for a backslash. We try to
414 optimize the copying in the simple case without using the
415 GET and PUT macros. */
420 for (s
= from
; s
< fromend
; s
++)
423 /* This condition must be changed if the type of any
424 other character can be LEX_IS_STRINGQUOTE. */
432 if (len
> toend
- to
)
436 memcpy (to
, from
, len
);
445 as_warn ("end of file in string: inserted '\"'");
450 else if (lex
[ch
] == LEX_IS_STRINGQUOTE
)
455 #ifndef NO_STRING_ESCAPES
462 else if (flag_mri
&& ch
== '\n')
464 /* Just quietly terminate the string. This permits lines like
465 bne label loop if we haven't reach end yet
482 /* Handle strings broken across lines, by turning '\n' into
509 #if defined(IGNORE_NONSTANDARD_ESCAPES) | defined(ONLY_STANDARD_ESCAPES)
511 as_warn ("Unknown escape '\\%c' in string: Ignored", ch
);
513 #else /* ONLY_STANDARD_ESCAPES */
515 /* Accept \x as x for any x */
517 #endif /* ONLY_STANDARD_ESCAPES */
520 as_warn ("End of file in string: '\"' inserted");
539 while (ch
!= '\n' && ch
!= EOF
);
547 /* OK, we are somewhere in states 0 through 4 or 9 through 11 */
556 as_warn ("end of file not at end of a line; newline inserted");
565 case LEX_IS_WHITESPACE
:
568 /* Preserve a single whitespace character at the
569 beginning of a line. */
578 while (ch
!= EOF
&& IS_WHITESPACE (ch
));
583 || (state
== 0 && IS_LINE_COMMENT (ch
))
585 || IS_LINE_SEPARATOR (ch
))
587 /* cpp never outputs a leading space before the #, so
588 try to avoid being confused. */
593 /* If we're in state 2 or 11, we've seen a non-white
594 character followed by whitespace. If the next character
595 is ':', this is whitespace after a label name which we
596 normally must ignore. In MRI mode, though, spaces are
597 not permitted between the label and the colon. */
598 if ((state
== 2 || state
== 11)
599 && lex
[ch
] == LEX_IS_COLON
611 goto recycle
; /* Punted leading sp */
613 /* We can arrive here if we leave a leading whitespace
614 character at the beginning of a line. */
620 /* Optimize common case by skipping UNGET/GET. */
621 PUT (' '); /* Sp after opco */
630 /* In MRI mode, we keep these spaces. */
635 goto recycle
; /* Sp in operands */
640 /* In MRI mode, we keep these spaces. */
646 state
= 10; /* Sp after symbol char */
651 PUT (' '); /* Sp after label definition. */
658 case LEX_IS_TWOCHAR_COMMENT_1ST
:
660 if (ch2
!= EOF
&& lex
[ch2
] == LEX_IS_TWOCHAR_COMMENT_2ND
)
667 if (ch2
!= EOF
&& IS_NEWLINE (ch2
))
671 (lex
[ch2
] != LEX_IS_TWOCHAR_COMMENT_2ND
));
674 (lex
[ch2
] == LEX_IS_TWOCHAR_COMMENT_2ND
))
680 || lex
[ch2
] == LEX_IS_TWOCHAR_COMMENT_1ST
)
685 as_warn ("end of file in multiline comment");
694 if (state
== 9 || state
== 10)
700 case LEX_IS_STRINGQUOTE
:
703 /* Preserve the whitespace in foo "bar" */
708 /* PUT didn't jump out. We could just break, but we
709 know what will happen, so optimize a bit. */
722 case LEX_IS_ONECHAR_QUOTE
:
725 /* Preserve the whitespace in foo 'b' */
734 as_warn ("end of file after a one-character quote; \\0 inserted");
742 as_warn ("end of file in escape character");
746 ch
= process_escape (ch
);
748 sprintf (out_buf
, "%d", (int) (unsigned char) ch
);
750 /* None of these 'x constants for us. We want 'x'. */
751 if ((ch
= GET ()) != '\'')
753 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
754 as_warn ("Missing close quote: (assumed)");
760 if (strlen (out_buf
) == 1)
770 out_string
= out_buf
;
776 if (state
== 9 || state
== 10)
784 /* Roll out a bunch of newlines from inside comments, etc. */
790 /* fall thru into... */
792 case LEX_IS_LINE_SEPARATOR
:
797 case LEX_IS_LINE_COMMENT_START
:
798 if (state
== 0) /* Only comment at start of line. */
800 /* FIXME-someday: The two character comment stuff was
801 badly thought out. On i386, we want '/' as line
802 comment start AND we want C style comments. hence
803 this hack. The whole lexical process should be
804 reworked. xoxorich. */
826 while (ch
!= EOF
&& IS_WHITESPACE (ch
));
829 as_warn ("end of file in comment; newline inserted");
833 if (ch
< '0' || ch
> '9' || not_cpp_line
)
835 /* Non-numerics: Eat whole comment line */
836 while (ch
!= EOF
&& !IS_NEWLINE (ch
))
839 as_warn ("EOF in Comment: Newline inserted");
844 /* Numerics begin comment. Perhaps CPP `# 123 "filename"' */
848 out_string
= "\t.appline ";
853 /* We have a line comment character which is not at the
854 start of a line. If this is also a normal comment
855 character, fall through. Otherwise treat it as a default
857 if (strchr (comment_chars
, ch
) == NULL
859 || (ch
!= '!' && ch
!= '*')))
862 && (ch
== '!' || ch
== '*')
867 case LEX_IS_COMMENT_START
:
872 while (ch
!= EOF
&& !IS_NEWLINE (ch
));
874 as_warn ("end of file in comment; newline inserted");
879 case LEX_IS_SYMBOL_COMPONENT
:
882 /* This is a symbol character following another symbol
883 character, with whitespace in between. We skipped
884 the whitespace earlier, so output it now. */
894 /* This is a common case. Quickly copy CH and all the
895 following symbol component or normal characters. */
901 for (s
= from
; s
< fromend
; s
++)
908 && type
!= LEX_IS_SYMBOL_COMPONENT
)
913 /* Handle the last character normally, for
918 if (len
> (toend
- to
) - 1)
919 len
= (toend
- to
) - 1;
925 memcpy (to
, from
, len
);
933 case 8: *to
++ = *from
++;
934 case 7: *to
++ = *from
++;
935 case 6: *to
++ = *from
++;
936 case 5: *to
++ = *from
++;
937 case 4: *to
++ = *from
++;
938 case 3: *to
++ = *from
++;
939 case 2: *to
++ = *from
++;
940 case 1: *to
++ = *from
++;
950 /* Some relatively `normal' character. */
953 state
= 11; /* Now seeing label definition */
957 state
= 2; /* Ditto */
961 if (lex
[ch
] != LEX_IS_SYMBOL_COMPONENT
)
964 else if (state
== 10)
976 /* We have reached the end of the input. */
980 /* The output buffer is full. Save any input we have not yet
986 save
= (char *) xmalloc (fromend
- from
);
987 memcpy (save
, from
, fromend
- from
);
988 if (saved_input
!= NULL
)
991 saved_input_len
= fromend
- from
;
995 if (saved_input
!= NULL
)
1001 return to
- tostart
;