cppfiles.c (stack_include_file): Don't optimize zero-length files.
[gcc.git] / gcc / cpplex.c
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000, 2001 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7 Single-pass line tokenization by Neil Booth, April 2000
8
9 This program is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option) any
12 later version.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
22
23 #include "config.h"
24 #include "system.h"
25 #include "cpplib.h"
26 #include "cpphash.h"
27
28 /* MULTIBYTE_CHARS support only works for native compilers.
29 ??? Ideally what we want is to model widechar support after
30 the current floating point support. */
31 #ifdef CROSS_COMPILE
32 #undef MULTIBYTE_CHARS
33 #endif
34
35 #ifdef MULTIBYTE_CHARS
36 #include "mbchar.h"
37 #include <locale.h>
38 #endif
39
40 /* Tokens with SPELL_STRING store their spelling in the token list,
41 and it's length in the token->val.name.len. */
42 enum spell_type
43 {
44 SPELL_OPERATOR = 0,
45 SPELL_CHAR,
46 SPELL_IDENT,
47 SPELL_NUMBER,
48 SPELL_STRING,
49 SPELL_NONE
50 };
51
52 struct token_spelling
53 {
54 enum spell_type category;
55 const unsigned char *name;
56 };
57
58 static const unsigned char *const digraph_spellings[] =
59 { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
60
61 #define OP(e, s) { SPELL_OPERATOR, U s },
62 #define TK(e, s) { s, U STRINGX (e) },
63 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
64 #undef OP
65 #undef TK
66
67 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
68 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
69 #define BACKUP() do {buffer->cur = buffer->backup_to;} while (0)
70
71 static void handle_newline PARAMS ((cpp_reader *));
72 static cppchar_t skip_escaped_newlines PARAMS ((cpp_reader *));
73 static cppchar_t get_effective_char PARAMS ((cpp_reader *));
74
75 static int skip_block_comment PARAMS ((cpp_reader *));
76 static int skip_line_comment PARAMS ((cpp_reader *));
77 static void adjust_column PARAMS ((cpp_reader *));
78 static int skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
79 static cpp_hashnode *parse_identifier PARAMS ((cpp_reader *));
80 static cpp_hashnode *parse_identifier_slow PARAMS ((cpp_reader *,
81 const U_CHAR *));
82 static void parse_number PARAMS ((cpp_reader *, cpp_string *, cppchar_t, int));
83 static int unescaped_terminator_p PARAMS ((cpp_reader *, const U_CHAR *));
84 static void parse_string PARAMS ((cpp_reader *, cpp_token *, cppchar_t));
85 static void unterminated PARAMS ((cpp_reader *, int));
86 static bool trigraph_p PARAMS ((cpp_reader *));
87 static void save_comment PARAMS ((cpp_reader *, cpp_token *, const U_CHAR *));
88 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
89 static int maybe_read_ucs PARAMS ((cpp_reader *, const unsigned char **,
90 const unsigned char *, unsigned int *));
91 static tokenrun *next_tokenrun PARAMS ((tokenrun *));
92
93 static unsigned int hex_digit_value PARAMS ((unsigned int));
94 static _cpp_buff *new_buff PARAMS ((size_t));
95
96 /* Utility routine:
97
98 Compares, the token TOKEN to the NUL-terminated string STRING.
99 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
100
101 int
102 cpp_ideq (token, string)
103 const cpp_token *token;
104 const char *string;
105 {
106 if (token->type != CPP_NAME)
107 return 0;
108
109 return !ustrcmp (NODE_NAME (token->val.node), (const U_CHAR *) string);
110 }
111
112 /* Call when meeting a newline, assumed to be in buffer->cur[-1].
113 Returns with buffer->cur pointing to the character immediately
114 following the newline (combination). */
115 static void
116 handle_newline (pfile)
117 cpp_reader *pfile;
118 {
119 cpp_buffer *buffer = pfile->buffer;
120
121 /* Handle CR-LF and LF-CR. Most other implementations (e.g. java)
122 only accept CR-LF; maybe we should fall back to that behaviour? */
123 if (buffer->cur[-1] + buffer->cur[0] == '\r' + '\n')
124 buffer->cur++;
125
126 buffer->line_base = buffer->cur;
127 buffer->col_adjust = 0;
128 pfile->line++;
129 }
130
131 /* Subroutine of skip_escaped_newlines; called when a 3-character
132 sequence beginning with "??" is encountered. buffer->cur points to
133 the second '?'.
134
135 Warn if necessary, and returns true if the sequence forms a
136 trigraph and the trigraph should be honoured. */
137 static bool
138 trigraph_p (pfile)
139 cpp_reader *pfile;
140 {
141 cpp_buffer *buffer = pfile->buffer;
142 cppchar_t from_char = buffer->cur[1];
143 bool accept;
144
145 if (!_cpp_trigraph_map[from_char])
146 return false;
147
148 accept = CPP_OPTION (pfile, trigraphs);
149
150 /* Don't warn about trigraphs in comments. */
151 if (CPP_OPTION (pfile, warn_trigraphs) && !pfile->state.lexing_comment)
152 {
153 if (accept)
154 cpp_warning_with_line (pfile, pfile->line, CPP_BUF_COL (buffer) - 1,
155 "trigraph ??%c converted to %c",
156 (int) from_char,
157 (int) _cpp_trigraph_map[from_char]);
158 else if (buffer->cur != buffer->last_Wtrigraphs)
159 {
160 buffer->last_Wtrigraphs = buffer->cur;
161 cpp_warning_with_line (pfile, pfile->line,
162 CPP_BUF_COL (buffer) - 1,
163 "trigraph ??%c ignored", (int) from_char);
164 }
165 }
166
167 return accept;
168 }
169
170 /* Skips any escaped newlines introduced by '?' or a '\\', assumed to
171 lie in buffer->cur[-1]. Returns the next byte, which will be in
172 buffer->cur[-1]. This routine performs preprocessing stages 1 and
173 2 of the ISO C standard. */
174 static cppchar_t
175 skip_escaped_newlines (pfile)
176 cpp_reader *pfile;
177 {
178 cpp_buffer *buffer = pfile->buffer;
179 cppchar_t next = buffer->cur[-1];
180
181 /* Only do this if we apply stages 1 and 2. */
182 if (!buffer->from_stage3)
183 {
184 const unsigned char *saved_cur;
185 cppchar_t next1;
186
187 do
188 {
189 if (next == '?')
190 {
191 if (buffer->cur[0] != '?' || !trigraph_p (pfile))
192 break;
193
194 /* Translate the trigraph. */
195 next = _cpp_trigraph_map[buffer->cur[1]];
196 buffer->cur += 2;
197 if (next != '\\')
198 break;
199 }
200
201 if (buffer->cur == buffer->rlimit)
202 break;
203
204 /* We have a backslash, and room for at least one more
205 character. Skip horizontal whitespace. */
206 saved_cur = buffer->cur;
207 do
208 next1 = *buffer->cur++;
209 while (is_nvspace (next1) && buffer->cur < buffer->rlimit);
210
211 if (!is_vspace (next1))
212 {
213 buffer->cur = saved_cur;
214 break;
215 }
216
217 if (saved_cur != buffer->cur - 1
218 && !pfile->state.lexing_comment)
219 cpp_warning (pfile, "backslash and newline separated by space");
220
221 handle_newline (pfile);
222 buffer->backup_to = buffer->cur;
223 if (buffer->cur == buffer->rlimit)
224 {
225 cpp_pedwarn (pfile, "backslash-newline at end of file");
226 next = EOF;
227 }
228 else
229 next = *buffer->cur++;
230 }
231 while (next == '\\' || next == '?');
232 }
233
234 return next;
235 }
236
237 /* Obtain the next character, after trigraph conversion and skipping
238 an arbitrarily long string of escaped newlines. The common case of
239 no trigraphs or escaped newlines falls through quickly. On return,
240 buffer->backup_to points to where to return to if the character is
241 not to be processed. */
242 static cppchar_t
243 get_effective_char (pfile)
244 cpp_reader *pfile;
245 {
246 cppchar_t next;
247 cpp_buffer *buffer = pfile->buffer;
248
249 buffer->backup_to = buffer->cur;
250 next = *buffer->cur++;
251 if (__builtin_expect (next == '?' || next == '\\', 0))
252 next = skip_escaped_newlines (pfile);
253
254 return next;
255 }
256
257 /* Skip a C-style block comment. We find the end of the comment by
258 seeing if an asterisk is before every '/' we encounter. Returns
259 non-zero if comment terminated by EOF, zero otherwise. */
260 static int
261 skip_block_comment (pfile)
262 cpp_reader *pfile;
263 {
264 cpp_buffer *buffer = pfile->buffer;
265 cppchar_t c = EOF, prevc = EOF;
266
267 pfile->state.lexing_comment = 1;
268 while (buffer->cur != buffer->rlimit)
269 {
270 prevc = c, c = *buffer->cur++;
271
272 /* FIXME: For speed, create a new character class of characters
273 of interest inside block comments. */
274 if (c == '?' || c == '\\')
275 c = skip_escaped_newlines (pfile);
276
277 /* People like decorating comments with '*', so check for '/'
278 instead for efficiency. */
279 if (c == '/')
280 {
281 if (prevc == '*')
282 break;
283
284 /* Warn about potential nested comments, but not if the '/'
285 comes immediately before the true comment delimiter.
286 Don't bother to get it right across escaped newlines. */
287 if (CPP_OPTION (pfile, warn_comments)
288 && buffer->cur[0] == '*' && buffer->cur[1] != '/')
289 cpp_warning_with_line (pfile,
290 pfile->line, CPP_BUF_COL (buffer),
291 "\"/*\" within comment");
292 }
293 else if (is_vspace (c))
294 handle_newline (pfile);
295 else if (c == '\t')
296 adjust_column (pfile);
297 }
298
299 pfile->state.lexing_comment = 0;
300 return c != '/' || prevc != '*';
301 }
302
303 /* Skip a C++ line comment, leaving buffer->cur pointing to the
304 terminating newline. Handles escaped newlines. Returns non-zero
305 if a multiline comment. */
306 static int
307 skip_line_comment (pfile)
308 cpp_reader *pfile;
309 {
310 cpp_buffer *buffer = pfile->buffer;
311 unsigned int orig_line = pfile->line;
312 cppchar_t c;
313
314 pfile->state.lexing_comment = 1;
315 do
316 {
317 if (buffer->cur == buffer->rlimit)
318 goto at_eof;
319
320 c = *buffer->cur++;
321 if (c == '?' || c == '\\')
322 c = skip_escaped_newlines (pfile);
323 }
324 while (!is_vspace (c));
325
326 /* Step back over the newline, except at EOF. */
327 buffer->cur--;
328 at_eof:
329
330 pfile->state.lexing_comment = 0;
331 return orig_line != pfile->line;
332 }
333
334 /* pfile->buffer->cur is one beyond the \t character. Update
335 col_adjust so we track the column correctly. */
336 static void
337 adjust_column (pfile)
338 cpp_reader *pfile;
339 {
340 cpp_buffer *buffer = pfile->buffer;
341 unsigned int col = CPP_BUF_COL (buffer) - 1; /* Zero-based column. */
342
343 /* Round it up to multiple of the tabstop, but subtract 1 since the
344 tab itself occupies a character position. */
345 buffer->col_adjust += (CPP_OPTION (pfile, tabstop)
346 - col % CPP_OPTION (pfile, tabstop)) - 1;
347 }
348
349 /* Skips whitespace, saving the next non-whitespace character.
350 Adjusts pfile->col_adjust to account for tabs. Without this,
351 tokens might be assigned an incorrect column. */
352 static int
353 skip_whitespace (pfile, c)
354 cpp_reader *pfile;
355 cppchar_t c;
356 {
357 cpp_buffer *buffer = pfile->buffer;
358 unsigned int warned = 0;
359
360 do
361 {
362 /* Horizontal space always OK. */
363 if (c == ' ')
364 ;
365 else if (c == '\t')
366 adjust_column (pfile);
367 /* Just \f \v or \0 left. */
368 else if (c == '\0')
369 {
370 if (buffer->cur - 1 == buffer->rlimit)
371 return 0;
372 if (!warned)
373 {
374 cpp_warning (pfile, "null character(s) ignored");
375 warned = 1;
376 }
377 }
378 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
379 cpp_pedwarn_with_line (pfile, pfile->line,
380 CPP_BUF_COL (buffer),
381 "%s in preprocessing directive",
382 c == '\f' ? "form feed" : "vertical tab");
383
384 c = *buffer->cur++;
385 }
386 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
387 while (is_nvspace (c));
388
389 buffer->cur--;
390 return 1;
391 }
392
393 /* See if the characters of a number token are valid in a name (no
394 '.', '+' or '-'). */
395 static int
396 name_p (pfile, string)
397 cpp_reader *pfile;
398 const cpp_string *string;
399 {
400 unsigned int i;
401
402 for (i = 0; i < string->len; i++)
403 if (!is_idchar (string->text[i]))
404 return 0;
405
406 return 1;
407 }
408
409 /* Parse an identifier, skipping embedded backslash-newlines. This is
410 a critical inner loop. The common case is an identifier which has
411 not been split by backslash-newline, does not contain a dollar
412 sign, and has already been scanned (roughly 10:1 ratio of
413 seen:unseen identifiers in normal code; the distribution is
414 Poisson-like). Second most common case is a new identifier, not
415 split and no dollar sign. The other possibilities are rare and
416 have been relegated to parse_identifier_slow. */
417
418 static cpp_hashnode *
419 parse_identifier (pfile)
420 cpp_reader *pfile;
421 {
422 cpp_hashnode *result;
423 const U_CHAR *cur;
424
425 /* Fast-path loop. Skim over a normal identifier.
426 N.B. ISIDNUM does not include $. */
427 cur = pfile->buffer->cur;
428 while (ISIDNUM (*cur))
429 cur++;
430
431 /* Check for slow-path cases. */
432 if (*cur == '?' || *cur == '\\' || *cur == '$')
433 result = parse_identifier_slow (pfile, cur);
434 else
435 {
436 const U_CHAR *base = pfile->buffer->cur - 1;
437 result = (cpp_hashnode *)
438 ht_lookup (pfile->hash_table, base, cur - base, HT_ALLOC);
439 pfile->buffer->cur = cur;
440 }
441
442 /* Rarely, identifiers require diagnostics when lexed.
443 XXX Has to be forced out of the fast path. */
444 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
445 && !pfile->state.skipping, 0))
446 {
447 /* It is allowed to poison the same identifier twice. */
448 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
449 cpp_error (pfile, "attempt to use poisoned \"%s\"",
450 NODE_NAME (result));
451
452 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
453 replacement list of a variadic macro. */
454 if (result == pfile->spec_nodes.n__VA_ARGS__
455 && !pfile->state.va_args_ok)
456 cpp_pedwarn (pfile,
457 "__VA_ARGS__ can only appear in the expansion of a C99 variadic macro");
458 }
459
460 return result;
461 }
462
463 /* Slow path. This handles identifiers which have been split, and
464 identifiers which contain dollar signs. The part of the identifier
465 from PFILE->buffer->cur-1 to CUR has already been scanned. */
466 static cpp_hashnode *
467 parse_identifier_slow (pfile, cur)
468 cpp_reader *pfile;
469 const U_CHAR *cur;
470 {
471 cpp_buffer *buffer = pfile->buffer;
472 const U_CHAR *base = buffer->cur - 1;
473 struct obstack *stack = &pfile->hash_table->stack;
474 unsigned int c, saw_dollar = 0, len;
475
476 /* Copy the part of the token which is known to be okay. */
477 obstack_grow (stack, base, cur - base);
478
479 /* Now process the part which isn't. We are looking at one of
480 '$', '\\', or '?' on entry to this loop. */
481 c = *cur++;
482 buffer->cur = cur;
483 do
484 {
485 while (is_idchar (c))
486 {
487 obstack_1grow (stack, c);
488
489 if (c == '$')
490 saw_dollar++;
491
492 c = *buffer->cur++;
493 }
494
495 /* Potential escaped newline? */
496 buffer->backup_to = buffer->cur - 1;
497 if (c != '?' && c != '\\')
498 break;
499 c = skip_escaped_newlines (pfile);
500 }
501 while (is_idchar (c));
502
503 /* Step back over the unwanted char. */
504 BACKUP ();
505
506 /* $ is not an identifier character in the standard, but is commonly
507 accepted as an extension. Don't warn about it in skipped
508 conditional blocks. */
509 if (saw_dollar && CPP_PEDANTIC (pfile) && ! pfile->state.skipping)
510 cpp_pedwarn (pfile, "'$' character(s) in identifier");
511
512 /* Identifiers are null-terminated. */
513 len = obstack_object_size (stack);
514 obstack_1grow (stack, '\0');
515
516 return (cpp_hashnode *)
517 ht_lookup (pfile->hash_table, obstack_finish (stack), len, HT_ALLOCED);
518 }
519
520 /* Parse a number, skipping embedded backslash-newlines. */
521 static void
522 parse_number (pfile, number, c, leading_period)
523 cpp_reader *pfile;
524 cpp_string *number;
525 cppchar_t c;
526 int leading_period;
527 {
528 cpp_buffer *buffer = pfile->buffer;
529 unsigned char *dest, *limit;
530
531 dest = BUFF_FRONT (pfile->u_buff);
532 limit = BUFF_LIMIT (pfile->u_buff);
533
534 /* Place a leading period. */
535 if (leading_period)
536 {
537 if (dest == limit)
538 {
539 _cpp_extend_buff (pfile, &pfile->u_buff, 1);
540 dest = BUFF_FRONT (pfile->u_buff);
541 limit = BUFF_LIMIT (pfile->u_buff);
542 }
543 *dest++ = '.';
544 }
545
546 do
547 {
548 do
549 {
550 /* Need room for terminating null. */
551 if ((size_t) (limit - dest) < 2)
552 {
553 size_t len_so_far = dest - BUFF_FRONT (pfile->u_buff);
554 _cpp_extend_buff (pfile, &pfile->u_buff, 2);
555 dest = BUFF_FRONT (pfile->u_buff) + len_so_far;
556 limit = BUFF_LIMIT (pfile->u_buff);
557 }
558 *dest++ = c;
559
560 c = *buffer->cur++;
561 }
562 while (is_numchar (c) || c == '.' || VALID_SIGN (c, dest[-1]));
563
564 /* Potential escaped newline? */
565 buffer->backup_to = buffer->cur - 1;
566 if (c != '?' && c != '\\')
567 break;
568 c = skip_escaped_newlines (pfile);
569 }
570 while (is_numchar (c) || c == '.' || VALID_SIGN (c, dest[-1]));
571
572 /* Step back over the unwanted char. */
573 BACKUP ();
574
575 /* Null-terminate the number. */
576 *dest = '\0';
577
578 number->text = BUFF_FRONT (pfile->u_buff);
579 number->len = dest - number->text;
580 BUFF_FRONT (pfile->u_buff) = dest + 1;
581 }
582
583 /* Subroutine of parse_string. Emits error for unterminated strings. */
584 static void
585 unterminated (pfile, term)
586 cpp_reader *pfile;
587 int term;
588 {
589 cpp_error (pfile, "missing terminating %c character", term);
590
591 if (term == '\"' && pfile->mls_line && pfile->mls_line != pfile->line)
592 {
593 cpp_error_with_line (pfile, pfile->mls_line, pfile->mls_col,
594 "possible start of unterminated string literal");
595 pfile->mls_line = 0;
596 }
597 }
598
599 /* Subroutine of parse_string. */
600 static int
601 unescaped_terminator_p (pfile, dest)
602 cpp_reader *pfile;
603 const unsigned char *dest;
604 {
605 const unsigned char *start, *temp;
606
607 /* In #include-style directives, terminators are not escapeable. */
608 if (pfile->state.angled_headers)
609 return 1;
610
611 start = BUFF_FRONT (pfile->u_buff);
612
613 /* An odd number of consecutive backslashes represents an escaped
614 terminator. */
615 for (temp = dest; temp > start && temp[-1] == '\\'; temp--)
616 ;
617
618 return ((dest - temp) & 1) == 0;
619 }
620
621 /* Parses a string, character constant, or angle-bracketed header file
622 name. Handles embedded trigraphs and escaped newlines. The stored
623 string is guaranteed NUL-terminated, but it is not guaranteed that
624 this is the first NUL since embedded NULs are preserved.
625 Multi-line strings are allowed, but they are deprecated.
626
627 When this function returns, buffer->cur points to the next
628 character to be processed. */
629 static void
630 parse_string (pfile, token, terminator)
631 cpp_reader *pfile;
632 cpp_token *token;
633 cppchar_t terminator;
634 {
635 cpp_buffer *buffer = pfile->buffer;
636 unsigned char *dest, *limit;
637 cppchar_t c;
638 bool warned_nulls = false, warned_multi = false;
639
640 dest = BUFF_FRONT (pfile->u_buff);
641 limit = BUFF_LIMIT (pfile->u_buff);
642
643 for (;;)
644 {
645 /* We need room for another char, possibly the terminating NUL. */
646 if ((size_t) (limit - dest) < 1)
647 {
648 size_t len_so_far = dest - BUFF_FRONT (pfile->u_buff);
649 _cpp_extend_buff (pfile, &pfile->u_buff, 2);
650 dest = BUFF_FRONT (pfile->u_buff) + len_so_far;
651 limit = BUFF_LIMIT (pfile->u_buff);
652 }
653
654 /* Handle trigraphs, escaped newlines etc. */
655 c = *buffer->cur++;
656 if (c == '?' || c == '\\')
657 c = skip_escaped_newlines (pfile);
658
659 if (c == terminator)
660 {
661 if (unescaped_terminator_p (pfile, dest))
662 break;
663 }
664 else if (is_vspace (c))
665 {
666 /* In assembly language, silently terminate string and
667 character literals at end of line. This is a kludge
668 around not knowing where comments are. */
669 if (CPP_OPTION (pfile, lang) == CLK_ASM && terminator != '>')
670 {
671 buffer->cur--;
672 break;
673 }
674
675 /* Character constants and header names may not extend over
676 multiple lines. In Standard C, neither may strings.
677 Unfortunately, we accept multiline strings as an
678 extension, except in #include family directives. */
679 if (terminator != '"' || pfile->state.angled_headers)
680 {
681 unterminated (pfile, terminator);
682 buffer->cur--;
683 break;
684 }
685
686 if (!warned_multi)
687 {
688 warned_multi = true;
689 cpp_pedwarn (pfile, "multi-line string literals are deprecated");
690 }
691
692 if (pfile->mls_line == 0)
693 {
694 pfile->mls_line = token->line;
695 pfile->mls_col = token->col;
696 }
697
698 handle_newline (pfile);
699 c = '\n';
700 }
701 else if (c == '\0')
702 {
703 if (buffer->cur - 1 == buffer->rlimit)
704 {
705 unterminated (pfile, terminator);
706 buffer->cur--;
707 break;
708 }
709 if (!warned_nulls)
710 {
711 warned_nulls = true;
712 cpp_warning (pfile, "null character(s) preserved in literal");
713 }
714 }
715
716 *dest++ = c;
717 }
718
719 *dest = '\0';
720
721 token->val.str.text = BUFF_FRONT (pfile->u_buff);
722 token->val.str.len = dest - BUFF_FRONT (pfile->u_buff);
723 BUFF_FRONT (pfile->u_buff) = dest + 1;
724 }
725
726 /* The stored comment includes the comment start and any terminator. */
727 static void
728 save_comment (pfile, token, from)
729 cpp_reader *pfile;
730 cpp_token *token;
731 const unsigned char *from;
732 {
733 unsigned char *buffer;
734 unsigned int len;
735
736 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
737
738 /* C++ comments probably (not definitely) have moved past a new
739 line, which we don't want to save in the comment. */
740 if (is_vspace (pfile->buffer->cur[-1]))
741 len--;
742 buffer = _cpp_unaligned_alloc (pfile, len);
743
744 token->type = CPP_COMMENT;
745 token->val.str.len = len;
746 token->val.str.text = buffer;
747
748 buffer[0] = '/';
749 memcpy (buffer + 1, from, len - 1);
750 }
751
752 /* Allocate COUNT tokens for RUN. */
753 void
754 _cpp_init_tokenrun (run, count)
755 tokenrun *run;
756 unsigned int count;
757 {
758 run->base = xnewvec (cpp_token, count);
759 run->limit = run->base + count;
760 run->next = NULL;
761 }
762
763 /* Returns the next tokenrun, or creates one if there is none. */
764 static tokenrun *
765 next_tokenrun (run)
766 tokenrun *run;
767 {
768 if (run->next == NULL)
769 {
770 run->next = xnew (tokenrun);
771 run->next->prev = run;
772 _cpp_init_tokenrun (run->next, 250);
773 }
774
775 return run->next;
776 }
777
778 /* Allocate a single token that is invalidated at the same time as the
779 rest of the tokens on the line. Has its line and col set to the
780 same as the last lexed token, so that diagnostics appear in the
781 right place. */
782 cpp_token *
783 _cpp_temp_token (pfile)
784 cpp_reader *pfile;
785 {
786 cpp_token *old, *result;
787
788 old = pfile->cur_token - 1;
789 if (pfile->cur_token == pfile->cur_run->limit)
790 {
791 pfile->cur_run = next_tokenrun (pfile->cur_run);
792 pfile->cur_token = pfile->cur_run->base;
793 }
794
795 result = pfile->cur_token++;
796 result->line = old->line;
797 result->col = old->col;
798 return result;
799 }
800
801 /* Lex a token into RESULT (external interface). Takes care of issues
802 like directive handling, token lookahead, multiple include
803 optimization and skipping. */
804 const cpp_token *
805 _cpp_lex_token (pfile)
806 cpp_reader *pfile;
807 {
808 cpp_token *result;
809
810 for (;;)
811 {
812 if (pfile->cur_token == pfile->cur_run->limit)
813 {
814 pfile->cur_run = next_tokenrun (pfile->cur_run);
815 pfile->cur_token = pfile->cur_run->base;
816 }
817
818 if (pfile->lookaheads)
819 {
820 pfile->lookaheads--;
821 result = pfile->cur_token++;
822 }
823 else
824 result = _cpp_lex_direct (pfile);
825
826 if (result->flags & BOL)
827 {
828 /* Is this a directive. If _cpp_handle_directive returns
829 false, it is an assembler #. */
830 if (result->type == CPP_HASH
831 && !pfile->state.parsing_args
832 && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
833 continue;
834 if (pfile->cb.line_change && !pfile->state.skipping)
835 (*pfile->cb.line_change)(pfile, result, pfile->state.parsing_args);
836 }
837
838 /* We don't skip tokens in directives. */
839 if (pfile->state.in_directive)
840 break;
841
842 /* Outside a directive, invalidate controlling macros. At file
843 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
844 get here and MI optimisation works. */
845 pfile->mi_valid = false;
846
847 if (!pfile->state.skipping || result->type == CPP_EOF)
848 break;
849 }
850
851 return result;
852 }
853
854 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
855 do { \
856 if (get_effective_char (pfile) == CHAR) \
857 result->type = THEN_TYPE; \
858 else \
859 { \
860 BACKUP (); \
861 result->type = ELSE_TYPE; \
862 } \
863 } while (0)
864
865 /* Lex a token into pfile->cur_token, which is also incremented, to
866 get diagnostics pointing to the correct location.
867
868 Does not handle issues such as token lookahead, multiple-include
869 optimisation, directives, skipping etc. This function is only
870 suitable for use by _cpp_lex_token, and in special cases like
871 lex_expansion_token which doesn't care for any of these issues.
872
873 When meeting a newline, returns CPP_EOF if parsing a directive,
874 otherwise returns to the start of the token buffer if permissible.
875 Returns the location of the lexed token. */
876 cpp_token *
877 _cpp_lex_direct (pfile)
878 cpp_reader *pfile;
879 {
880 cppchar_t c;
881 cpp_buffer *buffer;
882 const unsigned char *comment_start;
883 cpp_token *result = pfile->cur_token++;
884
885 fresh_line:
886 buffer = pfile->buffer;
887 result->flags = buffer->saved_flags;
888 buffer->saved_flags = 0;
889 update_tokens_line:
890 result->line = pfile->line;
891
892 skipped_white:
893 c = *buffer->cur++;
894 result->col = CPP_BUF_COLUMN (buffer, buffer->cur);
895
896 trigraph:
897 switch (c)
898 {
899 case ' ': case '\t': case '\f': case '\v': case '\0':
900 result->flags |= PREV_WHITE;
901 if (skip_whitespace (pfile, c))
902 goto skipped_white;
903
904 /* EOF. */
905 buffer->cur--;
906 buffer->saved_flags = BOL;
907 if (!pfile->state.parsing_args && !pfile->state.in_directive)
908 {
909 if (buffer->cur != buffer->line_base)
910 {
911 /* Non-empty files should end in a newline. Don't warn
912 for command line and _Pragma buffers. */
913 if (!buffer->from_stage3)
914 cpp_pedwarn (pfile, "no newline at end of file");
915 handle_newline (pfile);
916 }
917
918 /* Don't pop the last buffer. */
919 if (buffer->prev)
920 {
921 unsigned char stop = buffer->return_at_eof;
922
923 _cpp_pop_buffer (pfile);
924 if (!stop)
925 goto fresh_line;
926 }
927 }
928 result->type = CPP_EOF;
929 break;
930
931 case '\n': case '\r':
932 handle_newline (pfile);
933 buffer->saved_flags = BOL;
934 if (! pfile->state.in_directive)
935 {
936 if (pfile->state.parsing_args == 2)
937 buffer->saved_flags |= PREV_WHITE;
938 if (!pfile->keep_tokens)
939 {
940 pfile->cur_run = &pfile->base_run;
941 result = pfile->base_run.base;
942 pfile->cur_token = result + 1;
943 }
944 goto fresh_line;
945 }
946 result->type = CPP_EOF;
947 break;
948
949 case '?':
950 case '\\':
951 /* These could start an escaped newline, or '?' a trigraph. Let
952 skip_escaped_newlines do all the work. */
953 {
954 unsigned int line = pfile->line;
955
956 c = skip_escaped_newlines (pfile);
957 if (line != pfile->line)
958 {
959 buffer->cur--;
960 /* We had at least one escaped newline of some sort.
961 Update the token's line and column. */
962 goto update_tokens_line;
963 }
964 }
965
966 /* We are either the original '?' or '\\', or a trigraph. */
967 if (c == '?')
968 result->type = CPP_QUERY;
969 else if (c == '\\')
970 goto random_char;
971 else
972 goto trigraph;
973 break;
974
975 case '0': case '1': case '2': case '3': case '4':
976 case '5': case '6': case '7': case '8': case '9':
977 result->type = CPP_NUMBER;
978 parse_number (pfile, &result->val.str, c, 0);
979 break;
980
981 case '$':
982 if (!CPP_OPTION (pfile, dollars_in_ident))
983 goto random_char;
984 /* Fall through... */
985
986 case '_':
987 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
988 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
989 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
990 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
991 case 'y': case 'z':
992 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
993 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
994 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
995 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
996 case 'Y': case 'Z':
997 result->type = CPP_NAME;
998 result->val.node = parse_identifier (pfile);
999
1000 /* 'L' may introduce wide characters or strings. */
1001 if (result->val.node == pfile->spec_nodes.n_L)
1002 {
1003 c = *buffer->cur;
1004 if (c == '\'' || c == '"')
1005 {
1006 buffer->cur++;
1007 result->type = (c == '"' ? CPP_WSTRING: CPP_WCHAR);
1008 parse_string (pfile, result, c);
1009 }
1010 }
1011 /* Convert named operators to their proper types. */
1012 else if (result->val.node->flags & NODE_OPERATOR)
1013 {
1014 result->flags |= NAMED_OP;
1015 result->type = result->val.node->value.operator;
1016 }
1017 break;
1018
1019 case '\'':
1020 case '"':
1021 result->type = c == '"' ? CPP_STRING: CPP_CHAR;
1022 parse_string (pfile, result, c);
1023 break;
1024
1025 case '/':
1026 /* A potential block or line comment. */
1027 comment_start = buffer->cur;
1028 c = get_effective_char (pfile);
1029
1030 if (c == '*')
1031 {
1032 if (skip_block_comment (pfile))
1033 cpp_error (pfile, "unterminated comment");
1034 }
1035 else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1036 || CPP_IN_SYSTEM_HEADER (pfile)))
1037 {
1038 /* Warn about comments only if pedantically GNUC89, and not
1039 in system headers. */
1040 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1041 && ! buffer->warned_cplusplus_comments)
1042 {
1043 cpp_pedwarn (pfile,
1044 "C++ style comments are not allowed in ISO C89");
1045 cpp_pedwarn (pfile,
1046 "(this will be reported only once per input file)");
1047 buffer->warned_cplusplus_comments = 1;
1048 }
1049
1050 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1051 cpp_warning (pfile, "multi-line comment");
1052 }
1053 else if (c == '=')
1054 {
1055 result->type = CPP_DIV_EQ;
1056 break;
1057 }
1058 else
1059 {
1060 BACKUP ();
1061 result->type = CPP_DIV;
1062 break;
1063 }
1064
1065 if (!pfile->state.save_comments)
1066 {
1067 result->flags |= PREV_WHITE;
1068 goto update_tokens_line;
1069 }
1070
1071 /* Save the comment as a token in its own right. */
1072 save_comment (pfile, result, comment_start);
1073 break;
1074
1075 case '<':
1076 if (pfile->state.angled_headers)
1077 {
1078 result->type = CPP_HEADER_NAME;
1079 parse_string (pfile, result, '>');
1080 break;
1081 }
1082
1083 c = get_effective_char (pfile);
1084 if (c == '=')
1085 result->type = CPP_LESS_EQ;
1086 else if (c == '<')
1087 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1088 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1089 IF_NEXT_IS ('=', CPP_MIN_EQ, CPP_MIN);
1090 else if (c == ':' && CPP_OPTION (pfile, digraphs))
1091 {
1092 result->type = CPP_OPEN_SQUARE;
1093 result->flags |= DIGRAPH;
1094 }
1095 else if (c == '%' && CPP_OPTION (pfile, digraphs))
1096 {
1097 result->type = CPP_OPEN_BRACE;
1098 result->flags |= DIGRAPH;
1099 }
1100 else
1101 {
1102 BACKUP ();
1103 result->type = CPP_LESS;
1104 }
1105 break;
1106
1107 case '>':
1108 c = get_effective_char (pfile);
1109 if (c == '=')
1110 result->type = CPP_GREATER_EQ;
1111 else if (c == '>')
1112 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1113 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1114 IF_NEXT_IS ('=', CPP_MAX_EQ, CPP_MAX);
1115 else
1116 {
1117 BACKUP ();
1118 result->type = CPP_GREATER;
1119 }
1120 break;
1121
1122 case '%':
1123 c = get_effective_char (pfile);
1124 if (c == '=')
1125 result->type = CPP_MOD_EQ;
1126 else if (CPP_OPTION (pfile, digraphs) && c == ':')
1127 {
1128 result->flags |= DIGRAPH;
1129 result->type = CPP_HASH;
1130 if (get_effective_char (pfile) == '%')
1131 {
1132 const unsigned char *pos = buffer->cur;
1133
1134 if (get_effective_char (pfile) == ':')
1135 result->type = CPP_PASTE;
1136 else
1137 buffer->cur = pos - 1;
1138 }
1139 else
1140 BACKUP ();
1141 }
1142 else if (CPP_OPTION (pfile, digraphs) && c == '>')
1143 {
1144 result->flags |= DIGRAPH;
1145 result->type = CPP_CLOSE_BRACE;
1146 }
1147 else
1148 {
1149 BACKUP ();
1150 result->type = CPP_MOD;
1151 }
1152 break;
1153
1154 case '.':
1155 result->type = CPP_DOT;
1156 c = get_effective_char (pfile);
1157 if (c == '.')
1158 {
1159 const unsigned char *pos = buffer->cur;
1160
1161 if (get_effective_char (pfile) == '.')
1162 result->type = CPP_ELLIPSIS;
1163 else
1164 buffer->cur = pos - 1;
1165 }
1166 /* All known character sets have 0...9 contiguous. */
1167 else if (ISDIGIT (c))
1168 {
1169 result->type = CPP_NUMBER;
1170 parse_number (pfile, &result->val.str, c, 1);
1171 }
1172 else if (c == '*' && CPP_OPTION (pfile, cplusplus))
1173 result->type = CPP_DOT_STAR;
1174 else
1175 BACKUP ();
1176 break;
1177
1178 case '+':
1179 c = get_effective_char (pfile);
1180 if (c == '+')
1181 result->type = CPP_PLUS_PLUS;
1182 else if (c == '=')
1183 result->type = CPP_PLUS_EQ;
1184 else
1185 {
1186 BACKUP ();
1187 result->type = CPP_PLUS;
1188 }
1189 break;
1190
1191 case '-':
1192 c = get_effective_char (pfile);
1193 if (c == '>')
1194 {
1195 result->type = CPP_DEREF;
1196 if (CPP_OPTION (pfile, cplusplus))
1197 {
1198 if (get_effective_char (pfile) == '*')
1199 result->type = CPP_DEREF_STAR;
1200 else
1201 BACKUP ();
1202 }
1203 }
1204 else if (c == '-')
1205 result->type = CPP_MINUS_MINUS;
1206 else if (c == '=')
1207 result->type = CPP_MINUS_EQ;
1208 else
1209 {
1210 BACKUP ();
1211 result->type = CPP_MINUS;
1212 }
1213 break;
1214
1215 case '&':
1216 c = get_effective_char (pfile);
1217 if (c == '&')
1218 result->type = CPP_AND_AND;
1219 else if (c == '=')
1220 result->type = CPP_AND_EQ;
1221 else
1222 {
1223 BACKUP ();
1224 result->type = CPP_AND;
1225 }
1226 break;
1227
1228 case '|':
1229 c = get_effective_char (pfile);
1230 if (c == '|')
1231 result->type = CPP_OR_OR;
1232 else if (c == '=')
1233 result->type = CPP_OR_EQ;
1234 else
1235 {
1236 BACKUP ();
1237 result->type = CPP_OR;
1238 }
1239 break;
1240
1241 case ':':
1242 c = get_effective_char (pfile);
1243 if (c == ':' && CPP_OPTION (pfile, cplusplus))
1244 result->type = CPP_SCOPE;
1245 else if (c == '>' && CPP_OPTION (pfile, digraphs))
1246 {
1247 result->flags |= DIGRAPH;
1248 result->type = CPP_CLOSE_SQUARE;
1249 }
1250 else
1251 {
1252 BACKUP ();
1253 result->type = CPP_COLON;
1254 }
1255 break;
1256
1257 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1258 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1259 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1260 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1261 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1262
1263 case '~': result->type = CPP_COMPL; break;
1264 case ',': result->type = CPP_COMMA; break;
1265 case '(': result->type = CPP_OPEN_PAREN; break;
1266 case ')': result->type = CPP_CLOSE_PAREN; break;
1267 case '[': result->type = CPP_OPEN_SQUARE; break;
1268 case ']': result->type = CPP_CLOSE_SQUARE; break;
1269 case '{': result->type = CPP_OPEN_BRACE; break;
1270 case '}': result->type = CPP_CLOSE_BRACE; break;
1271 case ';': result->type = CPP_SEMICOLON; break;
1272
1273 /* @ is a punctuator in Objective C. */
1274 case '@': result->type = CPP_ATSIGN; break;
1275
1276 random_char:
1277 default:
1278 result->type = CPP_OTHER;
1279 result->val.c = c;
1280 break;
1281 }
1282
1283 return result;
1284 }
1285
1286 /* An upper bound on the number of bytes needed to spell a token,
1287 including preceding whitespace. */
1288 unsigned int
1289 cpp_token_len (token)
1290 const cpp_token *token;
1291 {
1292 unsigned int len;
1293
1294 switch (TOKEN_SPELL (token))
1295 {
1296 default: len = 0; break;
1297 case SPELL_NUMBER:
1298 case SPELL_STRING: len = token->val.str.len; break;
1299 case SPELL_IDENT: len = NODE_LEN (token->val.node); break;
1300 }
1301 /* 1 for whitespace, 4 for comment delimiters. */
1302 return len + 5;
1303 }
1304
1305 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
1306 already contain the enough space to hold the token's spelling.
1307 Returns a pointer to the character after the last character
1308 written. */
1309 unsigned char *
1310 cpp_spell_token (pfile, token, buffer)
1311 cpp_reader *pfile; /* Would be nice to be rid of this... */
1312 const cpp_token *token;
1313 unsigned char *buffer;
1314 {
1315 switch (TOKEN_SPELL (token))
1316 {
1317 case SPELL_OPERATOR:
1318 {
1319 const unsigned char *spelling;
1320 unsigned char c;
1321
1322 if (token->flags & DIGRAPH)
1323 spelling
1324 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1325 else if (token->flags & NAMED_OP)
1326 goto spell_ident;
1327 else
1328 spelling = TOKEN_NAME (token);
1329
1330 while ((c = *spelling++) != '\0')
1331 *buffer++ = c;
1332 }
1333 break;
1334
1335 case SPELL_CHAR:
1336 *buffer++ = token->val.c;
1337 break;
1338
1339 spell_ident:
1340 case SPELL_IDENT:
1341 memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
1342 buffer += NODE_LEN (token->val.node);
1343 break;
1344
1345 case SPELL_NUMBER:
1346 memcpy (buffer, token->val.str.text, token->val.str.len);
1347 buffer += token->val.str.len;
1348 break;
1349
1350 case SPELL_STRING:
1351 {
1352 int left, right, tag;
1353 switch (token->type)
1354 {
1355 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1356 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1357 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1358 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1359 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1360 default:
1361 cpp_ice (pfile, "unknown string token %s\n", TOKEN_NAME (token));
1362 return buffer;
1363 }
1364 if (tag) *buffer++ = tag;
1365 *buffer++ = left;
1366 memcpy (buffer, token->val.str.text, token->val.str.len);
1367 buffer += token->val.str.len;
1368 *buffer++ = right;
1369 }
1370 break;
1371
1372 case SPELL_NONE:
1373 cpp_ice (pfile, "Unspellable token %s", TOKEN_NAME (token));
1374 break;
1375 }
1376
1377 return buffer;
1378 }
1379
1380 /* Returns a token as a null-terminated string. The string is
1381 temporary, and automatically freed later. Useful for diagnostics. */
1382 unsigned char *
1383 cpp_token_as_text (pfile, token)
1384 cpp_reader *pfile;
1385 const cpp_token *token;
1386 {
1387 unsigned int len = cpp_token_len (token);
1388 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1389
1390 end = cpp_spell_token (pfile, token, start);
1391 end[0] = '\0';
1392
1393 return start;
1394 }
1395
1396 /* Used by C front ends. Should really move to using cpp_token_as_text. */
1397 const char *
1398 cpp_type2name (type)
1399 enum cpp_ttype type;
1400 {
1401 return (const char *) token_spellings[type].name;
1402 }
1403
1404 /* Writes the spelling of token to FP, without any preceding space.
1405 Separated from cpp_spell_token for efficiency - to avoid stdio
1406 double-buffering. */
1407 void
1408 cpp_output_token (token, fp)
1409 const cpp_token *token;
1410 FILE *fp;
1411 {
1412 switch (TOKEN_SPELL (token))
1413 {
1414 case SPELL_OPERATOR:
1415 {
1416 const unsigned char *spelling;
1417 int c;
1418
1419 if (token->flags & DIGRAPH)
1420 spelling
1421 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1422 else if (token->flags & NAMED_OP)
1423 goto spell_ident;
1424 else
1425 spelling = TOKEN_NAME (token);
1426
1427 c = *spelling;
1428 do
1429 putc (c, fp);
1430 while ((c = *++spelling) != '\0');
1431 }
1432 break;
1433
1434 case SPELL_CHAR:
1435 putc (token->val.c, fp);
1436 break;
1437
1438 spell_ident:
1439 case SPELL_IDENT:
1440 fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
1441 break;
1442
1443 case SPELL_NUMBER:
1444 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1445 break;
1446
1447 case SPELL_STRING:
1448 {
1449 int left, right, tag;
1450 switch (token->type)
1451 {
1452 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1453 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1454 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1455 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1456 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1457 default:
1458 fprintf (stderr, "impossible STRING token %s\n", TOKEN_NAME (token));
1459 return;
1460 }
1461 if (tag) putc (tag, fp);
1462 putc (left, fp);
1463 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1464 putc (right, fp);
1465 }
1466 break;
1467
1468 case SPELL_NONE:
1469 /* An error, most probably. */
1470 break;
1471 }
1472 }
1473
1474 /* Compare two tokens. */
1475 int
1476 _cpp_equiv_tokens (a, b)
1477 const cpp_token *a, *b;
1478 {
1479 if (a->type == b->type && a->flags == b->flags)
1480 switch (TOKEN_SPELL (a))
1481 {
1482 default: /* Keep compiler happy. */
1483 case SPELL_OPERATOR:
1484 return 1;
1485 case SPELL_CHAR:
1486 return a->val.c == b->val.c; /* Character. */
1487 case SPELL_NONE:
1488 return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1489 case SPELL_IDENT:
1490 return a->val.node == b->val.node;
1491 case SPELL_NUMBER:
1492 case SPELL_STRING:
1493 return (a->val.str.len == b->val.str.len
1494 && !memcmp (a->val.str.text, b->val.str.text,
1495 a->val.str.len));
1496 }
1497
1498 return 0;
1499 }
1500
1501 /* Returns nonzero if a space should be inserted to avoid an
1502 accidental token paste for output. For simplicity, it is
1503 conservative, and occasionally advises a space where one is not
1504 needed, e.g. "." and ".2". */
1505
1506 int
1507 cpp_avoid_paste (pfile, token1, token2)
1508 cpp_reader *pfile;
1509 const cpp_token *token1, *token2;
1510 {
1511 enum cpp_ttype a = token1->type, b = token2->type;
1512 cppchar_t c;
1513
1514 if (token1->flags & NAMED_OP)
1515 a = CPP_NAME;
1516 if (token2->flags & NAMED_OP)
1517 b = CPP_NAME;
1518
1519 c = EOF;
1520 if (token2->flags & DIGRAPH)
1521 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1522 else if (token_spellings[b].category == SPELL_OPERATOR)
1523 c = token_spellings[b].name[0];
1524
1525 /* Quickly get everything that can paste with an '='. */
1526 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1527 return 1;
1528
1529 switch (a)
1530 {
1531 case CPP_GREATER: return c == '>' || c == '?';
1532 case CPP_LESS: return c == '<' || c == '?' || c == '%' || c == ':';
1533 case CPP_PLUS: return c == '+';
1534 case CPP_MINUS: return c == '-' || c == '>';
1535 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1536 case CPP_MOD: return c == ':' || c == '>';
1537 case CPP_AND: return c == '&';
1538 case CPP_OR: return c == '|';
1539 case CPP_COLON: return c == ':' || c == '>';
1540 case CPP_DEREF: return c == '*';
1541 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1542 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1543 case CPP_NAME: return ((b == CPP_NUMBER
1544 && name_p (pfile, &token2->val.str))
1545 || b == CPP_NAME
1546 || b == CPP_CHAR || b == CPP_STRING); /* L */
1547 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1548 || c == '.' || c == '+' || c == '-');
1549 case CPP_OTHER: return (CPP_OPTION (pfile, objc)
1550 && token1->val.c == '@'
1551 && (b == CPP_NAME || b == CPP_STRING));
1552 default: break;
1553 }
1554
1555 return 0;
1556 }
1557
1558 /* Output all the remaining tokens on the current line, and a newline
1559 character, to FP. Leading whitespace is removed. If there are
1560 macros, special token padding is not performed. */
1561 void
1562 cpp_output_line (pfile, fp)
1563 cpp_reader *pfile;
1564 FILE *fp;
1565 {
1566 const cpp_token *token;
1567
1568 token = cpp_get_token (pfile);
1569 while (token->type != CPP_EOF)
1570 {
1571 cpp_output_token (token, fp);
1572 token = cpp_get_token (pfile);
1573 if (token->flags & PREV_WHITE)
1574 putc (' ', fp);
1575 }
1576
1577 putc ('\n', fp);
1578 }
1579
1580 /* Returns the value of a hexadecimal digit. */
1581 static unsigned int
1582 hex_digit_value (c)
1583 unsigned int c;
1584 {
1585 if (hex_p (c))
1586 return hex_value (c);
1587 else
1588 abort ();
1589 }
1590
1591 /* Parse a '\uNNNN' or '\UNNNNNNNN' sequence. Returns 1 to indicate
1592 failure if cpplib is not parsing C++ or C99. Such failure is
1593 silent, and no variables are updated. Otherwise returns 0, and
1594 warns if -Wtraditional.
1595
1596 [lex.charset]: The character designated by the universal character
1597 name \UNNNNNNNN is that character whose character short name in
1598 ISO/IEC 10646 is NNNNNNNN; the character designated by the
1599 universal character name \uNNNN is that character whose character
1600 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
1601 for a universal character name is less than 0x20 or in the range
1602 0x7F-0x9F (inclusive), or if the universal character name
1603 designates a character in the basic source character set, then the
1604 program is ill-formed.
1605
1606 We assume that wchar_t is Unicode, so we don't need to do any
1607 mapping. Is this ever wrong?
1608
1609 PC points to the 'u' or 'U', PSTR is points to the byte after PC,
1610 LIMIT is the end of the string or charconst. PSTR is updated to
1611 point after the UCS on return, and the UCS is written into PC. */
1612
1613 static int
1614 maybe_read_ucs (pfile, pstr, limit, pc)
1615 cpp_reader *pfile;
1616 const unsigned char **pstr;
1617 const unsigned char *limit;
1618 unsigned int *pc;
1619 {
1620 const unsigned char *p = *pstr;
1621 unsigned int code = 0;
1622 unsigned int c = *pc, length;
1623
1624 /* Only attempt to interpret a UCS for C++ and C99. */
1625 if (! (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99)))
1626 return 1;
1627
1628 if (CPP_WTRADITIONAL (pfile))
1629 cpp_warning (pfile, "the meaning of '\\%c' varies with -traditional", c);
1630
1631 length = (c == 'u' ? 4: 8);
1632
1633 if ((size_t) (limit - p) < length)
1634 {
1635 cpp_error (pfile, "incomplete universal-character-name");
1636 /* Skip to the end to avoid more diagnostics. */
1637 p = limit;
1638 }
1639 else
1640 {
1641 for (; length; length--, p++)
1642 {
1643 c = *p;
1644 if (ISXDIGIT (c))
1645 code = (code << 4) + hex_digit_value (c);
1646 else
1647 {
1648 cpp_error (pfile,
1649 "non-hex digit '%c' in universal-character-name", c);
1650 /* We shouldn't skip in case there are multibyte chars. */
1651 break;
1652 }
1653 }
1654 }
1655
1656 #ifdef TARGET_EBCDIC
1657 cpp_error (pfile, "universal-character-name on EBCDIC target");
1658 code = 0x3f; /* EBCDIC invalid character */
1659 #else
1660 /* True extended characters are OK. */
1661 if (code >= 0xa0
1662 && !(code & 0x80000000)
1663 && !(code >= 0xD800 && code <= 0xDFFF))
1664 ;
1665 /* The standard permits $, @ and ` to be specified as UCNs. We use
1666 hex escapes so that this also works with EBCDIC hosts. */
1667 else if (code == 0x24 || code == 0x40 || code == 0x60)
1668 ;
1669 /* Don't give another error if one occurred above. */
1670 else if (length == 0)
1671 cpp_error (pfile, "universal-character-name out of range");
1672 #endif
1673
1674 *pstr = p;
1675 *pc = code;
1676 return 0;
1677 }
1678
1679 /* Interpret an escape sequence, and return its value. PSTR points to
1680 the input pointer, which is just after the backslash. LIMIT is how
1681 much text we have. MASK is a bitmask for the precision for the
1682 destination type (char or wchar_t). TRADITIONAL, if true, does not
1683 interpret escapes that did not exist in traditional C.
1684
1685 Handles all relevant diagnostics. */
1686
1687 unsigned int
1688 cpp_parse_escape (pfile, pstr, limit, mask, traditional)
1689 cpp_reader *pfile;
1690 const unsigned char **pstr;
1691 const unsigned char *limit;
1692 unsigned HOST_WIDE_INT mask;
1693 int traditional;
1694 {
1695 int unknown = 0;
1696 const unsigned char *str = *pstr;
1697 unsigned int c = *str++;
1698
1699 switch (c)
1700 {
1701 case '\\': case '\'': case '"': case '?': break;
1702 case 'b': c = TARGET_BS; break;
1703 case 'f': c = TARGET_FF; break;
1704 case 'n': c = TARGET_NEWLINE; break;
1705 case 'r': c = TARGET_CR; break;
1706 case 't': c = TARGET_TAB; break;
1707 case 'v': c = TARGET_VT; break;
1708
1709 case '(': case '{': case '[': case '%':
1710 /* '\(', etc, are used at beginning of line to avoid confusing Emacs.
1711 '\%' is used to prevent SCCS from getting confused. */
1712 unknown = CPP_PEDANTIC (pfile);
1713 break;
1714
1715 case 'a':
1716 if (CPP_WTRADITIONAL (pfile))
1717 cpp_warning (pfile, "the meaning of '\\a' varies with -traditional");
1718 if (!traditional)
1719 c = TARGET_BELL;
1720 break;
1721
1722 case 'e': case 'E':
1723 if (CPP_PEDANTIC (pfile))
1724 cpp_pedwarn (pfile, "non-ISO-standard escape sequence, '\\%c'", c);
1725 c = TARGET_ESC;
1726 break;
1727
1728 case 'u': case 'U':
1729 unknown = maybe_read_ucs (pfile, &str, limit, &c);
1730 break;
1731
1732 case 'x':
1733 if (CPP_WTRADITIONAL (pfile))
1734 cpp_warning (pfile, "the meaning of '\\x' varies with -traditional");
1735
1736 if (!traditional)
1737 {
1738 unsigned int i = 0, overflow = 0;
1739 int digits_found = 0;
1740
1741 while (str < limit)
1742 {
1743 c = *str;
1744 if (! ISXDIGIT (c))
1745 break;
1746 str++;
1747 overflow |= i ^ (i << 4 >> 4);
1748 i = (i << 4) + hex_digit_value (c);
1749 digits_found = 1;
1750 }
1751
1752 if (!digits_found)
1753 cpp_error (pfile, "\\x used with no following hex digits");
1754
1755 if (overflow | (i != (i & mask)))
1756 {
1757 cpp_pedwarn (pfile, "hex escape sequence out of range");
1758 i &= mask;
1759 }
1760 c = i;
1761 }
1762 break;
1763
1764 case '0': case '1': case '2': case '3':
1765 case '4': case '5': case '6': case '7':
1766 {
1767 unsigned int i = c - '0';
1768 int count = 0;
1769
1770 while (str < limit && ++count < 3)
1771 {
1772 c = *str;
1773 if (c < '0' || c > '7')
1774 break;
1775 str++;
1776 i = (i << 3) + c - '0';
1777 }
1778
1779 if (i != (i & mask))
1780 {
1781 cpp_pedwarn (pfile, "octal escape sequence out of range");
1782 i &= mask;
1783 }
1784 c = i;
1785 }
1786 break;
1787
1788 default:
1789 unknown = 1;
1790 break;
1791 }
1792
1793 if (unknown)
1794 {
1795 if (ISGRAPH (c))
1796 cpp_pedwarn (pfile, "unknown escape sequence '\\%c'", c);
1797 else
1798 cpp_pedwarn (pfile, "unknown escape sequence: '\\%03o'", c);
1799 }
1800
1801 if (c > mask)
1802 cpp_pedwarn (pfile, "escape sequence out of range for character");
1803
1804 *pstr = str;
1805 return c;
1806 }
1807
1808 #ifndef MAX_CHAR_TYPE_SIZE
1809 #define MAX_CHAR_TYPE_SIZE CHAR_TYPE_SIZE
1810 #endif
1811
1812 #ifndef MAX_WCHAR_TYPE_SIZE
1813 #define MAX_WCHAR_TYPE_SIZE WCHAR_TYPE_SIZE
1814 #endif
1815
1816 /* Interpret a (possibly wide) character constant in TOKEN.
1817 WARN_MULTI warns about multi-character charconsts, if not
1818 TRADITIONAL. TRADITIONAL also indicates not to interpret escapes
1819 that did not exist in traditional C. PCHARS_SEEN points to a
1820 variable that is filled in with the number of characters seen. */
1821 HOST_WIDE_INT
1822 cpp_interpret_charconst (pfile, token, warn_multi, traditional, pchars_seen)
1823 cpp_reader *pfile;
1824 const cpp_token *token;
1825 int warn_multi;
1826 int traditional;
1827 unsigned int *pchars_seen;
1828 {
1829 const unsigned char *str = token->val.str.text;
1830 const unsigned char *limit = str + token->val.str.len;
1831 unsigned int chars_seen = 0;
1832 unsigned int width, max_chars, c;
1833 unsigned HOST_WIDE_INT mask;
1834 HOST_WIDE_INT result = 0;
1835
1836 #ifdef MULTIBYTE_CHARS
1837 (void) local_mbtowc (NULL, NULL, 0);
1838 #endif
1839
1840 /* Width in bits. */
1841 if (token->type == CPP_CHAR)
1842 width = MAX_CHAR_TYPE_SIZE;
1843 else
1844 width = MAX_WCHAR_TYPE_SIZE;
1845
1846 if (width < HOST_BITS_PER_WIDE_INT)
1847 mask = ((unsigned HOST_WIDE_INT) 1 << width) - 1;
1848 else
1849 mask = ~0;
1850 max_chars = HOST_BITS_PER_WIDE_INT / width;
1851
1852 while (str < limit)
1853 {
1854 #ifdef MULTIBYTE_CHARS
1855 wchar_t wc;
1856 int char_len;
1857
1858 char_len = local_mbtowc (&wc, str, limit - str);
1859 if (char_len == -1)
1860 {
1861 cpp_warning (pfile, "ignoring invalid multibyte character");
1862 c = *str++;
1863 }
1864 else
1865 {
1866 str += char_len;
1867 c = wc;
1868 }
1869 #else
1870 c = *str++;
1871 #endif
1872
1873 if (c == '\\')
1874 c = cpp_parse_escape (pfile, &str, limit, mask, traditional);
1875
1876 #ifdef MAP_CHARACTER
1877 if (ISPRINT (c))
1878 c = MAP_CHARACTER (c);
1879 #endif
1880
1881 /* Merge character into result; ignore excess chars. */
1882 if (++chars_seen <= max_chars)
1883 {
1884 if (width < HOST_BITS_PER_WIDE_INT)
1885 result = (result << width) | (c & mask);
1886 else
1887 result = c;
1888 }
1889 }
1890
1891 if (chars_seen == 0)
1892 cpp_error (pfile, "empty character constant");
1893 else if (chars_seen > max_chars)
1894 {
1895 chars_seen = max_chars;
1896 cpp_warning (pfile, "character constant too long");
1897 }
1898 else if (chars_seen > 1 && !traditional && warn_multi)
1899 cpp_warning (pfile, "multi-character character constant");
1900
1901 /* If char type is signed, sign-extend the constant. The
1902 __CHAR_UNSIGNED__ macro is set by the driver if appropriate. */
1903 if (token->type == CPP_CHAR && chars_seen)
1904 {
1905 unsigned int nbits = chars_seen * width;
1906
1907 mask = (unsigned HOST_WIDE_INT) ~0 >> (HOST_BITS_PER_WIDE_INT - nbits);
1908 if (pfile->spec_nodes.n__CHAR_UNSIGNED__->type == NT_MACRO
1909 || ((result >> (nbits - 1)) & 1) == 0)
1910 result &= mask;
1911 else
1912 result |= ~mask;
1913 }
1914
1915 *pchars_seen = chars_seen;
1916 return result;
1917 }
1918
1919 /* Memory buffers. Changing these three constants can have a dramatic
1920 effect on performance. The values here are reasonable defaults,
1921 but might be tuned. If you adjust them, be sure to test across a
1922 range of uses of cpplib, including heavy nested function-like macro
1923 expansion. Also check the change in peak memory usage (NJAMD is a
1924 good tool for this). */
1925 #define MIN_BUFF_SIZE 8000
1926 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1927 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1928 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1929
1930 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1931 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1932 #endif
1933
1934 struct dummy
1935 {
1936 char c;
1937 union
1938 {
1939 double d;
1940 int *p;
1941 } u;
1942 };
1943
1944 #define DEFAULT_ALIGNMENT (offsetof (struct dummy, u))
1945 #define CPP_ALIGN(size, align) (((size) + ((align) - 1)) & ~((align) - 1))
1946
1947 /* Create a new allocation buffer. Place the control block at the end
1948 of the buffer, so that buffer overflows will cause immediate chaos. */
1949 static _cpp_buff *
1950 new_buff (len)
1951 size_t len;
1952 {
1953 _cpp_buff *result;
1954 unsigned char *base;
1955
1956 if (len < MIN_BUFF_SIZE)
1957 len = MIN_BUFF_SIZE;
1958 len = CPP_ALIGN (len, DEFAULT_ALIGNMENT);
1959
1960 base = xmalloc (len + sizeof (_cpp_buff));
1961 result = (_cpp_buff *) (base + len);
1962 result->base = base;
1963 result->cur = base;
1964 result->limit = base + len;
1965 result->next = NULL;
1966 return result;
1967 }
1968
1969 /* Place a chain of unwanted allocation buffers on the free list. */
1970 void
1971 _cpp_release_buff (pfile, buff)
1972 cpp_reader *pfile;
1973 _cpp_buff *buff;
1974 {
1975 _cpp_buff *end = buff;
1976
1977 while (end->next)
1978 end = end->next;
1979 end->next = pfile->free_buffs;
1980 pfile->free_buffs = buff;
1981 }
1982
1983 /* Return a free buffer of size at least MIN_SIZE. */
1984 _cpp_buff *
1985 _cpp_get_buff (pfile, min_size)
1986 cpp_reader *pfile;
1987 size_t min_size;
1988 {
1989 _cpp_buff *result, **p;
1990
1991 for (p = &pfile->free_buffs;; p = &(*p)->next)
1992 {
1993 size_t size;
1994
1995 if (*p == NULL)
1996 return new_buff (min_size);
1997 result = *p;
1998 size = result->limit - result->base;
1999 /* Return a buffer that's big enough, but don't waste one that's
2000 way too big. */
2001 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2002 break;
2003 }
2004
2005 *p = result->next;
2006 result->next = NULL;
2007 result->cur = result->base;
2008 return result;
2009 }
2010
2011 /* Creates a new buffer with enough space to hold the uncommitted
2012 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
2013 the excess bytes to the new buffer. Chains the new buffer after
2014 BUFF, and returns the new buffer. */
2015 _cpp_buff *
2016 _cpp_append_extend_buff (pfile, buff, min_extra)
2017 cpp_reader *pfile;
2018 _cpp_buff *buff;
2019 size_t min_extra;
2020 {
2021 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2022 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2023
2024 buff->next = new_buff;
2025 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2026 return new_buff;
2027 }
2028
2029 /* Creates a new buffer with enough space to hold the uncommitted
2030 remaining bytes of the buffer pointed to by BUFF, and at least
2031 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
2032 Chains the new buffer before the buffer pointed to by BUFF, and
2033 updates the pointer to point to the new buffer. */
2034 void
2035 _cpp_extend_buff (pfile, pbuff, min_extra)
2036 cpp_reader *pfile;
2037 _cpp_buff **pbuff;
2038 size_t min_extra;
2039 {
2040 _cpp_buff *new_buff, *old_buff = *pbuff;
2041 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2042
2043 new_buff = _cpp_get_buff (pfile, size);
2044 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2045 new_buff->next = old_buff;
2046 *pbuff = new_buff;
2047 }
2048
2049 /* Free a chain of buffers starting at BUFF. */
2050 void
2051 _cpp_free_buff (buff)
2052 _cpp_buff *buff;
2053 {
2054 _cpp_buff *next;
2055
2056 for (; buff; buff = next)
2057 {
2058 next = buff->next;
2059 free (buff->base);
2060 }
2061 }
2062
2063 /* Allocate permanent, unaligned storage of length LEN. */
2064 unsigned char *
2065 _cpp_unaligned_alloc (pfile, len)
2066 cpp_reader *pfile;
2067 size_t len;
2068 {
2069 _cpp_buff *buff = pfile->u_buff;
2070 unsigned char *result = buff->cur;
2071
2072 if (len > (size_t) (buff->limit - result))
2073 {
2074 buff = _cpp_get_buff (pfile, len);
2075 buff->next = pfile->u_buff;
2076 pfile->u_buff = buff;
2077 result = buff->cur;
2078 }
2079
2080 buff->cur = result + len;
2081 return result;
2082 }
2083
2084 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2085 That buffer is used for growing allocations when saving macro
2086 replacement lists in a #define, and when parsing an answer to an
2087 assertion in #assert, #unassert or #if (and therefore possibly
2088 whilst expanding macros). It therefore must not be used by any
2089 code that they might call: specifically the lexer and the guts of
2090 the macro expander.
2091
2092 All existing other uses clearly fit this restriction: storing
2093 registered pragmas during initialization. */
2094 unsigned char *
2095 _cpp_aligned_alloc (pfile, len)
2096 cpp_reader *pfile;
2097 size_t len;
2098 {
2099 _cpp_buff *buff = pfile->a_buff;
2100 unsigned char *result = buff->cur;
2101
2102 if (len > (size_t) (buff->limit - result))
2103 {
2104 buff = _cpp_get_buff (pfile, len);
2105 buff->next = pfile->a_buff;
2106 pfile->a_buff = buff;
2107 result = buff->cur;
2108 }
2109
2110 buff->cur = result + len;
2111 return result;
2112 }