libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 2, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; if not, write to the Free Software
  20 Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 /* Returns with a logical line that contains no escaped newlines or
  99    trigraphs.  This is a time-critical inner loop.  */
 100 void
 101 _cpp_clean_line (cpp_reader *pfile)
 102 {
 103   cpp_buffer *buffer;
 104   const uchar *s;
 105   uchar c, *d, *p;
 106
 107   buffer = pfile->buffer;
 108   buffer->cur_note = buffer->notes_used = 0;
 109   buffer->cur = buffer->line_base = buffer->next_line;
 110   buffer->need_line = false;
 111   s = buffer->next_line - 1;
 112
 113   if (!buffer->from_stage3)
 114     {
 115       const uchar *pbackslash = NULL;
 116
 117       /* Short circuit for the common case of an un-escaped line with
 118          no trigraphs.  The primary win here is by not writing any
 119          data back to memory until we have to.  */
 120       for (;;)
 121         {
 122           c = *++s;
 123           if (__builtin_expect (c == '\n', false)
 124               || __builtin_expect (c == '\r', false))
 125             {
 126               d = (uchar *) s;
 127
 128               if (__builtin_expect (s == buffer->rlimit, false))
 129                 goto done;
 130
 131               /* DOS line ending? */
 132               if (__builtin_expect (c == '\r', false)
 133                   && s[1] == '\n')
 134                 {
 135                   s++;
 136                   if (s == buffer->rlimit)
 137                     goto done;
 138                 }
 139
 140               if (__builtin_expect (pbackslash == NULL, true))
 141                 goto done;
 142
 143               /* Check for escaped newline.  */
 144               p = d;
 145               while (is_nvspace (p[-1]))
 146                 p--;
 147               if (p - 1 != pbackslash)
 148                 goto done;
 149
 150               /* Have an escaped newline; process it and proceed to
 151                  the slow path.  */
 152               add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 153               d = p - 2;
 154               buffer->next_line = p - 1;
 155               break;
 156             }
 157           if (__builtin_expect (c == '\\', false))
 158             pbackslash = s;
 159           else if (__builtin_expect (c == '?', false)
 160                    && __builtin_expect (s[1] == '?', false)
 161                    && _cpp_trigraph_map[s[2]])
 162             {
 163               /* Have a trigraph.  We may or may not have to convert
 164                  it.  Add a line note regardless, for -Wtrigraphs.  */
 165               add_line_note (buffer, s, s[2]);
 166               if (CPP_OPTION (pfile, trigraphs))
 167                 {
 168                   /* We do, and that means we have to switch to the
 169                      slow path.  */
 170                   d = (uchar *) s;
 171                   *d = _cpp_trigraph_map[s[2]];
 172                   s += 2;
 173                   break;
 174                 }
 175             }
 176         }
 177
 178
 179       for (;;)
 180         {
 181           c = *++s;
 182           *++d = c;
 183
 184           if (c == '\n' || c == '\r')
 185             {
 186                   /* Handle DOS line endings.  */
 187               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 188                 s++;
 189               if (s == buffer->rlimit)
 190                 break;
 191
 192               /* Escaped?  */
 193               p = d;
 194               while (p != buffer->next_line && is_nvspace (p[-1]))
 195                 p--;
 196               if (p == buffer->next_line || p[-1] != '\\')
 197                 break;
 198
 199               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 200               d = p - 2;
 201               buffer->next_line = p - 1;
 202             }
 203           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 204             {
 205               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 206               add_line_note (buffer, d, s[2]);
 207               if (CPP_OPTION (pfile, trigraphs))
 208                 {
 209                   *d = _cpp_trigraph_map[s[2]];
 210                   s += 2;
 211                 }
 212             }
 213         }
 214     }
 215   else
 216     {
 217       do
 218         s++;
 219       while (*s != '\n' && *s != '\r');
 220       d = (uchar *) s;
 221
 222       /* Handle DOS line endings.  */
 223       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 224         s++;
 225     }
 226
 227  done:
 228   *d = '\n';
 229   /* A sentinel note that should never be processed.  */
 230   add_line_note (buffer, d + 1, '\n');
 231   buffer->next_line = s + 1;
 232 }
 233
 234 /* Return true if the trigraph indicated by NOTE should be warned
 235    about in a comment.  */
 236 static bool
 237 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 238 {
 239   const uchar *p;
 240
 241   /* Within comments we don't warn about trigraphs, unless the
 242      trigraph forms an escaped newline, as that may change
 243      behavior.  */
 244   if (note->type != '/')
 245     return false;
 246
 247   /* If -trigraphs, then this was an escaped newline iff the next note
 248      is coincident.  */
 249   if (CPP_OPTION (pfile, trigraphs))
 250     return note[1].pos == note->pos;
 251
 252   /* Otherwise, see if this forms an escaped newline.  */
 253   p = note->pos + 3;
 254   while (is_nvspace (*p))
 255     p++;
 256
 257   /* There might have been escaped newlines between the trigraph and the
 258      newline we found.  Hence the position test.  */
 259   return (*p == '\n' && p < note[1].pos);
 260 }
 261
 262 /* Process the notes created by add_line_note as far as the current
 263    location.  */
 264 void
 265 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 266 {
 267   cpp_buffer *buffer = pfile->buffer;
 268
 269   for (;;)
 270     {
 271       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 272       unsigned int col;
 273
 274       if (note->pos > buffer->cur)
 275         break;
 276
 277       buffer->cur_note++;
 278       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 279
 280       if (note->type == '\\' || note->type == ' ')
 281         {
 282           if (note->type == ' ' && !in_comment)
 283             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 284                                  "backslash and newline separated by space");
 285
 286           if (buffer->next_line > buffer->rlimit)
 287             {
 288               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 289                                    "backslash-newline at end of file");
 290               /* Prevent "no newline at end of file" warning.  */
 291               buffer->next_line = buffer->rlimit;
 292             }
 293
 294           buffer->line_base = note->pos;
 295           CPP_INCREMENT_LINE (pfile, 0);
 296         }
 297       else if (_cpp_trigraph_map[note->type])
 298         {
 299           if (CPP_OPTION (pfile, warn_trigraphs)
 300               && (!in_comment || warn_in_comment (pfile, note)))
 301             {
 302               if (CPP_OPTION (pfile, trigraphs))
 303                 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 304                                      "trigraph ??%c converted to %c",
 305                                      note->type,
 306                                      (int) _cpp_trigraph_map[note->type]);
 307               else
 308                 {
 309                   cpp_error_with_line
 310                     (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 311                      "trigraph ??%c ignored, use -trigraphs to enable",
 312                      note->type);
 313                 }
 314             }
 315         }
 316       else
 317         abort ();
 318     }
 319 }
 320
 321 /* Skip a C-style block comment.  We find the end of the comment by
 322    seeing if an asterisk is before every '/' we encounter.  Returns
 323    nonzero if comment terminated by EOF, zero otherwise.
 324
 325    Buffer->cur points to the initial asterisk of the comment.  */
 326 bool
 327 _cpp_skip_block_comment (cpp_reader *pfile)
 328 {
 329   cpp_buffer *buffer = pfile->buffer;
 330   const uchar *cur = buffer->cur;
 331   uchar c;
 332
 333   cur++;
 334   if (*cur == '/')
 335     cur++;
 336
 337   for (;;)
 338     {
 339       /* People like decorating comments with '*', so check for '/'
 340          instead for efficiency.  */
 341       c = *cur++;
 342
 343       if (c == '/')
 344         {
 345           if (cur[-2] == '*')
 346             break;
 347
 348           /* Warn about potential nested comments, but not if the '/'
 349              comes immediately before the true comment delimiter.
 350              Don't bother to get it right across escaped newlines.  */
 351           if (CPP_OPTION (pfile, warn_comments)
 352               && cur[0] == '*' && cur[1] != '/')
 353             {
 354               buffer->cur = cur;
 355               cpp_error_with_line (pfile, CPP_DL_WARNING,
 356                                    pfile->line_table->highest_line, CPP_BUF_COL (buffer),
 357                                    "\"/*\" within comment");
 358             }
 359         }
 360       else if (c == '\n')
 361         {
 362           unsigned int cols;
 363           buffer->cur = cur - 1;
 364           _cpp_process_line_notes (pfile, true);
 365           if (buffer->next_line >= buffer->rlimit)
 366             return true;
 367           _cpp_clean_line (pfile);
 368
 369           cols = buffer->next_line - buffer->line_base;
 370           CPP_INCREMENT_LINE (pfile, cols);
 371
 372           cur = buffer->cur;
 373         }
 374     }
 375
 376   buffer->cur = cur;
 377   _cpp_process_line_notes (pfile, true);
 378   return false;
 379 }
 380
 381 /* Skip a C++ line comment, leaving buffer->cur pointing to the
 382    terminating newline.  Handles escaped newlines.  Returns nonzero
 383    if a multiline comment.  */
 384 static int
 385 skip_line_comment (cpp_reader *pfile)
 386 {
 387   cpp_buffer *buffer = pfile->buffer;
 388   source_location orig_line = pfile->line_table->highest_line;
 389
 390   while (*buffer->cur != '\n')
 391     buffer->cur++;
 392
 393   _cpp_process_line_notes (pfile, true);
 394   return orig_line != pfile->line_table->highest_line;
 395 }
 396
 397 /* Skips whitespace, saving the next non-whitespace character.  */
 398 static void
 399 skip_whitespace (cpp_reader *pfile, cppchar_t c)
 400 {
 401   cpp_buffer *buffer = pfile->buffer;
 402   bool saw_NUL = false;
 403
 404   do
 405     {
 406       /* Horizontal space always OK.  */
 407       if (c == ' ' || c == '\t')
 408         ;
 409       /* Just \f \v or \0 left.  */
 410       else if (c == '\0')
 411         saw_NUL = true;
 412       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
 413         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
 414                              CPP_BUF_COL (buffer),
 415                              "%s in preprocessing directive",
 416                              c == '\f' ? "form feed" : "vertical tab");
 417
 418       c = *buffer->cur++;
 419     }
 420   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
 421   while (is_nvspace (c));
 422
 423   if (saw_NUL)
 424     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
 425
 426   buffer->cur--;
 427 }
 428
 429 /* See if the characters of a number token are valid in a name (no
 430    '.', '+' or '-').  */
 431 static int
 432 name_p (cpp_reader *pfile, const cpp_string *string)
 433 {
 434   unsigned int i;
 435
 436   for (i = 0; i < string->len; i++)
 437     if (!is_idchar (string->text[i]))
 438       return 0;
 439
 440   return 1;
 441 }
 442
 443 /* After parsing an identifier or other sequence, produce a warning about
 444    sequences not in NFC/NFKC.  */
 445 static void
 446 warn_about_normalization (cpp_reader *pfile,
 447                           const cpp_token *token,
 448                           const struct normalize_state *s)
 449 {
 450   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
 451       && !pfile->state.skipping)
 452     {
 453       /* Make sure that the token is printed using UCNs, even
 454          if we'd otherwise happily print UTF-8.  */
 455       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
 456       size_t sz;
 457
 458       sz = cpp_spell_token (pfile, token, buf, false) - buf;
 459       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
 460         cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
 461                              "`%.*s' is not in NFKC", (int) sz, buf);
 462       else
 463         cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
 464                              "`%.*s' is not in NFC", (int) sz, buf);
 465     }
 466 }
 467
 468 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
 469    an identifier.  FIRST is TRUE if this starts an identifier.  */
 470 static bool
 471 forms_identifier_p (cpp_reader *pfile, int first,
 472                     struct normalize_state *state)
 473 {
 474   cpp_buffer *buffer = pfile->buffer;
 475
 476   if (*buffer->cur == '$')
 477     {
 478       if (!CPP_OPTION (pfile, dollars_in_ident))
 479         return false;
 480
 481       buffer->cur++;
 482       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
 483         {
 484           CPP_OPTION (pfile, warn_dollars) = 0;
 485           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
 486         }
 487
 488       return true;
 489     }
 490
 491   /* Is this a syntactically valid UCN?  */
 492   if (CPP_OPTION (pfile, extended_identifiers)
 493       && *buffer->cur == '\\'
 494       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
 495     {
 496       buffer->cur += 2;
 497       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
 498                           state))
 499         return true;
 500       buffer->cur -= 2;
 501     }
 502
 503   return false;
 504 }
 505
 506 /* Lex an identifier starting at BUFFER->CUR - 1.  */
 507 static cpp_hashnode *
 508 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
 509                 struct normalize_state *nst)
 510 {
 511   cpp_hashnode *result;
 512   const uchar *cur;
 513   unsigned int len;
 514   unsigned int hash = HT_HASHSTEP (0, *base);
 515
 516   cur = pfile->buffer->cur;
 517   if (! starts_ucn)
 518     while (ISIDNUM (*cur))
 519       {
 520         hash = HT_HASHSTEP (hash, *cur);
 521         cur++;
 522       }
 523   pfile->buffer->cur = cur;
 524   if (starts_ucn || forms_identifier_p (pfile, false, nst))
 525     {
 526       /* Slower version for identifiers containing UCNs (or $).  */
 527       do {
 528         while (ISIDNUM (*pfile->buffer->cur))
 529           {
 530             pfile->buffer->cur++;
 531             NORMALIZE_STATE_UPDATE_IDNUM (nst);
 532           }
 533       } while (forms_identifier_p (pfile, false, nst));
 534       result = _cpp_interpret_identifier (pfile, base,
 535                                           pfile->buffer->cur - base);
 536     }
 537   else
 538     {
 539       len = cur - base;
 540       hash = HT_HASHFINISH (hash, len);
 541
 542       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
 543                                                   base, len, hash, HT_ALLOC));
 544     }
 545
 546   /* Rarely, identifiers require diagnostics when lexed.  */
 547   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
 548                         && !pfile->state.skipping, 0))
 549     {
 550       /* It is allowed to poison the same identifier twice.  */
 551       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
 552         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
 553                    NODE_NAME (result));
 554
 555       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
 556          replacement list of a variadic macro.  */
 557       if (result == pfile->spec_nodes.n__VA_ARGS__
 558           && !pfile->state.va_args_ok)
 559         cpp_error (pfile, CPP_DL_PEDWARN,
 560                    "__VA_ARGS__ can only appear in the expansion"
 561                    " of a C99 variadic macro");
 562     }
 563
 564   return result;
 565 }
 566
 567 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
 568 static void
 569 lex_number (cpp_reader *pfile, cpp_string *number,
 570             struct normalize_state *nst)
 571 {
 572   const uchar *cur;
 573   const uchar *base;
 574   uchar *dest;
 575
 576   base = pfile->buffer->cur - 1;
 577   do
 578     {
 579       cur = pfile->buffer->cur;
 580
 581       /* N.B. ISIDNUM does not include $.  */
 582       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
 583         {
 584           cur++;
 585           NORMALIZE_STATE_UPDATE_IDNUM (nst);
 586         }
 587
 588       pfile->buffer->cur = cur;
 589     }
 590   while (forms_identifier_p (pfile, false, nst));
 591
 592   number->len = cur - base;
 593   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
 594   memcpy (dest, base, number->len);
 595   dest[number->len] = '\0';
 596   number->text = dest;
 597 }
 598
 599 /* Create a token of type TYPE with a literal spelling.  */
 600 static void
 601 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
 602                 unsigned int len, enum cpp_ttype type)
 603 {
 604   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
 605
 606   memcpy (dest, base, len);
 607   dest[len] = '\0';
 608   token->type = type;
 609   token->val.str.len = len;
 610   token->val.str.text = dest;
 611 }
 612
 613 /* Lexes a string, character constant, or angle-bracketed header file
 614    name.  The stored string contains the spelling, including opening
 615    quote and leading any leading 'L', 'u' or 'U'.  It returns the type
 616    of the literal, or CPP_OTHER if it was not properly terminated.
 617
 618    The spelling is NUL-terminated, but it is not guaranteed that this
 619    is the first NUL since embedded NULs are preserved.  */
 620 static void
 621 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
 622 {
 623   bool saw_NUL = false;
 624   const uchar *cur;
 625   cppchar_t terminator;
 626   enum cpp_ttype type;
 627
 628   cur = base;
 629   terminator = *cur++;
 630   if (terminator == 'L' || terminator == 'u' || terminator == 'U')
 631     terminator = *cur++;
 632   if (terminator == '\"')
 633     type = (*base == 'L' ? CPP_WSTRING :
 634             *base == 'U' ? CPP_STRING32 :
 635             *base == 'u' ? CPP_STRING16 : CPP_STRING);
 636   else if (terminator == '\'')
 637     type = (*base == 'L' ? CPP_WCHAR :
 638             *base == 'U' ? CPP_CHAR32 :
 639             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
 640   else
 641     terminator = '>', type = CPP_HEADER_NAME;
 642
 643   for (;;)
 644     {
 645       cppchar_t c = *cur++;
 646
 647       /* In #include-style directives, terminators are not escapable.  */
 648       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
 649         cur++;
 650       else if (c == terminator)
 651         break;
 652       else if (c == '\n')
 653         {
 654           cur--;
 655           type = CPP_OTHER;
 656           break;
 657         }
 658       else if (c == '\0')
 659         saw_NUL = true;
 660     }
 661
 662   if (saw_NUL && !pfile->state.skipping)
 663     cpp_error (pfile, CPP_DL_WARNING,
 664                "null character(s) preserved in literal");
 665
 666   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
 667     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
 668                (int) terminator);
 669
 670   pfile->buffer->cur = cur;
 671   create_literal (pfile, token, base, cur - base, type);
 672 }
 673
 674 /* Return the comment table. The client may not make any assumption
 675    about the ordering of the table.  */
 676 cpp_comment_table *
 677 cpp_get_comments (cpp_reader *pfile)
 678 {
 679   return &pfile->comments;
 680 }
 681
 682 /* Append a comment to the end of the comment table. */
 683 static void
 684 store_comment (cpp_reader *pfile, cpp_token *token)
 685 {
 686   int len;
 687
 688   if (pfile->comments.allocated == 0)
 689     {
 690       pfile->comments.allocated = 256;
 691       pfile->comments.entries = (cpp_comment *) xmalloc
 692         (pfile->comments.allocated * sizeof (cpp_comment));
 693     }
 694
 695   if (pfile->comments.count == pfile->comments.allocated)
 696     {
 697       pfile->comments.allocated *= 2;
 698       pfile->comments.entries = (cpp_comment *) xrealloc
 699         (pfile->comments.entries,
 700          pfile->comments.allocated * sizeof (cpp_comment));
 701     }
 702
 703   len = token->val.str.len;
 704
 705   /* Copy comment. Note, token may not be NULL terminated. */
 706   pfile->comments.entries[pfile->comments.count].comment =
 707     (char *) xmalloc (sizeof (char) * (len + 1));
 708   memcpy (pfile->comments.entries[pfile->comments.count].comment,
 709           token->val.str.text, len);
 710   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
 711
 712   /* Set source location. */
 713   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
 714
 715   /* Increment the count of entries in the comment table. */
 716   pfile->comments.count++;
 717 }
 718
 719 /* The stored comment includes the comment start and any terminator.  */
 720 static void
 721 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
 722               cppchar_t type)
 723 {
 724   unsigned char *buffer;
 725   unsigned int len, clen;
 726
 727   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
 728
 729   /* C++ comments probably (not definitely) have moved past a new
 730      line, which we don't want to save in the comment.  */
 731   if (is_vspace (pfile->buffer->cur[-1]))
 732     len--;
 733
 734   /* If we are currently in a directive, then we need to store all
 735      C++ comments as C comments internally, and so we need to
 736      allocate a little extra space in that case.
 737
 738      Note that the only time we encounter a directive here is
 739      when we are saving comments in a "#define".  */
 740   clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
 741
 742   buffer = _cpp_unaligned_alloc (pfile, clen);
 743
 744   token->type = CPP_COMMENT;
 745   token->val.str.len = clen;
 746   token->val.str.text = buffer;
 747
 748   buffer[0] = '/';
 749   memcpy (buffer + 1, from, len - 1);
 750
 751   /* Finish conversion to a C comment, if necessary.  */
 752   if (pfile->state.in_directive && type == '/')
 753     {
 754       buffer[1] = '*';
 755       buffer[clen - 2] = '*';
 756       buffer[clen - 1] = '/';
 757     }
 758
 759   /* Finally store this comment for use by clients of libcpp. */
 760   store_comment (pfile, token);
 761 }
 762
 763 /* Allocate COUNT tokens for RUN.  */
 764 void
 765 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
 766 {
 767   run->base = XNEWVEC (cpp_token, count);
 768   run->limit = run->base + count;
 769   run->next = NULL;
 770 }
 771
 772 /* Returns the next tokenrun, or creates one if there is none.  */
 773 static tokenrun *
 774 next_tokenrun (tokenrun *run)
 775 {
 776   if (run->next == NULL)
 777     {
 778       run->next = XNEW (tokenrun);
 779       run->next->prev = run;
 780       _cpp_init_tokenrun (run->next, 250);
 781     }
 782
 783   return run->next;
 784 }
 785
 786 /* Look ahead in the input stream.  */
 787 const cpp_token *
 788 cpp_peek_token (cpp_reader *pfile, int index)
 789 {
 790   cpp_context *context = pfile->context;
 791   const cpp_token *peektok;
 792   int count;
 793
 794   /* First, scan through any pending cpp_context objects.  */
 795   while (context->prev)
 796     {
 797       ptrdiff_t sz = (context->direct_p
 798                       ? LAST (context).token - FIRST (context).token
 799                       : LAST (context).ptoken - FIRST (context).ptoken);
 800
 801       if (index < (int) sz)
 802         return (context->direct_p
 803                 ? FIRST (context).token + index
 804                 : *(FIRST (context).ptoken + index));
 805
 806       index -= (int) sz;
 807       context = context->prev;
 808     }
 809
 810   /* We will have to read some new tokens after all (and do so
 811      without invalidating preceding tokens).  */
 812   count = index;
 813   pfile->keep_tokens++;
 814
 815   do
 816     {
 817       peektok = _cpp_lex_token (pfile);
 818       if (peektok->type == CPP_EOF)
 819         return peektok;
 820     }
 821   while (index--);
 822
 823   _cpp_backup_tokens_direct (pfile, count + 1);
 824   pfile->keep_tokens--;
 825
 826   return peektok;
 827 }
 828
 829 /* Allocate a single token that is invalidated at the same time as the
 830    rest of the tokens on the line.  Has its line and col set to the
 831    same as the last lexed token, so that diagnostics appear in the
 832    right place.  */
 833 cpp_token *
 834 _cpp_temp_token (cpp_reader *pfile)
 835 {
 836   cpp_token *old, *result;
 837   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
 838   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
 839
 840   old = pfile->cur_token - 1;
 841   /* Any pre-existing lookaheads must not be clobbered.  */
 842   if (la)
 843     {
 844       if (sz <= la)
 845         {
 846           tokenrun *next = next_tokenrun (pfile->cur_run);
 847
 848           if (sz < la)
 849             memmove (next->base + 1, next->base,
 850                      (la - sz) * sizeof (cpp_token));
 851
 852           next->base[0] = pfile->cur_run->limit[-1];
 853         }
 854
 855       if (sz > 1)
 856         memmove (pfile->cur_token + 1, pfile->cur_token,
 857                  MIN (la, sz - 1) * sizeof (cpp_token));
 858     }
 859
 860   if (!sz && pfile->cur_token == pfile->cur_run->limit)
 861     {
 862       pfile->cur_run = next_tokenrun (pfile->cur_run);
 863       pfile->cur_token = pfile->cur_run->base;
 864     }
 865
 866   result = pfile->cur_token++;
 867   result->src_loc = old->src_loc;
 868   return result;
 869 }
 870
 871 /* Lex a token into RESULT (external interface).  Takes care of issues
 872    like directive handling, token lookahead, multiple include
 873    optimization and skipping.  */
 874 const cpp_token *
 875 _cpp_lex_token (cpp_reader *pfile)
 876 {
 877   cpp_token *result;
 878
 879   for (;;)
 880     {
 881       if (pfile->cur_token == pfile->cur_run->limit)
 882         {
 883           pfile->cur_run = next_tokenrun (pfile->cur_run);
 884           pfile->cur_token = pfile->cur_run->base;
 885         }
 886       /* We assume that the current token is somewhere in the current
 887          run.  */
 888       if (pfile->cur_token < pfile->cur_run->base
 889           || pfile->cur_token >= pfile->cur_run->limit)
 890         abort ();
 891
 892       if (pfile->lookaheads)
 893         {
 894           pfile->lookaheads--;
 895           result = pfile->cur_token++;
 896         }
 897       else
 898         result = _cpp_lex_direct (pfile);
 899
 900       if (result->flags & BOL)
 901         {
 902           /* Is this a directive.  If _cpp_handle_directive returns
 903              false, it is an assembler #.  */
 904           if (result->type == CPP_HASH
 905               /* 6.10.3 p 11: Directives in a list of macro arguments
 906                  gives undefined behavior.  This implementation
 907                  handles the directive as normal.  */
 908               && pfile->state.parsing_args != 1)
 909             {
 910               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
 911                 {
 912                   if (pfile->directive_result.type == CPP_PADDING)
 913                     continue;
 914                   result = &pfile->directive_result;
 915                 }
 916             }
 917           else if (pfile->state.in_deferred_pragma)
 918             result = &pfile->directive_result;
 919
 920           if (pfile->cb.line_change && !pfile->state.skipping)
 921             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
 922         }
 923
 924       /* We don't skip tokens in directives.  */
 925       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
 926         break;
 927
 928       /* Outside a directive, invalidate controlling macros.  At file
 929          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
 930          get here and MI optimization works.  */
 931       pfile->mi_valid = false;
 932
 933       if (!pfile->state.skipping || result->type == CPP_EOF)
 934         break;
 935     }
 936
 937   return result;
 938 }
 939
 940 /* Returns true if a fresh line has been loaded.  */
 941 bool
 942 _cpp_get_fresh_line (cpp_reader *pfile)
 943 {
 944   int return_at_eof;
 945
 946   /* We can't get a new line until we leave the current directive.  */
 947   if (pfile->state.in_directive)
 948     return false;
 949
 950   for (;;)
 951     {
 952       cpp_buffer *buffer = pfile->buffer;
 953
 954       if (!buffer->need_line)
 955         return true;
 956
 957       if (buffer->next_line < buffer->rlimit)
 958         {
 959           _cpp_clean_line (pfile);
 960           return true;
 961         }
 962
 963       /* First, get out of parsing arguments state.  */
 964       if (pfile->state.parsing_args)
 965         return false;
 966
 967       /* End of buffer.  Non-empty files should end in a newline.  */
 968       if (buffer->buf != buffer->rlimit
 969           && buffer->next_line > buffer->rlimit
 970           && !buffer->from_stage3)
 971         {
 972           /* Clip to buffer size.  */
 973           buffer->next_line = buffer->rlimit;
 974         }
 975
 976       return_at_eof = buffer->return_at_eof;
 977       _cpp_pop_buffer (pfile);
 978       if (pfile->buffer == NULL || return_at_eof)
 979         return false;
 980     }
 981 }
 982
 983 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
 984   do                                                    \
 985     {                                                   \
 986       result->type = ELSE_TYPE;                         \
 987       if (*buffer->cur == CHAR)                         \
 988         buffer->cur++, result->type = THEN_TYPE;        \
 989     }                                                   \
 990   while (0)
 991
 992 /* Lex a token into pfile->cur_token, which is also incremented, to
 993    get diagnostics pointing to the correct location.
 994
 995    Does not handle issues such as token lookahead, multiple-include
 996    optimization, directives, skipping etc.  This function is only
 997    suitable for use by _cpp_lex_token, and in special cases like
 998    lex_expansion_token which doesn't care for any of these issues.
 999
1000    When meeting a newline, returns CPP_EOF if parsing a directive,
1001    otherwise returns to the start of the token buffer if permissible.
1002    Returns the location of the lexed token.  */
1003 cpp_token *
1004 _cpp_lex_direct (cpp_reader *pfile)
1005 {
1006   cppchar_t c;
1007   cpp_buffer *buffer;
1008   const unsigned char *comment_start;
1009   cpp_token *result = pfile->cur_token++;
1010
1011  fresh_line:
1012   result->flags = 0;
1013   buffer = pfile->buffer;
1014   if (buffer->need_line)
1015     {
1016       if (pfile->state.in_deferred_pragma)
1017         {
1018           result->type = CPP_PRAGMA_EOL;
1019           pfile->state.in_deferred_pragma = false;
1020           if (!pfile->state.pragma_allow_expansion)
1021             pfile->state.prevent_expansion--;
1022           return result;
1023         }
1024       if (!_cpp_get_fresh_line (pfile))
1025         {
1026           result->type = CPP_EOF;
1027           if (!pfile->state.in_directive)
1028             {
1029               /* Tell the compiler the line number of the EOF token.  */
1030               result->src_loc = pfile->line_table->highest_line;
1031               result->flags = BOL;
1032             }
1033           return result;
1034         }
1035       if (!pfile->keep_tokens)
1036         {
1037           pfile->cur_run = &pfile->base_run;
1038           result = pfile->base_run.base;
1039           pfile->cur_token = result + 1;
1040         }
1041       result->flags = BOL;
1042       if (pfile->state.parsing_args == 2)
1043         result->flags |= PREV_WHITE;
1044     }
1045   buffer = pfile->buffer;
1046  update_tokens_line:
1047   result->src_loc = pfile->line_table->highest_line;
1048
1049  skipped_white:
1050   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
1051       && !pfile->overlaid_buffer)
1052     {
1053       _cpp_process_line_notes (pfile, false);
1054       result->src_loc = pfile->line_table->highest_line;
1055     }
1056   c = *buffer->cur++;
1057
1058   LINEMAP_POSITION_FOR_COLUMN (result->src_loc, pfile->line_table,
1059                                CPP_BUF_COLUMN (buffer, buffer->cur));
1060
1061   switch (c)
1062     {
1063     case ' ': case '\t': case '\f': case '\v': case '\0':
1064       result->flags |= PREV_WHITE;
1065       skip_whitespace (pfile, c);
1066       goto skipped_white;
1067
1068     case '\n':
1069       if (buffer->cur < buffer->rlimit)
1070         CPP_INCREMENT_LINE (pfile, 0);
1071       buffer->need_line = true;
1072       goto fresh_line;
1073
1074     case '0': case '1': case '2': case '3': case '4':
1075     case '5': case '6': case '7': case '8': case '9':
1076       {
1077         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1078         result->type = CPP_NUMBER;
1079         lex_number (pfile, &result->val.str, &nst);
1080         warn_about_normalization (pfile, result, &nst);
1081         break;
1082       }
1083
1084     case 'L':
1085     case 'u':
1086     case 'U':
1087       /* 'L', 'u' or 'U' may introduce wide characters or strings.  */
1088       if (c == 'L' || CPP_OPTION (pfile, uliterals))
1089         {
1090           if (*buffer->cur == '\'' || *buffer->cur == '"')
1091             {
1092               lex_string (pfile, result, buffer->cur - 1);
1093               break;
1094             }
1095         }
1096       /* Fall through.  */
1097
1098     case '_':
1099     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1100     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1101     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1102     case 's': case 't':           case 'v': case 'w': case 'x':
1103     case 'y': case 'z':
1104     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1105     case 'G': case 'H': case 'I': case 'J': case 'K':
1106     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1107     case 'S': case 'T':           case 'V': case 'W': case 'X':
1108     case 'Y': case 'Z':
1109       result->type = CPP_NAME;
1110       {
1111         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1112         result->val.node = lex_identifier (pfile, buffer->cur - 1, false,
1113                                            &nst);
1114         warn_about_normalization (pfile, result, &nst);
1115       }
1116
1117       /* Convert named operators to their proper types.  */
1118       if (result->val.node->flags & NODE_OPERATOR)
1119         {
1120           result->flags |= NAMED_OP;
1121           result->type = (enum cpp_ttype) result->val.node->directive_index;
1122         }
1123       break;
1124
1125     case '\'':
1126     case '"':
1127       lex_string (pfile, result, buffer->cur - 1);
1128       break;
1129
1130     case '/':
1131       /* A potential block or line comment.  */
1132       comment_start = buffer->cur;
1133       c = *buffer->cur;
1134
1135       if (c == '*')
1136         {
1137           if (_cpp_skip_block_comment (pfile))
1138             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
1139         }
1140       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1141                             || cpp_in_system_header (pfile)))
1142         {
1143           /* Warn about comments only if pedantically GNUC89, and not
1144              in system headers.  */
1145           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1146               && ! buffer->warned_cplusplus_comments)
1147             {
1148               cpp_error (pfile, CPP_DL_PEDWARN,
1149                          "C++ style comments are not allowed in ISO C90");
1150               cpp_error (pfile, CPP_DL_PEDWARN,
1151                          "(this will be reported only once per input file)");
1152               buffer->warned_cplusplus_comments = 1;
1153             }
1154
1155           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1156             cpp_error (pfile, CPP_DL_WARNING, "multi-line comment");
1157         }
1158       else if (c == '=')
1159         {
1160           buffer->cur++;
1161           result->type = CPP_DIV_EQ;
1162           break;
1163         }
1164       else
1165         {
1166           result->type = CPP_DIV;
1167           break;
1168         }
1169
1170       if (!pfile->state.save_comments)
1171         {
1172           result->flags |= PREV_WHITE;
1173           goto update_tokens_line;
1174         }
1175
1176       /* Save the comment as a token in its own right.  */
1177       save_comment (pfile, result, comment_start, c);
1178       break;
1179
1180     case '<':
1181       if (pfile->state.angled_headers)
1182         {
1183           lex_string (pfile, result, buffer->cur - 1);
1184           break;
1185         }
1186
1187       result->type = CPP_LESS;
1188       if (*buffer->cur == '=')
1189         buffer->cur++, result->type = CPP_LESS_EQ;
1190       else if (*buffer->cur == '<')
1191         {
1192           buffer->cur++;
1193           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1194         }
1195       else if (CPP_OPTION (pfile, digraphs))
1196         {
1197           if (*buffer->cur == ':')
1198             {
1199               buffer->cur++;
1200               result->flags |= DIGRAPH;
1201               result->type = CPP_OPEN_SQUARE;
1202             }
1203           else if (*buffer->cur == '%')
1204             {
1205               buffer->cur++;
1206               result->flags |= DIGRAPH;
1207               result->type = CPP_OPEN_BRACE;
1208             }
1209         }
1210       break;
1211
1212     case '>':
1213       result->type = CPP_GREATER;
1214       if (*buffer->cur == '=')
1215         buffer->cur++, result->type = CPP_GREATER_EQ;
1216       else if (*buffer->cur == '>')
1217         {
1218           buffer->cur++;
1219           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1220         }
1221       break;
1222
1223     case '%':
1224       result->type = CPP_MOD;
1225       if (*buffer->cur == '=')
1226         buffer->cur++, result->type = CPP_MOD_EQ;
1227       else if (CPP_OPTION (pfile, digraphs))
1228         {
1229           if (*buffer->cur == ':')
1230             {
1231               buffer->cur++;
1232               result->flags |= DIGRAPH;
1233               result->type = CPP_HASH;
1234               if (*buffer->cur == '%' && buffer->cur[1] == ':')
1235                 buffer->cur += 2, result->type = CPP_PASTE;
1236             }
1237           else if (*buffer->cur == '>')
1238             {
1239               buffer->cur++;
1240               result->flags |= DIGRAPH;
1241               result->type = CPP_CLOSE_BRACE;
1242             }
1243         }
1244       break;
1245
1246     case '.':
1247       result->type = CPP_DOT;
1248       if (ISDIGIT (*buffer->cur))
1249         {
1250           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1251           result->type = CPP_NUMBER;
1252           lex_number (pfile, &result->val.str, &nst);
1253           warn_about_normalization (pfile, result, &nst);
1254         }
1255       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
1256         buffer->cur += 2, result->type = CPP_ELLIPSIS;
1257       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1258         buffer->cur++, result->type = CPP_DOT_STAR;
1259       break;
1260
1261     case '+':
1262       result->type = CPP_PLUS;
1263       if (*buffer->cur == '+')
1264         buffer->cur++, result->type = CPP_PLUS_PLUS;
1265       else if (*buffer->cur == '=')
1266         buffer->cur++, result->type = CPP_PLUS_EQ;
1267       break;
1268
1269     case '-':
1270       result->type = CPP_MINUS;
1271       if (*buffer->cur == '>')
1272         {
1273           buffer->cur++;
1274           result->type = CPP_DEREF;
1275           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1276             buffer->cur++, result->type = CPP_DEREF_STAR;
1277         }
1278       else if (*buffer->cur == '-')
1279         buffer->cur++, result->type = CPP_MINUS_MINUS;
1280       else if (*buffer->cur == '=')
1281         buffer->cur++, result->type = CPP_MINUS_EQ;
1282       break;
1283
1284     case '&':
1285       result->type = CPP_AND;
1286       if (*buffer->cur == '&')
1287         buffer->cur++, result->type = CPP_AND_AND;
1288       else if (*buffer->cur == '=')
1289         buffer->cur++, result->type = CPP_AND_EQ;
1290       break;
1291
1292     case '|':
1293       result->type = CPP_OR;
1294       if (*buffer->cur == '|')
1295         buffer->cur++, result->type = CPP_OR_OR;
1296       else if (*buffer->cur == '=')
1297         buffer->cur++, result->type = CPP_OR_EQ;
1298       break;
1299
1300     case ':':
1301       result->type = CPP_COLON;
1302       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
1303         buffer->cur++, result->type = CPP_SCOPE;
1304       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
1305         {
1306           buffer->cur++;
1307           result->flags |= DIGRAPH;
1308           result->type = CPP_CLOSE_SQUARE;
1309         }
1310       break;
1311
1312     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1313     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1314     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1315     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1316     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1317
1318     case '?': result->type = CPP_QUERY; break;
1319     case '~': result->type = CPP_COMPL; break;
1320     case ',': result->type = CPP_COMMA; break;
1321     case '(': result->type = CPP_OPEN_PAREN; break;
1322     case ')': result->type = CPP_CLOSE_PAREN; break;
1323     case '[': result->type = CPP_OPEN_SQUARE; break;
1324     case ']': result->type = CPP_CLOSE_SQUARE; break;
1325     case '{': result->type = CPP_OPEN_BRACE; break;
1326     case '}': result->type = CPP_CLOSE_BRACE; break;
1327     case ';': result->type = CPP_SEMICOLON; break;
1328
1329       /* @ is a punctuator in Objective-C.  */
1330     case '@': result->type = CPP_ATSIGN; break;
1331
1332     case '$':
1333     case '\\':
1334       {
1335         const uchar *base = --buffer->cur;
1336         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1337
1338         if (forms_identifier_p (pfile, true, &nst))
1339           {
1340             result->type = CPP_NAME;
1341             result->val.node = lex_identifier (pfile, base, true, &nst);
1342             warn_about_normalization (pfile, result, &nst);
1343             break;
1344           }
1345         buffer->cur++;
1346       }
1347
1348     default:
1349       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
1350       break;
1351     }
1352
1353   return result;
1354 }
1355
1356 /* An upper bound on the number of bytes needed to spell TOKEN.
1357    Does not include preceding whitespace.  */
1358 unsigned int
1359 cpp_token_len (const cpp_token *token)
1360 {
1361   unsigned int len;
1362
1363   switch (TOKEN_SPELL (token))
1364     {
1365     default:            len = 6;                                break;
1366     case SPELL_LITERAL: len = token->val.str.len;               break;
1367     case SPELL_IDENT:   len = NODE_LEN (token->val.node) * 10;  break;
1368     }
1369
1370   return len;
1371 }
1372
1373 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
1374    Return the number of bytes read out of NAME.  (There are always
1375    10 bytes written to BUFFER.)  */
1376
1377 static size_t
1378 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
1379 {
1380   int j;
1381   int ucn_len = 0;
1382   int ucn_len_c;
1383   unsigned t;
1384   unsigned long utf32;
1385
1386   /* Compute the length of the UTF-8 sequence.  */
1387   for (t = *name; t & 0x80; t <<= 1)
1388     ucn_len++;
1389
1390   utf32 = *name & (0x7F >> ucn_len);
1391   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
1392     {
1393       utf32 = (utf32 << 6) | (*++name & 0x3F);
1394
1395       /* Ill-formed UTF-8.  */
1396       if ((*name & ~0x3F) != 0x80)
1397         abort ();
1398     }
1399
1400   *buffer++ = '\\';
1401   *buffer++ = 'U';
1402   for (j = 7; j >= 0; j--)
1403     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
1404   return ucn_len;
1405 }
1406
1407
1408 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
1409    already contain the enough space to hold the token's spelling.
1410    Returns a pointer to the character after the last character written.
1411    FORSTRING is true if this is to be the spelling after translation
1412    phase 1 (this is different for UCNs).
1413    FIXME: Would be nice if we didn't need the PFILE argument.  */
1414 unsigned char *
1415 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
1416                  unsigned char *buffer, bool forstring)
1417 {
1418   switch (TOKEN_SPELL (token))
1419     {
1420     case SPELL_OPERATOR:
1421       {
1422         const unsigned char *spelling;
1423         unsigned char c;
1424
1425         if (token->flags & DIGRAPH)
1426           spelling
1427             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1428         else if (token->flags & NAMED_OP)
1429           goto spell_ident;
1430         else
1431           spelling = TOKEN_NAME (token);
1432
1433         while ((c = *spelling++) != '\0')
1434           *buffer++ = c;
1435       }
1436       break;
1437
1438     spell_ident:
1439     case SPELL_IDENT:
1440       if (forstring)
1441         {
1442           memcpy (buffer, NODE_NAME (token->val.node),
1443                   NODE_LEN (token->val.node));
1444           buffer += NODE_LEN (token->val.node);
1445         }
1446       else
1447         {
1448           size_t i;
1449           const unsigned char * name = NODE_NAME (token->val.node);
1450
1451           for (i = 0; i < NODE_LEN (token->val.node); i++)
1452             if (name[i] & ~0x7F)
1453               {
1454                 i += utf8_to_ucn (buffer, name + i) - 1;
1455                 buffer += 10;
1456               }
1457             else
1458               *buffer++ = NODE_NAME (token->val.node)[i];
1459         }
1460       break;
1461
1462     case SPELL_LITERAL:
1463       memcpy (buffer, token->val.str.text, token->val.str.len);
1464       buffer += token->val.str.len;
1465       break;
1466
1467     case SPELL_NONE:
1468       cpp_error (pfile, CPP_DL_ICE,
1469                  "unspellable token %s", TOKEN_NAME (token));
1470       break;
1471     }
1472
1473   return buffer;
1474 }
1475
1476 /* Returns TOKEN spelt as a null-terminated string.  The string is
1477    freed when the reader is destroyed.  Useful for diagnostics.  */
1478 unsigned char *
1479 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
1480 {
1481   unsigned int len = cpp_token_len (token) + 1;
1482   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1483
1484   end = cpp_spell_token (pfile, token, start, false);
1485   end[0] = '\0';
1486
1487   return start;
1488 }
1489
1490 /* Used by C front ends, which really should move to using
1491    cpp_token_as_text.  */
1492 const char *
1493 cpp_type2name (enum cpp_ttype type)
1494 {
1495   return (const char *) token_spellings[type].name;
1496 }
1497
1498 /* Writes the spelling of token to FP, without any preceding space.
1499    Separated from cpp_spell_token for efficiency - to avoid stdio
1500    double-buffering.  */
1501 void
1502 cpp_output_token (const cpp_token *token, FILE *fp)
1503 {
1504   switch (TOKEN_SPELL (token))
1505     {
1506     case SPELL_OPERATOR:
1507       {
1508         const unsigned char *spelling;
1509         int c;
1510
1511         if (token->flags & DIGRAPH)
1512           spelling
1513             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1514         else if (token->flags & NAMED_OP)
1515           goto spell_ident;
1516         else
1517           spelling = TOKEN_NAME (token);
1518
1519         c = *spelling;
1520         do
1521           putc (c, fp);
1522         while ((c = *++spelling) != '\0');
1523       }
1524       break;
1525
1526     spell_ident:
1527     case SPELL_IDENT:
1528       {
1529         size_t i;
1530         const unsigned char * name = NODE_NAME (token->val.node);
1531
1532         for (i = 0; i < NODE_LEN (token->val.node); i++)
1533           if (name[i] & ~0x7F)
1534             {
1535               unsigned char buffer[10];
1536               i += utf8_to_ucn (buffer, name + i) - 1;
1537               fwrite (buffer, 1, 10, fp);
1538             }
1539           else
1540             fputc (NODE_NAME (token->val.node)[i], fp);
1541       }
1542       break;
1543
1544     case SPELL_LITERAL:
1545       fwrite (token->val.str.text, 1, token->val.str.len, fp);
1546       break;
1547
1548     case SPELL_NONE:
1549       /* An error, most probably.  */
1550       break;
1551     }
1552 }
1553
1554 /* Compare two tokens.  */
1555 int
1556 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
1557 {
1558   if (a->type == b->type && a->flags == b->flags)
1559     switch (TOKEN_SPELL (a))
1560       {
1561       default:                  /* Keep compiler happy.  */
1562       case SPELL_OPERATOR:
1563         return 1;
1564       case SPELL_NONE:
1565         return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1566       case SPELL_IDENT:
1567         return a->val.node == b->val.node;
1568       case SPELL_LITERAL:
1569         return (a->val.str.len == b->val.str.len
1570                 && !memcmp (a->val.str.text, b->val.str.text,
1571                             a->val.str.len));
1572       }
1573
1574   return 0;
1575 }
1576
1577 /* Returns nonzero if a space should be inserted to avoid an
1578    accidental token paste for output.  For simplicity, it is
1579    conservative, and occasionally advises a space where one is not
1580    needed, e.g. "." and ".2".  */
1581 int
1582 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
1583                  const cpp_token *token2)
1584 {
1585   enum cpp_ttype a = token1->type, b = token2->type;
1586   cppchar_t c;
1587
1588   if (token1->flags & NAMED_OP)
1589     a = CPP_NAME;
1590   if (token2->flags & NAMED_OP)
1591     b = CPP_NAME;
1592
1593   c = EOF;
1594   if (token2->flags & DIGRAPH)
1595     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1596   else if (token_spellings[b].category == SPELL_OPERATOR)
1597     c = token_spellings[b].name[0];
1598
1599   /* Quickly get everything that can paste with an '='.  */
1600   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1601     return 1;
1602
1603   switch (a)
1604     {
1605     case CPP_GREATER:   return c == '>';
1606     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
1607     case CPP_PLUS:      return c == '+';
1608     case CPP_MINUS:     return c == '-' || c == '>';
1609     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
1610     case CPP_MOD:       return c == ':' || c == '>';
1611     case CPP_AND:       return c == '&';
1612     case CPP_OR:        return c == '|';
1613     case CPP_COLON:     return c == ':' || c == '>';
1614     case CPP_DEREF:     return c == '*';
1615     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
1616     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
1617     case CPP_NAME:      return ((b == CPP_NUMBER
1618                                  && name_p (pfile, &token2->val.str))
1619                                 || b == CPP_NAME
1620                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
1621     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
1622                                 || c == '.' || c == '+' || c == '-');
1623                                       /* UCNs */
1624     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
1625                                  && b == CPP_NAME)
1626                                 || (CPP_OPTION (pfile, objc)
1627                                     && token1->val.str.text[0] == '@'
1628                                     && (b == CPP_NAME || b == CPP_STRING)));
1629     default:            break;
1630     }
1631
1632   return 0;
1633 }
1634
1635 /* Output all the remaining tokens on the current line, and a newline
1636    character, to FP.  Leading whitespace is removed.  If there are
1637    macros, special token padding is not performed.  */
1638 void
1639 cpp_output_line (cpp_reader *pfile, FILE *fp)
1640 {
1641   const cpp_token *token;
1642
1643   token = cpp_get_token (pfile);
1644   while (token->type != CPP_EOF)
1645     {
1646       cpp_output_token (token, fp);
1647       token = cpp_get_token (pfile);
1648       if (token->flags & PREV_WHITE)
1649         putc (' ', fp);
1650     }
1651
1652   putc ('\n', fp);
1653 }
1654
1655 /* Return a string representation of all the remaining tokens on the
1656    current line.  The result is allocated using xmalloc and must be
1657    freed by the caller.  */
1658 unsigned char *
1659 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
1660 {
1661   const cpp_token *token;
1662   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
1663   unsigned int alloced = 120 + out;
1664   unsigned char *result = (unsigned char *) xmalloc (alloced);
1665
1666   /* If DIR_NAME is empty, there are no initial contents.  */
1667   if (dir_name)
1668     {
1669       sprintf ((char *) result, "#%s ", dir_name);
1670       out += 2;
1671     }
1672
1673   token = cpp_get_token (pfile);
1674   while (token->type != CPP_EOF)
1675     {
1676       unsigned char *last;
1677       /* Include room for a possible space and the terminating nul.  */
1678       unsigned int len = cpp_token_len (token) + 2;
1679
1680       if (out + len > alloced)
1681         {
1682           alloced *= 2;
1683           if (out + len > alloced)
1684             alloced = out + len;
1685           result = (unsigned char *) xrealloc (result, alloced);
1686         }
1687
1688       last = cpp_spell_token (pfile, token, &result[out], 0);
1689       out = last - result;
1690
1691       token = cpp_get_token (pfile);
1692       if (token->flags & PREV_WHITE)
1693         result[out++] = ' ';
1694     }
1695
1696   result[out] = '\0';
1697   return result;
1698 }
1699
1700 /* Memory buffers.  Changing these three constants can have a dramatic
1701    effect on performance.  The values here are reasonable defaults,
1702    but might be tuned.  If you adjust them, be sure to test across a
1703    range of uses of cpplib, including heavy nested function-like macro
1704    expansion.  Also check the change in peak memory usage (NJAMD is a
1705    good tool for this).  */
1706 #define MIN_BUFF_SIZE 8000
1707 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1708 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1709         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1710
1711 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1712   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1713 #endif
1714
1715 /* Create a new allocation buffer.  Place the control block at the end
1716    of the buffer, so that buffer overflows will cause immediate chaos.  */
1717 static _cpp_buff *
1718 new_buff (size_t len)
1719 {
1720   _cpp_buff *result;
1721   unsigned char *base;
1722
1723   if (len < MIN_BUFF_SIZE)
1724     len = MIN_BUFF_SIZE;
1725   len = CPP_ALIGN (len);
1726
1727   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
1728   result = (_cpp_buff *) (base + len);
1729   result->base = base;
1730   result->cur = base;
1731   result->limit = base + len;
1732   result->next = NULL;
1733   return result;
1734 }
1735
1736 /* Place a chain of unwanted allocation buffers on the free list.  */
1737 void
1738 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
1739 {
1740   _cpp_buff *end = buff;
1741
1742   while (end->next)
1743     end = end->next;
1744   end->next = pfile->free_buffs;
1745   pfile->free_buffs = buff;
1746 }
1747
1748 /* Return a free buffer of size at least MIN_SIZE.  */
1749 _cpp_buff *
1750 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
1751 {
1752   _cpp_buff *result, **p;
1753
1754   for (p = &pfile->free_buffs;; p = &(*p)->next)
1755     {
1756       size_t size;
1757
1758       if (*p == NULL)
1759         return new_buff (min_size);
1760       result = *p;
1761       size = result->limit - result->base;
1762       /* Return a buffer that's big enough, but don't waste one that's
1763          way too big.  */
1764       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
1765         break;
1766     }
1767
1768   *p = result->next;
1769   result->next = NULL;
1770   result->cur = result->base;
1771   return result;
1772 }
1773
1774 /* Creates a new buffer with enough space to hold the uncommitted
1775    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
1776    the excess bytes to the new buffer.  Chains the new buffer after
1777    BUFF, and returns the new buffer.  */
1778 _cpp_buff *
1779 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
1780 {
1781   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
1782   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
1783
1784   buff->next = new_buff;
1785   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
1786   return new_buff;
1787 }
1788
1789 /* Creates a new buffer with enough space to hold the uncommitted
1790    remaining bytes of the buffer pointed to by BUFF, and at least
1791    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
1792    Chains the new buffer before the buffer pointed to by BUFF, and
1793    updates the pointer to point to the new buffer.  */
1794 void
1795 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
1796 {
1797   _cpp_buff *new_buff, *old_buff = *pbuff;
1798   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
1799
1800   new_buff = _cpp_get_buff (pfile, size);
1801   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
1802   new_buff->next = old_buff;
1803   *pbuff = new_buff;
1804 }
1805
1806 /* Free a chain of buffers starting at BUFF.  */
1807 void
1808 _cpp_free_buff (_cpp_buff *buff)
1809 {
1810   _cpp_buff *next;
1811
1812   for (; buff; buff = next)
1813     {
1814       next = buff->next;
1815       free (buff->base);
1816     }
1817 }
1818
1819 /* Allocate permanent, unaligned storage of length LEN.  */
1820 unsigned char *
1821 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
1822 {
1823   _cpp_buff *buff = pfile->u_buff;
1824   unsigned char *result = buff->cur;
1825
1826   if (len > (size_t) (buff->limit - result))
1827     {
1828       buff = _cpp_get_buff (pfile, len);
1829       buff->next = pfile->u_buff;
1830       pfile->u_buff = buff;
1831       result = buff->cur;
1832     }
1833
1834   buff->cur = result + len;
1835   return result;
1836 }
1837
1838 /* Allocate permanent, unaligned storage of length LEN from a_buff.
1839    That buffer is used for growing allocations when saving macro
1840    replacement lists in a #define, and when parsing an answer to an
1841    assertion in #assert, #unassert or #if (and therefore possibly
1842    whilst expanding macros).  It therefore must not be used by any
1843    code that they might call: specifically the lexer and the guts of
1844    the macro expander.
1845
1846    All existing other uses clearly fit this restriction: storing
1847    registered pragmas during initialization.  */
1848 unsigned char *
1849 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
1850 {
1851   _cpp_buff *buff = pfile->a_buff;
1852   unsigned char *result = buff->cur;
1853
1854   if (len > (size_t) (buff->limit - result))
1855     {
1856       buff = _cpp_get_buff (pfile, len);
1857       buff->next = pfile->a_buff;
1858       pfile->a_buff = buff;
1859       result = buff->cur;
1860     }
1861
1862   buff->cur = result + len;
1863   return result;
1864 }
1865
1866 /* Say which field of TOK is in use.  */
1867
1868 enum cpp_token_fld_kind
1869 cpp_token_val_index (cpp_token *tok)
1870 {
1871   switch (TOKEN_SPELL (tok))
1872     {
1873     case SPELL_IDENT:
1874       return CPP_TOKEN_FLD_NODE;
1875     case SPELL_LITERAL:
1876       return CPP_TOKEN_FLD_STR;
1877     case SPELL_NONE:
1878       if (tok->type == CPP_MACRO_ARG)
1879         return CPP_TOKEN_FLD_ARG_NO;
1880       else if (tok->type == CPP_PADDING)
1881         return CPP_TOKEN_FLD_SOURCE;
1882       else if (tok->type == CPP_PRAGMA)
1883         return CPP_TOKEN_FLD_PRAGMA;
1884       /* else fall through */
1885     default:
1886       return CPP_TOKEN_FLD_NONE;
1887     }
1888 }