libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009, 2010,
   3    2011, 2012 Free Software Foundation, Inc.
   4    Contributed by Per Bothner, 1994-95.
   5    Based on CCCP program by Paul Rubin, June 1986
   6    Adapted to ANSI C, Richard Stallman, Jan 1987
   7    Broken out to separate file, Zack Weinberg, Mar 2000
   8
   9 This program is free software; you can redistribute it and/or modify it
  10 under the terms of the GNU General Public License as published by the
  11 Free Software Foundation; either version 3, or (at your option) any
  12 later version.
  13
  14 This program is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with this program; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "cpplib.h"
  26 #include "internal.h"
  27
  28 enum spell_type
  29 {
  30   SPELL_OPERATOR = 0,
  31   SPELL_IDENT,
  32   SPELL_LITERAL,
  33   SPELL_NONE
  34 };
  35
  36 struct token_spelling
  37 {
  38   enum spell_type category;
  39   const unsigned char *name;
  40 };
  41
  42 static const unsigned char *const digraph_spellings[] =
  43 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  44
  45 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  46 #define TK(e, s) { SPELL_ ## s,    UC #e },
  47 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  48 #undef OP
  49 #undef TK
  50
  51 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  52 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  53
  54 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  55 static int skip_line_comment (cpp_reader *);
  56 static void skip_whitespace (cpp_reader *, cppchar_t);
  57 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  58 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  59 static void store_comment (cpp_reader *, cpp_token *);
  60 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  61                             unsigned int, enum cpp_ttype);
  62 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  63 static int name_p (cpp_reader *, const cpp_string *);
  64 static tokenrun *next_tokenrun (tokenrun *);
  65
  66 static _cpp_buff *new_buff (size_t);
  67
  68
  69 /* Utility routine:
  70
  71    Compares, the token TOKEN to the NUL-terminated string STRING.
  72    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  73 int
  74 cpp_ideq (const cpp_token *token, const char *string)
  75 {
  76   if (token->type != CPP_NAME)
  77     return 0;
  78
  79   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  80 }
  81
  82 /* Record a note TYPE at byte POS into the current cleaned logical
  83    line.  */
  84 static void
  85 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  86 {
  87   if (buffer->notes_used == buffer->notes_cap)
  88     {
  89       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  90       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  91                                   buffer->notes_cap);
  92     }
  93
  94   buffer->notes[buffer->notes_used].pos = pos;
  95   buffer->notes[buffer->notes_used].type = type;
  96   buffer->notes_used++;
  97 }
  98
  99 \f
 100 /* Fast path to find line special characters using optimized character
 101    scanning algorithms.  Anything complicated falls back to the slow
 102    path below.  Since this loop is very hot it's worth doing these kinds
 103    of optimizations.
 104
 105    One of the paths through the ifdefs should provide
 106
 107      const uchar *search_line_fast (const uchar *s, const uchar *end);
 108
 109    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 110    the found character.
 111
 112    Note that the last character of the buffer is *always* a newline,
 113    as forced by _cpp_convert_input.  This fact can be used to avoid
 114    explicitly looking for the end of the buffer.  */
 115
 116 /* Configure gives us an ifdef test.  */
 117 #ifndef WORDS_BIGENDIAN
 118 #define WORDS_BIGENDIAN 0
 119 #endif
 120
 121 /* We'd like the largest integer that fits into a register.  There's nothing
 122    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 123    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 124    can get the "real" word size.  */
 125 #ifdef __GNUC__
 126 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 127 #else
 128 typedef unsigned long word_type;
 129 #endif
 130
 131 /* The code below is only expecting sizes 4 or 8.
 132    Die at compile-time if this expectation is violated.  */
 133 typedef char check_word_type_size
 134   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 135
 136 /* Return X with the first N bytes forced to values that won't match one
 137    of the interesting characters.  Note that NUL is not interesting.  */
 138
 139 static inline word_type
 140 acc_char_mask_misalign (word_type val, unsigned int n)
 141 {
 142   word_type mask = -1;
 143   if (WORDS_BIGENDIAN)
 144     mask >>= n * 8;
 145   else
 146     mask <<= n * 8;
 147   return val & mask;
 148 }
 149
 150 /* Return X replicated to all byte positions within WORD_TYPE.  */
 151
 152 static inline word_type
 153 acc_char_replicate (uchar x)
 154 {
 155   word_type ret;
 156
 157   ret = (x << 24) | (x << 16) | (x << 8) | x;
 158   if (sizeof(word_type) == 8)
 159     ret = (ret << 16 << 16) | ret;
 160   return ret;
 161 }
 162
 163 /* Return non-zero if some byte of VAL is (probably) C.  */
 164
 165 static inline word_type
 166 acc_char_cmp (word_type val, word_type c)
 167 {
 168 #if defined(__GNUC__) && defined(__alpha__)
 169   /* We can get exact results using a compare-bytes instruction.
 170      Get (val == c) via (0 >= (val ^ c)).  */
 171   return __builtin_alpha_cmpbge (0, val ^ c);
 172 #else
 173   word_type magic = 0x7efefefeU;
 174   if (sizeof(word_type) == 8)
 175     magic = (magic << 16 << 16) | 0xfefefefeU;
 176   magic |= 1;
 177
 178   val ^= c;
 179   return ((val + magic) ^ ~val) & ~magic;
 180 #endif
 181 }
 182
 183 /* Given the result of acc_char_cmp is non-zero, return the index of
 184    the found character.  If this was a false positive, return -1.  */
 185
 186 static inline int
 187 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 188                 word_type val ATTRIBUTE_UNUSED)
 189 {
 190 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 191   /* The cmpbge instruction sets *bits* of the result corresponding to
 192      matches in the bytes with no false positives.  */
 193   return __builtin_ctzl (cmp);
 194 #else
 195   unsigned int i;
 196
 197   /* ??? It would be nice to force unrolling here,
 198      and have all of these constants folded.  */
 199   for (i = 0; i < sizeof(word_type); ++i)
 200     {
 201       uchar c;
 202       if (WORDS_BIGENDIAN)
 203         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 204       else
 205         c = (val >> i * 8) & 0xff;
 206
 207       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 208         return i;
 209     }
 210
 211   return -1;
 212 #endif
 213 }
 214
 215 /* A version of the fast scanner using bit fiddling techniques.
 216
 217    For 32-bit words, one would normally perform 16 comparisons and
 218    16 branches.  With this algorithm one performs 24 arithmetic
 219    operations and one branch.  Whether this is faster with a 32-bit
 220    word size is going to be somewhat system dependent.
 221
 222    For 64-bit words, we eliminate twice the number of comparisons
 223    and branches without increasing the number of arithmetic operations.
 224    It's almost certainly going to be a win with 64-bit word size.  */
 225
 226 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 227   ATTRIBUTE_UNUSED;
 228
 229 static const uchar *
 230 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 231 {
 232   const word_type repl_nl = acc_char_replicate ('\n');
 233   const word_type repl_cr = acc_char_replicate ('\r');
 234   const word_type repl_bs = acc_char_replicate ('\\');
 235   const word_type repl_qm = acc_char_replicate ('?');
 236
 237   unsigned int misalign;
 238   const word_type *p;
 239   word_type val, t;
 240
 241   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 242   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 243   val = *p;
 244   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 245   if (misalign)
 246     val = acc_char_mask_misalign (val, misalign);
 247
 248   /* Main loop.  */
 249   while (1)
 250     {
 251       t  = acc_char_cmp (val, repl_nl);
 252       t |= acc_char_cmp (val, repl_cr);
 253       t |= acc_char_cmp (val, repl_bs);
 254       t |= acc_char_cmp (val, repl_qm);
 255
 256       if (__builtin_expect (t != 0, 0))
 257         {
 258           int i = acc_char_index (t, val);
 259           if (i >= 0)
 260             return (const uchar *)p + i;
 261         }
 262
 263       val = *++p;
 264     }
 265 }
 266
 267 /* Disable on Solaris 2/x86 until the following problems can be properly
 268    autoconfed:
 269
 270    The Solaris 9 assembler cannot assemble SSE4.2 insns.
 271    Before Solaris 9 Update 6, SSE insns cannot be executed.
 272    The Solaris 10+ assembler tags objects with the instruction set
 273    extensions used, so SSE4.2 executables cannot run on machines that
 274    don't support that extension.  */
 275
 276 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 277
 278 /* Replicated character data to be shared between implementations.
 279    Recall that outside of a context with vector support we can't
 280    define compatible vector types, therefore these are all defined
 281    in terms of raw characters.  */
 282 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 283   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 284     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 285   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 286     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 287   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 288     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 289   { '?', '?', '?', '?', '?', '?', '?', '?',
 290     '?', '?', '?', '?', '?', '?', '?', '?' },
 291 };
 292
 293 /* A version of the fast scanner using MMX vectorized byte compare insns.
 294
 295    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 296    which was packaged into SSE1; it is also present in the AMD MMX
 297    extension.  Mark the function as using "sse" so that we emit a real
 298    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 299
 300 static const uchar *
 301 #ifndef __SSE__
 302 __attribute__((__target__("sse")))
 303 #endif
 304 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 305 {
 306   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 307   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 308
 309   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 310   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 311   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 312   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 313
 314   unsigned int misalign, found, mask;
 315   const v8qi *p;
 316   v8qi data, t, c;
 317
 318   /* Align the source pointer.  While MMX doesn't generate unaligned data
 319      faults, this allows us to safely scan to the end of the buffer without
 320      reading beyond the end of the last page.  */
 321   misalign = (uintptr_t)s & 7;
 322   p = (const v8qi *)((uintptr_t)s & -8);
 323   data = *p;
 324
 325   /* Create a mask for the bytes that are valid within the first
 326      16-byte block.  The Idea here is that the AND with the mask
 327      within the loop is "free", since we need some AND or TEST
 328      insn in order to set the flags for the branch anyway.  */
 329   mask = -1u << misalign;
 330
 331   /* Main loop processing 8 bytes at a time.  */
 332   goto start;
 333   do
 334     {
 335       data = *++p;
 336       mask = -1;
 337
 338     start:
 339       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 340       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 341       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 342       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 343       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 344       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 345       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 346       found = __builtin_ia32_pmovmskb (t);
 347       found &= mask;
 348     }
 349   while (!found);
 350
 351   __builtin_ia32_emms ();
 352
 353   /* FOUND contains 1 in bits for which we matched a relevant
 354      character.  Conversion to the byte index is trivial.  */
 355   found = __builtin_ctz(found);
 356   return (const uchar *)p + found;
 357 }
 358
 359 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 360
 361 static const uchar *
 362 #ifndef __SSE2__
 363 __attribute__((__target__("sse2")))
 364 #endif
 365 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 366 {
 367   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 368
 369   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 370   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 371   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 372   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 373
 374   unsigned int misalign, found, mask;
 375   const v16qi *p;
 376   v16qi data, t;
 377
 378   /* Align the source pointer.  */
 379   misalign = (uintptr_t)s & 15;
 380   p = (const v16qi *)((uintptr_t)s & -16);
 381   data = *p;
 382
 383   /* Create a mask for the bytes that are valid within the first
 384      16-byte block.  The Idea here is that the AND with the mask
 385      within the loop is "free", since we need some AND or TEST
 386      insn in order to set the flags for the branch anyway.  */
 387   mask = -1u << misalign;
 388
 389   /* Main loop processing 16 bytes at a time.  */
 390   goto start;
 391   do
 392     {
 393       data = *++p;
 394       mask = -1;
 395
 396     start:
 397       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 398       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 399       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 400       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 401       found = __builtin_ia32_pmovmskb128 (t);
 402       found &= mask;
 403     }
 404   while (!found);
 405
 406   /* FOUND contains 1 in bits for which we matched a relevant
 407      character.  Conversion to the byte index is trivial.  */
 408   found = __builtin_ctz(found);
 409   return (const uchar *)p + found;
 410 }
 411
 412 #ifdef HAVE_SSE4
 413 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 414
 415 static const uchar *
 416 #ifndef __SSE4_2__
 417 __attribute__((__target__("sse4.2")))
 418 #endif
 419 search_line_sse42 (const uchar *s, const uchar *end)
 420 {
 421   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 422   static const v16qi search = { '\n', '\r', '?', '\\' };
 423
 424   uintptr_t si = (uintptr_t)s;
 425   uintptr_t index;
 426
 427   /* Check for unaligned input.  */
 428   if (si & 15)
 429     {
 430       v16qi sv;
 431
 432       if (__builtin_expect (end - s < 16, 0)
 433           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 434         {
 435           /* There are less than 16 bytes left in the buffer, and less
 436              than 16 bytes left on the page.  Reading 16 bytes at this
 437              point might generate a spurious page fault.  Defer to the
 438              SSE2 implementation, which already handles alignment.  */
 439           return search_line_sse2 (s, end);
 440         }
 441
 442       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 443          memory need not be aligned.  */
 444       sv = __builtin_ia32_loaddqu ((const char *) s);
 445       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 446
 447       if (__builtin_expect (index < 16, 0))
 448         goto found;
 449
 450       /* Advance the pointer to an aligned address.  We will re-scan a
 451          few bytes, but we no longer need care for reading past the
 452          end of a page, since we're guaranteed a match.  */
 453       s = (const uchar *)((si + 16) & -16);
 454     }
 455
 456   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 457      in inline assembly, we can make proper use of the flags set.  */
 458   __asm (      "sub $16, %1\n"
 459         "       .balign 16\n"
 460         "0:     add $16, %1\n"
 461         "       %vpcmpestri $0, (%1), %2\n"
 462         "       jnc 0b"
 463         : "=&c"(index), "+r"(s)
 464         : "x"(search), "a"(4), "d"(16));
 465
 466  found:
 467   return s + index;
 468 }
 469
 470 #else
 471 /* Work around out-dated assemblers without sse4 support.  */
 472 #define search_line_sse42 search_line_sse2
 473 #endif
 474
 475 /* Check the CPU capabilities.  */
 476
 477 #include "../gcc/config/i386/cpuid.h"
 478
 479 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 480 static search_line_fast_type search_line_fast;
 481
 482 #define HAVE_init_vectorized_lexer 1
 483 static inline void
 484 init_vectorized_lexer (void)
 485 {
 486   unsigned dummy, ecx = 0, edx = 0;
 487   search_line_fast_type impl = search_line_acc_char;
 488   int minimum = 0;
 489
 490 #if defined(__SSE4_2__)
 491   minimum = 3;
 492 #elif defined(__SSE2__)
 493   minimum = 2;
 494 #elif defined(__SSE__)
 495   minimum = 1;
 496 #endif
 497
 498   if (minimum == 3)
 499     impl = search_line_sse42;
 500   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 501     {
 502       if (minimum == 3 || (ecx & bit_SSE4_2))
 503         impl = search_line_sse42;
 504       else if (minimum == 2 || (edx & bit_SSE2))
 505         impl = search_line_sse2;
 506       else if (minimum == 1 || (edx & bit_SSE))
 507         impl = search_line_mmx;
 508     }
 509   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 510     {
 511       if (minimum == 1
 512           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 513         impl = search_line_mmx;
 514     }
 515
 516   search_line_fast = impl;
 517 }
 518
 519 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
 520
 521 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 522 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 523    so we can't compile this function without -maltivec on the command line
 524    (or implied by some other switch).  */
 525
 526 static const uchar *
 527 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 528 {
 529   typedef __attribute__((altivec(vector))) unsigned char vc;
 530
 531   const vc repl_nl = {
 532     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 533     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 534   };
 535   const vc repl_cr = {
 536     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 537     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 538   };
 539   const vc repl_bs = {
 540     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 541     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 542   };
 543   const vc repl_qm = {
 544     '?', '?', '?', '?', '?', '?', '?', '?',
 545     '?', '?', '?', '?', '?', '?', '?', '?',
 546   };
 547   const vc ones = {
 548     -1, -1, -1, -1, -1, -1, -1, -1,
 549     -1, -1, -1, -1, -1, -1, -1, -1,
 550   };
 551   const vc zero = { 0 };
 552
 553   vc data, mask, t;
 554
 555   /* Altivec loads automatically mask addresses with -16.  This lets us
 556      issue the first load as early as possible.  */
 557   data = __builtin_vec_ld(0, (const vc *)s);
 558
 559   /* Discard bytes before the beginning of the buffer.  Do this by
 560      beginning with all ones and shifting in zeros according to the
 561      mis-alignment.  The LVSR instruction pulls the exact shift we
 562      want from the address.  */
 563   mask = __builtin_vec_lvsr(0, s);
 564   mask = __builtin_vec_perm(zero, ones, mask);
 565   data &= mask;
 566
 567   /* While altivec loads mask addresses, we still need to align S so
 568      that the offset we compute at the end is correct.  */
 569   s = (const uchar *)((uintptr_t)s & -16);
 570
 571   /* Main loop processing 16 bytes at a time.  */
 572   goto start;
 573   do
 574     {
 575       vc m_nl, m_cr, m_bs, m_qm;
 576
 577       s += 16;
 578       data = __builtin_vec_ld(0, (const vc *)s);
 579
 580     start:
 581       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 582       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 583       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 584       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 585       t = (m_nl | m_cr) | (m_bs | m_qm);
 586
 587       /* T now contains 0xff in bytes for which we matched one of the relevant
 588          characters.  We want to exit the loop if any byte in T is non-zero.
 589          Below is the expansion of vec_any_ne(t, zero).  */
 590     }
 591   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 592
 593   {
 594 #define N  (sizeof(vc) / sizeof(long))
 595
 596     union {
 597       vc v;
 598       /* Statically assert that N is 2 or 4.  */
 599       unsigned long l[(N == 2 || N == 4) ? N : -1];
 600     } u;
 601     unsigned long l, i = 0;
 602
 603     u.v = t;
 604
 605     /* Find the first word of T that is non-zero.  */
 606     switch (N)
 607       {
 608       case 4:
 609         l = u.l[i++];
 610         if (l != 0)
 611           break;
 612         s += sizeof(unsigned long);
 613         l = u.l[i++];
 614         if (l != 0)
 615           break;
 616         s += sizeof(unsigned long);
 617       case 2:
 618         l = u.l[i++];
 619         if (l != 0)
 620           break;
 621         s += sizeof(unsigned long);
 622         l = u.l[i];
 623       }
 624
 625     /* L now contains 0xff in bytes for which we matched one of the
 626        relevant characters.  We can find the byte index by finding
 627        its bit index and dividing by 8.  */
 628     l = __builtin_clzl(l) >> 3;
 629     return s + l;
 630
 631 #undef N
 632   }
 633 }
 634
 635 #elif defined (__ARM_NEON__)
 636 #include "arm_neon.h"
 637
 638 static const uchar *
 639 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 640 {
 641   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 642   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 643   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 644   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 645   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 646
 647   unsigned int misalign, found, mask;
 648   const uint8_t *p;
 649   uint8x16_t data;
 650
 651   /* Align the source pointer.  */
 652   misalign = (uintptr_t)s & 15;
 653   p = (const uint8_t *)((uintptr_t)s & -16);
 654   data = vld1q_u8 (p);
 655
 656   /* Create a mask for the bytes that are valid within the first
 657      16-byte block.  The Idea here is that the AND with the mask
 658      within the loop is "free", since we need some AND or TEST
 659      insn in order to set the flags for the branch anyway.  */
 660   mask = (-1u << misalign) & 0xffff;
 661
 662   /* Main loop, processing 16 bytes at a time.  */
 663   goto start;
 664
 665   do
 666     {
 667       uint8x8_t l;
 668       uint16x4_t m;
 669       uint32x2_t n;
 670       uint8x16_t t, u, v, w;
 671
 672       p += 16;
 673       data = vld1q_u8 (p);
 674       mask = 0xffff;
 675
 676     start:
 677       t = vceqq_u8 (data, repl_nl);
 678       u = vceqq_u8 (data, repl_cr);
 679       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 680       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 681       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 682       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 683       m = vpaddl_u8 (l);
 684       n = vpaddl_u16 (m);
 685
 686       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 687               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 688       found &= mask;
 689     }
 690   while (!found);
 691
 692   /* FOUND contains 1 in bits for which we matched a relevant
 693      character.  Conversion to the byte index is trivial.  */
 694   found = __builtin_ctz (found);
 695   return (const uchar *)p + found;
 696 }
 697
 698 #else
 699
 700 /* We only have one accellerated alternative.  Use a direct call so that
 701    we encourage inlining.  */
 702
 703 #define search_line_fast  search_line_acc_char
 704
 705 #endif
 706
 707 /* Initialize the lexer if needed.  */
 708
 709 void
 710 _cpp_init_lexer (void)
 711 {
 712 #ifdef HAVE_init_vectorized_lexer
 713   init_vectorized_lexer ();
 714 #endif
 715 }
 716
 717 /* Returns with a logical line that contains no escaped newlines or
 718    trigraphs.  This is a time-critical inner loop.  */
 719 void
 720 _cpp_clean_line (cpp_reader *pfile)
 721 {
 722   cpp_buffer *buffer;
 723   const uchar *s;
 724   uchar c, *d, *p;
 725
 726   buffer = pfile->buffer;
 727   buffer->cur_note = buffer->notes_used = 0;
 728   buffer->cur = buffer->line_base = buffer->next_line;
 729   buffer->need_line = false;
 730   s = buffer->next_line;
 731
 732   if (!buffer->from_stage3)
 733     {
 734       const uchar *pbackslash = NULL;
 735
 736       /* Fast path.  This is the common case of an un-escaped line with
 737          no trigraphs.  The primary win here is by not writing any
 738          data back to memory until we have to.  */
 739       while (1)
 740         {
 741           /* Perform an optimized search for \n, \r, \\, ?.  */
 742           s = search_line_fast (s, buffer->rlimit);
 743
 744           c = *s;
 745           if (c == '\\')
 746             {
 747               /* Record the location of the backslash and continue.  */
 748               pbackslash = s++;
 749             }
 750           else if (__builtin_expect (c == '?', 0))
 751             {
 752               if (__builtin_expect (s[1] == '?', false)
 753                    && _cpp_trigraph_map[s[2]])
 754                 {
 755                   /* Have a trigraph.  We may or may not have to convert
 756                      it.  Add a line note regardless, for -Wtrigraphs.  */
 757                   add_line_note (buffer, s, s[2]);
 758                   if (CPP_OPTION (pfile, trigraphs))
 759                     {
 760                       /* We do, and that means we have to switch to the
 761                          slow path.  */
 762                       d = (uchar *) s;
 763                       *d = _cpp_trigraph_map[s[2]];
 764                       s += 2;
 765                       goto slow_path;
 766                     }
 767                 }
 768               /* Not a trigraph.  Continue on fast-path.  */
 769               s++;
 770             }
 771           else
 772             break;
 773         }
 774
 775       /* This must be \r or \n.  We're either done, or we'll be forced
 776          to write back to the buffer and continue on the slow path.  */
 777       d = (uchar *) s;
 778
 779       if (__builtin_expect (s == buffer->rlimit, false))
 780         goto done;
 781
 782       /* DOS line ending? */
 783       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 784         {
 785           s++;
 786           if (s == buffer->rlimit)
 787             goto done;
 788         }
 789
 790       if (__builtin_expect (pbackslash == NULL, true))
 791         goto done;
 792
 793       /* Check for escaped newline.  */
 794       p = d;
 795       while (is_nvspace (p[-1]))
 796         p--;
 797       if (p - 1 != pbackslash)
 798         goto done;
 799
 800       /* Have an escaped newline; process it and proceed to
 801          the slow path.  */
 802       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 803       d = p - 2;
 804       buffer->next_line = p - 1;
 805
 806     slow_path:
 807       while (1)
 808         {
 809           c = *++s;
 810           *++d = c;
 811
 812           if (c == '\n' || c == '\r')
 813             {
 814               /* Handle DOS line endings.  */
 815               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 816                 s++;
 817               if (s == buffer->rlimit)
 818                 break;
 819
 820               /* Escaped?  */
 821               p = d;
 822               while (p != buffer->next_line && is_nvspace (p[-1]))
 823                 p--;
 824               if (p == buffer->next_line || p[-1] != '\\')
 825                 break;
 826
 827               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 828               d = p - 2;
 829               buffer->next_line = p - 1;
 830             }
 831           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 832             {
 833               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 834               add_line_note (buffer, d, s[2]);
 835               if (CPP_OPTION (pfile, trigraphs))
 836                 {
 837                   *d = _cpp_trigraph_map[s[2]];
 838                   s += 2;
 839                 }
 840             }
 841         }
 842     }
 843   else
 844     {
 845       while (*s != '\n' && *s != '\r')
 846         s++;
 847       d = (uchar *) s;
 848
 849       /* Handle DOS line endings.  */
 850       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 851         s++;
 852     }
 853
 854  done:
 855   *d = '\n';
 856   /* A sentinel note that should never be processed.  */
 857   add_line_note (buffer, d + 1, '\n');
 858   buffer->next_line = s + 1;
 859 }
 860
 861 /* Return true if the trigraph indicated by NOTE should be warned
 862    about in a comment.  */
 863 static bool
 864 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 865 {
 866   const uchar *p;
 867
 868   /* Within comments we don't warn about trigraphs, unless the
 869      trigraph forms an escaped newline, as that may change
 870      behavior.  */
 871   if (note->type != '/')
 872     return false;
 873
 874   /* If -trigraphs, then this was an escaped newline iff the next note
 875      is coincident.  */
 876   if (CPP_OPTION (pfile, trigraphs))
 877     return note[1].pos == note->pos;
 878
 879   /* Otherwise, see if this forms an escaped newline.  */
 880   p = note->pos + 3;
 881   while (is_nvspace (*p))
 882     p++;
 883
 884   /* There might have been escaped newlines between the trigraph and the
 885      newline we found.  Hence the position test.  */
 886   return (*p == '\n' && p < note[1].pos);
 887 }
 888
 889 /* Process the notes created by add_line_note as far as the current
 890    location.  */
 891 void
 892 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 893 {
 894   cpp_buffer *buffer = pfile->buffer;
 895
 896   for (;;)
 897     {
 898       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 899       unsigned int col;
 900
 901       if (note->pos > buffer->cur)
 902         break;
 903
 904       buffer->cur_note++;
 905       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 906
 907       if (note->type == '\\' || note->type == ' ')
 908         {
 909           if (note->type == ' ' && !in_comment)
 910             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 911                                  "backslash and newline separated by space");
 912
 913           if (buffer->next_line > buffer->rlimit)
 914             {
 915               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 916                                    "backslash-newline at end of file");
 917               /* Prevent "no newline at end of file" warning.  */
 918               buffer->next_line = buffer->rlimit;
 919             }
 920
 921           buffer->line_base = note->pos;
 922           CPP_INCREMENT_LINE (pfile, 0);
 923         }
 924       else if (_cpp_trigraph_map[note->type])
 925         {
 926           if (CPP_OPTION (pfile, warn_trigraphs)
 927               && (!in_comment || warn_in_comment (pfile, note)))
 928             {
 929               if (CPP_OPTION (pfile, trigraphs))
 930                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 931                                        pfile->line_table->highest_line, col,
 932                                        "trigraph ??%c converted to %c",
 933                                        note->type,
 934                                        (int) _cpp_trigraph_map[note->type]);
 935               else
 936                 {
 937                   cpp_warning_with_line
 938                     (pfile, CPP_W_TRIGRAPHS,
 939                      pfile->line_table->highest_line, col,
 940                      "trigraph ??%c ignored, use -trigraphs to enable",
 941                      note->type);
 942                 }
 943             }
 944         }
 945       else if (note->type == 0)
 946         /* Already processed in lex_raw_string.  */;
 947       else
 948         abort ();
 949     }
 950 }
 951
 952 /* Skip a C-style block comment.  We find the end of the comment by
 953    seeing if an asterisk is before every '/' we encounter.  Returns
 954    nonzero if comment terminated by EOF, zero otherwise.
 955
 956    Buffer->cur points to the initial asterisk of the comment.  */
 957 bool
 958 _cpp_skip_block_comment (cpp_reader *pfile)
 959 {
 960   cpp_buffer *buffer = pfile->buffer;
 961   const uchar *cur = buffer->cur;
 962   uchar c;
 963
 964   cur++;
 965   if (*cur == '/')
 966     cur++;
 967
 968   for (;;)
 969     {
 970       /* People like decorating comments with '*', so check for '/'
 971          instead for efficiency.  */
 972       c = *cur++;
 973
 974       if (c == '/')
 975         {
 976           if (cur[-2] == '*')
 977             break;
 978
 979           /* Warn about potential nested comments, but not if the '/'
 980              comes immediately before the true comment delimiter.
 981              Don't bother to get it right across escaped newlines.  */
 982           if (CPP_OPTION (pfile, warn_comments)
 983               && cur[0] == '*' && cur[1] != '/')
 984             {
 985               buffer->cur = cur;
 986               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 987                                      pfile->line_table->highest_line,
 988                                      CPP_BUF_COL (buffer),
 989                                      "\"/*\" within comment");
 990             }
 991         }
 992       else if (c == '\n')
 993         {
 994           unsigned int cols;
 995           buffer->cur = cur - 1;
 996           _cpp_process_line_notes (pfile, true);
 997           if (buffer->next_line >= buffer->rlimit)
 998             return true;
 999           _cpp_clean_line (pfile);
1000
1001           cols = buffer->next_line - buffer->line_base;
1002           CPP_INCREMENT_LINE (pfile, cols);
1003
1004           cur = buffer->cur;
1005         }
1006     }
1007
1008   buffer->cur = cur;
1009   _cpp_process_line_notes (pfile, true);
1010   return false;
1011 }
1012
1013 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1014    terminating newline.  Handles escaped newlines.  Returns nonzero
1015    if a multiline comment.  */
1016 static int
1017 skip_line_comment (cpp_reader *pfile)
1018 {
1019   cpp_buffer *buffer = pfile->buffer;
1020   source_location orig_line = pfile->line_table->highest_line;
1021
1022   while (*buffer->cur != '\n')
1023     buffer->cur++;
1024
1025   _cpp_process_line_notes (pfile, true);
1026   return orig_line != pfile->line_table->highest_line;
1027 }
1028
1029 /* Skips whitespace, saving the next non-whitespace character.  */
1030 static void
1031 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1032 {
1033   cpp_buffer *buffer = pfile->buffer;
1034   bool saw_NUL = false;
1035
1036   do
1037     {
1038       /* Horizontal space always OK.  */
1039       if (c == ' ' || c == '\t')
1040         ;
1041       /* Just \f \v or \0 left.  */
1042       else if (c == '\0')
1043         saw_NUL = true;
1044       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1045         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1046                              CPP_BUF_COL (buffer),
1047                              "%s in preprocessing directive",
1048                              c == '\f' ? "form feed" : "vertical tab");
1049
1050       c = *buffer->cur++;
1051     }
1052   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1053   while (is_nvspace (c));
1054
1055   if (saw_NUL)
1056     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1057
1058   buffer->cur--;
1059 }
1060
1061 /* See if the characters of a number token are valid in a name (no
1062    '.', '+' or '-').  */
1063 static int
1064 name_p (cpp_reader *pfile, const cpp_string *string)
1065 {
1066   unsigned int i;
1067
1068   for (i = 0; i < string->len; i++)
1069     if (!is_idchar (string->text[i]))
1070       return 0;
1071
1072   return 1;
1073 }
1074
1075 /* After parsing an identifier or other sequence, produce a warning about
1076    sequences not in NFC/NFKC.  */
1077 static void
1078 warn_about_normalization (cpp_reader *pfile,
1079                           const cpp_token *token,
1080                           const struct normalize_state *s)
1081 {
1082   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1083       && !pfile->state.skipping)
1084     {
1085       /* Make sure that the token is printed using UCNs, even
1086          if we'd otherwise happily print UTF-8.  */
1087       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1088       size_t sz;
1089
1090       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1091       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1092         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1093                                "`%.*s' is not in NFKC", (int) sz, buf);
1094       else
1095         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1096                                "`%.*s' is not in NFC", (int) sz, buf);
1097     }
1098 }
1099
1100 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1101    an identifier.  FIRST is TRUE if this starts an identifier.  */
1102 static bool
1103 forms_identifier_p (cpp_reader *pfile, int first,
1104                     struct normalize_state *state)
1105 {
1106   cpp_buffer *buffer = pfile->buffer;
1107
1108   if (*buffer->cur == '$')
1109     {
1110       if (!CPP_OPTION (pfile, dollars_in_ident))
1111         return false;
1112
1113       buffer->cur++;
1114       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1115         {
1116           CPP_OPTION (pfile, warn_dollars) = 0;
1117           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1118         }
1119
1120       return true;
1121     }
1122
1123   /* Is this a syntactically valid UCN?  */
1124   if (CPP_OPTION (pfile, extended_identifiers)
1125       && *buffer->cur == '\\'
1126       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1127     {
1128       buffer->cur += 2;
1129       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1130                           state))
1131         return true;
1132       buffer->cur -= 2;
1133     }
1134
1135   return false;
1136 }
1137
1138 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1139 static cpp_hashnode *
1140 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1141 {
1142   cpp_hashnode *result;
1143   const uchar *cur;
1144   unsigned int len;
1145   unsigned int hash = HT_HASHSTEP (0, *base);
1146
1147   cur = base + 1;
1148   while (ISIDNUM (*cur))
1149     {
1150       hash = HT_HASHSTEP (hash, *cur);
1151       cur++;
1152     }
1153   len = cur - base;
1154   hash = HT_HASHFINISH (hash, len);
1155   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1156                                               base, len, hash, HT_ALLOC));
1157
1158   /* Rarely, identifiers require diagnostics when lexed.  */
1159   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1160                         && !pfile->state.skipping, 0))
1161     {
1162       /* It is allowed to poison the same identifier twice.  */
1163       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1164         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1165                    NODE_NAME (result));
1166
1167       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1168          replacement list of a variadic macro.  */
1169       if (result == pfile->spec_nodes.n__VA_ARGS__
1170           && !pfile->state.va_args_ok)
1171         cpp_error (pfile, CPP_DL_PEDWARN,
1172                    "__VA_ARGS__ can only appear in the expansion"
1173                    " of a C99 variadic macro");
1174
1175       /* For -Wc++-compat, warn about use of C++ named operators.  */
1176       if (result->flags & NODE_WARN_OPERATOR)
1177         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1178                      "identifier \"%s\" is a special operator name in C++",
1179                      NODE_NAME (result));
1180     }
1181
1182   return result;
1183 }
1184
1185 /* Get the cpp_hashnode of an identifier specified by NAME in
1186    the current cpp_reader object.  If none is found, NULL is returned.  */
1187 cpp_hashnode *
1188 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1189 {
1190   cpp_hashnode *result;
1191   result = lex_identifier_intern (pfile, (uchar *) name);
1192   return result;
1193 }
1194
1195 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1196 static cpp_hashnode *
1197 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1198                 struct normalize_state *nst)
1199 {
1200   cpp_hashnode *result;
1201   const uchar *cur;
1202   unsigned int len;
1203   unsigned int hash = HT_HASHSTEP (0, *base);
1204
1205   cur = pfile->buffer->cur;
1206   if (! starts_ucn)
1207     while (ISIDNUM (*cur))
1208       {
1209         hash = HT_HASHSTEP (hash, *cur);
1210         cur++;
1211       }
1212   pfile->buffer->cur = cur;
1213   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1214     {
1215       /* Slower version for identifiers containing UCNs (or $).  */
1216       do {
1217         while (ISIDNUM (*pfile->buffer->cur))
1218           {
1219             pfile->buffer->cur++;
1220             NORMALIZE_STATE_UPDATE_IDNUM (nst);
1221           }
1222       } while (forms_identifier_p (pfile, false, nst));
1223       result = _cpp_interpret_identifier (pfile, base,
1224                                           pfile->buffer->cur - base);
1225     }
1226   else
1227     {
1228       len = cur - base;
1229       hash = HT_HASHFINISH (hash, len);
1230
1231       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1232                                                   base, len, hash, HT_ALLOC));
1233     }
1234
1235   /* Rarely, identifiers require diagnostics when lexed.  */
1236   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1237                         && !pfile->state.skipping, 0))
1238     {
1239       /* It is allowed to poison the same identifier twice.  */
1240       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1241         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1242                    NODE_NAME (result));
1243
1244       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1245          replacement list of a variadic macro.  */
1246       if (result == pfile->spec_nodes.n__VA_ARGS__
1247           && !pfile->state.va_args_ok)
1248         cpp_error (pfile, CPP_DL_PEDWARN,
1249                    "__VA_ARGS__ can only appear in the expansion"
1250                    " of a C99 variadic macro");
1251
1252       /* For -Wc++-compat, warn about use of C++ named operators.  */
1253       if (result->flags & NODE_WARN_OPERATOR)
1254         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1255                      "identifier \"%s\" is a special operator name in C++",
1256                      NODE_NAME (result));
1257     }
1258
1259   return result;
1260 }
1261
1262 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1263 static void
1264 lex_number (cpp_reader *pfile, cpp_string *number,
1265             struct normalize_state *nst)
1266 {
1267   const uchar *cur;
1268   const uchar *base;
1269   uchar *dest;
1270
1271   base = pfile->buffer->cur - 1;
1272   do
1273     {
1274       cur = pfile->buffer->cur;
1275
1276       /* N.B. ISIDNUM does not include $.  */
1277       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
1278         {
1279           cur++;
1280           NORMALIZE_STATE_UPDATE_IDNUM (nst);
1281         }
1282
1283       pfile->buffer->cur = cur;
1284     }
1285   while (forms_identifier_p (pfile, false, nst));
1286
1287   number->len = cur - base;
1288   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1289   memcpy (dest, base, number->len);
1290   dest[number->len] = '\0';
1291   number->text = dest;
1292 }
1293
1294 /* Create a token of type TYPE with a literal spelling.  */
1295 static void
1296 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1297                 unsigned int len, enum cpp_ttype type)
1298 {
1299   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1300
1301   memcpy (dest, base, len);
1302   dest[len] = '\0';
1303   token->type = type;
1304   token->val.str.len = len;
1305   token->val.str.text = dest;
1306 }
1307
1308 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1309    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1310
1311 static void
1312 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1313                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1314 {
1315   _cpp_buff *first_buff = *first_buff_p;
1316   _cpp_buff *last_buff = *last_buff_p;
1317
1318   if (first_buff == NULL)
1319     first_buff = last_buff = _cpp_get_buff (pfile, len);
1320   else if (len > BUFF_ROOM (last_buff))
1321     {
1322       size_t room = BUFF_ROOM (last_buff);
1323       memcpy (BUFF_FRONT (last_buff), base, room);
1324       BUFF_FRONT (last_buff) += room;
1325       base += room;
1326       len -= room;
1327       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1328     }
1329
1330   memcpy (BUFF_FRONT (last_buff), base, len);
1331   BUFF_FRONT (last_buff) += len;
1332
1333   *first_buff_p = first_buff;
1334   *last_buff_p = last_buff;
1335 }
1336
1337 /* Lexes a raw string.  The stored string contains the spelling, including
1338    double quotes, delimiter string, '(' and ')', any leading
1339    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1340    literal, or CPP_OTHER if it was not properly terminated.
1341
1342    The spelling is NUL-terminated, but it is not guaranteed that this
1343    is the first NUL since embedded NULs are preserved.  */
1344
1345 static void
1346 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1347                 const uchar *cur)
1348 {
1349   const uchar *raw_prefix;
1350   unsigned int raw_prefix_len = 0;
1351   enum cpp_ttype type;
1352   size_t total_len = 0;
1353   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1354   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1355
1356   type = (*base == 'L' ? CPP_WSTRING :
1357           *base == 'U' ? CPP_STRING32 :
1358           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1359           : CPP_STRING);
1360
1361   raw_prefix = cur + 1;
1362   while (raw_prefix_len < 16)
1363     {
1364       switch (raw_prefix[raw_prefix_len])
1365         {
1366         case ' ': case '(': case ')': case '\\': case '\t':
1367         case '\v': case '\f': case '\n': default:
1368           break;
1369         /* Basic source charset except the above chars.  */
1370         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1371         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1372         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1373         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1374         case 'y': case 'z':
1375         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1376         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1377         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1378         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1379         case 'Y': case 'Z':
1380         case '0': case '1': case '2': case '3': case '4': case '5':
1381         case '6': case '7': case '8': case '9':
1382         case '_': case '{': case '}': case '#': case '[': case ']':
1383         case '<': case '>': case '%': case ':': case ';': case '.':
1384         case '?': case '*': case '+': case '-': case '/': case '^':
1385         case '&': case '|': case '~': case '!': case '=': case ',':
1386         case '"': case '\'':
1387           raw_prefix_len++;
1388           continue;
1389         }
1390       break;
1391     }
1392
1393   if (raw_prefix[raw_prefix_len] != '(')
1394     {
1395       int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
1396                 + 1;
1397       if (raw_prefix_len == 16)
1398         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1399                              "raw string delimiter longer than 16 characters");
1400       else
1401         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1402                              "invalid character '%c' in raw string delimiter",
1403                              (int) raw_prefix[raw_prefix_len]);
1404       pfile->buffer->cur = raw_prefix - 1;
1405       create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
1406       return;
1407     }
1408
1409   cur = raw_prefix + raw_prefix_len + 1;
1410   for (;;)
1411     {
1412 #define BUF_APPEND(STR,LEN)                                     \
1413       do {                                                      \
1414         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1415                         &first_buff, &last_buff);               \
1416         total_len += (LEN);                                     \
1417       } while (0);
1418
1419       cppchar_t c;
1420
1421       /* If we previously performed any trigraph or line splicing
1422          transformations, undo them within the body of the raw string.  */
1423       while (note->pos < cur)
1424         ++note;
1425       for (; note->pos == cur; ++note)
1426         {
1427           switch (note->type)
1428             {
1429             case '\\':
1430             case ' ':
1431               /* Restore backslash followed by newline.  */
1432               BUF_APPEND (base, cur - base);
1433               base = cur;
1434               BUF_APPEND ("\\", 1);
1435             after_backslash:
1436               if (note->type == ' ')
1437                 {
1438                   /* GNU backslash whitespace newline extension.  FIXME
1439                      could be any sequence of non-vertical space.  When we
1440                      can properly restore any such sequence, we should mark
1441                      this note as handled so _cpp_process_line_notes
1442                      doesn't warn.  */
1443                   BUF_APPEND (" ", 1);
1444                 }
1445
1446               BUF_APPEND ("\n", 1);
1447               break;
1448
1449             case 0:
1450               /* Already handled.  */
1451               break;
1452
1453             default:
1454               if (_cpp_trigraph_map[note->type])
1455                 {
1456                   /* Don't warn about this trigraph in
1457                      _cpp_process_line_notes, since trigraphs show up as
1458                      trigraphs in raw strings.  */
1459                   uchar type = note->type;
1460                   note->type = 0;
1461
1462                   if (!CPP_OPTION (pfile, trigraphs))
1463                     /* If we didn't convert the trigraph in the first
1464                        place, don't do anything now either.  */
1465                     break;
1466
1467                   BUF_APPEND (base, cur - base);
1468                   base = cur;
1469                   BUF_APPEND ("??", 2);
1470
1471                   /* ??/ followed by newline gets two line notes, one for
1472                      the trigraph and one for the backslash/newline.  */
1473                   if (type == '/' && note[1].pos == cur)
1474                     {
1475                       if (note[1].type != '\\'
1476                           && note[1].type != ' ')
1477                         abort ();
1478                       BUF_APPEND ("/", 1);
1479                       ++note;
1480                       goto after_backslash;
1481                     }
1482                   /* The ) from ??) could be part of the suffix.  */
1483                   else if (type == ')'
1484                            && strncmp ((const char *) cur+1,
1485                                        (const char *) raw_prefix,
1486                                        raw_prefix_len) == 0
1487                            && cur[raw_prefix_len+1] == '"')
1488                     {
1489                       BUF_APPEND (")", 1);
1490                       base++;
1491                       cur += raw_prefix_len + 2;
1492                       goto break_outer_loop;
1493                     }
1494                   else
1495                     {
1496                       /* Skip the replacement character.  */
1497                       base = ++cur;
1498                       BUF_APPEND (&type, 1);
1499                     }
1500                 }
1501               else
1502                 abort ();
1503               break;
1504             }
1505         }
1506       c = *cur++;
1507
1508       if (c == ')'
1509           && strncmp ((const char *) cur, (const char *) raw_prefix,
1510                       raw_prefix_len) == 0
1511           && cur[raw_prefix_len] == '"')
1512         {
1513           cur += raw_prefix_len + 1;
1514           break;
1515         }
1516       else if (c == '\n')
1517         {
1518           if (pfile->state.in_directive
1519               || pfile->state.parsing_args
1520               || pfile->state.in_deferred_pragma)
1521             {
1522               cur--;
1523               type = CPP_OTHER;
1524               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1525                                    "unterminated raw string");
1526               break;
1527             }
1528
1529           BUF_APPEND (base, cur - base);
1530
1531           if (pfile->buffer->cur < pfile->buffer->rlimit)
1532             CPP_INCREMENT_LINE (pfile, 0);
1533           pfile->buffer->need_line = true;
1534
1535           pfile->buffer->cur = cur-1;
1536           _cpp_process_line_notes (pfile, false);
1537           if (!_cpp_get_fresh_line (pfile))
1538             {
1539               source_location src_loc = token->src_loc;
1540               token->type = CPP_EOF;
1541               /* Tell the compiler the line number of the EOF token.  */
1542               token->src_loc = pfile->line_table->highest_line;
1543               token->flags = BOL;
1544               if (first_buff != NULL)
1545                 _cpp_release_buff (pfile, first_buff);
1546               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1547                                    "unterminated raw string");
1548               return;
1549             }
1550
1551           cur = base = pfile->buffer->cur;
1552           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1553         }
1554     }
1555  break_outer_loop:
1556
1557   if (CPP_OPTION (pfile, user_literals))
1558     {
1559       /* According to C++11 [lex.ext]p10, a ud-suffix not starting with an
1560          underscore is ill-formed.  Since this breaks programs using macros
1561          from inttypes.h, we generate a warning and treat the ud-suffix as a
1562          separate preprocessing token.  This approach is under discussion by
1563          the standards committee, and has been adopted as a conforming
1564          extension by other front ends such as clang. */
1565       if (ISALPHA (*cur))
1566         {
1567           /* Raise a warning, but do not consume subsequent tokens.  */
1568           if (CPP_OPTION (pfile, warn_literal_suffix))
1569             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1570                                    token->src_loc, 0,
1571                                    "invalid suffix on literal; C++11 requires "
1572                                    "a space between literal and identifier");
1573         }
1574       /* Grab user defined literal suffix.  */
1575       else if (*cur == '_')
1576         {
1577           type = cpp_userdef_string_add_type (type);
1578           ++cur;
1579
1580           while (ISIDNUM (*cur))
1581             ++cur;
1582         }
1583     }
1584
1585   pfile->buffer->cur = cur;
1586   if (first_buff == NULL)
1587     create_literal (pfile, token, base, cur - base, type);
1588   else
1589     {
1590       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1591
1592       token->type = type;
1593       token->val.str.len = total_len + (cur - base);
1594       token->val.str.text = dest;
1595       last_buff = first_buff;
1596       while (last_buff != NULL)
1597         {
1598           memcpy (dest, last_buff->base,
1599                   BUFF_FRONT (last_buff) - last_buff->base);
1600           dest += BUFF_FRONT (last_buff) - last_buff->base;
1601           last_buff = last_buff->next;
1602         }
1603       _cpp_release_buff (pfile, first_buff);
1604       memcpy (dest, base, cur - base);
1605       dest[cur - base] = '\0';
1606     }
1607 }
1608
1609 /* Lexes a string, character constant, or angle-bracketed header file
1610    name.  The stored string contains the spelling, including opening
1611    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1612    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1613    if it was not properly terminated, or CPP_LESS for an unterminated
1614    header name which must be relexed as normal tokens.
1615
1616    The spelling is NUL-terminated, but it is not guaranteed that this
1617    is the first NUL since embedded NULs are preserved.  */
1618 static void
1619 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1620 {
1621   bool saw_NUL = false;
1622   const uchar *cur;
1623   cppchar_t terminator;
1624   enum cpp_ttype type;
1625
1626   cur = base;
1627   terminator = *cur++;
1628   if (terminator == 'L' || terminator == 'U')
1629     terminator = *cur++;
1630   else if (terminator == 'u')
1631     {
1632       terminator = *cur++;
1633       if (terminator == '8')
1634         terminator = *cur++;
1635     }
1636   if (terminator == 'R')
1637     {
1638       lex_raw_string (pfile, token, base, cur);
1639       return;
1640     }
1641   if (terminator == '"')
1642     type = (*base == 'L' ? CPP_WSTRING :
1643             *base == 'U' ? CPP_STRING32 :
1644             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1645                          : CPP_STRING);
1646   else if (terminator == '\'')
1647     type = (*base == 'L' ? CPP_WCHAR :
1648             *base == 'U' ? CPP_CHAR32 :
1649             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1650   else
1651     terminator = '>', type = CPP_HEADER_NAME;
1652
1653   for (;;)
1654     {
1655       cppchar_t c = *cur++;
1656
1657       /* In #include-style directives, terminators are not escapable.  */
1658       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1659         cur++;
1660       else if (c == terminator)
1661         break;
1662       else if (c == '\n')
1663         {
1664           cur--;
1665           /* Unmatched quotes always yield undefined behavior, but
1666              greedy lexing means that what appears to be an unterminated
1667              header name may actually be a legitimate sequence of tokens.  */
1668           if (terminator == '>')
1669             {
1670               token->type = CPP_LESS;
1671               return;
1672             }
1673           type = CPP_OTHER;
1674           break;
1675         }
1676       else if (c == '\0')
1677         saw_NUL = true;
1678     }
1679
1680   if (saw_NUL && !pfile->state.skipping)
1681     cpp_error (pfile, CPP_DL_WARNING,
1682                "null character(s) preserved in literal");
1683
1684   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1685     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1686                (int) terminator);
1687
1688   if (CPP_OPTION (pfile, user_literals))
1689     {
1690       /* According to C++11 [lex.ext]p10, a ud-suffix not starting with an
1691          underscore is ill-formed.  Since this breaks programs using macros
1692          from inttypes.h, we generate a warning and treat the ud-suffix as a
1693          separate preprocessing token.  This approach is under discussion by
1694          the standards committee, and has been adopted as a conforming
1695          extension by other front ends such as clang. */
1696       if (ISALPHA (*cur))
1697         {
1698           /* Raise a warning, but do not consume subsequent tokens.  */
1699           if (CPP_OPTION (pfile, warn_literal_suffix))
1700             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1701                                    token->src_loc, 0,
1702                                    "invalid suffix on literal; C++11 requires "
1703                                    "a space between literal and identifier");
1704         }
1705       /* Grab user defined literal suffix.  */
1706       else if (*cur == '_')
1707         {
1708           type = cpp_userdef_char_add_type (type);
1709           type = cpp_userdef_string_add_type (type);
1710           ++cur;
1711
1712           while (ISIDNUM (*cur))
1713             ++cur;
1714         }
1715     }
1716
1717   pfile->buffer->cur = cur;
1718   create_literal (pfile, token, base, cur - base, type);
1719 }
1720
1721 /* Return the comment table. The client may not make any assumption
1722    about the ordering of the table.  */
1723 cpp_comment_table *
1724 cpp_get_comments (cpp_reader *pfile)
1725 {
1726   return &pfile->comments;
1727 }
1728
1729 /* Append a comment to the end of the comment table. */
1730 static void
1731 store_comment (cpp_reader *pfile, cpp_token *token)
1732 {
1733   int len;
1734
1735   if (pfile->comments.allocated == 0)
1736     {
1737       pfile->comments.allocated = 256;
1738       pfile->comments.entries = (cpp_comment *) xmalloc
1739         (pfile->comments.allocated * sizeof (cpp_comment));
1740     }
1741
1742   if (pfile->comments.count == pfile->comments.allocated)
1743     {
1744       pfile->comments.allocated *= 2;
1745       pfile->comments.entries = (cpp_comment *) xrealloc
1746         (pfile->comments.entries,
1747          pfile->comments.allocated * sizeof (cpp_comment));
1748     }
1749
1750   len = token->val.str.len;
1751
1752   /* Copy comment. Note, token may not be NULL terminated. */
1753   pfile->comments.entries[pfile->comments.count].comment =
1754     (char *) xmalloc (sizeof (char) * (len + 1));
1755   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1756           token->val.str.text, len);
1757   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1758
1759   /* Set source location. */
1760   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1761
1762   /* Increment the count of entries in the comment table. */
1763   pfile->comments.count++;
1764 }
1765
1766 /* The stored comment includes the comment start and any terminator.  */
1767 static void
1768 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1769               cppchar_t type)
1770 {
1771   unsigned char *buffer;
1772   unsigned int len, clen, i;
1773
1774   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1775
1776   /* C++ comments probably (not definitely) have moved past a new
1777      line, which we don't want to save in the comment.  */
1778   if (is_vspace (pfile->buffer->cur[-1]))
1779     len--;
1780
1781   /* If we are currently in a directive or in argument parsing, then
1782      we need to store all C++ comments as C comments internally, and
1783      so we need to allocate a little extra space in that case.
1784
1785      Note that the only time we encounter a directive here is
1786      when we are saving comments in a "#define".  */
1787   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1788           && type == '/') ? len + 2 : len;
1789
1790   buffer = _cpp_unaligned_alloc (pfile, clen);
1791
1792   token->type = CPP_COMMENT;
1793   token->val.str.len = clen;
1794   token->val.str.text = buffer;
1795
1796   buffer[0] = '/';
1797   memcpy (buffer + 1, from, len - 1);
1798
1799   /* Finish conversion to a C comment, if necessary.  */
1800   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1801     {
1802       buffer[1] = '*';
1803       buffer[clen - 2] = '*';
1804       buffer[clen - 1] = '/';
1805       /* As there can be in a C++ comments illegal sequences for C comments
1806          we need to filter them out.  */
1807       for (i = 2; i < (clen - 2); i++)
1808         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1809           buffer[i] = '|';
1810     }
1811
1812   /* Finally store this comment for use by clients of libcpp. */
1813   store_comment (pfile, token);
1814 }
1815
1816 /* Allocate COUNT tokens for RUN.  */
1817 void
1818 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1819 {
1820   run->base = XNEWVEC (cpp_token, count);
1821   run->limit = run->base + count;
1822   run->next = NULL;
1823 }
1824
1825 /* Returns the next tokenrun, or creates one if there is none.  */
1826 static tokenrun *
1827 next_tokenrun (tokenrun *run)
1828 {
1829   if (run->next == NULL)
1830     {
1831       run->next = XNEW (tokenrun);
1832       run->next->prev = run;
1833       _cpp_init_tokenrun (run->next, 250);
1834     }
1835
1836   return run->next;
1837 }
1838
1839 /* Return the number of not yet processed token in a given
1840    context.  */
1841 int
1842 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1843 {
1844   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1845     return (LAST (context).token - FIRST (context).token);
1846   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1847            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1848     return (LAST (context).ptoken - FIRST (context).ptoken);
1849   else
1850       abort ();
1851 }
1852
1853 /* Returns the token present at index INDEX in a given context.  If
1854    INDEX is zero, the next token to be processed is returned.  */
1855 static const cpp_token*
1856 _cpp_token_from_context_at (cpp_context *context, int index)
1857 {
1858   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1859     return &(FIRST (context).token[index]);
1860   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1861            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1862     return FIRST (context).ptoken[index];
1863  else
1864    abort ();
1865 }
1866
1867 /* Look ahead in the input stream.  */
1868 const cpp_token *
1869 cpp_peek_token (cpp_reader *pfile, int index)
1870 {
1871   cpp_context *context = pfile->context;
1872   const cpp_token *peektok;
1873   int count;
1874
1875   /* First, scan through any pending cpp_context objects.  */
1876   while (context->prev)
1877     {
1878       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1879
1880       if (index < (int) sz)
1881         return _cpp_token_from_context_at (context, index);
1882       index -= (int) sz;
1883       context = context->prev;
1884     }
1885
1886   /* We will have to read some new tokens after all (and do so
1887      without invalidating preceding tokens).  */
1888   count = index;
1889   pfile->keep_tokens++;
1890
1891   do
1892     {
1893       peektok = _cpp_lex_token (pfile);
1894       if (peektok->type == CPP_EOF)
1895         return peektok;
1896     }
1897   while (index--);
1898
1899   _cpp_backup_tokens_direct (pfile, count + 1);
1900   pfile->keep_tokens--;
1901
1902   return peektok;
1903 }
1904
1905 /* Allocate a single token that is invalidated at the same time as the
1906    rest of the tokens on the line.  Has its line and col set to the
1907    same as the last lexed token, so that diagnostics appear in the
1908    right place.  */
1909 cpp_token *
1910 _cpp_temp_token (cpp_reader *pfile)
1911 {
1912   cpp_token *old, *result;
1913   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1914   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1915
1916   old = pfile->cur_token - 1;
1917   /* Any pre-existing lookaheads must not be clobbered.  */
1918   if (la)
1919     {
1920       if (sz <= la)
1921         {
1922           tokenrun *next = next_tokenrun (pfile->cur_run);
1923
1924           if (sz < la)
1925             memmove (next->base + 1, next->base,
1926                      (la - sz) * sizeof (cpp_token));
1927
1928           next->base[0] = pfile->cur_run->limit[-1];
1929         }
1930
1931       if (sz > 1)
1932         memmove (pfile->cur_token + 1, pfile->cur_token,
1933                  MIN (la, sz - 1) * sizeof (cpp_token));
1934     }
1935
1936   if (!sz && pfile->cur_token == pfile->cur_run->limit)
1937     {
1938       pfile->cur_run = next_tokenrun (pfile->cur_run);
1939       pfile->cur_token = pfile->cur_run->base;
1940     }
1941
1942   result = pfile->cur_token++;
1943   result->src_loc = old->src_loc;
1944   return result;
1945 }
1946
1947 /* Lex a token into RESULT (external interface).  Takes care of issues
1948    like directive handling, token lookahead, multiple include
1949    optimization and skipping.  */
1950 const cpp_token *
1951 _cpp_lex_token (cpp_reader *pfile)
1952 {
1953   cpp_token *result;
1954
1955   for (;;)
1956     {
1957       if (pfile->cur_token == pfile->cur_run->limit)
1958         {
1959           pfile->cur_run = next_tokenrun (pfile->cur_run);
1960           pfile->cur_token = pfile->cur_run->base;
1961         }
1962       /* We assume that the current token is somewhere in the current
1963          run.  */
1964       if (pfile->cur_token < pfile->cur_run->base
1965           || pfile->cur_token >= pfile->cur_run->limit)
1966         abort ();
1967
1968       if (pfile->lookaheads)
1969         {
1970           pfile->lookaheads--;
1971           result = pfile->cur_token++;
1972         }
1973       else
1974         result = _cpp_lex_direct (pfile);
1975
1976       if (result->flags & BOL)
1977         {
1978           /* Is this a directive.  If _cpp_handle_directive returns
1979              false, it is an assembler #.  */
1980           if (result->type == CPP_HASH
1981               /* 6.10.3 p 11: Directives in a list of macro arguments
1982                  gives undefined behavior.  This implementation
1983                  handles the directive as normal.  */
1984               && pfile->state.parsing_args != 1)
1985             {
1986               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1987                 {
1988                   if (pfile->directive_result.type == CPP_PADDING)
1989                     continue;
1990                   result = &pfile->directive_result;
1991                 }
1992             }
1993           else if (pfile->state.in_deferred_pragma)
1994             result = &pfile->directive_result;
1995
1996           if (pfile->cb.line_change && !pfile->state.skipping)
1997             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
1998         }
1999
2000       /* We don't skip tokens in directives.  */
2001       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2002         break;
2003
2004       /* Outside a directive, invalidate controlling macros.  At file
2005          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2006          get here and MI optimization works.  */
2007       pfile->mi_valid = false;
2008
2009       if (!pfile->state.skipping || result->type == CPP_EOF)
2010         break;
2011     }
2012
2013   return result;
2014 }
2015
2016 /* Returns true if a fresh line has been loaded.  */
2017 bool
2018 _cpp_get_fresh_line (cpp_reader *pfile)
2019 {
2020   int return_at_eof;
2021
2022   /* We can't get a new line until we leave the current directive.  */
2023   if (pfile->state.in_directive)
2024     return false;
2025
2026   for (;;)
2027     {
2028       cpp_buffer *buffer = pfile->buffer;
2029
2030       if (!buffer->need_line)
2031         return true;
2032
2033       if (buffer->next_line < buffer->rlimit)
2034         {
2035           _cpp_clean_line (pfile);
2036           return true;
2037         }
2038
2039       /* First, get out of parsing arguments state.  */
2040       if (pfile->state.parsing_args)
2041         return false;
2042
2043       /* End of buffer.  Non-empty files should end in a newline.  */
2044       if (buffer->buf != buffer->rlimit
2045           && buffer->next_line > buffer->rlimit
2046           && !buffer->from_stage3)
2047         {
2048           /* Clip to buffer size.  */
2049           buffer->next_line = buffer->rlimit;
2050         }
2051
2052       return_at_eof = buffer->return_at_eof;
2053       _cpp_pop_buffer (pfile);
2054       if (pfile->buffer == NULL || return_at_eof)
2055         return false;
2056     }
2057 }
2058
2059 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2060   do                                                    \
2061     {                                                   \
2062       result->type = ELSE_TYPE;                         \
2063       if (*buffer->cur == CHAR)                         \
2064         buffer->cur++, result->type = THEN_TYPE;        \
2065     }                                                   \
2066   while (0)
2067
2068 /* Lex a token into pfile->cur_token, which is also incremented, to
2069    get diagnostics pointing to the correct location.
2070
2071    Does not handle issues such as token lookahead, multiple-include
2072    optimization, directives, skipping etc.  This function is only
2073    suitable for use by _cpp_lex_token, and in special cases like
2074    lex_expansion_token which doesn't care for any of these issues.
2075
2076    When meeting a newline, returns CPP_EOF if parsing a directive,
2077    otherwise returns to the start of the token buffer if permissible.
2078    Returns the location of the lexed token.  */
2079 cpp_token *
2080 _cpp_lex_direct (cpp_reader *pfile)
2081 {
2082   cppchar_t c;
2083   cpp_buffer *buffer;
2084   const unsigned char *comment_start;
2085   cpp_token *result = pfile->cur_token++;
2086
2087  fresh_line:
2088   result->flags = 0;
2089   buffer = pfile->buffer;
2090   if (buffer->need_line)
2091     {
2092       if (pfile->state.in_deferred_pragma)
2093         {
2094           result->type = CPP_PRAGMA_EOL;
2095           pfile->state.in_deferred_pragma = false;
2096           if (!pfile->state.pragma_allow_expansion)
2097             pfile->state.prevent_expansion--;
2098           return result;
2099         }
2100       if (!_cpp_get_fresh_line (pfile))
2101         {
2102           result->type = CPP_EOF;
2103           if (!pfile->state.in_directive)
2104             {
2105               /* Tell the compiler the line number of the EOF token.  */
2106               result->src_loc = pfile->line_table->highest_line;
2107               result->flags = BOL;
2108             }
2109           return result;
2110         }
2111       if (!pfile->keep_tokens)
2112         {
2113           pfile->cur_run = &pfile->base_run;
2114           result = pfile->base_run.base;
2115           pfile->cur_token = result + 1;
2116         }
2117       result->flags = BOL;
2118       if (pfile->state.parsing_args == 2)
2119         result->flags |= PREV_WHITE;
2120     }
2121   buffer = pfile->buffer;
2122  update_tokens_line:
2123   result->src_loc = pfile->line_table->highest_line;
2124
2125  skipped_white:
2126   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2127       && !pfile->overlaid_buffer)
2128     {
2129       _cpp_process_line_notes (pfile, false);
2130       result->src_loc = pfile->line_table->highest_line;
2131     }
2132   c = *buffer->cur++;
2133
2134   if (pfile->forced_token_location_p)
2135     result->src_loc = *pfile->forced_token_location_p;
2136   else
2137     result->src_loc = linemap_position_for_column (pfile->line_table,
2138                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2139
2140   switch (c)
2141     {
2142     case ' ': case '\t': case '\f': case '\v': case '\0':
2143       result->flags |= PREV_WHITE;
2144       skip_whitespace (pfile, c);
2145       goto skipped_white;
2146
2147     case '\n':
2148       if (buffer->cur < buffer->rlimit)
2149         CPP_INCREMENT_LINE (pfile, 0);
2150       buffer->need_line = true;
2151       goto fresh_line;
2152
2153     case '0': case '1': case '2': case '3': case '4':
2154     case '5': case '6': case '7': case '8': case '9':
2155       {
2156         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2157         result->type = CPP_NUMBER;
2158         lex_number (pfile, &result->val.str, &nst);
2159         warn_about_normalization (pfile, result, &nst);
2160         break;
2161       }
2162
2163     case 'L':
2164     case 'u':
2165     case 'U':
2166     case 'R':
2167       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2168          wide strings or raw strings.  */
2169       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2170           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2171         {
2172           if ((*buffer->cur == '\'' && c != 'R')
2173               || *buffer->cur == '"'
2174               || (*buffer->cur == 'R'
2175                   && c != 'R'
2176                   && buffer->cur[1] == '"'
2177                   && CPP_OPTION (pfile, rliterals))
2178               || (*buffer->cur == '8'
2179                   && c == 'u'
2180                   && (buffer->cur[1] == '"'
2181                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2182                           && CPP_OPTION (pfile, rliterals)))))
2183             {
2184               lex_string (pfile, result, buffer->cur - 1);
2185               break;
2186             }
2187         }
2188       /* Fall through.  */
2189
2190     case '_':
2191     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2192     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2193     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2194     case 's': case 't':           case 'v': case 'w': case 'x':
2195     case 'y': case 'z':
2196     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2197     case 'G': case 'H': case 'I': case 'J': case 'K':
2198     case 'M': case 'N': case 'O': case 'P': case 'Q':
2199     case 'S': case 'T':           case 'V': case 'W': case 'X':
2200     case 'Y': case 'Z':
2201       result->type = CPP_NAME;
2202       {
2203         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2204         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2205                                                 &nst);
2206         warn_about_normalization (pfile, result, &nst);
2207       }
2208
2209       /* Convert named operators to their proper types.  */
2210       if (result->val.node.node->flags & NODE_OPERATOR)
2211         {
2212           result->flags |= NAMED_OP;
2213           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2214         }
2215       break;
2216
2217     case '\'':
2218     case '"':
2219       lex_string (pfile, result, buffer->cur - 1);
2220       break;
2221
2222     case '/':
2223       /* A potential block or line comment.  */
2224       comment_start = buffer->cur;
2225       c = *buffer->cur;
2226
2227       if (c == '*')
2228         {
2229           if (_cpp_skip_block_comment (pfile))
2230             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2231         }
2232       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2233                             || cpp_in_system_header (pfile)))
2234         {
2235           /* Warn about comments only if pedantically GNUC89, and not
2236              in system headers.  */
2237           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2238               && ! buffer->warned_cplusplus_comments)
2239             {
2240               cpp_error (pfile, CPP_DL_PEDWARN,
2241                          "C++ style comments are not allowed in ISO C90");
2242               cpp_error (pfile, CPP_DL_PEDWARN,
2243                          "(this will be reported only once per input file)");
2244               buffer->warned_cplusplus_comments = 1;
2245             }
2246
2247           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2248             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2249         }
2250       else if (c == '=')
2251         {
2252           buffer->cur++;
2253           result->type = CPP_DIV_EQ;
2254           break;
2255         }
2256       else
2257         {
2258           result->type = CPP_DIV;
2259           break;
2260         }
2261
2262       if (!pfile->state.save_comments)
2263         {
2264           result->flags |= PREV_WHITE;
2265           goto update_tokens_line;
2266         }
2267
2268       /* Save the comment as a token in its own right.  */
2269       save_comment (pfile, result, comment_start, c);
2270       break;
2271
2272     case '<':
2273       if (pfile->state.angled_headers)
2274         {
2275           lex_string (pfile, result, buffer->cur - 1);
2276           if (result->type != CPP_LESS)
2277             break;
2278         }
2279
2280       result->type = CPP_LESS;
2281       if (*buffer->cur == '=')
2282         buffer->cur++, result->type = CPP_LESS_EQ;
2283       else if (*buffer->cur == '<')
2284         {
2285           buffer->cur++;
2286           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2287         }
2288       else if (CPP_OPTION (pfile, digraphs))
2289         {
2290           if (*buffer->cur == ':')
2291             {
2292               buffer->cur++;
2293               result->flags |= DIGRAPH;
2294               result->type = CPP_OPEN_SQUARE;
2295             }
2296           else if (*buffer->cur == '%')
2297             {
2298               buffer->cur++;
2299               result->flags |= DIGRAPH;
2300               result->type = CPP_OPEN_BRACE;
2301             }
2302         }
2303       break;
2304
2305     case '>':
2306       result->type = CPP_GREATER;
2307       if (*buffer->cur == '=')
2308         buffer->cur++, result->type = CPP_GREATER_EQ;
2309       else if (*buffer->cur == '>')
2310         {
2311           buffer->cur++;
2312           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2313         }
2314       break;
2315
2316     case '%':
2317       result->type = CPP_MOD;
2318       if (*buffer->cur == '=')
2319         buffer->cur++, result->type = CPP_MOD_EQ;
2320       else if (CPP_OPTION (pfile, digraphs))
2321         {
2322           if (*buffer->cur == ':')
2323             {
2324               buffer->cur++;
2325               result->flags |= DIGRAPH;
2326               result->type = CPP_HASH;
2327               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2328                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2329             }
2330           else if (*buffer->cur == '>')
2331             {
2332               buffer->cur++;
2333               result->flags |= DIGRAPH;
2334               result->type = CPP_CLOSE_BRACE;
2335             }
2336         }
2337       break;
2338
2339     case '.':
2340       result->type = CPP_DOT;
2341       if (ISDIGIT (*buffer->cur))
2342         {
2343           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2344           result->type = CPP_NUMBER;
2345           lex_number (pfile, &result->val.str, &nst);
2346           warn_about_normalization (pfile, result, &nst);
2347         }
2348       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2349         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2350       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2351         buffer->cur++, result->type = CPP_DOT_STAR;
2352       break;
2353
2354     case '+':
2355       result->type = CPP_PLUS;
2356       if (*buffer->cur == '+')
2357         buffer->cur++, result->type = CPP_PLUS_PLUS;
2358       else if (*buffer->cur == '=')
2359         buffer->cur++, result->type = CPP_PLUS_EQ;
2360       break;
2361
2362     case '-':
2363       result->type = CPP_MINUS;
2364       if (*buffer->cur == '>')
2365         {
2366           buffer->cur++;
2367           result->type = CPP_DEREF;
2368           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2369             buffer->cur++, result->type = CPP_DEREF_STAR;
2370         }
2371       else if (*buffer->cur == '-')
2372         buffer->cur++, result->type = CPP_MINUS_MINUS;
2373       else if (*buffer->cur == '=')
2374         buffer->cur++, result->type = CPP_MINUS_EQ;
2375       break;
2376
2377     case '&':
2378       result->type = CPP_AND;
2379       if (*buffer->cur == '&')
2380         buffer->cur++, result->type = CPP_AND_AND;
2381       else if (*buffer->cur == '=')
2382         buffer->cur++, result->type = CPP_AND_EQ;
2383       break;
2384
2385     case '|':
2386       result->type = CPP_OR;
2387       if (*buffer->cur == '|')
2388         buffer->cur++, result->type = CPP_OR_OR;
2389       else if (*buffer->cur == '=')
2390         buffer->cur++, result->type = CPP_OR_EQ;
2391       break;
2392
2393     case ':':
2394       result->type = CPP_COLON;
2395       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2396         buffer->cur++, result->type = CPP_SCOPE;
2397       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2398         {
2399           buffer->cur++;
2400           result->flags |= DIGRAPH;
2401           result->type = CPP_CLOSE_SQUARE;
2402         }
2403       break;
2404
2405     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2406     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2407     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2408     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2409     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2410
2411     case '?': result->type = CPP_QUERY; break;
2412     case '~': result->type = CPP_COMPL; break;
2413     case ',': result->type = CPP_COMMA; break;
2414     case '(': result->type = CPP_OPEN_PAREN; break;
2415     case ')': result->type = CPP_CLOSE_PAREN; break;
2416     case '[': result->type = CPP_OPEN_SQUARE; break;
2417     case ']': result->type = CPP_CLOSE_SQUARE; break;
2418     case '{': result->type = CPP_OPEN_BRACE; break;
2419     case '}': result->type = CPP_CLOSE_BRACE; break;
2420     case ';': result->type = CPP_SEMICOLON; break;
2421
2422       /* @ is a punctuator in Objective-C.  */
2423     case '@': result->type = CPP_ATSIGN; break;
2424
2425     case '$':
2426     case '\\':
2427       {
2428         const uchar *base = --buffer->cur;
2429         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2430
2431         if (forms_identifier_p (pfile, true, &nst))
2432           {
2433             result->type = CPP_NAME;
2434             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2435             warn_about_normalization (pfile, result, &nst);
2436             break;
2437           }
2438         buffer->cur++;
2439       }
2440
2441     default:
2442       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2443       break;
2444     }
2445
2446   return result;
2447 }
2448
2449 /* An upper bound on the number of bytes needed to spell TOKEN.
2450    Does not include preceding whitespace.  */
2451 unsigned int
2452 cpp_token_len (const cpp_token *token)
2453 {
2454   unsigned int len;
2455
2456   switch (TOKEN_SPELL (token))
2457     {
2458     default:            len = 6;                                break;
2459     case SPELL_LITERAL: len = token->val.str.len;               break;
2460     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2461     }
2462
2463   return len;
2464 }
2465
2466 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2467    Return the number of bytes read out of NAME.  (There are always
2468    10 bytes written to BUFFER.)  */
2469
2470 static size_t
2471 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2472 {
2473   int j;
2474   int ucn_len = 0;
2475   int ucn_len_c;
2476   unsigned t;
2477   unsigned long utf32;
2478
2479   /* Compute the length of the UTF-8 sequence.  */
2480   for (t = *name; t & 0x80; t <<= 1)
2481     ucn_len++;
2482
2483   utf32 = *name & (0x7F >> ucn_len);
2484   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2485     {
2486       utf32 = (utf32 << 6) | (*++name & 0x3F);
2487
2488       /* Ill-formed UTF-8.  */
2489       if ((*name & ~0x3F) != 0x80)
2490         abort ();
2491     }
2492
2493   *buffer++ = '\\';
2494   *buffer++ = 'U';
2495   for (j = 7; j >= 0; j--)
2496     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2497   return ucn_len;
2498 }
2499
2500 /* Given a token TYPE corresponding to a digraph, return a pointer to
2501    the spelling of the digraph.  */
2502 static const unsigned char *
2503 cpp_digraph2name (enum cpp_ttype type)
2504 {
2505   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2506 }
2507
2508 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2509    already contain the enough space to hold the token's spelling.
2510    Returns a pointer to the character after the last character written.
2511    FORSTRING is true if this is to be the spelling after translation
2512    phase 1 (this is different for UCNs).
2513    FIXME: Would be nice if we didn't need the PFILE argument.  */
2514 unsigned char *
2515 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2516                  unsigned char *buffer, bool forstring)
2517 {
2518   switch (TOKEN_SPELL (token))
2519     {
2520     case SPELL_OPERATOR:
2521       {
2522         const unsigned char *spelling;
2523         unsigned char c;
2524
2525         if (token->flags & DIGRAPH)
2526           spelling = cpp_digraph2name (token->type);
2527         else if (token->flags & NAMED_OP)
2528           goto spell_ident;
2529         else
2530           spelling = TOKEN_NAME (token);
2531
2532         while ((c = *spelling++) != '\0')
2533           *buffer++ = c;
2534       }
2535       break;
2536
2537     spell_ident:
2538     case SPELL_IDENT:
2539       if (forstring)
2540         {
2541           memcpy (buffer, NODE_NAME (token->val.node.node),
2542                   NODE_LEN (token->val.node.node));
2543           buffer += NODE_LEN (token->val.node.node);
2544         }
2545       else
2546         {
2547           size_t i;
2548           const unsigned char * name = NODE_NAME (token->val.node.node);
2549
2550           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2551             if (name[i] & ~0x7F)
2552               {
2553                 i += utf8_to_ucn (buffer, name + i) - 1;
2554                 buffer += 10;
2555               }
2556             else
2557               *buffer++ = NODE_NAME (token->val.node.node)[i];
2558         }
2559       break;
2560
2561     case SPELL_LITERAL:
2562       memcpy (buffer, token->val.str.text, token->val.str.len);
2563       buffer += token->val.str.len;
2564       break;
2565
2566     case SPELL_NONE:
2567       cpp_error (pfile, CPP_DL_ICE,
2568                  "unspellable token %s", TOKEN_NAME (token));
2569       break;
2570     }
2571
2572   return buffer;
2573 }
2574
2575 /* Returns TOKEN spelt as a null-terminated string.  The string is
2576    freed when the reader is destroyed.  Useful for diagnostics.  */
2577 unsigned char *
2578 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2579 {
2580   unsigned int len = cpp_token_len (token) + 1;
2581   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2582
2583   end = cpp_spell_token (pfile, token, start, false);
2584   end[0] = '\0';
2585
2586   return start;
2587 }
2588
2589 /* Returns a pointer to a string which spells the token defined by
2590    TYPE and FLAGS.  Used by C front ends, which really should move to
2591    using cpp_token_as_text.  */
2592 const char *
2593 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2594 {
2595   if (flags & DIGRAPH)
2596     return (const char *) cpp_digraph2name (type);
2597   else if (flags & NAMED_OP)
2598     return cpp_named_operator2name (type);
2599
2600   return (const char *) token_spellings[type].name;
2601 }
2602
2603 /* Writes the spelling of token to FP, without any preceding space.
2604    Separated from cpp_spell_token for efficiency - to avoid stdio
2605    double-buffering.  */
2606 void
2607 cpp_output_token (const cpp_token *token, FILE *fp)
2608 {
2609   switch (TOKEN_SPELL (token))
2610     {
2611     case SPELL_OPERATOR:
2612       {
2613         const unsigned char *spelling;
2614         int c;
2615
2616         if (token->flags & DIGRAPH)
2617           spelling = cpp_digraph2name (token->type);
2618         else if (token->flags & NAMED_OP)
2619           goto spell_ident;
2620         else
2621           spelling = TOKEN_NAME (token);
2622
2623         c = *spelling;
2624         do
2625           putc (c, fp);
2626         while ((c = *++spelling) != '\0');
2627       }
2628       break;
2629
2630     spell_ident:
2631     case SPELL_IDENT:
2632       {
2633         size_t i;
2634         const unsigned char * name = NODE_NAME (token->val.node.node);
2635
2636         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2637           if (name[i] & ~0x7F)
2638             {
2639               unsigned char buffer[10];
2640               i += utf8_to_ucn (buffer, name + i) - 1;
2641               fwrite (buffer, 1, 10, fp);
2642             }
2643           else
2644             fputc (NODE_NAME (token->val.node.node)[i], fp);
2645       }
2646       break;
2647
2648     case SPELL_LITERAL:
2649       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2650       break;
2651
2652     case SPELL_NONE:
2653       /* An error, most probably.  */
2654       break;
2655     }
2656 }
2657
2658 /* Compare two tokens.  */
2659 int
2660 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2661 {
2662   if (a->type == b->type && a->flags == b->flags)
2663     switch (TOKEN_SPELL (a))
2664       {
2665       default:                  /* Keep compiler happy.  */
2666       case SPELL_OPERATOR:
2667         /* token_no is used to track where multiple consecutive ##
2668            tokens were originally located.  */
2669         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2670       case SPELL_NONE:
2671         return (a->type != CPP_MACRO_ARG
2672                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2673       case SPELL_IDENT:
2674         return a->val.node.node == b->val.node.node;
2675       case SPELL_LITERAL:
2676         return (a->val.str.len == b->val.str.len
2677                 && !memcmp (a->val.str.text, b->val.str.text,
2678                             a->val.str.len));
2679       }
2680
2681   return 0;
2682 }
2683
2684 /* Returns nonzero if a space should be inserted to avoid an
2685    accidental token paste for output.  For simplicity, it is
2686    conservative, and occasionally advises a space where one is not
2687    needed, e.g. "." and ".2".  */
2688 int
2689 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2690                  const cpp_token *token2)
2691 {
2692   enum cpp_ttype a = token1->type, b = token2->type;
2693   cppchar_t c;
2694
2695   if (token1->flags & NAMED_OP)
2696     a = CPP_NAME;
2697   if (token2->flags & NAMED_OP)
2698     b = CPP_NAME;
2699
2700   c = EOF;
2701   if (token2->flags & DIGRAPH)
2702     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2703   else if (token_spellings[b].category == SPELL_OPERATOR)
2704     c = token_spellings[b].name[0];
2705
2706   /* Quickly get everything that can paste with an '='.  */
2707   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2708     return 1;
2709
2710   switch (a)
2711     {
2712     case CPP_GREATER:   return c == '>';
2713     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2714     case CPP_PLUS:      return c == '+';
2715     case CPP_MINUS:     return c == '-' || c == '>';
2716     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2717     case CPP_MOD:       return c == ':' || c == '>';
2718     case CPP_AND:       return c == '&';
2719     case CPP_OR:        return c == '|';
2720     case CPP_COLON:     return c == ':' || c == '>';
2721     case CPP_DEREF:     return c == '*';
2722     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2723     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2724     case CPP_NAME:      return ((b == CPP_NUMBER
2725                                  && name_p (pfile, &token2->val.str))
2726                                 || b == CPP_NAME
2727                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2728     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2729                                 || c == '.' || c == '+' || c == '-');
2730                                       /* UCNs */
2731     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2732                                  && b == CPP_NAME)
2733                                 || (CPP_OPTION (pfile, objc)
2734                                     && token1->val.str.text[0] == '@'
2735                                     && (b == CPP_NAME || b == CPP_STRING)));
2736     default:            break;
2737     }
2738
2739   return 0;
2740 }
2741
2742 /* Output all the remaining tokens on the current line, and a newline
2743    character, to FP.  Leading whitespace is removed.  If there are
2744    macros, special token padding is not performed.  */
2745 void
2746 cpp_output_line (cpp_reader *pfile, FILE *fp)
2747 {
2748   const cpp_token *token;
2749
2750   token = cpp_get_token (pfile);
2751   while (token->type != CPP_EOF)
2752     {
2753       cpp_output_token (token, fp);
2754       token = cpp_get_token (pfile);
2755       if (token->flags & PREV_WHITE)
2756         putc (' ', fp);
2757     }
2758
2759   putc ('\n', fp);
2760 }
2761
2762 /* Return a string representation of all the remaining tokens on the
2763    current line.  The result is allocated using xmalloc and must be
2764    freed by the caller.  */
2765 unsigned char *
2766 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2767 {
2768   const cpp_token *token;
2769   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2770   unsigned int alloced = 120 + out;
2771   unsigned char *result = (unsigned char *) xmalloc (alloced);
2772
2773   /* If DIR_NAME is empty, there are no initial contents.  */
2774   if (dir_name)
2775     {
2776       sprintf ((char *) result, "#%s ", dir_name);
2777       out += 2;
2778     }
2779
2780   token = cpp_get_token (pfile);
2781   while (token->type != CPP_EOF)
2782     {
2783       unsigned char *last;
2784       /* Include room for a possible space and the terminating nul.  */
2785       unsigned int len = cpp_token_len (token) + 2;
2786
2787       if (out + len > alloced)
2788         {
2789           alloced *= 2;
2790           if (out + len > alloced)
2791             alloced = out + len;
2792           result = (unsigned char *) xrealloc (result, alloced);
2793         }
2794
2795       last = cpp_spell_token (pfile, token, &result[out], 0);
2796       out = last - result;
2797
2798       token = cpp_get_token (pfile);
2799       if (token->flags & PREV_WHITE)
2800         result[out++] = ' ';
2801     }
2802
2803   result[out] = '\0';
2804   return result;
2805 }
2806
2807 /* Memory buffers.  Changing these three constants can have a dramatic
2808    effect on performance.  The values here are reasonable defaults,
2809    but might be tuned.  If you adjust them, be sure to test across a
2810    range of uses of cpplib, including heavy nested function-like macro
2811    expansion.  Also check the change in peak memory usage (NJAMD is a
2812    good tool for this).  */
2813 #define MIN_BUFF_SIZE 8000
2814 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2815 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2816         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2817
2818 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2819   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2820 #endif
2821
2822 /* Create a new allocation buffer.  Place the control block at the end
2823    of the buffer, so that buffer overflows will cause immediate chaos.  */
2824 static _cpp_buff *
2825 new_buff (size_t len)
2826 {
2827   _cpp_buff *result;
2828   unsigned char *base;
2829
2830   if (len < MIN_BUFF_SIZE)
2831     len = MIN_BUFF_SIZE;
2832   len = CPP_ALIGN (len);
2833
2834   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2835   result = (_cpp_buff *) (base + len);
2836   result->base = base;
2837   result->cur = base;
2838   result->limit = base + len;
2839   result->next = NULL;
2840   return result;
2841 }
2842
2843 /* Place a chain of unwanted allocation buffers on the free list.  */
2844 void
2845 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2846 {
2847   _cpp_buff *end = buff;
2848
2849   while (end->next)
2850     end = end->next;
2851   end->next = pfile->free_buffs;
2852   pfile->free_buffs = buff;
2853 }
2854
2855 /* Return a free buffer of size at least MIN_SIZE.  */
2856 _cpp_buff *
2857 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2858 {
2859   _cpp_buff *result, **p;
2860
2861   for (p = &pfile->free_buffs;; p = &(*p)->next)
2862     {
2863       size_t size;
2864
2865       if (*p == NULL)
2866         return new_buff (min_size);
2867       result = *p;
2868       size = result->limit - result->base;
2869       /* Return a buffer that's big enough, but don't waste one that's
2870          way too big.  */
2871       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2872         break;
2873     }
2874
2875   *p = result->next;
2876   result->next = NULL;
2877   result->cur = result->base;
2878   return result;
2879 }
2880
2881 /* Creates a new buffer with enough space to hold the uncommitted
2882    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
2883    the excess bytes to the new buffer.  Chains the new buffer after
2884    BUFF, and returns the new buffer.  */
2885 _cpp_buff *
2886 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2887 {
2888   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2889   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2890
2891   buff->next = new_buff;
2892   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2893   return new_buff;
2894 }
2895
2896 /* Creates a new buffer with enough space to hold the uncommitted
2897    remaining bytes of the buffer pointed to by BUFF, and at least
2898    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
2899    Chains the new buffer before the buffer pointed to by BUFF, and
2900    updates the pointer to point to the new buffer.  */
2901 void
2902 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
2903 {
2904   _cpp_buff *new_buff, *old_buff = *pbuff;
2905   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2906
2907   new_buff = _cpp_get_buff (pfile, size);
2908   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2909   new_buff->next = old_buff;
2910   *pbuff = new_buff;
2911 }
2912
2913 /* Free a chain of buffers starting at BUFF.  */
2914 void
2915 _cpp_free_buff (_cpp_buff *buff)
2916 {
2917   _cpp_buff *next;
2918
2919   for (; buff; buff = next)
2920     {
2921       next = buff->next;
2922       free (buff->base);
2923     }
2924 }
2925
2926 /* Allocate permanent, unaligned storage of length LEN.  */
2927 unsigned char *
2928 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
2929 {
2930   _cpp_buff *buff = pfile->u_buff;
2931   unsigned char *result = buff->cur;
2932
2933   if (len > (size_t) (buff->limit - result))
2934     {
2935       buff = _cpp_get_buff (pfile, len);
2936       buff->next = pfile->u_buff;
2937       pfile->u_buff = buff;
2938       result = buff->cur;
2939     }
2940
2941   buff->cur = result + len;
2942   return result;
2943 }
2944
2945 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2946    That buffer is used for growing allocations when saving macro
2947    replacement lists in a #define, and when parsing an answer to an
2948    assertion in #assert, #unassert or #if (and therefore possibly
2949    whilst expanding macros).  It therefore must not be used by any
2950    code that they might call: specifically the lexer and the guts of
2951    the macro expander.
2952
2953    All existing other uses clearly fit this restriction: storing
2954    registered pragmas during initialization.  */
2955 unsigned char *
2956 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
2957 {
2958   _cpp_buff *buff = pfile->a_buff;
2959   unsigned char *result = buff->cur;
2960
2961   if (len > (size_t) (buff->limit - result))
2962     {
2963       buff = _cpp_get_buff (pfile, len);
2964       buff->next = pfile->a_buff;
2965       pfile->a_buff = buff;
2966       result = buff->cur;
2967     }
2968
2969   buff->cur = result + len;
2970   return result;
2971 }
2972
2973 /* Say which field of TOK is in use.  */
2974
2975 enum cpp_token_fld_kind
2976 cpp_token_val_index (cpp_token *tok)
2977 {
2978   switch (TOKEN_SPELL (tok))
2979     {
2980     case SPELL_IDENT:
2981       return CPP_TOKEN_FLD_NODE;
2982     case SPELL_LITERAL:
2983       return CPP_TOKEN_FLD_STR;
2984     case SPELL_OPERATOR:
2985       if (tok->type == CPP_PASTE)
2986         return CPP_TOKEN_FLD_TOKEN_NO;
2987       else
2988         return CPP_TOKEN_FLD_NONE;
2989     case SPELL_NONE:
2990       if (tok->type == CPP_MACRO_ARG)
2991         return CPP_TOKEN_FLD_ARG_NO;
2992       else if (tok->type == CPP_PADDING)
2993         return CPP_TOKEN_FLD_SOURCE;
2994       else if (tok->type == CPP_PRAGMA)
2995         return CPP_TOKEN_FLD_PRAGMA;
2996       /* else fall through */
2997     default:
2998       return CPP_TOKEN_FLD_NONE;
2999     }
3000 }
3001
3002 /* All tokens lexed in R after calling this function will be forced to have
3003    their source_location the same as the location referenced by P, until
3004    cpp_stop_forcing_token_locations is called for R.  */
3005
3006 void
3007 cpp_force_token_locations (cpp_reader *r, source_location *p)
3008 {
3009   r->forced_token_location_p = p;
3010 }
3011
3012 /* Go back to assigning locations naturally for lexed tokens.  */
3013
3014 void
3015 cpp_stop_forcing_token_locations (cpp_reader *r)
3016 {
3017   r->forced_token_location_p = NULL;
3018 }