libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007-2013
   3    Free Software Foundation, Inc.
   4    Contributed by Per Bothner, 1994-95.
   5    Based on CCCP program by Paul Rubin, June 1986
   6    Adapted to ANSI C, Richard Stallman, Jan 1987
   7    Broken out to separate file, Zack Weinberg, Mar 2000
   8
   9 This program is free software; you can redistribute it and/or modify it
  10 under the terms of the GNU General Public License as published by the
  11 Free Software Foundation; either version 3, or (at your option) any
  12 later version.
  13
  14 This program is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with this program; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "cpplib.h"
  26 #include "internal.h"
  27
  28 enum spell_type
  29 {
  30   SPELL_OPERATOR = 0,
  31   SPELL_IDENT,
  32   SPELL_LITERAL,
  33   SPELL_NONE
  34 };
  35
  36 struct token_spelling
  37 {
  38   enum spell_type category;
  39   const unsigned char *name;
  40 };
  41
  42 static const unsigned char *const digraph_spellings[] =
  43 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  44
  45 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  46 #define TK(e, s) { SPELL_ ## s,    UC #e },
  47 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  48 #undef OP
  49 #undef TK
  50
  51 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  52 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  53
  54 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  55 static int skip_line_comment (cpp_reader *);
  56 static void skip_whitespace (cpp_reader *, cppchar_t);
  57 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  58 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  59 static void store_comment (cpp_reader *, cpp_token *);
  60 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  61                             unsigned int, enum cpp_ttype);
  62 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  63 static int name_p (cpp_reader *, const cpp_string *);
  64 static tokenrun *next_tokenrun (tokenrun *);
  65
  66 static _cpp_buff *new_buff (size_t);
  67
  68
  69 /* Utility routine:
  70
  71    Compares, the token TOKEN to the NUL-terminated string STRING.
  72    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  73 int
  74 cpp_ideq (const cpp_token *token, const char *string)
  75 {
  76   if (token->type != CPP_NAME)
  77     return 0;
  78
  79   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  80 }
  81
  82 /* Record a note TYPE at byte POS into the current cleaned logical
  83    line.  */
  84 static void
  85 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  86 {
  87   if (buffer->notes_used == buffer->notes_cap)
  88     {
  89       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  90       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  91                                   buffer->notes_cap);
  92     }
  93
  94   buffer->notes[buffer->notes_used].pos = pos;
  95   buffer->notes[buffer->notes_used].type = type;
  96   buffer->notes_used++;
  97 }
  98
  99 \f
 100 /* Fast path to find line special characters using optimized character
 101    scanning algorithms.  Anything complicated falls back to the slow
 102    path below.  Since this loop is very hot it's worth doing these kinds
 103    of optimizations.
 104
 105    One of the paths through the ifdefs should provide
 106
 107      const uchar *search_line_fast (const uchar *s, const uchar *end);
 108
 109    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 110    the found character.
 111
 112    Note that the last character of the buffer is *always* a newline,
 113    as forced by _cpp_convert_input.  This fact can be used to avoid
 114    explicitly looking for the end of the buffer.  */
 115
 116 /* Configure gives us an ifdef test.  */
 117 #ifndef WORDS_BIGENDIAN
 118 #define WORDS_BIGENDIAN 0
 119 #endif
 120
 121 /* We'd like the largest integer that fits into a register.  There's nothing
 122    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 123    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 124    can get the "real" word size.  */
 125 #ifdef __GNUC__
 126 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 127 #else
 128 typedef unsigned long word_type;
 129 #endif
 130
 131 /* The code below is only expecting sizes 4 or 8.
 132    Die at compile-time if this expectation is violated.  */
 133 typedef char check_word_type_size
 134   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 135
 136 /* Return X with the first N bytes forced to values that won't match one
 137    of the interesting characters.  Note that NUL is not interesting.  */
 138
 139 static inline word_type
 140 acc_char_mask_misalign (word_type val, unsigned int n)
 141 {
 142   word_type mask = -1;
 143   if (WORDS_BIGENDIAN)
 144     mask >>= n * 8;
 145   else
 146     mask <<= n * 8;
 147   return val & mask;
 148 }
 149
 150 /* Return X replicated to all byte positions within WORD_TYPE.  */
 151
 152 static inline word_type
 153 acc_char_replicate (uchar x)
 154 {
 155   word_type ret;
 156
 157   ret = (x << 24) | (x << 16) | (x << 8) | x;
 158   if (sizeof(word_type) == 8)
 159     ret = (ret << 16 << 16) | ret;
 160   return ret;
 161 }
 162
 163 /* Return non-zero if some byte of VAL is (probably) C.  */
 164
 165 static inline word_type
 166 acc_char_cmp (word_type val, word_type c)
 167 {
 168 #if defined(__GNUC__) && defined(__alpha__)
 169   /* We can get exact results using a compare-bytes instruction.
 170      Get (val == c) via (0 >= (val ^ c)).  */
 171   return __builtin_alpha_cmpbge (0, val ^ c);
 172 #else
 173   word_type magic = 0x7efefefeU;
 174   if (sizeof(word_type) == 8)
 175     magic = (magic << 16 << 16) | 0xfefefefeU;
 176   magic |= 1;
 177
 178   val ^= c;
 179   return ((val + magic) ^ ~val) & ~magic;
 180 #endif
 181 }
 182
 183 /* Given the result of acc_char_cmp is non-zero, return the index of
 184    the found character.  If this was a false positive, return -1.  */
 185
 186 static inline int
 187 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 188                 word_type val ATTRIBUTE_UNUSED)
 189 {
 190 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 191   /* The cmpbge instruction sets *bits* of the result corresponding to
 192      matches in the bytes with no false positives.  */
 193   return __builtin_ctzl (cmp);
 194 #else
 195   unsigned int i;
 196
 197   /* ??? It would be nice to force unrolling here,
 198      and have all of these constants folded.  */
 199   for (i = 0; i < sizeof(word_type); ++i)
 200     {
 201       uchar c;
 202       if (WORDS_BIGENDIAN)
 203         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 204       else
 205         c = (val >> i * 8) & 0xff;
 206
 207       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 208         return i;
 209     }
 210
 211   return -1;
 212 #endif
 213 }
 214
 215 /* A version of the fast scanner using bit fiddling techniques.
 216
 217    For 32-bit words, one would normally perform 16 comparisons and
 218    16 branches.  With this algorithm one performs 24 arithmetic
 219    operations and one branch.  Whether this is faster with a 32-bit
 220    word size is going to be somewhat system dependent.
 221
 222    For 64-bit words, we eliminate twice the number of comparisons
 223    and branches without increasing the number of arithmetic operations.
 224    It's almost certainly going to be a win with 64-bit word size.  */
 225
 226 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 227   ATTRIBUTE_UNUSED;
 228
 229 static const uchar *
 230 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 231 {
 232   const word_type repl_nl = acc_char_replicate ('\n');
 233   const word_type repl_cr = acc_char_replicate ('\r');
 234   const word_type repl_bs = acc_char_replicate ('\\');
 235   const word_type repl_qm = acc_char_replicate ('?');
 236
 237   unsigned int misalign;
 238   const word_type *p;
 239   word_type val, t;
 240
 241   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 242   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 243   val = *p;
 244   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 245   if (misalign)
 246     val = acc_char_mask_misalign (val, misalign);
 247
 248   /* Main loop.  */
 249   while (1)
 250     {
 251       t  = acc_char_cmp (val, repl_nl);
 252       t |= acc_char_cmp (val, repl_cr);
 253       t |= acc_char_cmp (val, repl_bs);
 254       t |= acc_char_cmp (val, repl_qm);
 255
 256       if (__builtin_expect (t != 0, 0))
 257         {
 258           int i = acc_char_index (t, val);
 259           if (i >= 0)
 260             return (const uchar *)p + i;
 261         }
 262
 263       val = *++p;
 264     }
 265 }
 266
 267 /* Disable on Solaris 2/x86 until the following problems can be properly
 268    autoconfed:
 269
 270    The Solaris 9 assembler cannot assemble SSE4.2 insns.
 271    Before Solaris 9 Update 6, SSE insns cannot be executed.
 272    The Solaris 10+ assembler tags objects with the instruction set
 273    extensions used, so SSE4.2 executables cannot run on machines that
 274    don't support that extension.  */
 275
 276 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 277
 278 /* Replicated character data to be shared between implementations.
 279    Recall that outside of a context with vector support we can't
 280    define compatible vector types, therefore these are all defined
 281    in terms of raw characters.  */
 282 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 283   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 284     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 285   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 286     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 287   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 288     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 289   { '?', '?', '?', '?', '?', '?', '?', '?',
 290     '?', '?', '?', '?', '?', '?', '?', '?' },
 291 };
 292
 293 /* A version of the fast scanner using MMX vectorized byte compare insns.
 294
 295    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 296    which was packaged into SSE1; it is also present in the AMD MMX
 297    extension.  Mark the function as using "sse" so that we emit a real
 298    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 299
 300 static const uchar *
 301 #ifndef __SSE__
 302 __attribute__((__target__("sse")))
 303 #endif
 304 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 305 {
 306   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 307   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 308
 309   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 310   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 311   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 312   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 313
 314   unsigned int misalign, found, mask;
 315   const v8qi *p;
 316   v8qi data, t, c;
 317
 318   /* Align the source pointer.  While MMX doesn't generate unaligned data
 319      faults, this allows us to safely scan to the end of the buffer without
 320      reading beyond the end of the last page.  */
 321   misalign = (uintptr_t)s & 7;
 322   p = (const v8qi *)((uintptr_t)s & -8);
 323   data = *p;
 324
 325   /* Create a mask for the bytes that are valid within the first
 326      16-byte block.  The Idea here is that the AND with the mask
 327      within the loop is "free", since we need some AND or TEST
 328      insn in order to set the flags for the branch anyway.  */
 329   mask = -1u << misalign;
 330
 331   /* Main loop processing 8 bytes at a time.  */
 332   goto start;
 333   do
 334     {
 335       data = *++p;
 336       mask = -1;
 337
 338     start:
 339       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 340       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 341       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 342       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 343       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 344       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 345       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 346       found = __builtin_ia32_pmovmskb (t);
 347       found &= mask;
 348     }
 349   while (!found);
 350
 351   __builtin_ia32_emms ();
 352
 353   /* FOUND contains 1 in bits for which we matched a relevant
 354      character.  Conversion to the byte index is trivial.  */
 355   found = __builtin_ctz(found);
 356   return (const uchar *)p + found;
 357 }
 358
 359 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 360
 361 static const uchar *
 362 #ifndef __SSE2__
 363 __attribute__((__target__("sse2")))
 364 #endif
 365 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 366 {
 367   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 368
 369   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 370   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 371   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 372   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 373
 374   unsigned int misalign, found, mask;
 375   const v16qi *p;
 376   v16qi data, t;
 377
 378   /* Align the source pointer.  */
 379   misalign = (uintptr_t)s & 15;
 380   p = (const v16qi *)((uintptr_t)s & -16);
 381   data = *p;
 382
 383   /* Create a mask for the bytes that are valid within the first
 384      16-byte block.  The Idea here is that the AND with the mask
 385      within the loop is "free", since we need some AND or TEST
 386      insn in order to set the flags for the branch anyway.  */
 387   mask = -1u << misalign;
 388
 389   /* Main loop processing 16 bytes at a time.  */
 390   goto start;
 391   do
 392     {
 393       data = *++p;
 394       mask = -1;
 395
 396     start:
 397       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 398       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 399       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 400       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 401       found = __builtin_ia32_pmovmskb128 (t);
 402       found &= mask;
 403     }
 404   while (!found);
 405
 406   /* FOUND contains 1 in bits for which we matched a relevant
 407      character.  Conversion to the byte index is trivial.  */
 408   found = __builtin_ctz(found);
 409   return (const uchar *)p + found;
 410 }
 411
 412 #ifdef HAVE_SSE4
 413 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 414
 415 static const uchar *
 416 #ifndef __SSE4_2__
 417 __attribute__((__target__("sse4.2")))
 418 #endif
 419 search_line_sse42 (const uchar *s, const uchar *end)
 420 {
 421   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 422   static const v16qi search = { '\n', '\r', '?', '\\' };
 423
 424   uintptr_t si = (uintptr_t)s;
 425   uintptr_t index;
 426
 427   /* Check for unaligned input.  */
 428   if (si & 15)
 429     {
 430       v16qi sv;
 431
 432       if (__builtin_expect (end - s < 16, 0)
 433           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 434         {
 435           /* There are less than 16 bytes left in the buffer, and less
 436              than 16 bytes left on the page.  Reading 16 bytes at this
 437              point might generate a spurious page fault.  Defer to the
 438              SSE2 implementation, which already handles alignment.  */
 439           return search_line_sse2 (s, end);
 440         }
 441
 442       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 443          memory need not be aligned.  */
 444       sv = __builtin_ia32_loaddqu ((const char *) s);
 445       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 446
 447       if (__builtin_expect (index < 16, 0))
 448         goto found;
 449
 450       /* Advance the pointer to an aligned address.  We will re-scan a
 451          few bytes, but we no longer need care for reading past the
 452          end of a page, since we're guaranteed a match.  */
 453       s = (const uchar *)((si + 16) & -16);
 454     }
 455
 456   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 457      in inline assembly, we can make proper use of the flags set.  */
 458   __asm (      "sub $16, %1\n"
 459         "       .balign 16\n"
 460         "0:     add $16, %1\n"
 461         "       %vpcmpestri $0, (%1), %2\n"
 462         "       jnc 0b"
 463         : "=&c"(index), "+r"(s)
 464         : "x"(search), "a"(4), "d"(16));
 465
 466  found:
 467   return s + index;
 468 }
 469
 470 #else
 471 /* Work around out-dated assemblers without sse4 support.  */
 472 #define search_line_sse42 search_line_sse2
 473 #endif
 474
 475 /* Check the CPU capabilities.  */
 476
 477 #include "../gcc/config/i386/cpuid.h"
 478
 479 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 480 static search_line_fast_type search_line_fast;
 481
 482 #define HAVE_init_vectorized_lexer 1
 483 static inline void
 484 init_vectorized_lexer (void)
 485 {
 486   unsigned dummy, ecx = 0, edx = 0;
 487   search_line_fast_type impl = search_line_acc_char;
 488   int minimum = 0;
 489
 490 #if defined(__SSE4_2__)
 491   minimum = 3;
 492 #elif defined(__SSE2__)
 493   minimum = 2;
 494 #elif defined(__SSE__)
 495   minimum = 1;
 496 #endif
 497
 498   if (minimum == 3)
 499     impl = search_line_sse42;
 500   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 501     {
 502       if (minimum == 3 || (ecx & bit_SSE4_2))
 503         impl = search_line_sse42;
 504       else if (minimum == 2 || (edx & bit_SSE2))
 505         impl = search_line_sse2;
 506       else if (minimum == 1 || (edx & bit_SSE))
 507         impl = search_line_mmx;
 508     }
 509   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 510     {
 511       if (minimum == 1
 512           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 513         impl = search_line_mmx;
 514     }
 515
 516   search_line_fast = impl;
 517 }
 518
 519 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
 520
 521 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 522 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 523    so we can't compile this function without -maltivec on the command line
 524    (or implied by some other switch).  */
 525
 526 static const uchar *
 527 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 528 {
 529   typedef __attribute__((altivec(vector))) unsigned char vc;
 530
 531   const vc repl_nl = {
 532     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 533     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 534   };
 535   const vc repl_cr = {
 536     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 537     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 538   };
 539   const vc repl_bs = {
 540     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 541     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 542   };
 543   const vc repl_qm = {
 544     '?', '?', '?', '?', '?', '?', '?', '?',
 545     '?', '?', '?', '?', '?', '?', '?', '?',
 546   };
 547   const vc ones = {
 548     -1, -1, -1, -1, -1, -1, -1, -1,
 549     -1, -1, -1, -1, -1, -1, -1, -1,
 550   };
 551   const vc zero = { 0 };
 552
 553   vc data, mask, t;
 554
 555   /* Altivec loads automatically mask addresses with -16.  This lets us
 556      issue the first load as early as possible.  */
 557   data = __builtin_vec_ld(0, (const vc *)s);
 558
 559   /* Discard bytes before the beginning of the buffer.  Do this by
 560      beginning with all ones and shifting in zeros according to the
 561      mis-alignment.  The LVSR instruction pulls the exact shift we
 562      want from the address.  */
 563   mask = __builtin_vec_lvsr(0, s);
 564   mask = __builtin_vec_perm(zero, ones, mask);
 565   data &= mask;
 566
 567   /* While altivec loads mask addresses, we still need to align S so
 568      that the offset we compute at the end is correct.  */
 569   s = (const uchar *)((uintptr_t)s & -16);
 570
 571   /* Main loop processing 16 bytes at a time.  */
 572   goto start;
 573   do
 574     {
 575       vc m_nl, m_cr, m_bs, m_qm;
 576
 577       s += 16;
 578       data = __builtin_vec_ld(0, (const vc *)s);
 579
 580     start:
 581       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 582       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 583       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 584       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 585       t = (m_nl | m_cr) | (m_bs | m_qm);
 586
 587       /* T now contains 0xff in bytes for which we matched one of the relevant
 588          characters.  We want to exit the loop if any byte in T is non-zero.
 589          Below is the expansion of vec_any_ne(t, zero).  */
 590     }
 591   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 592
 593   {
 594 #define N  (sizeof(vc) / sizeof(long))
 595
 596     union {
 597       vc v;
 598       /* Statically assert that N is 2 or 4.  */
 599       unsigned long l[(N == 2 || N == 4) ? N : -1];
 600     } u;
 601     unsigned long l, i = 0;
 602
 603     u.v = t;
 604
 605     /* Find the first word of T that is non-zero.  */
 606     switch (N)
 607       {
 608       case 4:
 609         l = u.l[i++];
 610         if (l != 0)
 611           break;
 612         s += sizeof(unsigned long);
 613         l = u.l[i++];
 614         if (l != 0)
 615           break;
 616         s += sizeof(unsigned long);
 617       case 2:
 618         l = u.l[i++];
 619         if (l != 0)
 620           break;
 621         s += sizeof(unsigned long);
 622         l = u.l[i];
 623       }
 624
 625     /* L now contains 0xff in bytes for which we matched one of the
 626        relevant characters.  We can find the byte index by finding
 627        its bit index and dividing by 8.  */
 628     l = __builtin_clzl(l) >> 3;
 629     return s + l;
 630
 631 #undef N
 632   }
 633 }
 634
 635 #elif defined (__ARM_NEON__)
 636 #include "arm_neon.h"
 637
 638 static const uchar *
 639 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 640 {
 641   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 642   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 643   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 644   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 645   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 646
 647   unsigned int misalign, found, mask;
 648   const uint8_t *p;
 649   uint8x16_t data;
 650
 651   /* Align the source pointer.  */
 652   misalign = (uintptr_t)s & 15;
 653   p = (const uint8_t *)((uintptr_t)s & -16);
 654   data = vld1q_u8 (p);
 655
 656   /* Create a mask for the bytes that are valid within the first
 657      16-byte block.  The Idea here is that the AND with the mask
 658      within the loop is "free", since we need some AND or TEST
 659      insn in order to set the flags for the branch anyway.  */
 660   mask = (-1u << misalign) & 0xffff;
 661
 662   /* Main loop, processing 16 bytes at a time.  */
 663   goto start;
 664
 665   do
 666     {
 667       uint8x8_t l;
 668       uint16x4_t m;
 669       uint32x2_t n;
 670       uint8x16_t t, u, v, w;
 671
 672       p += 16;
 673       data = vld1q_u8 (p);
 674       mask = 0xffff;
 675
 676     start:
 677       t = vceqq_u8 (data, repl_nl);
 678       u = vceqq_u8 (data, repl_cr);
 679       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 680       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 681       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 682       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 683       m = vpaddl_u8 (l);
 684       n = vpaddl_u16 (m);
 685
 686       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 687               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 688       found &= mask;
 689     }
 690   while (!found);
 691
 692   /* FOUND contains 1 in bits for which we matched a relevant
 693      character.  Conversion to the byte index is trivial.  */
 694   found = __builtin_ctz (found);
 695   return (const uchar *)p + found;
 696 }
 697
 698 #else
 699
 700 /* We only have one accellerated alternative.  Use a direct call so that
 701    we encourage inlining.  */
 702
 703 #define search_line_fast  search_line_acc_char
 704
 705 #endif
 706
 707 /* Initialize the lexer if needed.  */
 708
 709 void
 710 _cpp_init_lexer (void)
 711 {
 712 #ifdef HAVE_init_vectorized_lexer
 713   init_vectorized_lexer ();
 714 #endif
 715 }
 716
 717 /* Returns with a logical line that contains no escaped newlines or
 718    trigraphs.  This is a time-critical inner loop.  */
 719 void
 720 _cpp_clean_line (cpp_reader *pfile)
 721 {
 722   cpp_buffer *buffer;
 723   const uchar *s;
 724   uchar c, *d, *p;
 725
 726   buffer = pfile->buffer;
 727   buffer->cur_note = buffer->notes_used = 0;
 728   buffer->cur = buffer->line_base = buffer->next_line;
 729   buffer->need_line = false;
 730   s = buffer->next_line;
 731
 732   if (!buffer->from_stage3)
 733     {
 734       const uchar *pbackslash = NULL;
 735
 736       /* Fast path.  This is the common case of an un-escaped line with
 737          no trigraphs.  The primary win here is by not writing any
 738          data back to memory until we have to.  */
 739       while (1)
 740         {
 741           /* Perform an optimized search for \n, \r, \\, ?.  */
 742           s = search_line_fast (s, buffer->rlimit);
 743
 744           c = *s;
 745           if (c == '\\')
 746             {
 747               /* Record the location of the backslash and continue.  */
 748               pbackslash = s++;
 749             }
 750           else if (__builtin_expect (c == '?', 0))
 751             {
 752               if (__builtin_expect (s[1] == '?', false)
 753                    && _cpp_trigraph_map[s[2]])
 754                 {
 755                   /* Have a trigraph.  We may or may not have to convert
 756                      it.  Add a line note regardless, for -Wtrigraphs.  */
 757                   add_line_note (buffer, s, s[2]);
 758                   if (CPP_OPTION (pfile, trigraphs))
 759                     {
 760                       /* We do, and that means we have to switch to the
 761                          slow path.  */
 762                       d = (uchar *) s;
 763                       *d = _cpp_trigraph_map[s[2]];
 764                       s += 2;
 765                       goto slow_path;
 766                     }
 767                 }
 768               /* Not a trigraph.  Continue on fast-path.  */
 769               s++;
 770             }
 771           else
 772             break;
 773         }
 774
 775       /* This must be \r or \n.  We're either done, or we'll be forced
 776          to write back to the buffer and continue on the slow path.  */
 777       d = (uchar *) s;
 778
 779       if (__builtin_expect (s == buffer->rlimit, false))
 780         goto done;
 781
 782       /* DOS line ending? */
 783       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 784         {
 785           s++;
 786           if (s == buffer->rlimit)
 787             goto done;
 788         }
 789
 790       if (__builtin_expect (pbackslash == NULL, true))
 791         goto done;
 792
 793       /* Check for escaped newline.  */
 794       p = d;
 795       while (is_nvspace (p[-1]))
 796         p--;
 797       if (p - 1 != pbackslash)
 798         goto done;
 799
 800       /* Have an escaped newline; process it and proceed to
 801          the slow path.  */
 802       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 803       d = p - 2;
 804       buffer->next_line = p - 1;
 805
 806     slow_path:
 807       while (1)
 808         {
 809           c = *++s;
 810           *++d = c;
 811
 812           if (c == '\n' || c == '\r')
 813             {
 814               /* Handle DOS line endings.  */
 815               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 816                 s++;
 817               if (s == buffer->rlimit)
 818                 break;
 819
 820               /* Escaped?  */
 821               p = d;
 822               while (p != buffer->next_line && is_nvspace (p[-1]))
 823                 p--;
 824               if (p == buffer->next_line || p[-1] != '\\')
 825                 break;
 826
 827               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 828               d = p - 2;
 829               buffer->next_line = p - 1;
 830             }
 831           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 832             {
 833               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 834               add_line_note (buffer, d, s[2]);
 835               if (CPP_OPTION (pfile, trigraphs))
 836                 {
 837                   *d = _cpp_trigraph_map[s[2]];
 838                   s += 2;
 839                 }
 840             }
 841         }
 842     }
 843   else
 844     {
 845       while (*s != '\n' && *s != '\r')
 846         s++;
 847       d = (uchar *) s;
 848
 849       /* Handle DOS line endings.  */
 850       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 851         s++;
 852     }
 853
 854  done:
 855   *d = '\n';
 856   /* A sentinel note that should never be processed.  */
 857   add_line_note (buffer, d + 1, '\n');
 858   buffer->next_line = s + 1;
 859 }
 860
 861 /* Return true if the trigraph indicated by NOTE should be warned
 862    about in a comment.  */
 863 static bool
 864 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 865 {
 866   const uchar *p;
 867
 868   /* Within comments we don't warn about trigraphs, unless the
 869      trigraph forms an escaped newline, as that may change
 870      behavior.  */
 871   if (note->type != '/')
 872     return false;
 873
 874   /* If -trigraphs, then this was an escaped newline iff the next note
 875      is coincident.  */
 876   if (CPP_OPTION (pfile, trigraphs))
 877     return note[1].pos == note->pos;
 878
 879   /* Otherwise, see if this forms an escaped newline.  */
 880   p = note->pos + 3;
 881   while (is_nvspace (*p))
 882     p++;
 883
 884   /* There might have been escaped newlines between the trigraph and the
 885      newline we found.  Hence the position test.  */
 886   return (*p == '\n' && p < note[1].pos);
 887 }
 888
 889 /* Process the notes created by add_line_note as far as the current
 890    location.  */
 891 void
 892 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 893 {
 894   cpp_buffer *buffer = pfile->buffer;
 895
 896   for (;;)
 897     {
 898       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 899       unsigned int col;
 900
 901       if (note->pos > buffer->cur)
 902         break;
 903
 904       buffer->cur_note++;
 905       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 906
 907       if (note->type == '\\' || note->type == ' ')
 908         {
 909           if (note->type == ' ' && !in_comment)
 910             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 911                                  "backslash and newline separated by space");
 912
 913           if (buffer->next_line > buffer->rlimit)
 914             {
 915               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 916                                    "backslash-newline at end of file");
 917               /* Prevent "no newline at end of file" warning.  */
 918               buffer->next_line = buffer->rlimit;
 919             }
 920
 921           buffer->line_base = note->pos;
 922           CPP_INCREMENT_LINE (pfile, 0);
 923         }
 924       else if (_cpp_trigraph_map[note->type])
 925         {
 926           if (CPP_OPTION (pfile, warn_trigraphs)
 927               && (!in_comment || warn_in_comment (pfile, note)))
 928             {
 929               if (CPP_OPTION (pfile, trigraphs))
 930                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 931                                        pfile->line_table->highest_line, col,
 932                                        "trigraph ??%c converted to %c",
 933                                        note->type,
 934                                        (int) _cpp_trigraph_map[note->type]);
 935               else
 936                 {
 937                   cpp_warning_with_line
 938                     (pfile, CPP_W_TRIGRAPHS,
 939                      pfile->line_table->highest_line, col,
 940                      "trigraph ??%c ignored, use -trigraphs to enable",
 941                      note->type);
 942                 }
 943             }
 944         }
 945       else if (note->type == 0)
 946         /* Already processed in lex_raw_string.  */;
 947       else
 948         abort ();
 949     }
 950 }
 951
 952 /* Skip a C-style block comment.  We find the end of the comment by
 953    seeing if an asterisk is before every '/' we encounter.  Returns
 954    nonzero if comment terminated by EOF, zero otherwise.
 955
 956    Buffer->cur points to the initial asterisk of the comment.  */
 957 bool
 958 _cpp_skip_block_comment (cpp_reader *pfile)
 959 {
 960   cpp_buffer *buffer = pfile->buffer;
 961   const uchar *cur = buffer->cur;
 962   uchar c;
 963
 964   cur++;
 965   if (*cur == '/')
 966     cur++;
 967
 968   for (;;)
 969     {
 970       /* People like decorating comments with '*', so check for '/'
 971          instead for efficiency.  */
 972       c = *cur++;
 973
 974       if (c == '/')
 975         {
 976           if (cur[-2] == '*')
 977             break;
 978
 979           /* Warn about potential nested comments, but not if the '/'
 980              comes immediately before the true comment delimiter.
 981              Don't bother to get it right across escaped newlines.  */
 982           if (CPP_OPTION (pfile, warn_comments)
 983               && cur[0] == '*' && cur[1] != '/')
 984             {
 985               buffer->cur = cur;
 986               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 987                                      pfile->line_table->highest_line,
 988                                      CPP_BUF_COL (buffer),
 989                                      "\"/*\" within comment");
 990             }
 991         }
 992       else if (c == '\n')
 993         {
 994           unsigned int cols;
 995           buffer->cur = cur - 1;
 996           _cpp_process_line_notes (pfile, true);
 997           if (buffer->next_line >= buffer->rlimit)
 998             return true;
 999           _cpp_clean_line (pfile);
1000
1001           cols = buffer->next_line - buffer->line_base;
1002           CPP_INCREMENT_LINE (pfile, cols);
1003
1004           cur = buffer->cur;
1005         }
1006     }
1007
1008   buffer->cur = cur;
1009   _cpp_process_line_notes (pfile, true);
1010   return false;
1011 }
1012
1013 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1014    terminating newline.  Handles escaped newlines.  Returns nonzero
1015    if a multiline comment.  */
1016 static int
1017 skip_line_comment (cpp_reader *pfile)
1018 {
1019   cpp_buffer *buffer = pfile->buffer;
1020   source_location orig_line = pfile->line_table->highest_line;
1021
1022   while (*buffer->cur != '\n')
1023     buffer->cur++;
1024
1025   _cpp_process_line_notes (pfile, true);
1026   return orig_line != pfile->line_table->highest_line;
1027 }
1028
1029 /* Skips whitespace, saving the next non-whitespace character.  */
1030 static void
1031 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1032 {
1033   cpp_buffer *buffer = pfile->buffer;
1034   bool saw_NUL = false;
1035
1036   do
1037     {
1038       /* Horizontal space always OK.  */
1039       if (c == ' ' || c == '\t')
1040         ;
1041       /* Just \f \v or \0 left.  */
1042       else if (c == '\0')
1043         saw_NUL = true;
1044       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1045         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1046                              CPP_BUF_COL (buffer),
1047                              "%s in preprocessing directive",
1048                              c == '\f' ? "form feed" : "vertical tab");
1049
1050       c = *buffer->cur++;
1051     }
1052   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1053   while (is_nvspace (c));
1054
1055   if (saw_NUL)
1056     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1057
1058   buffer->cur--;
1059 }
1060
1061 /* See if the characters of a number token are valid in a name (no
1062    '.', '+' or '-').  */
1063 static int
1064 name_p (cpp_reader *pfile, const cpp_string *string)
1065 {
1066   unsigned int i;
1067
1068   for (i = 0; i < string->len; i++)
1069     if (!is_idchar (string->text[i]))
1070       return 0;
1071
1072   return 1;
1073 }
1074
1075 /* After parsing an identifier or other sequence, produce a warning about
1076    sequences not in NFC/NFKC.  */
1077 static void
1078 warn_about_normalization (cpp_reader *pfile,
1079                           const cpp_token *token,
1080                           const struct normalize_state *s)
1081 {
1082   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1083       && !pfile->state.skipping)
1084     {
1085       /* Make sure that the token is printed using UCNs, even
1086          if we'd otherwise happily print UTF-8.  */
1087       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1088       size_t sz;
1089
1090       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1091       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1092         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1093                                "`%.*s' is not in NFKC", (int) sz, buf);
1094       else
1095         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1096                                "`%.*s' is not in NFC", (int) sz, buf);
1097       free (buf);
1098     }
1099 }
1100
1101 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1102    an identifier.  FIRST is TRUE if this starts an identifier.  */
1103 static bool
1104 forms_identifier_p (cpp_reader *pfile, int first,
1105                     struct normalize_state *state)
1106 {
1107   cpp_buffer *buffer = pfile->buffer;
1108
1109   if (*buffer->cur == '$')
1110     {
1111       if (!CPP_OPTION (pfile, dollars_in_ident))
1112         return false;
1113
1114       buffer->cur++;
1115       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1116         {
1117           CPP_OPTION (pfile, warn_dollars) = 0;
1118           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1119         }
1120
1121       return true;
1122     }
1123
1124   /* Is this a syntactically valid UCN?  */
1125   if (CPP_OPTION (pfile, extended_identifiers)
1126       && *buffer->cur == '\\'
1127       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1128     {
1129       buffer->cur += 2;
1130       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1131                           state))
1132         return true;
1133       buffer->cur -= 2;
1134     }
1135
1136   return false;
1137 }
1138
1139 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1140 static cpp_hashnode *
1141 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1142 {
1143   cpp_hashnode *result;
1144   const uchar *cur;
1145   unsigned int len;
1146   unsigned int hash = HT_HASHSTEP (0, *base);
1147
1148   cur = base + 1;
1149   while (ISIDNUM (*cur))
1150     {
1151       hash = HT_HASHSTEP (hash, *cur);
1152       cur++;
1153     }
1154   len = cur - base;
1155   hash = HT_HASHFINISH (hash, len);
1156   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1157                                               base, len, hash, HT_ALLOC));
1158
1159   /* Rarely, identifiers require diagnostics when lexed.  */
1160   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1161                         && !pfile->state.skipping, 0))
1162     {
1163       /* It is allowed to poison the same identifier twice.  */
1164       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1165         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1166                    NODE_NAME (result));
1167
1168       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1169          replacement list of a variadic macro.  */
1170       if (result == pfile->spec_nodes.n__VA_ARGS__
1171           && !pfile->state.va_args_ok)
1172         cpp_error (pfile, CPP_DL_PEDWARN,
1173                    "__VA_ARGS__ can only appear in the expansion"
1174                    " of a C99 variadic macro");
1175
1176       /* For -Wc++-compat, warn about use of C++ named operators.  */
1177       if (result->flags & NODE_WARN_OPERATOR)
1178         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1179                      "identifier \"%s\" is a special operator name in C++",
1180                      NODE_NAME (result));
1181     }
1182
1183   return result;
1184 }
1185
1186 /* Get the cpp_hashnode of an identifier specified by NAME in
1187    the current cpp_reader object.  If none is found, NULL is returned.  */
1188 cpp_hashnode *
1189 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1190 {
1191   cpp_hashnode *result;
1192   result = lex_identifier_intern (pfile, (uchar *) name);
1193   return result;
1194 }
1195
1196 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1197 static cpp_hashnode *
1198 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1199                 struct normalize_state *nst)
1200 {
1201   cpp_hashnode *result;
1202   const uchar *cur;
1203   unsigned int len;
1204   unsigned int hash = HT_HASHSTEP (0, *base);
1205
1206   cur = pfile->buffer->cur;
1207   if (! starts_ucn)
1208     while (ISIDNUM (*cur))
1209       {
1210         hash = HT_HASHSTEP (hash, *cur);
1211         cur++;
1212       }
1213   pfile->buffer->cur = cur;
1214   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1215     {
1216       /* Slower version for identifiers containing UCNs (or $).  */
1217       do {
1218         while (ISIDNUM (*pfile->buffer->cur))
1219           {
1220             pfile->buffer->cur++;
1221             NORMALIZE_STATE_UPDATE_IDNUM (nst);
1222           }
1223       } while (forms_identifier_p (pfile, false, nst));
1224       result = _cpp_interpret_identifier (pfile, base,
1225                                           pfile->buffer->cur - base);
1226     }
1227   else
1228     {
1229       len = cur - base;
1230       hash = HT_HASHFINISH (hash, len);
1231
1232       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1233                                                   base, len, hash, HT_ALLOC));
1234     }
1235
1236   /* Rarely, identifiers require diagnostics when lexed.  */
1237   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1238                         && !pfile->state.skipping, 0))
1239     {
1240       /* It is allowed to poison the same identifier twice.  */
1241       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1242         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1243                    NODE_NAME (result));
1244
1245       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1246          replacement list of a variadic macro.  */
1247       if (result == pfile->spec_nodes.n__VA_ARGS__
1248           && !pfile->state.va_args_ok)
1249         cpp_error (pfile, CPP_DL_PEDWARN,
1250                    "__VA_ARGS__ can only appear in the expansion"
1251                    " of a C99 variadic macro");
1252
1253       /* For -Wc++-compat, warn about use of C++ named operators.  */
1254       if (result->flags & NODE_WARN_OPERATOR)
1255         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1256                      "identifier \"%s\" is a special operator name in C++",
1257                      NODE_NAME (result));
1258     }
1259
1260   return result;
1261 }
1262
1263 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1264 static void
1265 lex_number (cpp_reader *pfile, cpp_string *number,
1266             struct normalize_state *nst)
1267 {
1268   const uchar *cur;
1269   const uchar *base;
1270   uchar *dest;
1271
1272   base = pfile->buffer->cur - 1;
1273   do
1274     {
1275       cur = pfile->buffer->cur;
1276
1277       /* N.B. ISIDNUM does not include $.  */
1278       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
1279         {
1280           cur++;
1281           NORMALIZE_STATE_UPDATE_IDNUM (nst);
1282         }
1283
1284       pfile->buffer->cur = cur;
1285     }
1286   while (forms_identifier_p (pfile, false, nst));
1287
1288   number->len = cur - base;
1289   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1290   memcpy (dest, base, number->len);
1291   dest[number->len] = '\0';
1292   number->text = dest;
1293 }
1294
1295 /* Create a token of type TYPE with a literal spelling.  */
1296 static void
1297 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1298                 unsigned int len, enum cpp_ttype type)
1299 {
1300   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1301
1302   memcpy (dest, base, len);
1303   dest[len] = '\0';
1304   token->type = type;
1305   token->val.str.len = len;
1306   token->val.str.text = dest;
1307 }
1308
1309 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1310    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1311
1312 static void
1313 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1314                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1315 {
1316   _cpp_buff *first_buff = *first_buff_p;
1317   _cpp_buff *last_buff = *last_buff_p;
1318
1319   if (first_buff == NULL)
1320     first_buff = last_buff = _cpp_get_buff (pfile, len);
1321   else if (len > BUFF_ROOM (last_buff))
1322     {
1323       size_t room = BUFF_ROOM (last_buff);
1324       memcpy (BUFF_FRONT (last_buff), base, room);
1325       BUFF_FRONT (last_buff) += room;
1326       base += room;
1327       len -= room;
1328       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1329     }
1330
1331   memcpy (BUFF_FRONT (last_buff), base, len);
1332   BUFF_FRONT (last_buff) += len;
1333
1334   *first_buff_p = first_buff;
1335   *last_buff_p = last_buff;
1336 }
1337
1338 /* Lexes a raw string.  The stored string contains the spelling, including
1339    double quotes, delimiter string, '(' and ')', any leading
1340    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1341    literal, or CPP_OTHER if it was not properly terminated.
1342
1343    The spelling is NUL-terminated, but it is not guaranteed that this
1344    is the first NUL since embedded NULs are preserved.  */
1345
1346 static void
1347 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1348                 const uchar *cur)
1349 {
1350   const uchar *raw_prefix;
1351   unsigned int raw_prefix_len = 0;
1352   enum cpp_ttype type;
1353   size_t total_len = 0;
1354   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1355   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1356
1357   type = (*base == 'L' ? CPP_WSTRING :
1358           *base == 'U' ? CPP_STRING32 :
1359           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1360           : CPP_STRING);
1361
1362   raw_prefix = cur + 1;
1363   while (raw_prefix_len < 16)
1364     {
1365       switch (raw_prefix[raw_prefix_len])
1366         {
1367         case ' ': case '(': case ')': case '\\': case '\t':
1368         case '\v': case '\f': case '\n': default:
1369           break;
1370         /* Basic source charset except the above chars.  */
1371         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1372         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1373         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1374         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1375         case 'y': case 'z':
1376         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1377         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1378         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1379         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1380         case 'Y': case 'Z':
1381         case '0': case '1': case '2': case '3': case '4': case '5':
1382         case '6': case '7': case '8': case '9':
1383         case '_': case '{': case '}': case '#': case '[': case ']':
1384         case '<': case '>': case '%': case ':': case ';': case '.':
1385         case '?': case '*': case '+': case '-': case '/': case '^':
1386         case '&': case '|': case '~': case '!': case '=': case ',':
1387         case '"': case '\'':
1388           raw_prefix_len++;
1389           continue;
1390         }
1391       break;
1392     }
1393
1394   if (raw_prefix[raw_prefix_len] != '(')
1395     {
1396       int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
1397                 + 1;
1398       if (raw_prefix_len == 16)
1399         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1400                              "raw string delimiter longer than 16 characters");
1401       else
1402         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1403                              "invalid character '%c' in raw string delimiter",
1404                              (int) raw_prefix[raw_prefix_len]);
1405       pfile->buffer->cur = raw_prefix - 1;
1406       create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
1407       return;
1408     }
1409
1410   cur = raw_prefix + raw_prefix_len + 1;
1411   for (;;)
1412     {
1413 #define BUF_APPEND(STR,LEN)                                     \
1414       do {                                                      \
1415         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1416                         &first_buff, &last_buff);               \
1417         total_len += (LEN);                                     \
1418       } while (0);
1419
1420       cppchar_t c;
1421
1422       /* If we previously performed any trigraph or line splicing
1423          transformations, undo them within the body of the raw string.  */
1424       while (note->pos < cur)
1425         ++note;
1426       for (; note->pos == cur; ++note)
1427         {
1428           switch (note->type)
1429             {
1430             case '\\':
1431             case ' ':
1432               /* Restore backslash followed by newline.  */
1433               BUF_APPEND (base, cur - base);
1434               base = cur;
1435               BUF_APPEND ("\\", 1);
1436             after_backslash:
1437               if (note->type == ' ')
1438                 {
1439                   /* GNU backslash whitespace newline extension.  FIXME
1440                      could be any sequence of non-vertical space.  When we
1441                      can properly restore any such sequence, we should mark
1442                      this note as handled so _cpp_process_line_notes
1443                      doesn't warn.  */
1444                   BUF_APPEND (" ", 1);
1445                 }
1446
1447               BUF_APPEND ("\n", 1);
1448               break;
1449
1450             case 0:
1451               /* Already handled.  */
1452               break;
1453
1454             default:
1455               if (_cpp_trigraph_map[note->type])
1456                 {
1457                   /* Don't warn about this trigraph in
1458                      _cpp_process_line_notes, since trigraphs show up as
1459                      trigraphs in raw strings.  */
1460                   uchar type = note->type;
1461                   note->type = 0;
1462
1463                   if (!CPP_OPTION (pfile, trigraphs))
1464                     /* If we didn't convert the trigraph in the first
1465                        place, don't do anything now either.  */
1466                     break;
1467
1468                   BUF_APPEND (base, cur - base);
1469                   base = cur;
1470                   BUF_APPEND ("??", 2);
1471
1472                   /* ??/ followed by newline gets two line notes, one for
1473                      the trigraph and one for the backslash/newline.  */
1474                   if (type == '/' && note[1].pos == cur)
1475                     {
1476                       if (note[1].type != '\\'
1477                           && note[1].type != ' ')
1478                         abort ();
1479                       BUF_APPEND ("/", 1);
1480                       ++note;
1481                       goto after_backslash;
1482                     }
1483                   /* The ) from ??) could be part of the suffix.  */
1484                   else if (type == ')'
1485                            && strncmp ((const char *) cur+1,
1486                                        (const char *) raw_prefix,
1487                                        raw_prefix_len) == 0
1488                            && cur[raw_prefix_len+1] == '"')
1489                     {
1490                       BUF_APPEND (")", 1);
1491                       base++;
1492                       cur += raw_prefix_len + 2;
1493                       goto break_outer_loop;
1494                     }
1495                   else
1496                     {
1497                       /* Skip the replacement character.  */
1498                       base = ++cur;
1499                       BUF_APPEND (&type, 1);
1500                     }
1501                 }
1502               else
1503                 abort ();
1504               break;
1505             }
1506         }
1507       c = *cur++;
1508
1509       if (c == ')'
1510           && strncmp ((const char *) cur, (const char *) raw_prefix,
1511                       raw_prefix_len) == 0
1512           && cur[raw_prefix_len] == '"')
1513         {
1514           cur += raw_prefix_len + 1;
1515           break;
1516         }
1517       else if (c == '\n')
1518         {
1519           if (pfile->state.in_directive
1520               || pfile->state.parsing_args
1521               || pfile->state.in_deferred_pragma)
1522             {
1523               cur--;
1524               type = CPP_OTHER;
1525               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1526                                    "unterminated raw string");
1527               break;
1528             }
1529
1530           BUF_APPEND (base, cur - base);
1531
1532           if (pfile->buffer->cur < pfile->buffer->rlimit)
1533             CPP_INCREMENT_LINE (pfile, 0);
1534           pfile->buffer->need_line = true;
1535
1536           pfile->buffer->cur = cur-1;
1537           _cpp_process_line_notes (pfile, false);
1538           if (!_cpp_get_fresh_line (pfile))
1539             {
1540               source_location src_loc = token->src_loc;
1541               token->type = CPP_EOF;
1542               /* Tell the compiler the line number of the EOF token.  */
1543               token->src_loc = pfile->line_table->highest_line;
1544               token->flags = BOL;
1545               if (first_buff != NULL)
1546                 _cpp_release_buff (pfile, first_buff);
1547               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1548                                    "unterminated raw string");
1549               return;
1550             }
1551
1552           cur = base = pfile->buffer->cur;
1553           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1554         }
1555     }
1556  break_outer_loop:
1557
1558   if (CPP_OPTION (pfile, user_literals))
1559     {
1560       /* According to C++11 [lex.ext]p10, a ud-suffix not starting with an
1561          underscore is ill-formed.  Since this breaks programs using macros
1562          from inttypes.h, we generate a warning and treat the ud-suffix as a
1563          separate preprocessing token.  This approach is under discussion by
1564          the standards committee, and has been adopted as a conforming
1565          extension by other front ends such as clang. */
1566       if (ISALPHA (*cur))
1567         {
1568           /* Raise a warning, but do not consume subsequent tokens.  */
1569           if (CPP_OPTION (pfile, warn_literal_suffix))
1570             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1571                                    token->src_loc, 0,
1572                                    "invalid suffix on literal; C++11 requires "
1573                                    "a space between literal and identifier");
1574         }
1575       /* Grab user defined literal suffix.  */
1576       else if (*cur == '_')
1577         {
1578           type = cpp_userdef_string_add_type (type);
1579           ++cur;
1580
1581           while (ISIDNUM (*cur))
1582             ++cur;
1583         }
1584     }
1585
1586   pfile->buffer->cur = cur;
1587   if (first_buff == NULL)
1588     create_literal (pfile, token, base, cur - base, type);
1589   else
1590     {
1591       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1592
1593       token->type = type;
1594       token->val.str.len = total_len + (cur - base);
1595       token->val.str.text = dest;
1596       last_buff = first_buff;
1597       while (last_buff != NULL)
1598         {
1599           memcpy (dest, last_buff->base,
1600                   BUFF_FRONT (last_buff) - last_buff->base);
1601           dest += BUFF_FRONT (last_buff) - last_buff->base;
1602           last_buff = last_buff->next;
1603         }
1604       _cpp_release_buff (pfile, first_buff);
1605       memcpy (dest, base, cur - base);
1606       dest[cur - base] = '\0';
1607     }
1608 }
1609
1610 /* Lexes a string, character constant, or angle-bracketed header file
1611    name.  The stored string contains the spelling, including opening
1612    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1613    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1614    if it was not properly terminated, or CPP_LESS for an unterminated
1615    header name which must be relexed as normal tokens.
1616
1617    The spelling is NUL-terminated, but it is not guaranteed that this
1618    is the first NUL since embedded NULs are preserved.  */
1619 static void
1620 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1621 {
1622   bool saw_NUL = false;
1623   const uchar *cur;
1624   cppchar_t terminator;
1625   enum cpp_ttype type;
1626
1627   cur = base;
1628   terminator = *cur++;
1629   if (terminator == 'L' || terminator == 'U')
1630     terminator = *cur++;
1631   else if (terminator == 'u')
1632     {
1633       terminator = *cur++;
1634       if (terminator == '8')
1635         terminator = *cur++;
1636     }
1637   if (terminator == 'R')
1638     {
1639       lex_raw_string (pfile, token, base, cur);
1640       return;
1641     }
1642   if (terminator == '"')
1643     type = (*base == 'L' ? CPP_WSTRING :
1644             *base == 'U' ? CPP_STRING32 :
1645             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1646                          : CPP_STRING);
1647   else if (terminator == '\'')
1648     type = (*base == 'L' ? CPP_WCHAR :
1649             *base == 'U' ? CPP_CHAR32 :
1650             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1651   else
1652     terminator = '>', type = CPP_HEADER_NAME;
1653
1654   for (;;)
1655     {
1656       cppchar_t c = *cur++;
1657
1658       /* In #include-style directives, terminators are not escapable.  */
1659       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1660         cur++;
1661       else if (c == terminator)
1662         break;
1663       else if (c == '\n')
1664         {
1665           cur--;
1666           /* Unmatched quotes always yield undefined behavior, but
1667              greedy lexing means that what appears to be an unterminated
1668              header name may actually be a legitimate sequence of tokens.  */
1669           if (terminator == '>')
1670             {
1671               token->type = CPP_LESS;
1672               return;
1673             }
1674           type = CPP_OTHER;
1675           break;
1676         }
1677       else if (c == '\0')
1678         saw_NUL = true;
1679     }
1680
1681   if (saw_NUL && !pfile->state.skipping)
1682     cpp_error (pfile, CPP_DL_WARNING,
1683                "null character(s) preserved in literal");
1684
1685   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1686     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1687                (int) terminator);
1688
1689   if (CPP_OPTION (pfile, user_literals))
1690     {
1691       /* According to C++11 [lex.ext]p10, a ud-suffix not starting with an
1692          underscore is ill-formed.  Since this breaks programs using macros
1693          from inttypes.h, we generate a warning and treat the ud-suffix as a
1694          separate preprocessing token.  This approach is under discussion by
1695          the standards committee, and has been adopted as a conforming
1696          extension by other front ends such as clang. */
1697       if (ISALPHA (*cur))
1698         {
1699           /* Raise a warning, but do not consume subsequent tokens.  */
1700           if (CPP_OPTION (pfile, warn_literal_suffix))
1701             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1702                                    token->src_loc, 0,
1703                                    "invalid suffix on literal; C++11 requires "
1704                                    "a space between literal and identifier");
1705         }
1706       /* Grab user defined literal suffix.  */
1707       else if (*cur == '_')
1708         {
1709           type = cpp_userdef_char_add_type (type);
1710           type = cpp_userdef_string_add_type (type);
1711           ++cur;
1712
1713           while (ISIDNUM (*cur))
1714             ++cur;
1715         }
1716     }
1717
1718   pfile->buffer->cur = cur;
1719   create_literal (pfile, token, base, cur - base, type);
1720 }
1721
1722 /* Return the comment table. The client may not make any assumption
1723    about the ordering of the table.  */
1724 cpp_comment_table *
1725 cpp_get_comments (cpp_reader *pfile)
1726 {
1727   return &pfile->comments;
1728 }
1729
1730 /* Append a comment to the end of the comment table. */
1731 static void
1732 store_comment (cpp_reader *pfile, cpp_token *token)
1733 {
1734   int len;
1735
1736   if (pfile->comments.allocated == 0)
1737     {
1738       pfile->comments.allocated = 256;
1739       pfile->comments.entries = (cpp_comment *) xmalloc
1740         (pfile->comments.allocated * sizeof (cpp_comment));
1741     }
1742
1743   if (pfile->comments.count == pfile->comments.allocated)
1744     {
1745       pfile->comments.allocated *= 2;
1746       pfile->comments.entries = (cpp_comment *) xrealloc
1747         (pfile->comments.entries,
1748          pfile->comments.allocated * sizeof (cpp_comment));
1749     }
1750
1751   len = token->val.str.len;
1752
1753   /* Copy comment. Note, token may not be NULL terminated. */
1754   pfile->comments.entries[pfile->comments.count].comment =
1755     (char *) xmalloc (sizeof (char) * (len + 1));
1756   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1757           token->val.str.text, len);
1758   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1759
1760   /* Set source location. */
1761   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1762
1763   /* Increment the count of entries in the comment table. */
1764   pfile->comments.count++;
1765 }
1766
1767 /* The stored comment includes the comment start and any terminator.  */
1768 static void
1769 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1770               cppchar_t type)
1771 {
1772   unsigned char *buffer;
1773   unsigned int len, clen, i;
1774
1775   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1776
1777   /* C++ comments probably (not definitely) have moved past a new
1778      line, which we don't want to save in the comment.  */
1779   if (is_vspace (pfile->buffer->cur[-1]))
1780     len--;
1781
1782   /* If we are currently in a directive or in argument parsing, then
1783      we need to store all C++ comments as C comments internally, and
1784      so we need to allocate a little extra space in that case.
1785
1786      Note that the only time we encounter a directive here is
1787      when we are saving comments in a "#define".  */
1788   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1789           && type == '/') ? len + 2 : len;
1790
1791   buffer = _cpp_unaligned_alloc (pfile, clen);
1792
1793   token->type = CPP_COMMENT;
1794   token->val.str.len = clen;
1795   token->val.str.text = buffer;
1796
1797   buffer[0] = '/';
1798   memcpy (buffer + 1, from, len - 1);
1799
1800   /* Finish conversion to a C comment, if necessary.  */
1801   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1802     {
1803       buffer[1] = '*';
1804       buffer[clen - 2] = '*';
1805       buffer[clen - 1] = '/';
1806       /* As there can be in a C++ comments illegal sequences for C comments
1807          we need to filter them out.  */
1808       for (i = 2; i < (clen - 2); i++)
1809         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1810           buffer[i] = '|';
1811     }
1812
1813   /* Finally store this comment for use by clients of libcpp. */
1814   store_comment (pfile, token);
1815 }
1816
1817 /* Allocate COUNT tokens for RUN.  */
1818 void
1819 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1820 {
1821   run->base = XNEWVEC (cpp_token, count);
1822   run->limit = run->base + count;
1823   run->next = NULL;
1824 }
1825
1826 /* Returns the next tokenrun, or creates one if there is none.  */
1827 static tokenrun *
1828 next_tokenrun (tokenrun *run)
1829 {
1830   if (run->next == NULL)
1831     {
1832       run->next = XNEW (tokenrun);
1833       run->next->prev = run;
1834       _cpp_init_tokenrun (run->next, 250);
1835     }
1836
1837   return run->next;
1838 }
1839
1840 /* Return the number of not yet processed token in a given
1841    context.  */
1842 int
1843 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1844 {
1845   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1846     return (LAST (context).token - FIRST (context).token);
1847   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1848            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1849     return (LAST (context).ptoken - FIRST (context).ptoken);
1850   else
1851       abort ();
1852 }
1853
1854 /* Returns the token present at index INDEX in a given context.  If
1855    INDEX is zero, the next token to be processed is returned.  */
1856 static const cpp_token*
1857 _cpp_token_from_context_at (cpp_context *context, int index)
1858 {
1859   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1860     return &(FIRST (context).token[index]);
1861   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1862            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1863     return FIRST (context).ptoken[index];
1864  else
1865    abort ();
1866 }
1867
1868 /* Look ahead in the input stream.  */
1869 const cpp_token *
1870 cpp_peek_token (cpp_reader *pfile, int index)
1871 {
1872   cpp_context *context = pfile->context;
1873   const cpp_token *peektok;
1874   int count;
1875
1876   /* First, scan through any pending cpp_context objects.  */
1877   while (context->prev)
1878     {
1879       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1880
1881       if (index < (int) sz)
1882         return _cpp_token_from_context_at (context, index);
1883       index -= (int) sz;
1884       context = context->prev;
1885     }
1886
1887   /* We will have to read some new tokens after all (and do so
1888      without invalidating preceding tokens).  */
1889   count = index;
1890   pfile->keep_tokens++;
1891
1892   do
1893     {
1894       peektok = _cpp_lex_token (pfile);
1895       if (peektok->type == CPP_EOF)
1896         return peektok;
1897     }
1898   while (index--);
1899
1900   _cpp_backup_tokens_direct (pfile, count + 1);
1901   pfile->keep_tokens--;
1902
1903   return peektok;
1904 }
1905
1906 /* Allocate a single token that is invalidated at the same time as the
1907    rest of the tokens on the line.  Has its line and col set to the
1908    same as the last lexed token, so that diagnostics appear in the
1909    right place.  */
1910 cpp_token *
1911 _cpp_temp_token (cpp_reader *pfile)
1912 {
1913   cpp_token *old, *result;
1914   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1915   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1916
1917   old = pfile->cur_token - 1;
1918   /* Any pre-existing lookaheads must not be clobbered.  */
1919   if (la)
1920     {
1921       if (sz <= la)
1922         {
1923           tokenrun *next = next_tokenrun (pfile->cur_run);
1924
1925           if (sz < la)
1926             memmove (next->base + 1, next->base,
1927                      (la - sz) * sizeof (cpp_token));
1928
1929           next->base[0] = pfile->cur_run->limit[-1];
1930         }
1931
1932       if (sz > 1)
1933         memmove (pfile->cur_token + 1, pfile->cur_token,
1934                  MIN (la, sz - 1) * sizeof (cpp_token));
1935     }
1936
1937   if (!sz && pfile->cur_token == pfile->cur_run->limit)
1938     {
1939       pfile->cur_run = next_tokenrun (pfile->cur_run);
1940       pfile->cur_token = pfile->cur_run->base;
1941     }
1942
1943   result = pfile->cur_token++;
1944   result->src_loc = old->src_loc;
1945   return result;
1946 }
1947
1948 /* Lex a token into RESULT (external interface).  Takes care of issues
1949    like directive handling, token lookahead, multiple include
1950    optimization and skipping.  */
1951 const cpp_token *
1952 _cpp_lex_token (cpp_reader *pfile)
1953 {
1954   cpp_token *result;
1955
1956   for (;;)
1957     {
1958       if (pfile->cur_token == pfile->cur_run->limit)
1959         {
1960           pfile->cur_run = next_tokenrun (pfile->cur_run);
1961           pfile->cur_token = pfile->cur_run->base;
1962         }
1963       /* We assume that the current token is somewhere in the current
1964          run.  */
1965       if (pfile->cur_token < pfile->cur_run->base
1966           || pfile->cur_token >= pfile->cur_run->limit)
1967         abort ();
1968
1969       if (pfile->lookaheads)
1970         {
1971           pfile->lookaheads--;
1972           result = pfile->cur_token++;
1973         }
1974       else
1975         result = _cpp_lex_direct (pfile);
1976
1977       if (result->flags & BOL)
1978         {
1979           /* Is this a directive.  If _cpp_handle_directive returns
1980              false, it is an assembler #.  */
1981           if (result->type == CPP_HASH
1982               /* 6.10.3 p 11: Directives in a list of macro arguments
1983                  gives undefined behavior.  This implementation
1984                  handles the directive as normal.  */
1985               && pfile->state.parsing_args != 1)
1986             {
1987               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1988                 {
1989                   if (pfile->directive_result.type == CPP_PADDING)
1990                     continue;
1991                   result = &pfile->directive_result;
1992                 }
1993             }
1994           else if (pfile->state.in_deferred_pragma)
1995             result = &pfile->directive_result;
1996
1997           if (pfile->cb.line_change && !pfile->state.skipping)
1998             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
1999         }
2000
2001       /* We don't skip tokens in directives.  */
2002       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2003         break;
2004
2005       /* Outside a directive, invalidate controlling macros.  At file
2006          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2007          get here and MI optimization works.  */
2008       pfile->mi_valid = false;
2009
2010       if (!pfile->state.skipping || result->type == CPP_EOF)
2011         break;
2012     }
2013
2014   return result;
2015 }
2016
2017 /* Returns true if a fresh line has been loaded.  */
2018 bool
2019 _cpp_get_fresh_line (cpp_reader *pfile)
2020 {
2021   int return_at_eof;
2022
2023   /* We can't get a new line until we leave the current directive.  */
2024   if (pfile->state.in_directive)
2025     return false;
2026
2027   for (;;)
2028     {
2029       cpp_buffer *buffer = pfile->buffer;
2030
2031       if (!buffer->need_line)
2032         return true;
2033
2034       if (buffer->next_line < buffer->rlimit)
2035         {
2036           _cpp_clean_line (pfile);
2037           return true;
2038         }
2039
2040       /* First, get out of parsing arguments state.  */
2041       if (pfile->state.parsing_args)
2042         return false;
2043
2044       /* End of buffer.  Non-empty files should end in a newline.  */
2045       if (buffer->buf != buffer->rlimit
2046           && buffer->next_line > buffer->rlimit
2047           && !buffer->from_stage3)
2048         {
2049           /* Clip to buffer size.  */
2050           buffer->next_line = buffer->rlimit;
2051         }
2052
2053       return_at_eof = buffer->return_at_eof;
2054       _cpp_pop_buffer (pfile);
2055       if (pfile->buffer == NULL || return_at_eof)
2056         return false;
2057     }
2058 }
2059
2060 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2061   do                                                    \
2062     {                                                   \
2063       result->type = ELSE_TYPE;                         \
2064       if (*buffer->cur == CHAR)                         \
2065         buffer->cur++, result->type = THEN_TYPE;        \
2066     }                                                   \
2067   while (0)
2068
2069 /* Lex a token into pfile->cur_token, which is also incremented, to
2070    get diagnostics pointing to the correct location.
2071
2072    Does not handle issues such as token lookahead, multiple-include
2073    optimization, directives, skipping etc.  This function is only
2074    suitable for use by _cpp_lex_token, and in special cases like
2075    lex_expansion_token which doesn't care for any of these issues.
2076
2077    When meeting a newline, returns CPP_EOF if parsing a directive,
2078    otherwise returns to the start of the token buffer if permissible.
2079    Returns the location of the lexed token.  */
2080 cpp_token *
2081 _cpp_lex_direct (cpp_reader *pfile)
2082 {
2083   cppchar_t c;
2084   cpp_buffer *buffer;
2085   const unsigned char *comment_start;
2086   cpp_token *result = pfile->cur_token++;
2087
2088  fresh_line:
2089   result->flags = 0;
2090   buffer = pfile->buffer;
2091   if (buffer->need_line)
2092     {
2093       if (pfile->state.in_deferred_pragma)
2094         {
2095           result->type = CPP_PRAGMA_EOL;
2096           pfile->state.in_deferred_pragma = false;
2097           if (!pfile->state.pragma_allow_expansion)
2098             pfile->state.prevent_expansion--;
2099           return result;
2100         }
2101       if (!_cpp_get_fresh_line (pfile))
2102         {
2103           result->type = CPP_EOF;
2104           if (!pfile->state.in_directive)
2105             {
2106               /* Tell the compiler the line number of the EOF token.  */
2107               result->src_loc = pfile->line_table->highest_line;
2108               result->flags = BOL;
2109             }
2110           return result;
2111         }
2112       if (!pfile->keep_tokens)
2113         {
2114           pfile->cur_run = &pfile->base_run;
2115           result = pfile->base_run.base;
2116           pfile->cur_token = result + 1;
2117         }
2118       result->flags = BOL;
2119       if (pfile->state.parsing_args == 2)
2120         result->flags |= PREV_WHITE;
2121     }
2122   buffer = pfile->buffer;
2123  update_tokens_line:
2124   result->src_loc = pfile->line_table->highest_line;
2125
2126  skipped_white:
2127   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2128       && !pfile->overlaid_buffer)
2129     {
2130       _cpp_process_line_notes (pfile, false);
2131       result->src_loc = pfile->line_table->highest_line;
2132     }
2133   c = *buffer->cur++;
2134
2135   if (pfile->forced_token_location_p)
2136     result->src_loc = *pfile->forced_token_location_p;
2137   else
2138     result->src_loc = linemap_position_for_column (pfile->line_table,
2139                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2140
2141   switch (c)
2142     {
2143     case ' ': case '\t': case '\f': case '\v': case '\0':
2144       result->flags |= PREV_WHITE;
2145       skip_whitespace (pfile, c);
2146       goto skipped_white;
2147
2148     case '\n':
2149       if (buffer->cur < buffer->rlimit)
2150         CPP_INCREMENT_LINE (pfile, 0);
2151       buffer->need_line = true;
2152       goto fresh_line;
2153
2154     case '0': case '1': case '2': case '3': case '4':
2155     case '5': case '6': case '7': case '8': case '9':
2156       {
2157         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2158         result->type = CPP_NUMBER;
2159         lex_number (pfile, &result->val.str, &nst);
2160         warn_about_normalization (pfile, result, &nst);
2161         break;
2162       }
2163
2164     case 'L':
2165     case 'u':
2166     case 'U':
2167     case 'R':
2168       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2169          wide strings or raw strings.  */
2170       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2171           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2172         {
2173           if ((*buffer->cur == '\'' && c != 'R')
2174               || *buffer->cur == '"'
2175               || (*buffer->cur == 'R'
2176                   && c != 'R'
2177                   && buffer->cur[1] == '"'
2178                   && CPP_OPTION (pfile, rliterals))
2179               || (*buffer->cur == '8'
2180                   && c == 'u'
2181                   && (buffer->cur[1] == '"'
2182                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2183                           && CPP_OPTION (pfile, rliterals)))))
2184             {
2185               lex_string (pfile, result, buffer->cur - 1);
2186               break;
2187             }
2188         }
2189       /* Fall through.  */
2190
2191     case '_':
2192     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2193     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2194     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2195     case 's': case 't':           case 'v': case 'w': case 'x':
2196     case 'y': case 'z':
2197     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2198     case 'G': case 'H': case 'I': case 'J': case 'K':
2199     case 'M': case 'N': case 'O': case 'P': case 'Q':
2200     case 'S': case 'T':           case 'V': case 'W': case 'X':
2201     case 'Y': case 'Z':
2202       result->type = CPP_NAME;
2203       {
2204         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2205         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2206                                                 &nst);
2207         warn_about_normalization (pfile, result, &nst);
2208       }
2209
2210       /* Convert named operators to their proper types.  */
2211       if (result->val.node.node->flags & NODE_OPERATOR)
2212         {
2213           result->flags |= NAMED_OP;
2214           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2215         }
2216       break;
2217
2218     case '\'':
2219     case '"':
2220       lex_string (pfile, result, buffer->cur - 1);
2221       break;
2222
2223     case '/':
2224       /* A potential block or line comment.  */
2225       comment_start = buffer->cur;
2226       c = *buffer->cur;
2227
2228       if (c == '*')
2229         {
2230           if (_cpp_skip_block_comment (pfile))
2231             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2232         }
2233       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2234                             || cpp_in_system_header (pfile)))
2235         {
2236           /* Warn about comments only if pedantically GNUC89, and not
2237              in system headers.  */
2238           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2239               && ! buffer->warned_cplusplus_comments)
2240             {
2241               cpp_error (pfile, CPP_DL_PEDWARN,
2242                          "C++ style comments are not allowed in ISO C90");
2243               cpp_error (pfile, CPP_DL_PEDWARN,
2244                          "(this will be reported only once per input file)");
2245               buffer->warned_cplusplus_comments = 1;
2246             }
2247
2248           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2249             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2250         }
2251       else if (c == '=')
2252         {
2253           buffer->cur++;
2254           result->type = CPP_DIV_EQ;
2255           break;
2256         }
2257       else
2258         {
2259           result->type = CPP_DIV;
2260           break;
2261         }
2262
2263       if (!pfile->state.save_comments)
2264         {
2265           result->flags |= PREV_WHITE;
2266           goto update_tokens_line;
2267         }
2268
2269       /* Save the comment as a token in its own right.  */
2270       save_comment (pfile, result, comment_start, c);
2271       break;
2272
2273     case '<':
2274       if (pfile->state.angled_headers)
2275         {
2276           lex_string (pfile, result, buffer->cur - 1);
2277           if (result->type != CPP_LESS)
2278             break;
2279         }
2280
2281       result->type = CPP_LESS;
2282       if (*buffer->cur == '=')
2283         buffer->cur++, result->type = CPP_LESS_EQ;
2284       else if (*buffer->cur == '<')
2285         {
2286           buffer->cur++;
2287           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2288         }
2289       else if (CPP_OPTION (pfile, digraphs))
2290         {
2291           if (*buffer->cur == ':')
2292             {
2293               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2294                  three characters are <:: and the subsequent character
2295                  is neither : nor >, the < is treated as a preprocessor
2296                  token by itself".  */
2297               if (CPP_OPTION (pfile, cplusplus)
2298                   && (CPP_OPTION (pfile, lang) == CLK_CXX11
2299                       || CPP_OPTION (pfile, lang) == CLK_GNUCXX11)
2300                   && buffer->cur[1] == ':'
2301                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
2302                 break;
2303
2304               buffer->cur++;
2305               result->flags |= DIGRAPH;
2306               result->type = CPP_OPEN_SQUARE;
2307             }
2308           else if (*buffer->cur == '%')
2309             {
2310               buffer->cur++;
2311               result->flags |= DIGRAPH;
2312               result->type = CPP_OPEN_BRACE;
2313             }
2314         }
2315       break;
2316
2317     case '>':
2318       result->type = CPP_GREATER;
2319       if (*buffer->cur == '=')
2320         buffer->cur++, result->type = CPP_GREATER_EQ;
2321       else if (*buffer->cur == '>')
2322         {
2323           buffer->cur++;
2324           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2325         }
2326       break;
2327
2328     case '%':
2329       result->type = CPP_MOD;
2330       if (*buffer->cur == '=')
2331         buffer->cur++, result->type = CPP_MOD_EQ;
2332       else if (CPP_OPTION (pfile, digraphs))
2333         {
2334           if (*buffer->cur == ':')
2335             {
2336               buffer->cur++;
2337               result->flags |= DIGRAPH;
2338               result->type = CPP_HASH;
2339               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2340                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2341             }
2342           else if (*buffer->cur == '>')
2343             {
2344               buffer->cur++;
2345               result->flags |= DIGRAPH;
2346               result->type = CPP_CLOSE_BRACE;
2347             }
2348         }
2349       break;
2350
2351     case '.':
2352       result->type = CPP_DOT;
2353       if (ISDIGIT (*buffer->cur))
2354         {
2355           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2356           result->type = CPP_NUMBER;
2357           lex_number (pfile, &result->val.str, &nst);
2358           warn_about_normalization (pfile, result, &nst);
2359         }
2360       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2361         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2362       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2363         buffer->cur++, result->type = CPP_DOT_STAR;
2364       break;
2365
2366     case '+':
2367       result->type = CPP_PLUS;
2368       if (*buffer->cur == '+')
2369         buffer->cur++, result->type = CPP_PLUS_PLUS;
2370       else if (*buffer->cur == '=')
2371         buffer->cur++, result->type = CPP_PLUS_EQ;
2372       break;
2373
2374     case '-':
2375       result->type = CPP_MINUS;
2376       if (*buffer->cur == '>')
2377         {
2378           buffer->cur++;
2379           result->type = CPP_DEREF;
2380           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2381             buffer->cur++, result->type = CPP_DEREF_STAR;
2382         }
2383       else if (*buffer->cur == '-')
2384         buffer->cur++, result->type = CPP_MINUS_MINUS;
2385       else if (*buffer->cur == '=')
2386         buffer->cur++, result->type = CPP_MINUS_EQ;
2387       break;
2388
2389     case '&':
2390       result->type = CPP_AND;
2391       if (*buffer->cur == '&')
2392         buffer->cur++, result->type = CPP_AND_AND;
2393       else if (*buffer->cur == '=')
2394         buffer->cur++, result->type = CPP_AND_EQ;
2395       break;
2396
2397     case '|':
2398       result->type = CPP_OR;
2399       if (*buffer->cur == '|')
2400         buffer->cur++, result->type = CPP_OR_OR;
2401       else if (*buffer->cur == '=')
2402         buffer->cur++, result->type = CPP_OR_EQ;
2403       break;
2404
2405     case ':':
2406       result->type = CPP_COLON;
2407       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2408         buffer->cur++, result->type = CPP_SCOPE;
2409       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2410         {
2411           buffer->cur++;
2412           result->flags |= DIGRAPH;
2413           result->type = CPP_CLOSE_SQUARE;
2414         }
2415       break;
2416
2417     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2418     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2419     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2420     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2421     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2422
2423     case '?': result->type = CPP_QUERY; break;
2424     case '~': result->type = CPP_COMPL; break;
2425     case ',': result->type = CPP_COMMA; break;
2426     case '(': result->type = CPP_OPEN_PAREN; break;
2427     case ')': result->type = CPP_CLOSE_PAREN; break;
2428     case '[': result->type = CPP_OPEN_SQUARE; break;
2429     case ']': result->type = CPP_CLOSE_SQUARE; break;
2430     case '{': result->type = CPP_OPEN_BRACE; break;
2431     case '}': result->type = CPP_CLOSE_BRACE; break;
2432     case ';': result->type = CPP_SEMICOLON; break;
2433
2434       /* @ is a punctuator in Objective-C.  */
2435     case '@': result->type = CPP_ATSIGN; break;
2436
2437     case '$':
2438     case '\\':
2439       {
2440         const uchar *base = --buffer->cur;
2441         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2442
2443         if (forms_identifier_p (pfile, true, &nst))
2444           {
2445             result->type = CPP_NAME;
2446             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2447             warn_about_normalization (pfile, result, &nst);
2448             break;
2449           }
2450         buffer->cur++;
2451       }
2452
2453     default:
2454       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2455       break;
2456     }
2457
2458   return result;
2459 }
2460
2461 /* An upper bound on the number of bytes needed to spell TOKEN.
2462    Does not include preceding whitespace.  */
2463 unsigned int
2464 cpp_token_len (const cpp_token *token)
2465 {
2466   unsigned int len;
2467
2468   switch (TOKEN_SPELL (token))
2469     {
2470     default:            len = 6;                                break;
2471     case SPELL_LITERAL: len = token->val.str.len;               break;
2472     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2473     }
2474
2475   return len;
2476 }
2477
2478 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2479    Return the number of bytes read out of NAME.  (There are always
2480    10 bytes written to BUFFER.)  */
2481
2482 static size_t
2483 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2484 {
2485   int j;
2486   int ucn_len = 0;
2487   int ucn_len_c;
2488   unsigned t;
2489   unsigned long utf32;
2490
2491   /* Compute the length of the UTF-8 sequence.  */
2492   for (t = *name; t & 0x80; t <<= 1)
2493     ucn_len++;
2494
2495   utf32 = *name & (0x7F >> ucn_len);
2496   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2497     {
2498       utf32 = (utf32 << 6) | (*++name & 0x3F);
2499
2500       /* Ill-formed UTF-8.  */
2501       if ((*name & ~0x3F) != 0x80)
2502         abort ();
2503     }
2504
2505   *buffer++ = '\\';
2506   *buffer++ = 'U';
2507   for (j = 7; j >= 0; j--)
2508     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2509   return ucn_len;
2510 }
2511
2512 /* Given a token TYPE corresponding to a digraph, return a pointer to
2513    the spelling of the digraph.  */
2514 static const unsigned char *
2515 cpp_digraph2name (enum cpp_ttype type)
2516 {
2517   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2518 }
2519
2520 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2521    already contain the enough space to hold the token's spelling.
2522    Returns a pointer to the character after the last character written.
2523    FORSTRING is true if this is to be the spelling after translation
2524    phase 1 (this is different for UCNs).
2525    FIXME: Would be nice if we didn't need the PFILE argument.  */
2526 unsigned char *
2527 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2528                  unsigned char *buffer, bool forstring)
2529 {
2530   switch (TOKEN_SPELL (token))
2531     {
2532     case SPELL_OPERATOR:
2533       {
2534         const unsigned char *spelling;
2535         unsigned char c;
2536
2537         if (token->flags & DIGRAPH)
2538           spelling = cpp_digraph2name (token->type);
2539         else if (token->flags & NAMED_OP)
2540           goto spell_ident;
2541         else
2542           spelling = TOKEN_NAME (token);
2543
2544         while ((c = *spelling++) != '\0')
2545           *buffer++ = c;
2546       }
2547       break;
2548
2549     spell_ident:
2550     case SPELL_IDENT:
2551       if (forstring)
2552         {
2553           memcpy (buffer, NODE_NAME (token->val.node.node),
2554                   NODE_LEN (token->val.node.node));
2555           buffer += NODE_LEN (token->val.node.node);
2556         }
2557       else
2558         {
2559           size_t i;
2560           const unsigned char * name = NODE_NAME (token->val.node.node);
2561
2562           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2563             if (name[i] & ~0x7F)
2564               {
2565                 i += utf8_to_ucn (buffer, name + i) - 1;
2566                 buffer += 10;
2567               }
2568             else
2569               *buffer++ = NODE_NAME (token->val.node.node)[i];
2570         }
2571       break;
2572
2573     case SPELL_LITERAL:
2574       memcpy (buffer, token->val.str.text, token->val.str.len);
2575       buffer += token->val.str.len;
2576       break;
2577
2578     case SPELL_NONE:
2579       cpp_error (pfile, CPP_DL_ICE,
2580                  "unspellable token %s", TOKEN_NAME (token));
2581       break;
2582     }
2583
2584   return buffer;
2585 }
2586
2587 /* Returns TOKEN spelt as a null-terminated string.  The string is
2588    freed when the reader is destroyed.  Useful for diagnostics.  */
2589 unsigned char *
2590 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2591 {
2592   unsigned int len = cpp_token_len (token) + 1;
2593   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2594
2595   end = cpp_spell_token (pfile, token, start, false);
2596   end[0] = '\0';
2597
2598   return start;
2599 }
2600
2601 /* Returns a pointer to a string which spells the token defined by
2602    TYPE and FLAGS.  Used by C front ends, which really should move to
2603    using cpp_token_as_text.  */
2604 const char *
2605 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2606 {
2607   if (flags & DIGRAPH)
2608     return (const char *) cpp_digraph2name (type);
2609   else if (flags & NAMED_OP)
2610     return cpp_named_operator2name (type);
2611
2612   return (const char *) token_spellings[type].name;
2613 }
2614
2615 /* Writes the spelling of token to FP, without any preceding space.
2616    Separated from cpp_spell_token for efficiency - to avoid stdio
2617    double-buffering.  */
2618 void
2619 cpp_output_token (const cpp_token *token, FILE *fp)
2620 {
2621   switch (TOKEN_SPELL (token))
2622     {
2623     case SPELL_OPERATOR:
2624       {
2625         const unsigned char *spelling;
2626         int c;
2627
2628         if (token->flags & DIGRAPH)
2629           spelling = cpp_digraph2name (token->type);
2630         else if (token->flags & NAMED_OP)
2631           goto spell_ident;
2632         else
2633           spelling = TOKEN_NAME (token);
2634
2635         c = *spelling;
2636         do
2637           putc (c, fp);
2638         while ((c = *++spelling) != '\0');
2639       }
2640       break;
2641
2642     spell_ident:
2643     case SPELL_IDENT:
2644       {
2645         size_t i;
2646         const unsigned char * name = NODE_NAME (token->val.node.node);
2647
2648         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2649           if (name[i] & ~0x7F)
2650             {
2651               unsigned char buffer[10];
2652               i += utf8_to_ucn (buffer, name + i) - 1;
2653               fwrite (buffer, 1, 10, fp);
2654             }
2655           else
2656             fputc (NODE_NAME (token->val.node.node)[i], fp);
2657       }
2658       break;
2659
2660     case SPELL_LITERAL:
2661       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2662       break;
2663
2664     case SPELL_NONE:
2665       /* An error, most probably.  */
2666       break;
2667     }
2668 }
2669
2670 /* Compare two tokens.  */
2671 int
2672 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2673 {
2674   if (a->type == b->type && a->flags == b->flags)
2675     switch (TOKEN_SPELL (a))
2676       {
2677       default:                  /* Keep compiler happy.  */
2678       case SPELL_OPERATOR:
2679         /* token_no is used to track where multiple consecutive ##
2680            tokens were originally located.  */
2681         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2682       case SPELL_NONE:
2683         return (a->type != CPP_MACRO_ARG
2684                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2685       case SPELL_IDENT:
2686         return a->val.node.node == b->val.node.node;
2687       case SPELL_LITERAL:
2688         return (a->val.str.len == b->val.str.len
2689                 && !memcmp (a->val.str.text, b->val.str.text,
2690                             a->val.str.len));
2691       }
2692
2693   return 0;
2694 }
2695
2696 /* Returns nonzero if a space should be inserted to avoid an
2697    accidental token paste for output.  For simplicity, it is
2698    conservative, and occasionally advises a space where one is not
2699    needed, e.g. "." and ".2".  */
2700 int
2701 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2702                  const cpp_token *token2)
2703 {
2704   enum cpp_ttype a = token1->type, b = token2->type;
2705   cppchar_t c;
2706
2707   if (token1->flags & NAMED_OP)
2708     a = CPP_NAME;
2709   if (token2->flags & NAMED_OP)
2710     b = CPP_NAME;
2711
2712   c = EOF;
2713   if (token2->flags & DIGRAPH)
2714     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2715   else if (token_spellings[b].category == SPELL_OPERATOR)
2716     c = token_spellings[b].name[0];
2717
2718   /* Quickly get everything that can paste with an '='.  */
2719   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2720     return 1;
2721
2722   switch (a)
2723     {
2724     case CPP_GREATER:   return c == '>';
2725     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2726     case CPP_PLUS:      return c == '+';
2727     case CPP_MINUS:     return c == '-' || c == '>';
2728     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2729     case CPP_MOD:       return c == ':' || c == '>';
2730     case CPP_AND:       return c == '&';
2731     case CPP_OR:        return c == '|';
2732     case CPP_COLON:     return c == ':' || c == '>';
2733     case CPP_DEREF:     return c == '*';
2734     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2735     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2736     case CPP_NAME:      return ((b == CPP_NUMBER
2737                                  && name_p (pfile, &token2->val.str))
2738                                 || b == CPP_NAME
2739                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2740     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2741                                 || c == '.' || c == '+' || c == '-');
2742                                       /* UCNs */
2743     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2744                                  && b == CPP_NAME)
2745                                 || (CPP_OPTION (pfile, objc)
2746                                     && token1->val.str.text[0] == '@'
2747                                     && (b == CPP_NAME || b == CPP_STRING)));
2748     default:            break;
2749     }
2750
2751   return 0;
2752 }
2753
2754 /* Output all the remaining tokens on the current line, and a newline
2755    character, to FP.  Leading whitespace is removed.  If there are
2756    macros, special token padding is not performed.  */
2757 void
2758 cpp_output_line (cpp_reader *pfile, FILE *fp)
2759 {
2760   const cpp_token *token;
2761
2762   token = cpp_get_token (pfile);
2763   while (token->type != CPP_EOF)
2764     {
2765       cpp_output_token (token, fp);
2766       token = cpp_get_token (pfile);
2767       if (token->flags & PREV_WHITE)
2768         putc (' ', fp);
2769     }
2770
2771   putc ('\n', fp);
2772 }
2773
2774 /* Return a string representation of all the remaining tokens on the
2775    current line.  The result is allocated using xmalloc and must be
2776    freed by the caller.  */
2777 unsigned char *
2778 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2779 {
2780   const cpp_token *token;
2781   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2782   unsigned int alloced = 120 + out;
2783   unsigned char *result = (unsigned char *) xmalloc (alloced);
2784
2785   /* If DIR_NAME is empty, there are no initial contents.  */
2786   if (dir_name)
2787     {
2788       sprintf ((char *) result, "#%s ", dir_name);
2789       out += 2;
2790     }
2791
2792   token = cpp_get_token (pfile);
2793   while (token->type != CPP_EOF)
2794     {
2795       unsigned char *last;
2796       /* Include room for a possible space and the terminating nul.  */
2797       unsigned int len = cpp_token_len (token) + 2;
2798
2799       if (out + len > alloced)
2800         {
2801           alloced *= 2;
2802           if (out + len > alloced)
2803             alloced = out + len;
2804           result = (unsigned char *) xrealloc (result, alloced);
2805         }
2806
2807       last = cpp_spell_token (pfile, token, &result[out], 0);
2808       out = last - result;
2809
2810       token = cpp_get_token (pfile);
2811       if (token->flags & PREV_WHITE)
2812         result[out++] = ' ';
2813     }
2814
2815   result[out] = '\0';
2816   return result;
2817 }
2818
2819 /* Memory buffers.  Changing these three constants can have a dramatic
2820    effect on performance.  The values here are reasonable defaults,
2821    but might be tuned.  If you adjust them, be sure to test across a
2822    range of uses of cpplib, including heavy nested function-like macro
2823    expansion.  Also check the change in peak memory usage (NJAMD is a
2824    good tool for this).  */
2825 #define MIN_BUFF_SIZE 8000
2826 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2827 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2828         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2829
2830 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2831   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2832 #endif
2833
2834 /* Create a new allocation buffer.  Place the control block at the end
2835    of the buffer, so that buffer overflows will cause immediate chaos.  */
2836 static _cpp_buff *
2837 new_buff (size_t len)
2838 {
2839   _cpp_buff *result;
2840   unsigned char *base;
2841
2842   if (len < MIN_BUFF_SIZE)
2843     len = MIN_BUFF_SIZE;
2844   len = CPP_ALIGN (len);
2845
2846   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2847   result = (_cpp_buff *) (base + len);
2848   result->base = base;
2849   result->cur = base;
2850   result->limit = base + len;
2851   result->next = NULL;
2852   return result;
2853 }
2854
2855 /* Place a chain of unwanted allocation buffers on the free list.  */
2856 void
2857 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2858 {
2859   _cpp_buff *end = buff;
2860
2861   while (end->next)
2862     end = end->next;
2863   end->next = pfile->free_buffs;
2864   pfile->free_buffs = buff;
2865 }
2866
2867 /* Return a free buffer of size at least MIN_SIZE.  */
2868 _cpp_buff *
2869 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2870 {
2871   _cpp_buff *result, **p;
2872
2873   for (p = &pfile->free_buffs;; p = &(*p)->next)
2874     {
2875       size_t size;
2876
2877       if (*p == NULL)
2878         return new_buff (min_size);
2879       result = *p;
2880       size = result->limit - result->base;
2881       /* Return a buffer that's big enough, but don't waste one that's
2882          way too big.  */
2883       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2884         break;
2885     }
2886
2887   *p = result->next;
2888   result->next = NULL;
2889   result->cur = result->base;
2890   return result;
2891 }
2892
2893 /* Creates a new buffer with enough space to hold the uncommitted
2894    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
2895    the excess bytes to the new buffer.  Chains the new buffer after
2896    BUFF, and returns the new buffer.  */
2897 _cpp_buff *
2898 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2899 {
2900   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2901   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2902
2903   buff->next = new_buff;
2904   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2905   return new_buff;
2906 }
2907
2908 /* Creates a new buffer with enough space to hold the uncommitted
2909    remaining bytes of the buffer pointed to by BUFF, and at least
2910    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
2911    Chains the new buffer before the buffer pointed to by BUFF, and
2912    updates the pointer to point to the new buffer.  */
2913 void
2914 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
2915 {
2916   _cpp_buff *new_buff, *old_buff = *pbuff;
2917   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2918
2919   new_buff = _cpp_get_buff (pfile, size);
2920   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2921   new_buff->next = old_buff;
2922   *pbuff = new_buff;
2923 }
2924
2925 /* Free a chain of buffers starting at BUFF.  */
2926 void
2927 _cpp_free_buff (_cpp_buff *buff)
2928 {
2929   _cpp_buff *next;
2930
2931   for (; buff; buff = next)
2932     {
2933       next = buff->next;
2934       free (buff->base);
2935     }
2936 }
2937
2938 /* Allocate permanent, unaligned storage of length LEN.  */
2939 unsigned char *
2940 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
2941 {
2942   _cpp_buff *buff = pfile->u_buff;
2943   unsigned char *result = buff->cur;
2944
2945   if (len > (size_t) (buff->limit - result))
2946     {
2947       buff = _cpp_get_buff (pfile, len);
2948       buff->next = pfile->u_buff;
2949       pfile->u_buff = buff;
2950       result = buff->cur;
2951     }
2952
2953   buff->cur = result + len;
2954   return result;
2955 }
2956
2957 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2958    That buffer is used for growing allocations when saving macro
2959    replacement lists in a #define, and when parsing an answer to an
2960    assertion in #assert, #unassert or #if (and therefore possibly
2961    whilst expanding macros).  It therefore must not be used by any
2962    code that they might call: specifically the lexer and the guts of
2963    the macro expander.
2964
2965    All existing other uses clearly fit this restriction: storing
2966    registered pragmas during initialization.  */
2967 unsigned char *
2968 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
2969 {
2970   _cpp_buff *buff = pfile->a_buff;
2971   unsigned char *result = buff->cur;
2972
2973   if (len > (size_t) (buff->limit - result))
2974     {
2975       buff = _cpp_get_buff (pfile, len);
2976       buff->next = pfile->a_buff;
2977       pfile->a_buff = buff;
2978       result = buff->cur;
2979     }
2980
2981   buff->cur = result + len;
2982   return result;
2983 }
2984
2985 /* Say which field of TOK is in use.  */
2986
2987 enum cpp_token_fld_kind
2988 cpp_token_val_index (cpp_token *tok)
2989 {
2990   switch (TOKEN_SPELL (tok))
2991     {
2992     case SPELL_IDENT:
2993       return CPP_TOKEN_FLD_NODE;
2994     case SPELL_LITERAL:
2995       return CPP_TOKEN_FLD_STR;
2996     case SPELL_OPERATOR:
2997       if (tok->type == CPP_PASTE)
2998         return CPP_TOKEN_FLD_TOKEN_NO;
2999       else
3000         return CPP_TOKEN_FLD_NONE;
3001     case SPELL_NONE:
3002       if (tok->type == CPP_MACRO_ARG)
3003         return CPP_TOKEN_FLD_ARG_NO;
3004       else if (tok->type == CPP_PADDING)
3005         return CPP_TOKEN_FLD_SOURCE;
3006       else if (tok->type == CPP_PRAGMA)
3007         return CPP_TOKEN_FLD_PRAGMA;
3008       /* else fall through */
3009     default:
3010       return CPP_TOKEN_FLD_NONE;
3011     }
3012 }
3013
3014 /* All tokens lexed in R after calling this function will be forced to have
3015    their source_location the same as the location referenced by P, until
3016    cpp_stop_forcing_token_locations is called for R.  */
3017
3018 void
3019 cpp_force_token_locations (cpp_reader *r, source_location *p)
3020 {
3021   r->forced_token_location_p = p;
3022 }
3023
3024 /* Go back to assigning locations naturally for lexed tokens.  */
3025
3026 void
3027 cpp_stop_forcing_token_locations (cpp_reader *r)
3028 {
3029   r->forced_token_location_p = NULL;
3030 }