libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009, 2010,
   3    2011, 2012 Free Software Foundation, Inc.
   4    Contributed by Per Bothner, 1994-95.
   5    Based on CCCP program by Paul Rubin, June 1986
   6    Adapted to ANSI C, Richard Stallman, Jan 1987
   7    Broken out to separate file, Zack Weinberg, Mar 2000
   8
   9 This program is free software; you can redistribute it and/or modify it
  10 under the terms of the GNU General Public License as published by the
  11 Free Software Foundation; either version 3, or (at your option) any
  12 later version.
  13
  14 This program is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with this program; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "cpplib.h"
  26 #include "internal.h"
  27
  28 enum spell_type
  29 {
  30   SPELL_OPERATOR = 0,
  31   SPELL_IDENT,
  32   SPELL_LITERAL,
  33   SPELL_NONE
  34 };
  35
  36 struct token_spelling
  37 {
  38   enum spell_type category;
  39   const unsigned char *name;
  40 };
  41
  42 static const unsigned char *const digraph_spellings[] =
  43 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  44
  45 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  46 #define TK(e, s) { SPELL_ ## s,    UC #e },
  47 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  48 #undef OP
  49 #undef TK
  50
  51 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  52 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  53
  54 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  55 static int skip_line_comment (cpp_reader *);
  56 static void skip_whitespace (cpp_reader *, cppchar_t);
  57 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  58 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  59 static void store_comment (cpp_reader *, cpp_token *);
  60 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  61                             unsigned int, enum cpp_ttype);
  62 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  63 static int name_p (cpp_reader *, const cpp_string *);
  64 static tokenrun *next_tokenrun (tokenrun *);
  65
  66 static _cpp_buff *new_buff (size_t);
  67
  68
  69 /* Utility routine:
  70
  71    Compares, the token TOKEN to the NUL-terminated string STRING.
  72    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  73 int
  74 cpp_ideq (const cpp_token *token, const char *string)
  75 {
  76   if (token->type != CPP_NAME)
  77     return 0;
  78
  79   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  80 }
  81
  82 /* Record a note TYPE at byte POS into the current cleaned logical
  83    line.  */
  84 static void
  85 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  86 {
  87   if (buffer->notes_used == buffer->notes_cap)
  88     {
  89       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  90       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  91                                   buffer->notes_cap);
  92     }
  93
  94   buffer->notes[buffer->notes_used].pos = pos;
  95   buffer->notes[buffer->notes_used].type = type;
  96   buffer->notes_used++;
  97 }
  98
  99 \f
 100 /* Fast path to find line special characters using optimized character
 101    scanning algorithms.  Anything complicated falls back to the slow
 102    path below.  Since this loop is very hot it's worth doing these kinds
 103    of optimizations.
 104
 105    One of the paths through the ifdefs should provide
 106
 107      const uchar *search_line_fast (const uchar *s, const uchar *end);
 108
 109    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 110    the found character.
 111
 112    Note that the last character of the buffer is *always* a newline,
 113    as forced by _cpp_convert_input.  This fact can be used to avoid
 114    explicitly looking for the end of the buffer.  */
 115
 116 /* Configure gives us an ifdef test.  */
 117 #ifndef WORDS_BIGENDIAN
 118 #define WORDS_BIGENDIAN 0
 119 #endif
 120
 121 /* We'd like the largest integer that fits into a register.  There's nothing
 122    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 123    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 124    can get the "real" word size.  */
 125 #ifdef __GNUC__
 126 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 127 #else
 128 typedef unsigned long word_type;
 129 #endif
 130
 131 /* The code below is only expecting sizes 4 or 8.
 132    Die at compile-time if this expectation is violated.  */
 133 typedef char check_word_type_size
 134   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 135
 136 /* Return X with the first N bytes forced to values that won't match one
 137    of the interesting characters.  Note that NUL is not interesting.  */
 138
 139 static inline word_type
 140 acc_char_mask_misalign (word_type val, unsigned int n)
 141 {
 142   word_type mask = -1;
 143   if (WORDS_BIGENDIAN)
 144     mask >>= n * 8;
 145   else
 146     mask <<= n * 8;
 147   return val & mask;
 148 }
 149
 150 /* Return X replicated to all byte positions within WORD_TYPE.  */
 151
 152 static inline word_type
 153 acc_char_replicate (uchar x)
 154 {
 155   word_type ret;
 156
 157   ret = (x << 24) | (x << 16) | (x << 8) | x;
 158   if (sizeof(word_type) == 8)
 159     ret = (ret << 16 << 16) | ret;
 160   return ret;
 161 }
 162
 163 /* Return non-zero if some byte of VAL is (probably) C.  */
 164
 165 static inline word_type
 166 acc_char_cmp (word_type val, word_type c)
 167 {
 168 #if defined(__GNUC__) && defined(__alpha__)
 169   /* We can get exact results using a compare-bytes instruction.
 170      Get (val == c) via (0 >= (val ^ c)).  */
 171   return __builtin_alpha_cmpbge (0, val ^ c);
 172 #else
 173   word_type magic = 0x7efefefeU;
 174   if (sizeof(word_type) == 8)
 175     magic = (magic << 16 << 16) | 0xfefefefeU;
 176   magic |= 1;
 177
 178   val ^= c;
 179   return ((val + magic) ^ ~val) & ~magic;
 180 #endif
 181 }
 182
 183 /* Given the result of acc_char_cmp is non-zero, return the index of
 184    the found character.  If this was a false positive, return -1.  */
 185
 186 static inline int
 187 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 188                 word_type val ATTRIBUTE_UNUSED)
 189 {
 190 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 191   /* The cmpbge instruction sets *bits* of the result corresponding to
 192      matches in the bytes with no false positives.  */
 193   return __builtin_ctzl (cmp);
 194 #else
 195   unsigned int i;
 196
 197   /* ??? It would be nice to force unrolling here,
 198      and have all of these constants folded.  */
 199   for (i = 0; i < sizeof(word_type); ++i)
 200     {
 201       uchar c;
 202       if (WORDS_BIGENDIAN)
 203         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 204       else
 205         c = (val >> i * 8) & 0xff;
 206
 207       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 208         return i;
 209     }
 210
 211   return -1;
 212 #endif
 213 }
 214
 215 /* A version of the fast scanner using bit fiddling techniques.
 216
 217    For 32-bit words, one would normally perform 16 comparisons and
 218    16 branches.  With this algorithm one performs 24 arithmetic
 219    operations and one branch.  Whether this is faster with a 32-bit
 220    word size is going to be somewhat system dependent.
 221
 222    For 64-bit words, we eliminate twice the number of comparisons
 223    and branches without increasing the number of arithmetic operations.
 224    It's almost certainly going to be a win with 64-bit word size.  */
 225
 226 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 227   ATTRIBUTE_UNUSED;
 228
 229 static const uchar *
 230 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 231 {
 232   const word_type repl_nl = acc_char_replicate ('\n');
 233   const word_type repl_cr = acc_char_replicate ('\r');
 234   const word_type repl_bs = acc_char_replicate ('\\');
 235   const word_type repl_qm = acc_char_replicate ('?');
 236
 237   unsigned int misalign;
 238   const word_type *p;
 239   word_type val, t;
 240
 241   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 242   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 243   val = *p;
 244   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 245   if (misalign)
 246     val = acc_char_mask_misalign (val, misalign);
 247
 248   /* Main loop.  */
 249   while (1)
 250     {
 251       t  = acc_char_cmp (val, repl_nl);
 252       t |= acc_char_cmp (val, repl_cr);
 253       t |= acc_char_cmp (val, repl_bs);
 254       t |= acc_char_cmp (val, repl_qm);
 255
 256       if (__builtin_expect (t != 0, 0))
 257         {
 258           int i = acc_char_index (t, val);
 259           if (i >= 0)
 260             return (const uchar *)p + i;
 261         }
 262
 263       val = *++p;
 264     }
 265 }
 266
 267 /* Disable on Solaris 2/x86 until the following problems can be properly
 268    autoconfed:
 269
 270    The Solaris 9 assembler cannot assemble SSE4.2 insns.
 271    Before Solaris 9 Update 6, SSE insns cannot be executed.
 272    The Solaris 10+ assembler tags objects with the instruction set
 273    extensions used, so SSE4.2 executables cannot run on machines that
 274    don't support that extension.  */
 275
 276 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 277
 278 /* Replicated character data to be shared between implementations.
 279    Recall that outside of a context with vector support we can't
 280    define compatible vector types, therefore these are all defined
 281    in terms of raw characters.  */
 282 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 283   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 284     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 285   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 286     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 287   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 288     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 289   { '?', '?', '?', '?', '?', '?', '?', '?',
 290     '?', '?', '?', '?', '?', '?', '?', '?' },
 291 };
 292
 293 /* A version of the fast scanner using MMX vectorized byte compare insns.
 294
 295    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 296    which was packaged into SSE1; it is also present in the AMD MMX
 297    extension.  Mark the function as using "sse" so that we emit a real
 298    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 299
 300 static const uchar *
 301 #ifndef __SSE__
 302 __attribute__((__target__("sse")))
 303 #endif
 304 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 305 {
 306   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 307   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 308
 309   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 310   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 311   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 312   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 313
 314   unsigned int misalign, found, mask;
 315   const v8qi *p;
 316   v8qi data, t, c;
 317
 318   /* Align the source pointer.  While MMX doesn't generate unaligned data
 319      faults, this allows us to safely scan to the end of the buffer without
 320      reading beyond the end of the last page.  */
 321   misalign = (uintptr_t)s & 7;
 322   p = (const v8qi *)((uintptr_t)s & -8);
 323   data = *p;
 324
 325   /* Create a mask for the bytes that are valid within the first
 326      16-byte block.  The Idea here is that the AND with the mask
 327      within the loop is "free", since we need some AND or TEST
 328      insn in order to set the flags for the branch anyway.  */
 329   mask = -1u << misalign;
 330
 331   /* Main loop processing 8 bytes at a time.  */
 332   goto start;
 333   do
 334     {
 335       data = *++p;
 336       mask = -1;
 337
 338     start:
 339       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 340       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 341       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 342       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 343       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 344       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 345       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 346       found = __builtin_ia32_pmovmskb (t);
 347       found &= mask;
 348     }
 349   while (!found);
 350
 351   __builtin_ia32_emms ();
 352
 353   /* FOUND contains 1 in bits for which we matched a relevant
 354      character.  Conversion to the byte index is trivial.  */
 355   found = __builtin_ctz(found);
 356   return (const uchar *)p + found;
 357 }
 358
 359 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 360
 361 static const uchar *
 362 #ifndef __SSE2__
 363 __attribute__((__target__("sse2")))
 364 #endif
 365 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 366 {
 367   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 368
 369   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 370   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 371   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 372   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 373
 374   unsigned int misalign, found, mask;
 375   const v16qi *p;
 376   v16qi data, t;
 377
 378   /* Align the source pointer.  */
 379   misalign = (uintptr_t)s & 15;
 380   p = (const v16qi *)((uintptr_t)s & -16);
 381   data = *p;
 382
 383   /* Create a mask for the bytes that are valid within the first
 384      16-byte block.  The Idea here is that the AND with the mask
 385      within the loop is "free", since we need some AND or TEST
 386      insn in order to set the flags for the branch anyway.  */
 387   mask = -1u << misalign;
 388
 389   /* Main loop processing 16 bytes at a time.  */
 390   goto start;
 391   do
 392     {
 393       data = *++p;
 394       mask = -1;
 395
 396     start:
 397       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 398       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 399       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 400       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 401       found = __builtin_ia32_pmovmskb128 (t);
 402       found &= mask;
 403     }
 404   while (!found);
 405
 406   /* FOUND contains 1 in bits for which we matched a relevant
 407      character.  Conversion to the byte index is trivial.  */
 408   found = __builtin_ctz(found);
 409   return (const uchar *)p + found;
 410 }
 411
 412 #ifdef HAVE_SSE4
 413 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 414
 415 static const uchar *
 416 #ifndef __SSE4_2__
 417 __attribute__((__target__("sse4.2")))
 418 #endif
 419 search_line_sse42 (const uchar *s, const uchar *end)
 420 {
 421   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 422   static const v16qi search = { '\n', '\r', '?', '\\' };
 423
 424   uintptr_t si = (uintptr_t)s;
 425   uintptr_t index;
 426
 427   /* Check for unaligned input.  */
 428   if (si & 15)
 429     {
 430       if (__builtin_expect (end - s < 16, 0)
 431           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 432         {
 433           /* There are less than 16 bytes left in the buffer, and less
 434              than 16 bytes left on the page.  Reading 16 bytes at this
 435              point might generate a spurious page fault.  Defer to the
 436              SSE2 implementation, which already handles alignment.  */
 437           return search_line_sse2 (s, end);
 438         }
 439
 440       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 441          memory need not be aligned.  */
 442       __asm ("%vpcmpestri $0, (%1), %2"
 443              : "=c"(index) : "r"(s), "x"(search), "a"(4), "d"(16));
 444       if (__builtin_expect (index < 16, 0))
 445         goto found;
 446
 447       /* Advance the pointer to an aligned address.  We will re-scan a
 448          few bytes, but we no longer need care for reading past the
 449          end of a page, since we're guaranteed a match.  */
 450       s = (const uchar *)((si + 16) & -16);
 451     }
 452
 453   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 454      in inline assembly, we can make proper use of the flags set.  */
 455   __asm (      "sub $16, %1\n"
 456         "       .balign 16\n"
 457         "0:     add $16, %1\n"
 458         "       %vpcmpestri $0, (%1), %2\n"
 459         "       jnc 0b"
 460         : "=&c"(index), "+r"(s)
 461         : "x"(search), "a"(4), "d"(16));
 462
 463  found:
 464   return s + index;
 465 }
 466
 467 #else
 468 /* Work around out-dated assemblers without sse4 support.  */
 469 #define search_line_sse42 search_line_sse2
 470 #endif
 471
 472 /* Check the CPU capabilities.  */
 473
 474 #include "../gcc/config/i386/cpuid.h"
 475
 476 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 477 static search_line_fast_type search_line_fast;
 478
 479 #define HAVE_init_vectorized_lexer 1
 480 static inline void
 481 init_vectorized_lexer (void)
 482 {
 483   unsigned dummy, ecx = 0, edx = 0;
 484   search_line_fast_type impl = search_line_acc_char;
 485   int minimum = 0;
 486
 487 #if defined(__SSE4_2__)
 488   minimum = 3;
 489 #elif defined(__SSE2__)
 490   minimum = 2;
 491 #elif defined(__SSE__)
 492   minimum = 1;
 493 #endif
 494
 495   if (minimum == 3)
 496     impl = search_line_sse42;
 497   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 498     {
 499       if (minimum == 3 || (ecx & bit_SSE4_2))
 500         impl = search_line_sse42;
 501       else if (minimum == 2 || (edx & bit_SSE2))
 502         impl = search_line_sse2;
 503       else if (minimum == 1 || (edx & bit_SSE))
 504         impl = search_line_mmx;
 505     }
 506   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 507     {
 508       if (minimum == 1
 509           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 510         impl = search_line_mmx;
 511     }
 512
 513   search_line_fast = impl;
 514 }
 515
 516 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
 517
 518 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 519 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 520    so we can't compile this function without -maltivec on the command line
 521    (or implied by some other switch).  */
 522
 523 static const uchar *
 524 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 525 {
 526   typedef __attribute__((altivec(vector))) unsigned char vc;
 527
 528   const vc repl_nl = {
 529     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 530     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 531   };
 532   const vc repl_cr = {
 533     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 534     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 535   };
 536   const vc repl_bs = {
 537     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 538     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 539   };
 540   const vc repl_qm = {
 541     '?', '?', '?', '?', '?', '?', '?', '?',
 542     '?', '?', '?', '?', '?', '?', '?', '?',
 543   };
 544   const vc ones = {
 545     -1, -1, -1, -1, -1, -1, -1, -1,
 546     -1, -1, -1, -1, -1, -1, -1, -1,
 547   };
 548   const vc zero = { 0 };
 549
 550   vc data, mask, t;
 551
 552   /* Altivec loads automatically mask addresses with -16.  This lets us
 553      issue the first load as early as possible.  */
 554   data = __builtin_vec_ld(0, (const vc *)s);
 555
 556   /* Discard bytes before the beginning of the buffer.  Do this by
 557      beginning with all ones and shifting in zeros according to the
 558      mis-alignment.  The LVSR instruction pulls the exact shift we
 559      want from the address.  */
 560   mask = __builtin_vec_lvsr(0, s);
 561   mask = __builtin_vec_perm(zero, ones, mask);
 562   data &= mask;
 563
 564   /* While altivec loads mask addresses, we still need to align S so
 565      that the offset we compute at the end is correct.  */
 566   s = (const uchar *)((uintptr_t)s & -16);
 567
 568   /* Main loop processing 16 bytes at a time.  */
 569   goto start;
 570   do
 571     {
 572       vc m_nl, m_cr, m_bs, m_qm;
 573
 574       s += 16;
 575       data = __builtin_vec_ld(0, (const vc *)s);
 576
 577     start:
 578       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 579       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 580       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 581       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 582       t = (m_nl | m_cr) | (m_bs | m_qm);
 583
 584       /* T now contains 0xff in bytes for which we matched one of the relevant
 585          characters.  We want to exit the loop if any byte in T is non-zero.
 586          Below is the expansion of vec_any_ne(t, zero).  */
 587     }
 588   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 589
 590   {
 591 #define N  (sizeof(vc) / sizeof(long))
 592
 593     union {
 594       vc v;
 595       unsigned long l[N];
 596     } u;
 597     unsigned long l, i = 0;
 598
 599     u.v = t;
 600
 601     /* Find the first word of T that is non-zero.  */
 602     switch (N)
 603       {
 604       case 4:
 605         l = u.l[i++];
 606         if (l != 0)
 607           break;
 608         s += sizeof(unsigned long);
 609         l = u.l[i++];
 610         if (l != 0)
 611           break;
 612         s += sizeof(unsigned long);
 613       case 2:
 614         l = u.l[i++];
 615         if (l != 0)
 616           break;
 617         s += sizeof(unsigned long);
 618         l = u.l[i];
 619       }
 620
 621     /* L now contains 0xff in bytes for which we matched one of the
 622        relevant characters.  We can find the byte index by finding
 623        its bit index and dividing by 8.  */
 624     l = __builtin_clzl(l) >> 3;
 625     return s + l;
 626
 627 #undef N
 628   }
 629 }
 630
 631 #elif defined (__ARM_NEON__)
 632 #include "arm_neon.h"
 633
 634 static const uchar *
 635 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 636 {
 637   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 638   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 639   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 640   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 641   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 642
 643   unsigned int misalign, found, mask;
 644   const uint8_t *p;
 645   uint8x16_t data;
 646
 647   /* Align the source pointer.  */
 648   misalign = (uintptr_t)s & 15;
 649   p = (const uint8_t *)((uintptr_t)s & -16);
 650   data = vld1q_u8 (p);
 651
 652   /* Create a mask for the bytes that are valid within the first
 653      16-byte block.  The Idea here is that the AND with the mask
 654      within the loop is "free", since we need some AND or TEST
 655      insn in order to set the flags for the branch anyway.  */
 656   mask = (-1u << misalign) & 0xffff;
 657
 658   /* Main loop, processing 16 bytes at a time.  */
 659   goto start;
 660
 661   do
 662     {
 663       uint8x8_t l;
 664       uint16x4_t m;
 665       uint32x2_t n;
 666       uint8x16_t t, u, v, w;
 667
 668       p += 16;
 669       data = vld1q_u8 (p);
 670       mask = 0xffff;
 671
 672     start:
 673       t = vceqq_u8 (data, repl_nl);
 674       u = vceqq_u8 (data, repl_cr);
 675       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 676       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 677       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 678       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 679       m = vpaddl_u8 (l);
 680       n = vpaddl_u16 (m);
 681
 682       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 683               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 684       found &= mask;
 685     }
 686   while (!found);
 687
 688   /* FOUND contains 1 in bits for which we matched a relevant
 689      character.  Conversion to the byte index is trivial.  */
 690   found = __builtin_ctz (found);
 691   return (const uchar *)p + found;
 692 }
 693
 694 #else
 695
 696 /* We only have one accellerated alternative.  Use a direct call so that
 697    we encourage inlining.  */
 698
 699 #define search_line_fast  search_line_acc_char
 700
 701 #endif
 702
 703 /* Initialize the lexer if needed.  */
 704
 705 void
 706 _cpp_init_lexer (void)
 707 {
 708 #ifdef HAVE_init_vectorized_lexer
 709   init_vectorized_lexer ();
 710 #endif
 711 }
 712
 713 /* Returns with a logical line that contains no escaped newlines or
 714    trigraphs.  This is a time-critical inner loop.  */
 715 void
 716 _cpp_clean_line (cpp_reader *pfile)
 717 {
 718   cpp_buffer *buffer;
 719   const uchar *s;
 720   uchar c, *d, *p;
 721
 722   buffer = pfile->buffer;
 723   buffer->cur_note = buffer->notes_used = 0;
 724   buffer->cur = buffer->line_base = buffer->next_line;
 725   buffer->need_line = false;
 726   s = buffer->next_line;
 727
 728   if (!buffer->from_stage3)
 729     {
 730       const uchar *pbackslash = NULL;
 731
 732       /* Fast path.  This is the common case of an un-escaped line with
 733          no trigraphs.  The primary win here is by not writing any
 734          data back to memory until we have to.  */
 735       while (1)
 736         {
 737           /* Perform an optimized search for \n, \r, \\, ?.  */
 738           s = search_line_fast (s, buffer->rlimit);
 739
 740           c = *s;
 741           if (c == '\\')
 742             {
 743               /* Record the location of the backslash and continue.  */
 744               pbackslash = s++;
 745             }
 746           else if (__builtin_expect (c == '?', 0))
 747             {
 748               if (__builtin_expect (s[1] == '?', false)
 749                    && _cpp_trigraph_map[s[2]])
 750                 {
 751                   /* Have a trigraph.  We may or may not have to convert
 752                      it.  Add a line note regardless, for -Wtrigraphs.  */
 753                   add_line_note (buffer, s, s[2]);
 754                   if (CPP_OPTION (pfile, trigraphs))
 755                     {
 756                       /* We do, and that means we have to switch to the
 757                          slow path.  */
 758                       d = (uchar *) s;
 759                       *d = _cpp_trigraph_map[s[2]];
 760                       s += 2;
 761                       goto slow_path;
 762                     }
 763                 }
 764               /* Not a trigraph.  Continue on fast-path.  */
 765               s++;
 766             }
 767           else
 768             break;
 769         }
 770
 771       /* This must be \r or \n.  We're either done, or we'll be forced
 772          to write back to the buffer and continue on the slow path.  */
 773       d = (uchar *) s;
 774
 775       if (__builtin_expect (s == buffer->rlimit, false))
 776         goto done;
 777
 778       /* DOS line ending? */
 779       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 780         {
 781           s++;
 782           if (s == buffer->rlimit)
 783             goto done;
 784         }
 785
 786       if (__builtin_expect (pbackslash == NULL, true))
 787         goto done;
 788
 789       /* Check for escaped newline.  */
 790       p = d;
 791       while (is_nvspace (p[-1]))
 792         p--;
 793       if (p - 1 != pbackslash)
 794         goto done;
 795
 796       /* Have an escaped newline; process it and proceed to
 797          the slow path.  */
 798       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 799       d = p - 2;
 800       buffer->next_line = p - 1;
 801
 802     slow_path:
 803       while (1)
 804         {
 805           c = *++s;
 806           *++d = c;
 807
 808           if (c == '\n' || c == '\r')
 809             {
 810               /* Handle DOS line endings.  */
 811               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 812                 s++;
 813               if (s == buffer->rlimit)
 814                 break;
 815
 816               /* Escaped?  */
 817               p = d;
 818               while (p != buffer->next_line && is_nvspace (p[-1]))
 819                 p--;
 820               if (p == buffer->next_line || p[-1] != '\\')
 821                 break;
 822
 823               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 824               d = p - 2;
 825               buffer->next_line = p - 1;
 826             }
 827           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 828             {
 829               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 830               add_line_note (buffer, d, s[2]);
 831               if (CPP_OPTION (pfile, trigraphs))
 832                 {
 833                   *d = _cpp_trigraph_map[s[2]];
 834                   s += 2;
 835                 }
 836             }
 837         }
 838     }
 839   else
 840     {
 841       while (*s != '\n' && *s != '\r')
 842         s++;
 843       d = (uchar *) s;
 844
 845       /* Handle DOS line endings.  */
 846       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 847         s++;
 848     }
 849
 850  done:
 851   *d = '\n';
 852   /* A sentinel note that should never be processed.  */
 853   add_line_note (buffer, d + 1, '\n');
 854   buffer->next_line = s + 1;
 855 }
 856
 857 /* Return true if the trigraph indicated by NOTE should be warned
 858    about in a comment.  */
 859 static bool
 860 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 861 {
 862   const uchar *p;
 863
 864   /* Within comments we don't warn about trigraphs, unless the
 865      trigraph forms an escaped newline, as that may change
 866      behavior.  */
 867   if (note->type != '/')
 868     return false;
 869
 870   /* If -trigraphs, then this was an escaped newline iff the next note
 871      is coincident.  */
 872   if (CPP_OPTION (pfile, trigraphs))
 873     return note[1].pos == note->pos;
 874
 875   /* Otherwise, see if this forms an escaped newline.  */
 876   p = note->pos + 3;
 877   while (is_nvspace (*p))
 878     p++;
 879
 880   /* There might have been escaped newlines between the trigraph and the
 881      newline we found.  Hence the position test.  */
 882   return (*p == '\n' && p < note[1].pos);
 883 }
 884
 885 /* Process the notes created by add_line_note as far as the current
 886    location.  */
 887 void
 888 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 889 {
 890   cpp_buffer *buffer = pfile->buffer;
 891
 892   for (;;)
 893     {
 894       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 895       unsigned int col;
 896
 897       if (note->pos > buffer->cur)
 898         break;
 899
 900       buffer->cur_note++;
 901       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 902
 903       if (note->type == '\\' || note->type == ' ')
 904         {
 905           if (note->type == ' ' && !in_comment)
 906             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 907                                  "backslash and newline separated by space");
 908
 909           if (buffer->next_line > buffer->rlimit)
 910             {
 911               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 912                                    "backslash-newline at end of file");
 913               /* Prevent "no newline at end of file" warning.  */
 914               buffer->next_line = buffer->rlimit;
 915             }
 916
 917           buffer->line_base = note->pos;
 918           CPP_INCREMENT_LINE (pfile, 0);
 919         }
 920       else if (_cpp_trigraph_map[note->type])
 921         {
 922           if (CPP_OPTION (pfile, warn_trigraphs)
 923               && (!in_comment || warn_in_comment (pfile, note)))
 924             {
 925               if (CPP_OPTION (pfile, trigraphs))
 926                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 927                                        pfile->line_table->highest_line, col,
 928                                        "trigraph ??%c converted to %c",
 929                                        note->type,
 930                                        (int) _cpp_trigraph_map[note->type]);
 931               else
 932                 {
 933                   cpp_warning_with_line
 934                     (pfile, CPP_W_TRIGRAPHS,
 935                      pfile->line_table->highest_line, col,
 936                      "trigraph ??%c ignored, use -trigraphs to enable",
 937                      note->type);
 938                 }
 939             }
 940         }
 941       else if (note->type == 0)
 942         /* Already processed in lex_raw_string.  */;
 943       else
 944         abort ();
 945     }
 946 }
 947
 948 /* Skip a C-style block comment.  We find the end of the comment by
 949    seeing if an asterisk is before every '/' we encounter.  Returns
 950    nonzero if comment terminated by EOF, zero otherwise.
 951
 952    Buffer->cur points to the initial asterisk of the comment.  */
 953 bool
 954 _cpp_skip_block_comment (cpp_reader *pfile)
 955 {
 956   cpp_buffer *buffer = pfile->buffer;
 957   const uchar *cur = buffer->cur;
 958   uchar c;
 959
 960   cur++;
 961   if (*cur == '/')
 962     cur++;
 963
 964   for (;;)
 965     {
 966       /* People like decorating comments with '*', so check for '/'
 967          instead for efficiency.  */
 968       c = *cur++;
 969
 970       if (c == '/')
 971         {
 972           if (cur[-2] == '*')
 973             break;
 974
 975           /* Warn about potential nested comments, but not if the '/'
 976              comes immediately before the true comment delimiter.
 977              Don't bother to get it right across escaped newlines.  */
 978           if (CPP_OPTION (pfile, warn_comments)
 979               && cur[0] == '*' && cur[1] != '/')
 980             {
 981               buffer->cur = cur;
 982               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 983                                      pfile->line_table->highest_line,
 984                                      CPP_BUF_COL (buffer),
 985                                      "\"/*\" within comment");
 986             }
 987         }
 988       else if (c == '\n')
 989         {
 990           unsigned int cols;
 991           buffer->cur = cur - 1;
 992           _cpp_process_line_notes (pfile, true);
 993           if (buffer->next_line >= buffer->rlimit)
 994             return true;
 995           _cpp_clean_line (pfile);
 996
 997           cols = buffer->next_line - buffer->line_base;
 998           CPP_INCREMENT_LINE (pfile, cols);
 999
1000           cur = buffer->cur;
1001         }
1002     }
1003
1004   buffer->cur = cur;
1005   _cpp_process_line_notes (pfile, true);
1006   return false;
1007 }
1008
1009 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1010    terminating newline.  Handles escaped newlines.  Returns nonzero
1011    if a multiline comment.  */
1012 static int
1013 skip_line_comment (cpp_reader *pfile)
1014 {
1015   cpp_buffer *buffer = pfile->buffer;
1016   source_location orig_line = pfile->line_table->highest_line;
1017
1018   while (*buffer->cur != '\n')
1019     buffer->cur++;
1020
1021   _cpp_process_line_notes (pfile, true);
1022   return orig_line != pfile->line_table->highest_line;
1023 }
1024
1025 /* Skips whitespace, saving the next non-whitespace character.  */
1026 static void
1027 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1028 {
1029   cpp_buffer *buffer = pfile->buffer;
1030   bool saw_NUL = false;
1031
1032   do
1033     {
1034       /* Horizontal space always OK.  */
1035       if (c == ' ' || c == '\t')
1036         ;
1037       /* Just \f \v or \0 left.  */
1038       else if (c == '\0')
1039         saw_NUL = true;
1040       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1041         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1042                              CPP_BUF_COL (buffer),
1043                              "%s in preprocessing directive",
1044                              c == '\f' ? "form feed" : "vertical tab");
1045
1046       c = *buffer->cur++;
1047     }
1048   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1049   while (is_nvspace (c));
1050
1051   if (saw_NUL)
1052     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1053
1054   buffer->cur--;
1055 }
1056
1057 /* See if the characters of a number token are valid in a name (no
1058    '.', '+' or '-').  */
1059 static int
1060 name_p (cpp_reader *pfile, const cpp_string *string)
1061 {
1062   unsigned int i;
1063
1064   for (i = 0; i < string->len; i++)
1065     if (!is_idchar (string->text[i]))
1066       return 0;
1067
1068   return 1;
1069 }
1070
1071 /* After parsing an identifier or other sequence, produce a warning about
1072    sequences not in NFC/NFKC.  */
1073 static void
1074 warn_about_normalization (cpp_reader *pfile,
1075                           const cpp_token *token,
1076                           const struct normalize_state *s)
1077 {
1078   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1079       && !pfile->state.skipping)
1080     {
1081       /* Make sure that the token is printed using UCNs, even
1082          if we'd otherwise happily print UTF-8.  */
1083       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1084       size_t sz;
1085
1086       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1087       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1088         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1089                                "`%.*s' is not in NFKC", (int) sz, buf);
1090       else
1091         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1092                                "`%.*s' is not in NFC", (int) sz, buf);
1093     }
1094 }
1095
1096 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1097    an identifier.  FIRST is TRUE if this starts an identifier.  */
1098 static bool
1099 forms_identifier_p (cpp_reader *pfile, int first,
1100                     struct normalize_state *state)
1101 {
1102   cpp_buffer *buffer = pfile->buffer;
1103
1104   if (*buffer->cur == '$')
1105     {
1106       if (!CPP_OPTION (pfile, dollars_in_ident))
1107         return false;
1108
1109       buffer->cur++;
1110       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1111         {
1112           CPP_OPTION (pfile, warn_dollars) = 0;
1113           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1114         }
1115
1116       return true;
1117     }
1118
1119   /* Is this a syntactically valid UCN?  */
1120   if (CPP_OPTION (pfile, extended_identifiers)
1121       && *buffer->cur == '\\'
1122       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1123     {
1124       buffer->cur += 2;
1125       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1126                           state))
1127         return true;
1128       buffer->cur -= 2;
1129     }
1130
1131   return false;
1132 }
1133
1134 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1135 static cpp_hashnode *
1136 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1137 {
1138   cpp_hashnode *result;
1139   const uchar *cur;
1140   unsigned int len;
1141   unsigned int hash = HT_HASHSTEP (0, *base);
1142
1143   cur = base + 1;
1144   while (ISIDNUM (*cur))
1145     {
1146       hash = HT_HASHSTEP (hash, *cur);
1147       cur++;
1148     }
1149   len = cur - base;
1150   hash = HT_HASHFINISH (hash, len);
1151   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1152                                               base, len, hash, HT_ALLOC));
1153
1154   /* Rarely, identifiers require diagnostics when lexed.  */
1155   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1156                         && !pfile->state.skipping, 0))
1157     {
1158       /* It is allowed to poison the same identifier twice.  */
1159       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1160         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1161                    NODE_NAME (result));
1162
1163       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1164          replacement list of a variadic macro.  */
1165       if (result == pfile->spec_nodes.n__VA_ARGS__
1166           && !pfile->state.va_args_ok)
1167         cpp_error (pfile, CPP_DL_PEDWARN,
1168                    "__VA_ARGS__ can only appear in the expansion"
1169                    " of a C99 variadic macro");
1170
1171       /* For -Wc++-compat, warn about use of C++ named operators.  */
1172       if (result->flags & NODE_WARN_OPERATOR)
1173         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1174                      "identifier \"%s\" is a special operator name in C++",
1175                      NODE_NAME (result));
1176     }
1177
1178   return result;
1179 }
1180
1181 /* Get the cpp_hashnode of an identifier specified by NAME in
1182    the current cpp_reader object.  If none is found, NULL is returned.  */
1183 cpp_hashnode *
1184 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1185 {
1186   cpp_hashnode *result;
1187   result = lex_identifier_intern (pfile, (uchar *) name);
1188   return result;
1189 }
1190
1191 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1192 static cpp_hashnode *
1193 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1194                 struct normalize_state *nst)
1195 {
1196   cpp_hashnode *result;
1197   const uchar *cur;
1198   unsigned int len;
1199   unsigned int hash = HT_HASHSTEP (0, *base);
1200
1201   cur = pfile->buffer->cur;
1202   if (! starts_ucn)
1203     while (ISIDNUM (*cur))
1204       {
1205         hash = HT_HASHSTEP (hash, *cur);
1206         cur++;
1207       }
1208   pfile->buffer->cur = cur;
1209   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1210     {
1211       /* Slower version for identifiers containing UCNs (or $).  */
1212       do {
1213         while (ISIDNUM (*pfile->buffer->cur))
1214           {
1215             pfile->buffer->cur++;
1216             NORMALIZE_STATE_UPDATE_IDNUM (nst);
1217           }
1218       } while (forms_identifier_p (pfile, false, nst));
1219       result = _cpp_interpret_identifier (pfile, base,
1220                                           pfile->buffer->cur - base);
1221     }
1222   else
1223     {
1224       len = cur - base;
1225       hash = HT_HASHFINISH (hash, len);
1226
1227       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1228                                                   base, len, hash, HT_ALLOC));
1229     }
1230
1231   /* Rarely, identifiers require diagnostics when lexed.  */
1232   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1233                         && !pfile->state.skipping, 0))
1234     {
1235       /* It is allowed to poison the same identifier twice.  */
1236       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1237         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1238                    NODE_NAME (result));
1239
1240       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1241          replacement list of a variadic macro.  */
1242       if (result == pfile->spec_nodes.n__VA_ARGS__
1243           && !pfile->state.va_args_ok)
1244         cpp_error (pfile, CPP_DL_PEDWARN,
1245                    "__VA_ARGS__ can only appear in the expansion"
1246                    " of a C99 variadic macro");
1247
1248       /* For -Wc++-compat, warn about use of C++ named operators.  */
1249       if (result->flags & NODE_WARN_OPERATOR)
1250         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1251                      "identifier \"%s\" is a special operator name in C++",
1252                      NODE_NAME (result));
1253     }
1254
1255   return result;
1256 }
1257
1258 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1259 static void
1260 lex_number (cpp_reader *pfile, cpp_string *number,
1261             struct normalize_state *nst)
1262 {
1263   const uchar *cur;
1264   const uchar *base;
1265   uchar *dest;
1266
1267   base = pfile->buffer->cur - 1;
1268   do
1269     {
1270       cur = pfile->buffer->cur;
1271
1272       /* N.B. ISIDNUM does not include $.  */
1273       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
1274         {
1275           cur++;
1276           NORMALIZE_STATE_UPDATE_IDNUM (nst);
1277         }
1278
1279       pfile->buffer->cur = cur;
1280     }
1281   while (forms_identifier_p (pfile, false, nst));
1282
1283   number->len = cur - base;
1284   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1285   memcpy (dest, base, number->len);
1286   dest[number->len] = '\0';
1287   number->text = dest;
1288 }
1289
1290 /* Create a token of type TYPE with a literal spelling.  */
1291 static void
1292 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1293                 unsigned int len, enum cpp_ttype type)
1294 {
1295   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1296
1297   memcpy (dest, base, len);
1298   dest[len] = '\0';
1299   token->type = type;
1300   token->val.str.len = len;
1301   token->val.str.text = dest;
1302 }
1303
1304 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1305    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1306
1307 static void
1308 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1309                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1310 {
1311   _cpp_buff *first_buff = *first_buff_p;
1312   _cpp_buff *last_buff = *last_buff_p;
1313
1314   if (first_buff == NULL)
1315     first_buff = last_buff = _cpp_get_buff (pfile, len);
1316   else if (len > BUFF_ROOM (last_buff))
1317     {
1318       size_t room = BUFF_ROOM (last_buff);
1319       memcpy (BUFF_FRONT (last_buff), base, room);
1320       BUFF_FRONT (last_buff) += room;
1321       base += room;
1322       len -= room;
1323       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1324     }
1325
1326   memcpy (BUFF_FRONT (last_buff), base, len);
1327   BUFF_FRONT (last_buff) += len;
1328
1329   *first_buff_p = first_buff;
1330   *last_buff_p = last_buff;
1331 }
1332
1333 /* Lexes a raw string.  The stored string contains the spelling, including
1334    double quotes, delimiter string, '(' and ')', any leading
1335    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1336    literal, or CPP_OTHER if it was not properly terminated.
1337
1338    The spelling is NUL-terminated, but it is not guaranteed that this
1339    is the first NUL since embedded NULs are preserved.  */
1340
1341 static void
1342 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1343                 const uchar *cur)
1344 {
1345   const uchar *raw_prefix;
1346   unsigned int raw_prefix_len = 0;
1347   enum cpp_ttype type;
1348   size_t total_len = 0;
1349   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1350   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1351
1352   type = (*base == 'L' ? CPP_WSTRING :
1353           *base == 'U' ? CPP_STRING32 :
1354           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1355           : CPP_STRING);
1356
1357   raw_prefix = cur + 1;
1358   while (raw_prefix_len < 16)
1359     {
1360       switch (raw_prefix[raw_prefix_len])
1361         {
1362         case ' ': case '(': case ')': case '\\': case '\t':
1363         case '\v': case '\f': case '\n': default:
1364           break;
1365         /* Basic source charset except the above chars.  */
1366         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1367         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1368         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1369         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1370         case 'y': case 'z':
1371         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1372         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1373         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1374         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1375         case 'Y': case 'Z':
1376         case '0': case '1': case '2': case '3': case '4': case '5':
1377         case '6': case '7': case '8': case '9':
1378         case '_': case '{': case '}': case '#': case '[': case ']':
1379         case '<': case '>': case '%': case ':': case ';': case '.':
1380         case '?': case '*': case '+': case '-': case '/': case '^':
1381         case '&': case '|': case '~': case '!': case '=': case ',':
1382         case '"': case '\'':
1383           raw_prefix_len++;
1384           continue;
1385         }
1386       break;
1387     }
1388
1389   if (raw_prefix[raw_prefix_len] != '(')
1390     {
1391       int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
1392                 + 1;
1393       if (raw_prefix_len == 16)
1394         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1395                              "raw string delimiter longer than 16 characters");
1396       else
1397         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1398                              "invalid character '%c' in raw string delimiter",
1399                              (int) raw_prefix[raw_prefix_len]);
1400       pfile->buffer->cur = raw_prefix - 1;
1401       create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
1402       return;
1403     }
1404
1405   cur = raw_prefix + raw_prefix_len + 1;
1406   for (;;)
1407     {
1408 #define BUF_APPEND(STR,LEN)                                     \
1409       do {                                                      \
1410         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1411                         &first_buff, &last_buff);               \
1412         total_len += (LEN);                                     \
1413       } while (0);
1414
1415       cppchar_t c;
1416
1417       /* If we previously performed any trigraph or line splicing
1418          transformations, undo them within the body of the raw string.  */
1419       while (note->pos < cur)
1420         ++note;
1421       for (; note->pos == cur; ++note)
1422         {
1423           switch (note->type)
1424             {
1425             case '\\':
1426             case ' ':
1427               /* Restore backslash followed by newline.  */
1428               BUF_APPEND (base, cur - base);
1429               base = cur;
1430               BUF_APPEND ("\\", 1);
1431             after_backslash:
1432               if (note->type == ' ')
1433                 {
1434                   /* GNU backslash whitespace newline extension.  FIXME
1435                      could be any sequence of non-vertical space.  When we
1436                      can properly restore any such sequence, we should mark
1437                      this note as handled so _cpp_process_line_notes
1438                      doesn't warn.  */
1439                   BUF_APPEND (" ", 1);
1440                 }
1441
1442               BUF_APPEND ("\n", 1);
1443               break;
1444
1445             case 0:
1446               /* Already handled.  */
1447               break;
1448
1449             default:
1450               if (_cpp_trigraph_map[note->type])
1451                 {
1452                   /* Don't warn about this trigraph in
1453                      _cpp_process_line_notes, since trigraphs show up as
1454                      trigraphs in raw strings.  */
1455                   uchar type = note->type;
1456                   note->type = 0;
1457
1458                   if (!CPP_OPTION (pfile, trigraphs))
1459                     /* If we didn't convert the trigraph in the first
1460                        place, don't do anything now either.  */
1461                     break;
1462
1463                   BUF_APPEND (base, cur - base);
1464                   base = cur;
1465                   BUF_APPEND ("??", 2);
1466
1467                   /* ??/ followed by newline gets two line notes, one for
1468                      the trigraph and one for the backslash/newline.  */
1469                   if (type == '/' && note[1].pos == cur)
1470                     {
1471                       if (note[1].type != '\\'
1472                           && note[1].type != ' ')
1473                         abort ();
1474                       BUF_APPEND ("/", 1);
1475                       ++note;
1476                       goto after_backslash;
1477                     }
1478                   /* The ) from ??) could be part of the suffix.  */
1479                   else if (type == ')'
1480                            && strncmp ((const char *) cur+1,
1481                                        (const char *) raw_prefix,
1482                                        raw_prefix_len) == 0
1483                            && cur[raw_prefix_len+1] == '"')
1484                     {
1485                       BUF_APPEND (")", 1);
1486                       base++;
1487                       cur += raw_prefix_len + 2;
1488                       goto break_outer_loop;
1489                     }
1490                   else
1491                     {
1492                       /* Skip the replacement character.  */
1493                       base = ++cur;
1494                       BUF_APPEND (&type, 1);
1495                     }
1496                 }
1497               else
1498                 abort ();
1499               break;
1500             }
1501         }
1502       c = *cur++;
1503
1504       if (c == ')'
1505           && strncmp ((const char *) cur, (const char *) raw_prefix,
1506                       raw_prefix_len) == 0
1507           && cur[raw_prefix_len] == '"')
1508         {
1509           cur += raw_prefix_len + 1;
1510           break;
1511         }
1512       else if (c == '\n')
1513         {
1514           if (pfile->state.in_directive
1515               || pfile->state.parsing_args
1516               || pfile->state.in_deferred_pragma)
1517             {
1518               cur--;
1519               type = CPP_OTHER;
1520               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1521                                    "unterminated raw string");
1522               break;
1523             }
1524
1525           BUF_APPEND (base, cur - base);
1526
1527           if (pfile->buffer->cur < pfile->buffer->rlimit)
1528             CPP_INCREMENT_LINE (pfile, 0);
1529           pfile->buffer->need_line = true;
1530
1531           pfile->buffer->cur = cur-1;
1532           _cpp_process_line_notes (pfile, false);
1533           if (!_cpp_get_fresh_line (pfile))
1534             {
1535               source_location src_loc = token->src_loc;
1536               token->type = CPP_EOF;
1537               /* Tell the compiler the line number of the EOF token.  */
1538               token->src_loc = pfile->line_table->highest_line;
1539               token->flags = BOL;
1540               if (first_buff != NULL)
1541                 _cpp_release_buff (pfile, first_buff);
1542               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1543                                    "unterminated raw string");
1544               return;
1545             }
1546
1547           cur = base = pfile->buffer->cur;
1548           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1549         }
1550     }
1551  break_outer_loop:
1552
1553   if (CPP_OPTION (pfile, user_literals))
1554     {
1555       /* According to C++11 [lex.ext]p10, a ud-suffix not starting with an
1556          underscore is ill-formed.  Since this breaks programs using macros
1557          from inttypes.h, we generate a warning and treat the ud-suffix as a
1558          separate preprocessing token.  This approach is under discussion by
1559          the standards committee, and has been adopted as a conforming
1560          extension by other front ends such as clang. */
1561       if (ISALPHA (*cur))
1562         {
1563           /* Raise a warning, but do not consume subsequent tokens.  */
1564           if (CPP_OPTION (pfile, warn_literal_suffix))
1565             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1566                                    token->src_loc, 0,
1567                                    "invalid suffix on literal; C++11 requires "
1568                                    "a space between literal and identifier");
1569         }
1570       /* Grab user defined literal suffix.  */
1571       else if (*cur == '_')
1572         {
1573           type = cpp_userdef_string_add_type (type);
1574           ++cur;
1575
1576           while (ISIDNUM (*cur))
1577             ++cur;
1578         }
1579     }
1580
1581   pfile->buffer->cur = cur;
1582   if (first_buff == NULL)
1583     create_literal (pfile, token, base, cur - base, type);
1584   else
1585     {
1586       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1587
1588       token->type = type;
1589       token->val.str.len = total_len + (cur - base);
1590       token->val.str.text = dest;
1591       last_buff = first_buff;
1592       while (last_buff != NULL)
1593         {
1594           memcpy (dest, last_buff->base,
1595                   BUFF_FRONT (last_buff) - last_buff->base);
1596           dest += BUFF_FRONT (last_buff) - last_buff->base;
1597           last_buff = last_buff->next;
1598         }
1599       _cpp_release_buff (pfile, first_buff);
1600       memcpy (dest, base, cur - base);
1601       dest[cur - base] = '\0';
1602     }
1603 }
1604
1605 /* Lexes a string, character constant, or angle-bracketed header file
1606    name.  The stored string contains the spelling, including opening
1607    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1608    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1609    if it was not properly terminated, or CPP_LESS for an unterminated
1610    header name which must be relexed as normal tokens.
1611
1612    The spelling is NUL-terminated, but it is not guaranteed that this
1613    is the first NUL since embedded NULs are preserved.  */
1614 static void
1615 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1616 {
1617   bool saw_NUL = false;
1618   const uchar *cur;
1619   cppchar_t terminator;
1620   enum cpp_ttype type;
1621
1622   cur = base;
1623   terminator = *cur++;
1624   if (terminator == 'L' || terminator == 'U')
1625     terminator = *cur++;
1626   else if (terminator == 'u')
1627     {
1628       terminator = *cur++;
1629       if (terminator == '8')
1630         terminator = *cur++;
1631     }
1632   if (terminator == 'R')
1633     {
1634       lex_raw_string (pfile, token, base, cur);
1635       return;
1636     }
1637   if (terminator == '"')
1638     type = (*base == 'L' ? CPP_WSTRING :
1639             *base == 'U' ? CPP_STRING32 :
1640             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1641                          : CPP_STRING);
1642   else if (terminator == '\'')
1643     type = (*base == 'L' ? CPP_WCHAR :
1644             *base == 'U' ? CPP_CHAR32 :
1645             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1646   else
1647     terminator = '>', type = CPP_HEADER_NAME;
1648
1649   for (;;)
1650     {
1651       cppchar_t c = *cur++;
1652
1653       /* In #include-style directives, terminators are not escapable.  */
1654       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1655         cur++;
1656       else if (c == terminator)
1657         break;
1658       else if (c == '\n')
1659         {
1660           cur--;
1661           /* Unmatched quotes always yield undefined behavior, but
1662              greedy lexing means that what appears to be an unterminated
1663              header name may actually be a legitimate sequence of tokens.  */
1664           if (terminator == '>')
1665             {
1666               token->type = CPP_LESS;
1667               return;
1668             }
1669           type = CPP_OTHER;
1670           break;
1671         }
1672       else if (c == '\0')
1673         saw_NUL = true;
1674     }
1675
1676   if (saw_NUL && !pfile->state.skipping)
1677     cpp_error (pfile, CPP_DL_WARNING,
1678                "null character(s) preserved in literal");
1679
1680   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1681     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1682                (int) terminator);
1683
1684   if (CPP_OPTION (pfile, user_literals))
1685     {
1686       /* According to C++11 [lex.ext]p10, a ud-suffix not starting with an
1687          underscore is ill-formed.  Since this breaks programs using macros
1688          from inttypes.h, we generate a warning and treat the ud-suffix as a
1689          separate preprocessing token.  This approach is under discussion by
1690          the standards committee, and has been adopted as a conforming
1691          extension by other front ends such as clang. */
1692       if (ISALPHA (*cur))
1693         {
1694           /* Raise a warning, but do not consume subsequent tokens.  */
1695           if (CPP_OPTION (pfile, warn_literal_suffix))
1696             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1697                                    token->src_loc, 0,
1698                                    "invalid suffix on literal; C++11 requires "
1699                                    "a space between literal and identifier");
1700         }
1701       /* Grab user defined literal suffix.  */
1702       else if (*cur == '_')
1703         {
1704           type = cpp_userdef_char_add_type (type);
1705           type = cpp_userdef_string_add_type (type);
1706           ++cur;
1707
1708           while (ISIDNUM (*cur))
1709             ++cur;
1710         }
1711     }
1712
1713   pfile->buffer->cur = cur;
1714   create_literal (pfile, token, base, cur - base, type);
1715 }
1716
1717 /* Return the comment table. The client may not make any assumption
1718    about the ordering of the table.  */
1719 cpp_comment_table *
1720 cpp_get_comments (cpp_reader *pfile)
1721 {
1722   return &pfile->comments;
1723 }
1724
1725 /* Append a comment to the end of the comment table. */
1726 static void
1727 store_comment (cpp_reader *pfile, cpp_token *token)
1728 {
1729   int len;
1730
1731   if (pfile->comments.allocated == 0)
1732     {
1733       pfile->comments.allocated = 256;
1734       pfile->comments.entries = (cpp_comment *) xmalloc
1735         (pfile->comments.allocated * sizeof (cpp_comment));
1736     }
1737
1738   if (pfile->comments.count == pfile->comments.allocated)
1739     {
1740       pfile->comments.allocated *= 2;
1741       pfile->comments.entries = (cpp_comment *) xrealloc
1742         (pfile->comments.entries,
1743          pfile->comments.allocated * sizeof (cpp_comment));
1744     }
1745
1746   len = token->val.str.len;
1747
1748   /* Copy comment. Note, token may not be NULL terminated. */
1749   pfile->comments.entries[pfile->comments.count].comment =
1750     (char *) xmalloc (sizeof (char) * (len + 1));
1751   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1752           token->val.str.text, len);
1753   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1754
1755   /* Set source location. */
1756   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1757
1758   /* Increment the count of entries in the comment table. */
1759   pfile->comments.count++;
1760 }
1761
1762 /* The stored comment includes the comment start and any terminator.  */
1763 static void
1764 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1765               cppchar_t type)
1766 {
1767   unsigned char *buffer;
1768   unsigned int len, clen, i;
1769
1770   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1771
1772   /* C++ comments probably (not definitely) have moved past a new
1773      line, which we don't want to save in the comment.  */
1774   if (is_vspace (pfile->buffer->cur[-1]))
1775     len--;
1776
1777   /* If we are currently in a directive or in argument parsing, then
1778      we need to store all C++ comments as C comments internally, and
1779      so we need to allocate a little extra space in that case.
1780
1781      Note that the only time we encounter a directive here is
1782      when we are saving comments in a "#define".  */
1783   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1784           && type == '/') ? len + 2 : len;
1785
1786   buffer = _cpp_unaligned_alloc (pfile, clen);
1787
1788   token->type = CPP_COMMENT;
1789   token->val.str.len = clen;
1790   token->val.str.text = buffer;
1791
1792   buffer[0] = '/';
1793   memcpy (buffer + 1, from, len - 1);
1794
1795   /* Finish conversion to a C comment, if necessary.  */
1796   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1797     {
1798       buffer[1] = '*';
1799       buffer[clen - 2] = '*';
1800       buffer[clen - 1] = '/';
1801       /* As there can be in a C++ comments illegal sequences for C comments
1802          we need to filter them out.  */
1803       for (i = 2; i < (clen - 2); i++)
1804         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1805           buffer[i] = '|';
1806     }
1807
1808   /* Finally store this comment for use by clients of libcpp. */
1809   store_comment (pfile, token);
1810 }
1811
1812 /* Allocate COUNT tokens for RUN.  */
1813 void
1814 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1815 {
1816   run->base = XNEWVEC (cpp_token, count);
1817   run->limit = run->base + count;
1818   run->next = NULL;
1819 }
1820
1821 /* Returns the next tokenrun, or creates one if there is none.  */
1822 static tokenrun *
1823 next_tokenrun (tokenrun *run)
1824 {
1825   if (run->next == NULL)
1826     {
1827       run->next = XNEW (tokenrun);
1828       run->next->prev = run;
1829       _cpp_init_tokenrun (run->next, 250);
1830     }
1831
1832   return run->next;
1833 }
1834
1835 /* Return the number of not yet processed token in a given
1836    context.  */
1837 int
1838 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1839 {
1840   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1841     return (LAST (context).token - FIRST (context).token);
1842   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1843            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1844     return (LAST (context).ptoken - FIRST (context).ptoken);
1845   else
1846       abort ();
1847 }
1848
1849 /* Returns the token present at index INDEX in a given context.  If
1850    INDEX is zero, the next token to be processed is returned.  */
1851 static const cpp_token*
1852 _cpp_token_from_context_at (cpp_context *context, int index)
1853 {
1854   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1855     return &(FIRST (context).token[index]);
1856   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1857            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1858     return FIRST (context).ptoken[index];
1859  else
1860    abort ();
1861 }
1862
1863 /* Look ahead in the input stream.  */
1864 const cpp_token *
1865 cpp_peek_token (cpp_reader *pfile, int index)
1866 {
1867   cpp_context *context = pfile->context;
1868   const cpp_token *peektok;
1869   int count;
1870
1871   /* First, scan through any pending cpp_context objects.  */
1872   while (context->prev)
1873     {
1874       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1875
1876       if (index < (int) sz)
1877         return _cpp_token_from_context_at (context, index);
1878       index -= (int) sz;
1879       context = context->prev;
1880     }
1881
1882   /* We will have to read some new tokens after all (and do so
1883      without invalidating preceding tokens).  */
1884   count = index;
1885   pfile->keep_tokens++;
1886
1887   do
1888     {
1889       peektok = _cpp_lex_token (pfile);
1890       if (peektok->type == CPP_EOF)
1891         return peektok;
1892     }
1893   while (index--);
1894
1895   _cpp_backup_tokens_direct (pfile, count + 1);
1896   pfile->keep_tokens--;
1897
1898   return peektok;
1899 }
1900
1901 /* Allocate a single token that is invalidated at the same time as the
1902    rest of the tokens on the line.  Has its line and col set to the
1903    same as the last lexed token, so that diagnostics appear in the
1904    right place.  */
1905 cpp_token *
1906 _cpp_temp_token (cpp_reader *pfile)
1907 {
1908   cpp_token *old, *result;
1909   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1910   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1911
1912   old = pfile->cur_token - 1;
1913   /* Any pre-existing lookaheads must not be clobbered.  */
1914   if (la)
1915     {
1916       if (sz <= la)
1917         {
1918           tokenrun *next = next_tokenrun (pfile->cur_run);
1919
1920           if (sz < la)
1921             memmove (next->base + 1, next->base,
1922                      (la - sz) * sizeof (cpp_token));
1923
1924           next->base[0] = pfile->cur_run->limit[-1];
1925         }
1926
1927       if (sz > 1)
1928         memmove (pfile->cur_token + 1, pfile->cur_token,
1929                  MIN (la, sz - 1) * sizeof (cpp_token));
1930     }
1931
1932   if (!sz && pfile->cur_token == pfile->cur_run->limit)
1933     {
1934       pfile->cur_run = next_tokenrun (pfile->cur_run);
1935       pfile->cur_token = pfile->cur_run->base;
1936     }
1937
1938   result = pfile->cur_token++;
1939   result->src_loc = old->src_loc;
1940   return result;
1941 }
1942
1943 /* Lex a token into RESULT (external interface).  Takes care of issues
1944    like directive handling, token lookahead, multiple include
1945    optimization and skipping.  */
1946 const cpp_token *
1947 _cpp_lex_token (cpp_reader *pfile)
1948 {
1949   cpp_token *result;
1950
1951   for (;;)
1952     {
1953       if (pfile->cur_token == pfile->cur_run->limit)
1954         {
1955           pfile->cur_run = next_tokenrun (pfile->cur_run);
1956           pfile->cur_token = pfile->cur_run->base;
1957         }
1958       /* We assume that the current token is somewhere in the current
1959          run.  */
1960       if (pfile->cur_token < pfile->cur_run->base
1961           || pfile->cur_token >= pfile->cur_run->limit)
1962         abort ();
1963
1964       if (pfile->lookaheads)
1965         {
1966           pfile->lookaheads--;
1967           result = pfile->cur_token++;
1968         }
1969       else
1970         result = _cpp_lex_direct (pfile);
1971
1972       if (result->flags & BOL)
1973         {
1974           /* Is this a directive.  If _cpp_handle_directive returns
1975              false, it is an assembler #.  */
1976           if (result->type == CPP_HASH
1977               /* 6.10.3 p 11: Directives in a list of macro arguments
1978                  gives undefined behavior.  This implementation
1979                  handles the directive as normal.  */
1980               && pfile->state.parsing_args != 1)
1981             {
1982               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1983                 {
1984                   if (pfile->directive_result.type == CPP_PADDING)
1985                     continue;
1986                   result = &pfile->directive_result;
1987                 }
1988             }
1989           else if (pfile->state.in_deferred_pragma)
1990             result = &pfile->directive_result;
1991
1992           if (pfile->cb.line_change && !pfile->state.skipping)
1993             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
1994         }
1995
1996       /* We don't skip tokens in directives.  */
1997       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
1998         break;
1999
2000       /* Outside a directive, invalidate controlling macros.  At file
2001          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2002          get here and MI optimization works.  */
2003       pfile->mi_valid = false;
2004
2005       if (!pfile->state.skipping || result->type == CPP_EOF)
2006         break;
2007     }
2008
2009   return result;
2010 }
2011
2012 /* Returns true if a fresh line has been loaded.  */
2013 bool
2014 _cpp_get_fresh_line (cpp_reader *pfile)
2015 {
2016   int return_at_eof;
2017
2018   /* We can't get a new line until we leave the current directive.  */
2019   if (pfile->state.in_directive)
2020     return false;
2021
2022   for (;;)
2023     {
2024       cpp_buffer *buffer = pfile->buffer;
2025
2026       if (!buffer->need_line)
2027         return true;
2028
2029       if (buffer->next_line < buffer->rlimit)
2030         {
2031           _cpp_clean_line (pfile);
2032           return true;
2033         }
2034
2035       /* First, get out of parsing arguments state.  */
2036       if (pfile->state.parsing_args)
2037         return false;
2038
2039       /* End of buffer.  Non-empty files should end in a newline.  */
2040       if (buffer->buf != buffer->rlimit
2041           && buffer->next_line > buffer->rlimit
2042           && !buffer->from_stage3)
2043         {
2044           /* Clip to buffer size.  */
2045           buffer->next_line = buffer->rlimit;
2046         }
2047
2048       return_at_eof = buffer->return_at_eof;
2049       _cpp_pop_buffer (pfile);
2050       if (pfile->buffer == NULL || return_at_eof)
2051         return false;
2052     }
2053 }
2054
2055 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2056   do                                                    \
2057     {                                                   \
2058       result->type = ELSE_TYPE;                         \
2059       if (*buffer->cur == CHAR)                         \
2060         buffer->cur++, result->type = THEN_TYPE;        \
2061     }                                                   \
2062   while (0)
2063
2064 /* Lex a token into pfile->cur_token, which is also incremented, to
2065    get diagnostics pointing to the correct location.
2066
2067    Does not handle issues such as token lookahead, multiple-include
2068    optimization, directives, skipping etc.  This function is only
2069    suitable for use by _cpp_lex_token, and in special cases like
2070    lex_expansion_token which doesn't care for any of these issues.
2071
2072    When meeting a newline, returns CPP_EOF if parsing a directive,
2073    otherwise returns to the start of the token buffer if permissible.
2074    Returns the location of the lexed token.  */
2075 cpp_token *
2076 _cpp_lex_direct (cpp_reader *pfile)
2077 {
2078   cppchar_t c;
2079   cpp_buffer *buffer;
2080   const unsigned char *comment_start;
2081   cpp_token *result = pfile->cur_token++;
2082
2083  fresh_line:
2084   result->flags = 0;
2085   buffer = pfile->buffer;
2086   if (buffer->need_line)
2087     {
2088       if (pfile->state.in_deferred_pragma)
2089         {
2090           result->type = CPP_PRAGMA_EOL;
2091           pfile->state.in_deferred_pragma = false;
2092           if (!pfile->state.pragma_allow_expansion)
2093             pfile->state.prevent_expansion--;
2094           return result;
2095         }
2096       if (!_cpp_get_fresh_line (pfile))
2097         {
2098           result->type = CPP_EOF;
2099           if (!pfile->state.in_directive)
2100             {
2101               /* Tell the compiler the line number of the EOF token.  */
2102               result->src_loc = pfile->line_table->highest_line;
2103               result->flags = BOL;
2104             }
2105           return result;
2106         }
2107       if (!pfile->keep_tokens)
2108         {
2109           pfile->cur_run = &pfile->base_run;
2110           result = pfile->base_run.base;
2111           pfile->cur_token = result + 1;
2112         }
2113       result->flags = BOL;
2114       if (pfile->state.parsing_args == 2)
2115         result->flags |= PREV_WHITE;
2116     }
2117   buffer = pfile->buffer;
2118  update_tokens_line:
2119   result->src_loc = pfile->line_table->highest_line;
2120
2121  skipped_white:
2122   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2123       && !pfile->overlaid_buffer)
2124     {
2125       _cpp_process_line_notes (pfile, false);
2126       result->src_loc = pfile->line_table->highest_line;
2127     }
2128   c = *buffer->cur++;
2129
2130   if (pfile->forced_token_location_p)
2131     result->src_loc = *pfile->forced_token_location_p;
2132   else
2133     result->src_loc = linemap_position_for_column (pfile->line_table,
2134                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2135
2136   switch (c)
2137     {
2138     case ' ': case '\t': case '\f': case '\v': case '\0':
2139       result->flags |= PREV_WHITE;
2140       skip_whitespace (pfile, c);
2141       goto skipped_white;
2142
2143     case '\n':
2144       if (buffer->cur < buffer->rlimit)
2145         CPP_INCREMENT_LINE (pfile, 0);
2146       buffer->need_line = true;
2147       goto fresh_line;
2148
2149     case '0': case '1': case '2': case '3': case '4':
2150     case '5': case '6': case '7': case '8': case '9':
2151       {
2152         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2153         result->type = CPP_NUMBER;
2154         lex_number (pfile, &result->val.str, &nst);
2155         warn_about_normalization (pfile, result, &nst);
2156         break;
2157       }
2158
2159     case 'L':
2160     case 'u':
2161     case 'U':
2162     case 'R':
2163       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2164          wide strings or raw strings.  */
2165       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2166           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2167         {
2168           if ((*buffer->cur == '\'' && c != 'R')
2169               || *buffer->cur == '"'
2170               || (*buffer->cur == 'R'
2171                   && c != 'R'
2172                   && buffer->cur[1] == '"'
2173                   && CPP_OPTION (pfile, rliterals))
2174               || (*buffer->cur == '8'
2175                   && c == 'u'
2176                   && (buffer->cur[1] == '"'
2177                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2178                           && CPP_OPTION (pfile, rliterals)))))
2179             {
2180               lex_string (pfile, result, buffer->cur - 1);
2181               break;
2182             }
2183         }
2184       /* Fall through.  */
2185
2186     case '_':
2187     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2188     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2189     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2190     case 's': case 't':           case 'v': case 'w': case 'x':
2191     case 'y': case 'z':
2192     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2193     case 'G': case 'H': case 'I': case 'J': case 'K':
2194     case 'M': case 'N': case 'O': case 'P': case 'Q':
2195     case 'S': case 'T':           case 'V': case 'W': case 'X':
2196     case 'Y': case 'Z':
2197       result->type = CPP_NAME;
2198       {
2199         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2200         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2201                                                 &nst);
2202         warn_about_normalization (pfile, result, &nst);
2203       }
2204
2205       /* Convert named operators to their proper types.  */
2206       if (result->val.node.node->flags & NODE_OPERATOR)
2207         {
2208           result->flags |= NAMED_OP;
2209           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2210         }
2211       break;
2212
2213     case '\'':
2214     case '"':
2215       lex_string (pfile, result, buffer->cur - 1);
2216       break;
2217
2218     case '/':
2219       /* A potential block or line comment.  */
2220       comment_start = buffer->cur;
2221       c = *buffer->cur;
2222
2223       if (c == '*')
2224         {
2225           if (_cpp_skip_block_comment (pfile))
2226             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2227         }
2228       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2229                             || cpp_in_system_header (pfile)))
2230         {
2231           /* Warn about comments only if pedantically GNUC89, and not
2232              in system headers.  */
2233           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2234               && ! buffer->warned_cplusplus_comments)
2235             {
2236               cpp_error (pfile, CPP_DL_PEDWARN,
2237                          "C++ style comments are not allowed in ISO C90");
2238               cpp_error (pfile, CPP_DL_PEDWARN,
2239                          "(this will be reported only once per input file)");
2240               buffer->warned_cplusplus_comments = 1;
2241             }
2242
2243           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2244             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2245         }
2246       else if (c == '=')
2247         {
2248           buffer->cur++;
2249           result->type = CPP_DIV_EQ;
2250           break;
2251         }
2252       else
2253         {
2254           result->type = CPP_DIV;
2255           break;
2256         }
2257
2258       if (!pfile->state.save_comments)
2259         {
2260           result->flags |= PREV_WHITE;
2261           goto update_tokens_line;
2262         }
2263
2264       /* Save the comment as a token in its own right.  */
2265       save_comment (pfile, result, comment_start, c);
2266       break;
2267
2268     case '<':
2269       if (pfile->state.angled_headers)
2270         {
2271           lex_string (pfile, result, buffer->cur - 1);
2272           if (result->type != CPP_LESS)
2273             break;
2274         }
2275
2276       result->type = CPP_LESS;
2277       if (*buffer->cur == '=')
2278         buffer->cur++, result->type = CPP_LESS_EQ;
2279       else if (*buffer->cur == '<')
2280         {
2281           buffer->cur++;
2282           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2283         }
2284       else if (CPP_OPTION (pfile, digraphs))
2285         {
2286           if (*buffer->cur == ':')
2287             {
2288               buffer->cur++;
2289               result->flags |= DIGRAPH;
2290               result->type = CPP_OPEN_SQUARE;
2291             }
2292           else if (*buffer->cur == '%')
2293             {
2294               buffer->cur++;
2295               result->flags |= DIGRAPH;
2296               result->type = CPP_OPEN_BRACE;
2297             }
2298         }
2299       break;
2300
2301     case '>':
2302       result->type = CPP_GREATER;
2303       if (*buffer->cur == '=')
2304         buffer->cur++, result->type = CPP_GREATER_EQ;
2305       else if (*buffer->cur == '>')
2306         {
2307           buffer->cur++;
2308           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2309         }
2310       break;
2311
2312     case '%':
2313       result->type = CPP_MOD;
2314       if (*buffer->cur == '=')
2315         buffer->cur++, result->type = CPP_MOD_EQ;
2316       else if (CPP_OPTION (pfile, digraphs))
2317         {
2318           if (*buffer->cur == ':')
2319             {
2320               buffer->cur++;
2321               result->flags |= DIGRAPH;
2322               result->type = CPP_HASH;
2323               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2324                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2325             }
2326           else if (*buffer->cur == '>')
2327             {
2328               buffer->cur++;
2329               result->flags |= DIGRAPH;
2330               result->type = CPP_CLOSE_BRACE;
2331             }
2332         }
2333       break;
2334
2335     case '.':
2336       result->type = CPP_DOT;
2337       if (ISDIGIT (*buffer->cur))
2338         {
2339           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2340           result->type = CPP_NUMBER;
2341           lex_number (pfile, &result->val.str, &nst);
2342           warn_about_normalization (pfile, result, &nst);
2343         }
2344       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2345         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2346       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2347         buffer->cur++, result->type = CPP_DOT_STAR;
2348       break;
2349
2350     case '+':
2351       result->type = CPP_PLUS;
2352       if (*buffer->cur == '+')
2353         buffer->cur++, result->type = CPP_PLUS_PLUS;
2354       else if (*buffer->cur == '=')
2355         buffer->cur++, result->type = CPP_PLUS_EQ;
2356       break;
2357
2358     case '-':
2359       result->type = CPP_MINUS;
2360       if (*buffer->cur == '>')
2361         {
2362           buffer->cur++;
2363           result->type = CPP_DEREF;
2364           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2365             buffer->cur++, result->type = CPP_DEREF_STAR;
2366         }
2367       else if (*buffer->cur == '-')
2368         buffer->cur++, result->type = CPP_MINUS_MINUS;
2369       else if (*buffer->cur == '=')
2370         buffer->cur++, result->type = CPP_MINUS_EQ;
2371       break;
2372
2373     case '&':
2374       result->type = CPP_AND;
2375       if (*buffer->cur == '&')
2376         buffer->cur++, result->type = CPP_AND_AND;
2377       else if (*buffer->cur == '=')
2378         buffer->cur++, result->type = CPP_AND_EQ;
2379       break;
2380
2381     case '|':
2382       result->type = CPP_OR;
2383       if (*buffer->cur == '|')
2384         buffer->cur++, result->type = CPP_OR_OR;
2385       else if (*buffer->cur == '=')
2386         buffer->cur++, result->type = CPP_OR_EQ;
2387       break;
2388
2389     case ':':
2390       result->type = CPP_COLON;
2391       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2392         buffer->cur++, result->type = CPP_SCOPE;
2393       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2394         {
2395           buffer->cur++;
2396           result->flags |= DIGRAPH;
2397           result->type = CPP_CLOSE_SQUARE;
2398         }
2399       break;
2400
2401     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2402     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2403     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2404     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2405     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2406
2407     case '?': result->type = CPP_QUERY; break;
2408     case '~': result->type = CPP_COMPL; break;
2409     case ',': result->type = CPP_COMMA; break;
2410     case '(': result->type = CPP_OPEN_PAREN; break;
2411     case ')': result->type = CPP_CLOSE_PAREN; break;
2412     case '[': result->type = CPP_OPEN_SQUARE; break;
2413     case ']': result->type = CPP_CLOSE_SQUARE; break;
2414     case '{': result->type = CPP_OPEN_BRACE; break;
2415     case '}': result->type = CPP_CLOSE_BRACE; break;
2416     case ';': result->type = CPP_SEMICOLON; break;
2417
2418       /* @ is a punctuator in Objective-C.  */
2419     case '@': result->type = CPP_ATSIGN; break;
2420
2421     case '$':
2422     case '\\':
2423       {
2424         const uchar *base = --buffer->cur;
2425         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2426
2427         if (forms_identifier_p (pfile, true, &nst))
2428           {
2429             result->type = CPP_NAME;
2430             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2431             warn_about_normalization (pfile, result, &nst);
2432             break;
2433           }
2434         buffer->cur++;
2435       }
2436
2437     default:
2438       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2439       break;
2440     }
2441
2442   return result;
2443 }
2444
2445 /* An upper bound on the number of bytes needed to spell TOKEN.
2446    Does not include preceding whitespace.  */
2447 unsigned int
2448 cpp_token_len (const cpp_token *token)
2449 {
2450   unsigned int len;
2451
2452   switch (TOKEN_SPELL (token))
2453     {
2454     default:            len = 6;                                break;
2455     case SPELL_LITERAL: len = token->val.str.len;               break;
2456     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2457     }
2458
2459   return len;
2460 }
2461
2462 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2463    Return the number of bytes read out of NAME.  (There are always
2464    10 bytes written to BUFFER.)  */
2465
2466 static size_t
2467 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2468 {
2469   int j;
2470   int ucn_len = 0;
2471   int ucn_len_c;
2472   unsigned t;
2473   unsigned long utf32;
2474
2475   /* Compute the length of the UTF-8 sequence.  */
2476   for (t = *name; t & 0x80; t <<= 1)
2477     ucn_len++;
2478
2479   utf32 = *name & (0x7F >> ucn_len);
2480   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2481     {
2482       utf32 = (utf32 << 6) | (*++name & 0x3F);
2483
2484       /* Ill-formed UTF-8.  */
2485       if ((*name & ~0x3F) != 0x80)
2486         abort ();
2487     }
2488
2489   *buffer++ = '\\';
2490   *buffer++ = 'U';
2491   for (j = 7; j >= 0; j--)
2492     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2493   return ucn_len;
2494 }
2495
2496 /* Given a token TYPE corresponding to a digraph, return a pointer to
2497    the spelling of the digraph.  */
2498 static const unsigned char *
2499 cpp_digraph2name (enum cpp_ttype type)
2500 {
2501   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2502 }
2503
2504 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2505    already contain the enough space to hold the token's spelling.
2506    Returns a pointer to the character after the last character written.
2507    FORSTRING is true if this is to be the spelling after translation
2508    phase 1 (this is different for UCNs).
2509    FIXME: Would be nice if we didn't need the PFILE argument.  */
2510 unsigned char *
2511 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2512                  unsigned char *buffer, bool forstring)
2513 {
2514   switch (TOKEN_SPELL (token))
2515     {
2516     case SPELL_OPERATOR:
2517       {
2518         const unsigned char *spelling;
2519         unsigned char c;
2520
2521         if (token->flags & DIGRAPH)
2522           spelling = cpp_digraph2name (token->type);
2523         else if (token->flags & NAMED_OP)
2524           goto spell_ident;
2525         else
2526           spelling = TOKEN_NAME (token);
2527
2528         while ((c = *spelling++) != '\0')
2529           *buffer++ = c;
2530       }
2531       break;
2532
2533     spell_ident:
2534     case SPELL_IDENT:
2535       if (forstring)
2536         {
2537           memcpy (buffer, NODE_NAME (token->val.node.node),
2538                   NODE_LEN (token->val.node.node));
2539           buffer += NODE_LEN (token->val.node.node);
2540         }
2541       else
2542         {
2543           size_t i;
2544           const unsigned char * name = NODE_NAME (token->val.node.node);
2545
2546           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2547             if (name[i] & ~0x7F)
2548               {
2549                 i += utf8_to_ucn (buffer, name + i) - 1;
2550                 buffer += 10;
2551               }
2552             else
2553               *buffer++ = NODE_NAME (token->val.node.node)[i];
2554         }
2555       break;
2556
2557     case SPELL_LITERAL:
2558       memcpy (buffer, token->val.str.text, token->val.str.len);
2559       buffer += token->val.str.len;
2560       break;
2561
2562     case SPELL_NONE:
2563       cpp_error (pfile, CPP_DL_ICE,
2564                  "unspellable token %s", TOKEN_NAME (token));
2565       break;
2566     }
2567
2568   return buffer;
2569 }
2570
2571 /* Returns TOKEN spelt as a null-terminated string.  The string is
2572    freed when the reader is destroyed.  Useful for diagnostics.  */
2573 unsigned char *
2574 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2575 {
2576   unsigned int len = cpp_token_len (token) + 1;
2577   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2578
2579   end = cpp_spell_token (pfile, token, start, false);
2580   end[0] = '\0';
2581
2582   return start;
2583 }
2584
2585 /* Returns a pointer to a string which spells the token defined by
2586    TYPE and FLAGS.  Used by C front ends, which really should move to
2587    using cpp_token_as_text.  */
2588 const char *
2589 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2590 {
2591   if (flags & DIGRAPH)
2592     return (const char *) cpp_digraph2name (type);
2593   else if (flags & NAMED_OP)
2594     return cpp_named_operator2name (type);
2595
2596   return (const char *) token_spellings[type].name;
2597 }
2598
2599 /* Writes the spelling of token to FP, without any preceding space.
2600    Separated from cpp_spell_token for efficiency - to avoid stdio
2601    double-buffering.  */
2602 void
2603 cpp_output_token (const cpp_token *token, FILE *fp)
2604 {
2605   switch (TOKEN_SPELL (token))
2606     {
2607     case SPELL_OPERATOR:
2608       {
2609         const unsigned char *spelling;
2610         int c;
2611
2612         if (token->flags & DIGRAPH)
2613           spelling = cpp_digraph2name (token->type);
2614         else if (token->flags & NAMED_OP)
2615           goto spell_ident;
2616         else
2617           spelling = TOKEN_NAME (token);
2618
2619         c = *spelling;
2620         do
2621           putc (c, fp);
2622         while ((c = *++spelling) != '\0');
2623       }
2624       break;
2625
2626     spell_ident:
2627     case SPELL_IDENT:
2628       {
2629         size_t i;
2630         const unsigned char * name = NODE_NAME (token->val.node.node);
2631
2632         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2633           if (name[i] & ~0x7F)
2634             {
2635               unsigned char buffer[10];
2636               i += utf8_to_ucn (buffer, name + i) - 1;
2637               fwrite (buffer, 1, 10, fp);
2638             }
2639           else
2640             fputc (NODE_NAME (token->val.node.node)[i], fp);
2641       }
2642       break;
2643
2644     case SPELL_LITERAL:
2645       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2646       break;
2647
2648     case SPELL_NONE:
2649       /* An error, most probably.  */
2650       break;
2651     }
2652 }
2653
2654 /* Compare two tokens.  */
2655 int
2656 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2657 {
2658   if (a->type == b->type && a->flags == b->flags)
2659     switch (TOKEN_SPELL (a))
2660       {
2661       default:                  /* Keep compiler happy.  */
2662       case SPELL_OPERATOR:
2663         /* token_no is used to track where multiple consecutive ##
2664            tokens were originally located.  */
2665         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2666       case SPELL_NONE:
2667         return (a->type != CPP_MACRO_ARG
2668                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2669       case SPELL_IDENT:
2670         return a->val.node.node == b->val.node.node;
2671       case SPELL_LITERAL:
2672         return (a->val.str.len == b->val.str.len
2673                 && !memcmp (a->val.str.text, b->val.str.text,
2674                             a->val.str.len));
2675       }
2676
2677   return 0;
2678 }
2679
2680 /* Returns nonzero if a space should be inserted to avoid an
2681    accidental token paste for output.  For simplicity, it is
2682    conservative, and occasionally advises a space where one is not
2683    needed, e.g. "." and ".2".  */
2684 int
2685 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2686                  const cpp_token *token2)
2687 {
2688   enum cpp_ttype a = token1->type, b = token2->type;
2689   cppchar_t c;
2690
2691   if (token1->flags & NAMED_OP)
2692     a = CPP_NAME;
2693   if (token2->flags & NAMED_OP)
2694     b = CPP_NAME;
2695
2696   c = EOF;
2697   if (token2->flags & DIGRAPH)
2698     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2699   else if (token_spellings[b].category == SPELL_OPERATOR)
2700     c = token_spellings[b].name[0];
2701
2702   /* Quickly get everything that can paste with an '='.  */
2703   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2704     return 1;
2705
2706   switch (a)
2707     {
2708     case CPP_GREATER:   return c == '>';
2709     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2710     case CPP_PLUS:      return c == '+';
2711     case CPP_MINUS:     return c == '-' || c == '>';
2712     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2713     case CPP_MOD:       return c == ':' || c == '>';
2714     case CPP_AND:       return c == '&';
2715     case CPP_OR:        return c == '|';
2716     case CPP_COLON:     return c == ':' || c == '>';
2717     case CPP_DEREF:     return c == '*';
2718     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2719     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2720     case CPP_NAME:      return ((b == CPP_NUMBER
2721                                  && name_p (pfile, &token2->val.str))
2722                                 || b == CPP_NAME
2723                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2724     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2725                                 || c == '.' || c == '+' || c == '-');
2726                                       /* UCNs */
2727     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2728                                  && b == CPP_NAME)
2729                                 || (CPP_OPTION (pfile, objc)
2730                                     && token1->val.str.text[0] == '@'
2731                                     && (b == CPP_NAME || b == CPP_STRING)));
2732     default:            break;
2733     }
2734
2735   return 0;
2736 }
2737
2738 /* Output all the remaining tokens on the current line, and a newline
2739    character, to FP.  Leading whitespace is removed.  If there are
2740    macros, special token padding is not performed.  */
2741 void
2742 cpp_output_line (cpp_reader *pfile, FILE *fp)
2743 {
2744   const cpp_token *token;
2745
2746   token = cpp_get_token (pfile);
2747   while (token->type != CPP_EOF)
2748     {
2749       cpp_output_token (token, fp);
2750       token = cpp_get_token (pfile);
2751       if (token->flags & PREV_WHITE)
2752         putc (' ', fp);
2753     }
2754
2755   putc ('\n', fp);
2756 }
2757
2758 /* Return a string representation of all the remaining tokens on the
2759    current line.  The result is allocated using xmalloc and must be
2760    freed by the caller.  */
2761 unsigned char *
2762 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2763 {
2764   const cpp_token *token;
2765   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2766   unsigned int alloced = 120 + out;
2767   unsigned char *result = (unsigned char *) xmalloc (alloced);
2768
2769   /* If DIR_NAME is empty, there are no initial contents.  */
2770   if (dir_name)
2771     {
2772       sprintf ((char *) result, "#%s ", dir_name);
2773       out += 2;
2774     }
2775
2776   token = cpp_get_token (pfile);
2777   while (token->type != CPP_EOF)
2778     {
2779       unsigned char *last;
2780       /* Include room for a possible space and the terminating nul.  */
2781       unsigned int len = cpp_token_len (token) + 2;
2782
2783       if (out + len > alloced)
2784         {
2785           alloced *= 2;
2786           if (out + len > alloced)
2787             alloced = out + len;
2788           result = (unsigned char *) xrealloc (result, alloced);
2789         }
2790
2791       last = cpp_spell_token (pfile, token, &result[out], 0);
2792       out = last - result;
2793
2794       token = cpp_get_token (pfile);
2795       if (token->flags & PREV_WHITE)
2796         result[out++] = ' ';
2797     }
2798
2799   result[out] = '\0';
2800   return result;
2801 }
2802
2803 /* Memory buffers.  Changing these three constants can have a dramatic
2804    effect on performance.  The values here are reasonable defaults,
2805    but might be tuned.  If you adjust them, be sure to test across a
2806    range of uses of cpplib, including heavy nested function-like macro
2807    expansion.  Also check the change in peak memory usage (NJAMD is a
2808    good tool for this).  */
2809 #define MIN_BUFF_SIZE 8000
2810 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2811 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2812         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2813
2814 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2815   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2816 #endif
2817
2818 /* Create a new allocation buffer.  Place the control block at the end
2819    of the buffer, so that buffer overflows will cause immediate chaos.  */
2820 static _cpp_buff *
2821 new_buff (size_t len)
2822 {
2823   _cpp_buff *result;
2824   unsigned char *base;
2825
2826   if (len < MIN_BUFF_SIZE)
2827     len = MIN_BUFF_SIZE;
2828   len = CPP_ALIGN (len);
2829
2830   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2831   result = (_cpp_buff *) (base + len);
2832   result->base = base;
2833   result->cur = base;
2834   result->limit = base + len;
2835   result->next = NULL;
2836   return result;
2837 }
2838
2839 /* Place a chain of unwanted allocation buffers on the free list.  */
2840 void
2841 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2842 {
2843   _cpp_buff *end = buff;
2844
2845   while (end->next)
2846     end = end->next;
2847   end->next = pfile->free_buffs;
2848   pfile->free_buffs = buff;
2849 }
2850
2851 /* Return a free buffer of size at least MIN_SIZE.  */
2852 _cpp_buff *
2853 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2854 {
2855   _cpp_buff *result, **p;
2856
2857   for (p = &pfile->free_buffs;; p = &(*p)->next)
2858     {
2859       size_t size;
2860
2861       if (*p == NULL)
2862         return new_buff (min_size);
2863       result = *p;
2864       size = result->limit - result->base;
2865       /* Return a buffer that's big enough, but don't waste one that's
2866          way too big.  */
2867       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2868         break;
2869     }
2870
2871   *p = result->next;
2872   result->next = NULL;
2873   result->cur = result->base;
2874   return result;
2875 }
2876
2877 /* Creates a new buffer with enough space to hold the uncommitted
2878    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
2879    the excess bytes to the new buffer.  Chains the new buffer after
2880    BUFF, and returns the new buffer.  */
2881 _cpp_buff *
2882 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2883 {
2884   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2885   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2886
2887   buff->next = new_buff;
2888   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2889   return new_buff;
2890 }
2891
2892 /* Creates a new buffer with enough space to hold the uncommitted
2893    remaining bytes of the buffer pointed to by BUFF, and at least
2894    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
2895    Chains the new buffer before the buffer pointed to by BUFF, and
2896    updates the pointer to point to the new buffer.  */
2897 void
2898 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
2899 {
2900   _cpp_buff *new_buff, *old_buff = *pbuff;
2901   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2902
2903   new_buff = _cpp_get_buff (pfile, size);
2904   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2905   new_buff->next = old_buff;
2906   *pbuff = new_buff;
2907 }
2908
2909 /* Free a chain of buffers starting at BUFF.  */
2910 void
2911 _cpp_free_buff (_cpp_buff *buff)
2912 {
2913   _cpp_buff *next;
2914
2915   for (; buff; buff = next)
2916     {
2917       next = buff->next;
2918       free (buff->base);
2919     }
2920 }
2921
2922 /* Allocate permanent, unaligned storage of length LEN.  */
2923 unsigned char *
2924 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
2925 {
2926   _cpp_buff *buff = pfile->u_buff;
2927   unsigned char *result = buff->cur;
2928
2929   if (len > (size_t) (buff->limit - result))
2930     {
2931       buff = _cpp_get_buff (pfile, len);
2932       buff->next = pfile->u_buff;
2933       pfile->u_buff = buff;
2934       result = buff->cur;
2935     }
2936
2937   buff->cur = result + len;
2938   return result;
2939 }
2940
2941 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2942    That buffer is used for growing allocations when saving macro
2943    replacement lists in a #define, and when parsing an answer to an
2944    assertion in #assert, #unassert or #if (and therefore possibly
2945    whilst expanding macros).  It therefore must not be used by any
2946    code that they might call: specifically the lexer and the guts of
2947    the macro expander.
2948
2949    All existing other uses clearly fit this restriction: storing
2950    registered pragmas during initialization.  */
2951 unsigned char *
2952 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
2953 {
2954   _cpp_buff *buff = pfile->a_buff;
2955   unsigned char *result = buff->cur;
2956
2957   if (len > (size_t) (buff->limit - result))
2958     {
2959       buff = _cpp_get_buff (pfile, len);
2960       buff->next = pfile->a_buff;
2961       pfile->a_buff = buff;
2962       result = buff->cur;
2963     }
2964
2965   buff->cur = result + len;
2966   return result;
2967 }
2968
2969 /* Say which field of TOK is in use.  */
2970
2971 enum cpp_token_fld_kind
2972 cpp_token_val_index (cpp_token *tok)
2973 {
2974   switch (TOKEN_SPELL (tok))
2975     {
2976     case SPELL_IDENT:
2977       return CPP_TOKEN_FLD_NODE;
2978     case SPELL_LITERAL:
2979       return CPP_TOKEN_FLD_STR;
2980     case SPELL_OPERATOR:
2981       if (tok->type == CPP_PASTE)
2982         return CPP_TOKEN_FLD_TOKEN_NO;
2983       else
2984         return CPP_TOKEN_FLD_NONE;
2985     case SPELL_NONE:
2986       if (tok->type == CPP_MACRO_ARG)
2987         return CPP_TOKEN_FLD_ARG_NO;
2988       else if (tok->type == CPP_PADDING)
2989         return CPP_TOKEN_FLD_SOURCE;
2990       else if (tok->type == CPP_PRAGMA)
2991         return CPP_TOKEN_FLD_PRAGMA;
2992       /* else fall through */
2993     default:
2994       return CPP_TOKEN_FLD_NONE;
2995     }
2996 }
2997
2998 /* All tokens lexed in R after calling this function will be forced to have
2999    their source_location the same as the location referenced by P, until
3000    cpp_stop_forcing_token_locations is called for R.  */
3001
3002 void
3003 cpp_force_token_locations (cpp_reader *r, source_location *p)
3004 {
3005   r->forced_token_location_p = p;
3006 }
3007
3008 /* Go back to assigning locations naturally for lexed tokens.  */
3009
3010 void
3011 cpp_stop_forcing_token_locations (cpp_reader *r)
3012 {
3013   r->forced_token_location_p = NULL;
3014 }