X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=libcpp%2Flex.c;h=07d5a4ff4668853a230c00f53dd48b0691276b09;hb=6a6926635c36e0ef2598b5399afdbfc2dbd4bf1f;hp=6fca257710570780db810ceb56dd2e41e7048c9a;hpb=35c4515b8b8e306684a3837d40ffa2c9fcdd9899;p=gcc.git diff --git a/libcpp/lex.c b/libcpp/lex.c index 6fca2577105..07d5a4ff466 100644 --- a/libcpp/lex.c +++ b/libcpp/lex.c @@ -1,5 +1,5 @@ /* CPP Library - lexical analysis. - Copyright (C) 2000-2017 Free Software Foundation, Inc. + Copyright (C) 2000-2020 Free Software Foundation, Inc. Contributed by Per Bothner, 1994-95. Based on CCCP program by Paul Rubin, June 1986 Adapted to ANSI C, Richard Stallman, Jan 1987 @@ -531,11 +531,11 @@ init_vectorized_lexer (void) search_line_fast = impl; } -#elif defined(_ARCH_PWR8) && defined(__ALTIVEC__) +#elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__) /* A vection of the fast scanner using AltiVec vectorized byte compares and VSX unaligned loads (when VSX is available). This is otherwise - the same as the pre-GCC 5 version. */ + the same as the AltiVec version. */ ATTRIBUTE_NO_SANITIZE_UNDEFINED static const uchar * @@ -568,7 +568,7 @@ search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) { vc m_nl, m_cr, m_bs, m_qm; - data = *((const vc *)s); + data = __builtin_vec_vsx_ld (0, s); s += 16; m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl); @@ -1062,7 +1062,7 @@ _cpp_clean_line (cpp_reader *pfile) d = (uchar *) s; /* Handle DOS line endings. */ - if (*s == '\r' && s != buffer->rlimit && s[1] == '\n') + if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n') s++; } @@ -1232,7 +1232,7 @@ static int skip_line_comment (cpp_reader *pfile) { cpp_buffer *buffer = pfile->buffer; - source_location orig_line = pfile->line_table->highest_line; + location_t orig_line = pfile->line_table->highest_line; while (*buffer->cur != '\n') buffer->cur++; @@ -1313,7 +1313,9 @@ warn_about_normalization (cpp_reader *pfile, } } -/* Returns TRUE if the sequence starting at buffer->cur is invalid in +static const cppchar_t utf8_signifier = 0xC0; + +/* Returns TRUE if the sequence starting at buffer->cur is valid in an identifier. FIRST is TRUE if this starts an identifier. */ static bool forms_identifier_p (cpp_reader *pfile, int first, @@ -1336,17 +1338,25 @@ forms_identifier_p (cpp_reader *pfile, int first, return true; } - /* Is this a syntactically valid UCN? */ - if (CPP_OPTION (pfile, extended_identifiers) - && *buffer->cur == '\\' - && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U')) + /* Is this a syntactically valid UCN or a valid UTF-8 char? */ + if (CPP_OPTION (pfile, extended_identifiers)) { cppchar_t s; - buffer->cur += 2; - if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first, - state, &s, NULL, NULL)) - return true; - buffer->cur -= 2; + if (*buffer->cur >= utf8_signifier) + { + if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first, + state, &s)) + return true; + } + else if (*buffer->cur == '\\' + && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U')) + { + buffer->cur += 2; + if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first, + state, &s, NULL, NULL)) + return true; + buffer->cur -= 2; + } } return false; @@ -1360,9 +1370,9 @@ maybe_va_opt_error (cpp_reader *pfile) { /* __VA_OPT__ should not be accepted at all, but allow it in system headers. */ - if (!cpp_in_system_header (pfile)) + if (!_cpp_in_system_header (pfile)) cpp_error (pfile, CPP_DL_PEDWARN, - "__VA_OPT__ is not available until C++2a"); + "__VA_OPT__ is not available until C++20"); } else if (!pfile->state.va_args_ok) { @@ -1370,7 +1380,7 @@ maybe_va_opt_error (cpp_reader *pfile) variadic macro. */ cpp_error (pfile, CPP_DL_PEDWARN, "__VA_OPT__ can only appear in the expansion" - " of a C++2a variadic macro"); + " of a C++20 variadic macro"); } } @@ -1464,7 +1474,8 @@ lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn, pfile->buffer->cur = cur; if (starts_ucn || forms_identifier_p (pfile, false, nst)) { - /* Slower version for identifiers containing UCNs (or $). */ + /* Slower version for identifiers containing UCNs + or extended chars (including $). */ do { while (ISIDNUM (*pfile->buffer->cur)) { @@ -1566,44 +1577,90 @@ static void create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base, unsigned int len, enum cpp_ttype type) { - uchar *dest = _cpp_unaligned_alloc (pfile, len + 1); - - memcpy (dest, base, len); - dest[len] = '\0'; token->type = type; token->val.str.len = len; - token->val.str.text = dest; + token->val.str.text = cpp_alloc_token_string (pfile, base, len); +} + +const uchar * +cpp_alloc_token_string (cpp_reader *pfile, + const unsigned char *ptr, unsigned len) +{ + uchar *dest = _cpp_unaligned_alloc (pfile, len + 1); + + dest[len] = 0; + memcpy (dest, ptr, len); + return dest; } +/* A pair of raw buffer pointers. The currently open one is [1], the + first one is [0]. Used for string literal lexing. */ +struct lit_accum { + _cpp_buff *first; + _cpp_buff *last; + const uchar *rpos; + size_t accum; + + lit_accum () + : first (NULL), last (NULL), rpos (0), accum (0) + { + } + + void append (cpp_reader *, const uchar *, size_t); + + void read_begin (cpp_reader *); + bool reading_p () const + { + return rpos != NULL; + } + char read_char () + { + char c = *rpos++; + if (rpos == BUFF_FRONT (last)) + rpos = NULL; + return c; + } +}; + /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer sequence from *FIRST_BUFF_P to LAST_BUFF_P. */ -static void -bufring_append (cpp_reader *pfile, const uchar *base, size_t len, - _cpp_buff **first_buff_p, _cpp_buff **last_buff_p) +void +lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len) { - _cpp_buff *first_buff = *first_buff_p; - _cpp_buff *last_buff = *last_buff_p; - - if (first_buff == NULL) - first_buff = last_buff = _cpp_get_buff (pfile, len); - else if (len > BUFF_ROOM (last_buff)) + if (!last) + /* Starting. */ + first = last = _cpp_get_buff (pfile, len); + else if (len > BUFF_ROOM (last)) { - size_t room = BUFF_ROOM (last_buff); - memcpy (BUFF_FRONT (last_buff), base, room); - BUFF_FRONT (last_buff) += room; + /* There is insufficient room in the buffer. Copy what we can, + and then either extend or create a new one. */ + size_t room = BUFF_ROOM (last); + memcpy (BUFF_FRONT (last), base, room); + BUFF_FRONT (last) += room; base += room; len -= room; - last_buff = _cpp_append_extend_buff (pfile, last_buff, len); - } + accum += room; - memcpy (BUFF_FRONT (last_buff), base, len); - BUFF_FRONT (last_buff) += len; + gcc_checking_assert (!rpos); - *first_buff_p = first_buff; - *last_buff_p = last_buff; + last = _cpp_append_extend_buff (pfile, last, len); + } + + memcpy (BUFF_FRONT (last), base, len); + BUFF_FRONT (last) += len; + accum += len; } +void +lit_accum::read_begin (cpp_reader *pfile) +{ + /* We never accumulate more than 4 chars to read. */ + if (BUFF_ROOM (last) < 4) + + last = _cpp_append_extend_buff (pfile, last, 4); + rpos = BUFF_FRONT (last); +} /* Returns true if a macro has been defined. This might not work if compile with -save-temps, @@ -1627,271 +1684,275 @@ is_macro(cpp_reader *pfile, const uchar *base) cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table, base, cur - base, hash, HT_NO_INSERT)); - return !result ? false : (result->type == NT_MACRO); + return result && cpp_macro_p (result); } +/* Returns true if a literal suffix does not have the expected form + and is defined as a macro. */ -/* Lexes a raw string. The stored string contains the spelling, including - double quotes, delimiter string, '(' and ')', any leading - 'L', 'u', 'U' or 'u8' and 'R' modifier. It returns the type of the - literal, or CPP_OTHER if it was not properly terminated. +static bool +is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base) +{ + /* User-defined literals outside of namespace std must start with a single + underscore, so assume anything of that form really is a UDL suffix. + We don't need to worry about UDLs defined inside namespace std because + their names are reserved, so cannot be used as macro names in valid + programs. */ + if (base[0] == '_' && base[1] != '_') + return false; + return is_macro (pfile, base); +} + +/* Lexes a raw string. The stored string contains the spelling, + including double quotes, delimiter string, '(' and ')', any leading + 'L', 'u', 'U' or 'u8' and 'R' modifier. The created token contains + the type of the literal, or CPP_OTHER if it was not properly + terminated. + + BASE is the start of the token. Updates pfile->buffer->cur to just + after the lexed string. The spelling is NUL-terminated, but it is not guaranteed that this is the first NUL since embedded NULs are preserved. */ static void -lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base, - const uchar *cur) -{ - uchar raw_prefix[17]; - uchar temp_buffer[18]; - const uchar *orig_base; - unsigned int raw_prefix_len = 0, raw_suffix_len = 0; - enum raw_str_phase { RAW_STR_PREFIX, RAW_STR, RAW_STR_SUFFIX }; - raw_str_phase phase = RAW_STR_PREFIX; - enum cpp_ttype type; - size_t total_len = 0; - /* Index into temp_buffer during phases other than RAW_STR, - during RAW_STR phase 17 to tell BUF_APPEND that nothing should - be appended to temp_buffer. */ - size_t temp_buffer_len = 0; - _cpp_buff *first_buff = NULL, *last_buff = NULL; - size_t raw_prefix_start; +lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base) +{ + const uchar *pos = base; + + /* 'tis a pity this information isn't passed down from the lexer's + initial categorization of the token. */ + enum cpp_ttype type = CPP_STRING; + + if (*pos == 'L') + { + type = CPP_WSTRING; + pos++; + } + else if (*pos == 'U') + { + type = CPP_STRING32; + pos++; + } + else if (*pos == 'u') + { + if (pos[1] == '8') + { + type = CPP_UTF8STRING; + pos++; + } + else + type = CPP_STRING16; + pos++; + } + + gcc_checking_assert (pos[0] == 'R' && pos[1] == '"'); + pos += 2; + _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note]; - type = (*base == 'L' ? CPP_WSTRING : - *base == 'U' ? CPP_STRING32 : - *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16) - : CPP_STRING); - -#define BUF_APPEND(STR,LEN) \ - do { \ - bufring_append (pfile, (const uchar *)(STR), (LEN), \ - &first_buff, &last_buff); \ - total_len += (LEN); \ - if (__builtin_expect (temp_buffer_len < 17, 0) \ - && (const uchar *)(STR) != base \ - && (LEN) <= 2) \ - { \ - memcpy (temp_buffer + temp_buffer_len, \ - (const uchar *)(STR), (LEN)); \ - temp_buffer_len += (LEN); \ - } \ - } while (0) - - orig_base = base; - ++cur; - raw_prefix_start = cur - base; + /* Skip notes before the ". */ + while (note->pos < pos) + ++note; + + lit_accum accum; + + uchar prefix[17]; + unsigned prefix_len = 0; + enum Phase + { + PHASE_PREFIX = -2, + PHASE_NONE = -1, + PHASE_SUFFIX = 0 + } phase = PHASE_PREFIX; + for (;;) { - cppchar_t c; + gcc_checking_assert (note->pos >= pos); - /* If we previously performed any trigraph or line splicing - transformations, undo them in between the opening and closing - double quote. */ - while (note->pos < cur) - ++note; - for (; note->pos == cur; ++note) - { - switch (note->type) - { - case '\\': - case ' ': - /* Restore backslash followed by newline. */ - BUF_APPEND (base, cur - base); - base = cur; - BUF_APPEND ("\\", 1); - after_backslash: - if (note->type == ' ') - { - /* GNU backslash whitespace newline extension. FIXME - could be any sequence of non-vertical space. When we - can properly restore any such sequence, we should mark - this note as handled so _cpp_process_line_notes - doesn't warn. */ - BUF_APPEND (" ", 1); - } - - BUF_APPEND ("\n", 1); - break; + /* Undo any escaped newlines and trigraphs. */ + if (!accum.reading_p () && note->pos == pos) + switch (note->type) + { + case '\\': + case ' ': + /* Restore backslash followed by newline. */ + accum.append (pfile, base, pos - base); + base = pos; + accum.read_begin (pfile); + accum.append (pfile, UC"\\", 1); + + after_backslash: + if (note->type == ' ') + /* GNU backslash whitespace newline extension. FIXME + could be any sequence of non-vertical space. When we + can properly restore any such sequence, we should + mark this note as handled so _cpp_process_line_notes + doesn't warn. */ + accum.append (pfile, UC" ", 1); + + accum.append (pfile, UC"\n", 1); + note++; + break; - case 0: - /* Already handled. */ - break; + case '\n': + /* This can happen for ??/ when trigraphs are not + being interpretted. */ + gcc_checking_assert (!CPP_OPTION (pfile, trigraphs)); + note->type = 0; + note++; + break; - default: - if (_cpp_trigraph_map[note->type]) - { - /* Don't warn about this trigraph in - _cpp_process_line_notes, since trigraphs show up as - trigraphs in raw strings. */ - uchar type = note->type; - note->type = 0; - - if (!CPP_OPTION (pfile, trigraphs)) - /* If we didn't convert the trigraph in the first - place, don't do anything now either. */ - break; + default: + gcc_checking_assert (_cpp_trigraph_map[note->type]); + + /* Don't warn about this trigraph in + _cpp_process_line_notes, since trigraphs show up as + trigraphs in raw strings. */ + uchar type = note->type; + note->type = 0; + + if (CPP_OPTION (pfile, trigraphs)) + { + accum.append (pfile, base, pos - base); + base = pos; + accum.read_begin (pfile); + accum.append (pfile, UC"??", 2); + accum.append (pfile, &type, 1); + + /* ??/ followed by newline gets two line notes, one for + the trigraph and one for the backslash/newline. */ + if (type == '/' && note[1].pos == pos) + { + note++; + gcc_assert (note->type == '\\' || note->type == ' '); + goto after_backslash; + } + /* Skip the replacement character. */ + base = ++pos; + } + + note++; + break; + } - BUF_APPEND (base, cur - base); - base = cur; - BUF_APPEND ("??", 2); + /* Now get a char to process. Either from an expanded note, or + from the line buffer. */ + bool read_note = accum.reading_p (); + char c = read_note ? accum.read_char () : *pos++; - /* ??/ followed by newline gets two line notes, one for - the trigraph and one for the backslash/newline. */ - if (type == '/' && note[1].pos == cur) - { - if (note[1].type != '\\' - && note[1].type != ' ') - abort (); - BUF_APPEND ("/", 1); - ++note; - goto after_backslash; - } - else - { - /* Skip the replacement character. */ - base = ++cur; - BUF_APPEND (&type, 1); - c = type; - goto check_c; - } - } + if (phase == PHASE_PREFIX) + { + if (c == '(') + { + /* Done. */ + phase = PHASE_NONE; + prefix[prefix_len++] = '"'; + } + else if (prefix_len < 16 + /* Prefix chars are any of the basic character set, + [lex.charset] except for ' + ()\\\t\v\f\n'. Optimized for a contiguous + alphabet. */ + /* Unlike a switch, this collapses down to one or + two shift and bitmask operations on an ASCII + system, with an outlier or two. */ + && (('Z' - 'A' == 25 + ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) + : ISIDST (c)) + || (c >= '0' && c <= '9') + || c == '_' || c == '{' || c == '}' + || c == '[' || c == ']' || c == '#' + || c == '<' || c == '>' || c == '%' + || c == ':' || c == ';' || c == '.' || c == '?' + || c == '*' || c == '+' || c == '-' || c == '/' + || c == '^' || c == '&' || c == '|' || c == '~' + || c == '!' || c == '=' || c == ',' + || c == '"' || c == '\'')) + prefix[prefix_len++] = c; + else + { + /* Something is wrong. */ + int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note; + if (prefix_len == 16) + cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, + col, "raw string delimiter longer " + "than 16 characters"); + else if (c == '\n') + cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, + col, "invalid new-line in raw " + "string delimiter"); else - abort (); - break; + cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, + col, "invalid character '%c' in " + "raw string delimiter", c); + type = CPP_OTHER; + phase = PHASE_NONE; + /* Continue until we get a close quote, that's probably + the best failure mode. */ + prefix_len = 0; } + if (c != '\n') + continue; } - c = *cur++; - if (__builtin_expect (temp_buffer_len < 17, 0)) - temp_buffer[temp_buffer_len++] = c; - check_c: - if (phase == RAW_STR_PREFIX) + if (phase != PHASE_NONE) { - while (raw_prefix_len < temp_buffer_len) + if (prefix[phase] != c) + phase = PHASE_NONE; + else if (unsigned (phase + 1) == prefix_len) + break; + else { - raw_prefix[raw_prefix_len] = temp_buffer[raw_prefix_len]; - switch (raw_prefix[raw_prefix_len]) - { - case ' ': case '(': case ')': case '\\': case '\t': - case '\v': case '\f': case '\n': default: - break; - /* Basic source charset except the above chars. */ - case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': - case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': - case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': - case 's': case 't': case 'u': case 'v': case 'w': case 'x': - case 'y': case 'z': - case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': - case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': - case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': - case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': - case 'Y': case 'Z': - case '0': case '1': case '2': case '3': case '4': case '5': - case '6': case '7': case '8': case '9': - case '_': case '{': case '}': case '#': case '[': case ']': - case '<': case '>': case '%': case ':': case ';': case '.': - case '?': case '*': case '+': case '-': case '/': case '^': - case '&': case '|': case '~': case '!': case '=': case ',': - case '"': case '\'': - if (raw_prefix_len < 16) - { - raw_prefix_len++; - continue; - } - break; - } - - if (raw_prefix[raw_prefix_len] != '(') - { - int col = CPP_BUF_COLUMN (pfile->buffer, cur) + 1; - if (raw_prefix_len == 16) - cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, - col, "raw string delimiter longer " - "than 16 characters"); - else if (raw_prefix[raw_prefix_len] == '\n') - cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, - col, "invalid new-line in raw " - "string delimiter"); - else - cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, - col, "invalid character '%c' in " - "raw string delimiter", - (int) raw_prefix[raw_prefix_len]); - pfile->buffer->cur = orig_base + raw_prefix_start - 1; - create_literal (pfile, token, orig_base, - raw_prefix_start - 1, CPP_OTHER); - if (first_buff) - _cpp_release_buff (pfile, first_buff); - return; - } - raw_prefix[raw_prefix_len] = '"'; - phase = RAW_STR; - /* Nothing should be appended to temp_buffer during - RAW_STR phase. */ - temp_buffer_len = 17; - break; + phase = Phase (phase + 1); + continue; } - continue; } - else if (phase == RAW_STR_SUFFIX) - { - while (raw_suffix_len <= raw_prefix_len - && raw_suffix_len < temp_buffer_len - && temp_buffer[raw_suffix_len] == raw_prefix[raw_suffix_len]) - raw_suffix_len++; - if (raw_suffix_len > raw_prefix_len) - break; - if (raw_suffix_len == temp_buffer_len) - continue; - phase = RAW_STR; - /* Nothing should be appended to temp_buffer during - RAW_STR phase. */ - temp_buffer_len = 17; - } - if (c == ')') - { - phase = RAW_STR_SUFFIX; - raw_suffix_len = 0; - temp_buffer_len = 0; - } - else if (c == '\n') + + if (!prefix_len && c == '"') + /* Failure mode lexing. */ + goto out; + else if (prefix_len && c == ')') + phase = PHASE_SUFFIX; + else if (!read_note && c == '\n') { + pos--; + pfile->buffer->cur = pos; if (pfile->state.in_directive || (pfile->state.parsing_args && pfile->buffer->next_line >= pfile->buffer->rlimit)) { - cur--; - type = CPP_OTHER; cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0, "unterminated raw string"); - break; + type = CPP_OTHER; + goto out; } - BUF_APPEND (base, cur - base); + accum.append (pfile, base, pos - base + 1); + _cpp_process_line_notes (pfile, false); - if (pfile->buffer->cur < pfile->buffer->rlimit) + if (pfile->buffer->next_line < pfile->buffer->rlimit) CPP_INCREMENT_LINE (pfile, 0); pfile->buffer->need_line = true; - pfile->buffer->cur = cur-1; - _cpp_process_line_notes (pfile, false); if (!_cpp_get_fresh_line (pfile)) { - source_location src_loc = token->src_loc; + /* We ran out of file and failed to get a line. */ + location_t src_loc = token->src_loc; token->type = CPP_EOF; /* Tell the compiler the line number of the EOF token. */ token->src_loc = pfile->line_table->highest_line; token->flags = BOL; - if (first_buff != NULL) - _cpp_release_buff (pfile, first_buff); + if (accum.first) + _cpp_release_buff (pfile, accum.first); cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0, "unterminated raw string"); + /* Now pop the buffer that _cpp_get_fresh_line did not. */ + _cpp_pop_buffer (pfile); return; } - cur = base = pfile->buffer->cur; + pos = base = pfile->buffer->cur; note = &pfile->buffer->notes[pfile->buffer->cur_note]; } } @@ -1900,10 +1961,8 @@ lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base, { /* If a string format macro, say from inttypes.h, is placed touching a string literal it could be parsed as a C++11 user-defined string - literal thus breaking the program. - Try to identify macros with is_macro. A warning is issued. - The macro name should not start with '_' for this warning. */ - if ((*cur != '_') && is_macro (pfile, cur)) + literal thus breaking the program. */ + if (is_macro_not_literal_suffix (pfile, pos)) { /* Raise a warning, but do not consume subsequent tokens. */ if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping) @@ -1913,37 +1972,37 @@ lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base, "a space between literal and string macro"); } /* Grab user defined literal suffix. */ - else if (ISIDST (*cur)) + else if (ISIDST (*pos)) { type = cpp_userdef_string_add_type (type); - ++cur; + ++pos; - while (ISIDNUM (*cur)) - ++cur; + while (ISIDNUM (*pos)) + ++pos; } } - pfile->buffer->cur = cur; - if (first_buff == NULL) - create_literal (pfile, token, base, cur - base, type); + out: + pfile->buffer->cur = pos; + if (!accum.accum) + create_literal (pfile, token, base, pos - base, type); else { - uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1); + size_t extra_len = pos - base; + uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1); token->type = type; - token->val.str.len = total_len + (cur - base); + token->val.str.len = accum.accum + extra_len; token->val.str.text = dest; - last_buff = first_buff; - while (last_buff != NULL) + for (_cpp_buff *buf = accum.first; buf; buf = buf->next) { - memcpy (dest, last_buff->base, - BUFF_FRONT (last_buff) - last_buff->base); - dest += BUFF_FRONT (last_buff) - last_buff->base; - last_buff = last_buff->next; + size_t len = BUFF_FRONT (buf) - buf->base; + memcpy (dest, buf->base, len); + dest += len; } - _cpp_release_buff (pfile, first_buff); - memcpy (dest, base, cur - base); - dest[cur - base] = '\0'; + _cpp_release_buff (pfile, accum.first); + memcpy (dest, base, extra_len); + dest[extra_len] = '\0'; } } @@ -1976,7 +2035,7 @@ lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base) } if (terminator == 'R') { - lex_raw_string (pfile, token, base, cur); + lex_raw_string (pfile, token, base); return; } if (terminator == '"') @@ -2031,10 +2090,8 @@ lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base) { /* If a string format macro, say from inttypes.h, is placed touching a string literal it could be parsed as a C++11 user-defined string - literal thus breaking the program. - Try to identify macros with is_macro. A warning is issued. - The macro name should not start with '_' for this warning. */ - if ((*cur != '_') && is_macro (pfile, cur)) + literal thus breaking the program. */ + if (is_macro_not_literal_suffix (pfile, cur)) { /* Raise a warning, but do not consume subsequent tokens. */ if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping) @@ -2504,6 +2561,15 @@ cpp_peek_token (cpp_reader *pfile, int index) index--; break; } + else if (peektok->type == CPP_PRAGMA) + { + /* Don't peek past a pragma. */ + if (peektok == &pfile->directive_result) + /* Save the pragma in the buffer. */ + *pfile->cur_token++ = *peektok; + index--; + break; + } } while (index--); @@ -2556,6 +2622,151 @@ _cpp_temp_token (cpp_reader *pfile) return result; } +/* We're at the beginning of a logical line (so not in + directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See + if we should enter deferred_pragma mode to tokenize the rest of the + line as a module control-line. */ + +static void +cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result) +{ + unsigned backup = 0; /* Tokens we peeked. */ + cpp_hashnode *node = result->val.node.node; + cpp_token *peek = result; + cpp_token *keyword = peek; + cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules; + int header_count = 0; + + /* Make sure the incoming state is as we expect it. This way we + can restore it using constants. */ + gcc_checking_assert (!pfile->state.in_deferred_pragma + && !pfile->state.skipping + && !pfile->state.parsing_args + && !pfile->state.angled_headers + && (pfile->state.save_comments + == !CPP_OPTION (pfile, discard_comments))); + + /* Enter directives mode sufficiently for peeking. We don't have + to actually set in_directive. */ + pfile->state.in_deferred_pragma = true; + + /* These two fields are needed to process tokenization in deferred + pragma mode. They are not used outside deferred pragma mode or + directives mode. */ + pfile->state.pragma_allow_expansion = true; + pfile->directive_line = result->src_loc; + + /* Saving comments is incompatible with directives mode. */ + pfile->state.save_comments = 0; + + if (node == n_modules[spec_nodes::M_EXPORT][0]) + { + peek = _cpp_lex_direct (pfile); + keyword = peek; + backup++; + if (keyword->type != CPP_NAME) + goto not_module; + node = keyword->val.node.node; + if (!(node->flags & NODE_MODULE)) + goto not_module; + } + + if (node == n_modules[spec_nodes::M__IMPORT][0]) + /* __import */ + header_count = backup + 2 + 16; + else if (node == n_modules[spec_nodes::M_IMPORT][0]) + /* import */ + header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0); + else if (node == n_modules[spec_nodes::M_MODULE][0]) + ; /* module */ + else + goto not_module; + + /* We've seen [export] {module|import|__import}. Check the next token. */ + if (header_count) + /* After '{,__}import' a header name may appear. */ + pfile->state.angled_headers = true; + peek = _cpp_lex_direct (pfile); + backup++; + + /* ... import followed by identifier, ':', '<' or + header-name preprocessing tokens, or module + followed by cpp-identifier, ':' or ';' preprocessing + tokens. C++ keywords are not yet relevant. */ + if (peek->type == CPP_NAME + || peek->type == CPP_COLON + || (header_count + ? (peek->type == CPP_LESS + || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R') + || peek->type == CPP_HEADER_NAME) + : peek->type == CPP_SEMICOLON)) + { + pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed); + if (!pfile->state.pragma_allow_expansion) + pfile->state.prevent_expansion++; + + if (!header_count && linemap_included_from + (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table))) + cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0, + "module control-line cannot be in included file"); + + /* The first one or two tokens cannot be macro names. */ + for (int ix = backup; ix--;) + { + cpp_token *tok = ix ? keyword : result; + cpp_hashnode *node = tok->val.node.node; + + /* Don't attempt to expand the token. */ + tok->flags |= NO_EXPAND; + if (_cpp_defined_macro_p (node) + && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc) + && !cpp_fun_like_macro_p (node)) + cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0, + "module control-line \"%s\" cannot be" + " an object-like macro", + NODE_NAME (node)); + } + + /* Map to underbar variants. */ + keyword->val.node.node = n_modules[header_count + ? spec_nodes::M_IMPORT + : spec_nodes::M_MODULE][1]; + if (backup != 1) + result->val.node.node = n_modules[spec_nodes::M_EXPORT][1]; + + /* Maybe tell the tokenizer we expect a header-name down the + road. */ + pfile->state.directive_file_token = header_count; + } + else + { + not_module: + /* Drop out of directive mode. */ + /* We aaserted save_comments had this value upon entry. */ + pfile->state.save_comments + = !CPP_OPTION (pfile, discard_comments); + pfile->state.in_deferred_pragma = false; + /* Do not let this remain on. */ + pfile->state.angled_headers = false; + } + + /* In either case we want to backup the peeked tokens. */ + if (backup) + { + /* If we saw EOL, we should drop it, because this isn't a module + control-line after all. */ + bool eol = peek->type == CPP_PRAGMA_EOL; + if (!eol || backup > 1) + { + /* Put put the peeked tokens back */ + _cpp_backup_tokens_direct (pfile, backup); + /* But if the last one was an EOL, forget it. */ + if (eol) + pfile->lookaheads--; + } + } +} + /* Lex a token into RESULT (external interface). Takes care of issues like directive handling, token lookahead, multiple include optimization and skipping. */ @@ -2604,6 +2815,21 @@ _cpp_lex_token (cpp_reader *pfile) } else if (pfile->state.in_deferred_pragma) result = &pfile->directive_result; + else if (result->type == CPP_NAME + && (result->val.node.node->flags & NODE_MODULE) + && !pfile->state.skipping + /* Unlike regular directives, we do not deal with + tokenizing module directives as macro arguments. + That's not permitted. */ + && !pfile->state.parsing_args) + { + /* P1857. Before macro expansion, At start of logical + line ... */ + /* We don't have to consider lookaheads at this point. */ + gcc_checking_assert (!pfile->lookaheads); + + cpp_maybe_module_directive (pfile, result); + } if (pfile->cb.line_change && !pfile->state.skipping) pfile->cb.line_change (pfile, result, pfile->state.parsing_args); @@ -2629,8 +2855,6 @@ _cpp_lex_token (cpp_reader *pfile) bool _cpp_get_fresh_line (cpp_reader *pfile) { - int return_at_eof; - /* We can't get a new line until we leave the current directive. */ if (pfile->state.in_directive) return false; @@ -2661,10 +2885,17 @@ _cpp_get_fresh_line (cpp_reader *pfile) buffer->next_line = buffer->rlimit; } - return_at_eof = buffer->return_at_eof; - _cpp_pop_buffer (pfile); - if (pfile->buffer == NULL || return_at_eof) - return false; + if (buffer->prev && !buffer->return_at_eof) + _cpp_pop_buffer (pfile); + else + { + /* End of translation. Do not pop the buffer yet. Increment + line number so that the EOF token is on a line of its own + (_cpp_lex_direct doesn't increment in that case, because + it's hard for it to distinguish this special case). */ + CPP_INCREMENT_LINE (pfile, 0); + return false; + } } } @@ -2702,22 +2933,20 @@ _cpp_lex_direct (cpp_reader *pfile) buffer = pfile->buffer; if (buffer->need_line) { - if (pfile->state.in_deferred_pragma) - { - result->type = CPP_PRAGMA_EOL; - pfile->state.in_deferred_pragma = false; - if (!pfile->state.pragma_allow_expansion) - pfile->state.prevent_expansion--; - return result; - } + gcc_assert (!pfile->state.in_deferred_pragma); if (!_cpp_get_fresh_line (pfile)) { result->type = CPP_EOF; - if (!pfile->state.in_directive) + /* Not a real EOF in a directive or arg parsing -- we refuse + to advance to the next file now, and will once we're out + of those modes. */ + if (!pfile->state.in_directive && !pfile->state.parsing_args) { /* Tell the compiler the line number of the EOF token. */ result->src_loc = pfile->line_table->highest_line; result->flags = BOL; + /* Now pop the buffer that _cpp_get_fresh_line did not. */ + _cpp_pop_buffer (pfile); } return result; } @@ -2746,8 +2975,8 @@ _cpp_lex_direct (cpp_reader *pfile) } c = *buffer->cur++; - if (pfile->forced_token_location_p) - result->src_loc = *pfile->forced_token_location_p; + if (pfile->forced_token_location) + result->src_loc = pfile->forced_token_location; else result->src_loc = linemap_position_for_column (pfile->line_table, CPP_BUF_COLUMN (buffer, buffer->cur)); @@ -2760,9 +2989,28 @@ _cpp_lex_direct (cpp_reader *pfile) goto skipped_white; case '\n': - if (buffer->cur < buffer->rlimit) + /* Increment the line, unless this is the last line ... */ + if (buffer->cur < buffer->rlimit + /* ... or this is a #include, (where _cpp_stack_file needs to + unwind by one line) ... */ + || (pfile->state.in_directive > 1 + /* ... except traditional-cpp increments this elsewhere. */ + && !CPP_OPTION (pfile, traditional))) CPP_INCREMENT_LINE (pfile, 0); buffer->need_line = true; + if (pfile->state.in_deferred_pragma) + { + /* Produce the PRAGMA_EOL on this line. File reading + ensures there is always a \n at end of the buffer, thus + in a deferred pragma we always see CPP_PRAGMA_EOL before + any CPP_EOF. */ + result->type = CPP_PRAGMA_EOL; + result->flags &= ~PREV_WHITE; + pfile->state.in_deferred_pragma = false; + if (!pfile->state.pragma_allow_expansion) + pfile->state.prevent_expansion--; + return result; + } goto fresh_line; case '0': case '1': case '2': case '3': case '4': @@ -2853,7 +3101,7 @@ _cpp_lex_direct (cpp_reader *pfile) else if (c == '/' && ! CPP_OPTION (pfile, traditional)) { /* Don't warn for system headers. */ - if (cpp_in_system_header (pfile)) + if (_cpp_in_system_header (pfile)) ; /* Warn about comments if pedantically GNUC89, and not in system headers. */ @@ -2861,10 +3109,10 @@ _cpp_lex_direct (cpp_reader *pfile) && CPP_PEDANTIC (pfile) && ! buffer->warned_cplusplus_comments) { - cpp_error (pfile, CPP_DL_PEDWARN, - "C++ style comments are not allowed in ISO C90"); - cpp_error (pfile, CPP_DL_PEDWARN, - "(this will be reported only once per input file)"); + if (cpp_error (pfile, CPP_DL_PEDWARN, + "C++ style comments are not allowed in ISO C90")) + cpp_error (pfile, CPP_DL_NOTE, + "(this will be reported only once per input file)"); buffer->warned_cplusplus_comments = 1; } /* Or if specifically desired via -Wc90-c99-compat. */ @@ -2872,10 +3120,10 @@ _cpp_lex_direct (cpp_reader *pfile) && ! CPP_OPTION (pfile, cplusplus) && ! buffer->warned_cplusplus_comments) { - cpp_error (pfile, CPP_DL_WARNING, - "C++ style comments are incompatible with C90"); - cpp_error (pfile, CPP_DL_WARNING, - "(this will be reported only once per input file)"); + if (cpp_error (pfile, CPP_DL_WARNING, + "C++ style comments are incompatible with C90")) + cpp_error (pfile, CPP_DL_NOTE, + "(this will be reported only once per input file)"); buffer->warned_cplusplus_comments = 1; } /* In C89/C94, C++ style comments are forbidden. */ @@ -2895,11 +3143,12 @@ _cpp_lex_direct (cpp_reader *pfile) } else if (! buffer->warned_cplusplus_comments) { - cpp_error (pfile, CPP_DL_ERROR, - "C++ style comments are not allowed in ISO C90"); - cpp_error (pfile, CPP_DL_ERROR, - "(this will be reported only once per input " - "file)"); + if (cpp_error (pfile, CPP_DL_ERROR, + "C++ style comments are not allowed in " + "ISO C90")) + cpp_error (pfile, CPP_DL_NOTE, + "(this will be reported only once per input " + "file)"); buffer->warned_cplusplus_comments = 1; } } @@ -2951,7 +3200,13 @@ _cpp_lex_direct (cpp_reader *pfile) result->type = CPP_LESS; if (*buffer->cur == '=') - buffer->cur++, result->type = CPP_LESS_EQ; + { + buffer->cur++, result->type = CPP_LESS_EQ; + if (*buffer->cur == '>' + && CPP_OPTION (pfile, cplusplus) + && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20) + buffer->cur++, result->type = CPP_SPACESHIP; + } else if (*buffer->cur == '<') { buffer->cur++; @@ -3075,7 +3330,7 @@ _cpp_lex_direct (cpp_reader *pfile) case ':': result->type = CPP_COLON; - if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus)) + if (*buffer->cur == ':' && CPP_OPTION (pfile, scope)) buffer->cur++, result->type = CPP_SCOPE; else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs)) { @@ -3105,12 +3360,12 @@ _cpp_lex_direct (cpp_reader *pfile) /* @ is a punctuator in Objective-C. */ case '@': result->type = CPP_ATSIGN; break; - case '$': - case '\\': + default: { const uchar *base = --buffer->cur; - struct normalize_state nst = INITIAL_NORMALIZE_STATE; + /* Check for an extended identifier ($ or UCN or UTF-8). */ + struct normalize_state nst = INITIAL_NORMALIZE_STATE; if (forms_identifier_p (pfile, true, &nst)) { result->type = CPP_NAME; @@ -3119,13 +3374,21 @@ _cpp_lex_direct (cpp_reader *pfile) warn_about_normalization (pfile, result, &nst); break; } + + /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a + single token. */ buffer->cur++; + if (c >= utf8_signifier) + { + const uchar *pstr = base; + cppchar_t s; + if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s)) + buffer->cur = pstr; + } + create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER); + break; } - /* FALLTHRU */ - default: - create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER); - break; } /* Potentially convert the location of the token to a range. */ @@ -3365,7 +3628,11 @@ cpp_output_token (const cpp_token *token, FILE *fp) break; case SPELL_LITERAL: + if (token->type == CPP_HEADER_NAME) + fputc ('"', fp); fwrite (token->val.str.text, 1, token->val.str.len, fp); + if (token->type == CPP_HEADER_NAME) + fputc ('"', fp); break; case SPELL_NONE: @@ -3454,6 +3721,7 @@ cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1, || (CPP_OPTION (pfile, objc) && token1->val.str.text[0] == '@' && (b == CPP_NAME || b == CPP_STRING))); + case CPP_LESS_EQ: return c == '>'; case CPP_STRING: case CPP_WSTRING: case CPP_UTF8STRING: @@ -3713,6 +3981,25 @@ _cpp_aligned_alloc (cpp_reader *pfile, size_t len) return result; } +/* Commit or allocate storage from a buffer. */ + +void * +_cpp_commit_buff (cpp_reader *pfile, size_t size) +{ + void *ptr = BUFF_FRONT (pfile->a_buff); + + if (pfile->hash_table->alloc_subobject) + { + void *copy = pfile->hash_table->alloc_subobject (size); + memcpy (copy, ptr, size); + ptr = copy; + } + else + BUFF_FRONT (pfile->a_buff) += size; + + return ptr; +} + /* Say which field of TOK is in use. */ enum cpp_token_fld_kind @@ -3725,7 +4012,11 @@ cpp_token_val_index (const cpp_token *tok) case SPELL_LITERAL: return CPP_TOKEN_FLD_STR; case SPELL_OPERATOR: - if (tok->type == CPP_PASTE) + /* Operands which were originally spelled as ident keep around + the node for the exact spelling. */ + if (tok->flags & NAMED_OP) + return CPP_TOKEN_FLD_NODE; + else if (tok->type == CPP_PASTE) return CPP_TOKEN_FLD_TOKEN_NO; else return CPP_TOKEN_FLD_NONE; @@ -3742,14 +4033,14 @@ cpp_token_val_index (const cpp_token *tok) } } -/* All tokens lexed in R after calling this function will be forced to have - their source_location the same as the location referenced by P, until +/* All tokens lexed in R after calling this function will be forced to + have their location_t to be P, until cpp_stop_forcing_token_locations is called for R. */ void -cpp_force_token_locations (cpp_reader *r, source_location *p) +cpp_force_token_locations (cpp_reader *r, location_t loc) { - r->forced_token_location_p = p; + r->forced_token_location = loc; } /* Go back to assigning locations naturally for lexed tokens. */ @@ -3757,5 +4048,717 @@ cpp_force_token_locations (cpp_reader *r, source_location *p) void cpp_stop_forcing_token_locations (cpp_reader *r) { - r->forced_token_location_p = NULL; + r->forced_token_location = 0; +} + +/* We're looking at \, if it's escaping EOL, look past it. If at + LIMIT, don't advance. */ + +static const unsigned char * +do_peek_backslash (const unsigned char *peek, const unsigned char *limit) +{ + const unsigned char *probe = peek; + + if (__builtin_expect (peek[1] == '\n', true)) + { + eol: + probe += 2; + if (__builtin_expect (probe < limit, true)) + { + peek = probe; + if (*peek == '\\') + /* The user might be perverse. */ + return do_peek_backslash (peek, limit); + } + } + else if (__builtin_expect (peek[1] == '\r', false)) + { + if (probe[2] == '\n') + probe++; + goto eol; + } + + return peek; +} + +static const unsigned char * +do_peek_next (const unsigned char *peek, const unsigned char *limit) +{ + if (__builtin_expect (*peek == '\\', false)) + peek = do_peek_backslash (peek, limit); + return peek; +} + +static const unsigned char * +do_peek_prev (const unsigned char *peek, const unsigned char *bound) +{ + if (peek == bound) + return NULL; + + unsigned char c = *--peek; + if (__builtin_expect (c == '\n', false) + || __builtin_expect (c == 'r', false)) + { + if (peek == bound) + return peek; + int ix = -1; + if (c == '\n' && peek[ix] == '\r') + { + if (peek + ix == bound) + return peek; + ix--; + } + + if (peek[ix] == '\\') + return do_peek_prev (peek + ix, bound); + + return peek; + } + else + return peek; +} + +/* If PEEK[-1] is identifier MATCH, scan past it and trailing white + space. Otherwise return NULL. */ + +static const unsigned char * +do_peek_ident (const char *match, const unsigned char *peek, + const unsigned char *limit) +{ + for (; *++match; peek++) + if (*peek != *match) + { + peek = do_peek_next (peek, limit); + if (*peek != *match) + return NULL; + } + + /* Must now not be looking at an identifier char. */ + peek = do_peek_next (peek, limit); + if (ISIDNUM (*peek)) + return NULL; + + /* Skip control-line whitespace. */ + ws: + while (*peek == ' ' || *peek == '\t') + peek++; + if (__builtin_expect (*peek == '\\', false)) + { + peek = do_peek_backslash (peek, limit); + if (*peek != '\\') + goto ws; + } + + return peek; +} + +/* Are we looking at a module control line starting as PEEK - 1? */ + +static bool +do_peek_module (cpp_reader *pfile, unsigned char c, + const unsigned char *peek, const unsigned char *limit) +{ + bool import = false; + + if (__builtin_expect (c == 'e', false)) + { + if (!((peek[0] == 'x' || peek[0] == '\\') + && (peek = do_peek_ident ("export", peek, limit)))) + return false; + + /* export, peek for import or module. No need to peek __import + here. */ + if (peek[0] == 'i') + { + if (!((peek[1] == 'm' || peek[1] == '\\') + && (peek = do_peek_ident ("import", peek + 1, limit)))) + return false; + import = true; + } + else if (peek[0] == 'm') + { + if (!((peek[1] == 'o' || peek[1] == '\\') + && (peek = do_peek_ident ("module", peek + 1, limit)))) + return false; + } + else + return false; + } + else if (__builtin_expect (c == 'i', false)) + { + if (!((peek[0] == 'm' || peek[0] == '\\') + && (peek = do_peek_ident ("import", peek, limit)))) + return false; + import = true; + } + else if (__builtin_expect (c == '_', false)) + { + /* Needed for translated includes. */ + if (!((peek[0] == '_' || peek[0] == '\\') + && (peek = do_peek_ident ("__import", peek, limit)))) + return false; + import = true; + } + else if (__builtin_expect (c == 'm', false)) + { + if (!((peek[0] == 'o' || peek[0] == '\\') + && (peek = do_peek_ident ("module", peek, limit)))) + return false; + } + else + return false; + + /* Peek the next character to see if it's good enough. We'll be at + the first non-whitespace char, including skipping an escaped + newline. */ + /* ... import followed by identifier, ':', '<' or header-name + preprocessing tokens, or module followed by identifier, ':' or + ';' preprocessing tokens. */ + unsigned char p = *peek++; + + /* A character literal is ... single quotes, ... optionally preceded + by u8, u, U, or L */ + /* A string-literal is a ... double quotes, optionally prefixed by + R, u8, u8R, u, uR, U, UR, L, or LR */ + if (p == 'u') + { + peek = do_peek_next (peek, limit); + if (*peek == '8') + { + peek++; + goto peek_u8; + } + goto peek_u; + } + else if (p == 'U' || p == 'L') + { + peek_u8: + peek = do_peek_next (peek, limit); + peek_u: + if (*peek == '\"' || *peek == '\'') + return false; + + if (*peek == 'R') + goto peek_R; + /* Identifier. Ok. */ + } + else if (p == 'R') + { + peek_R: + if (CPP_OPTION (pfile, rliterals)) + { + peek = do_peek_next (peek, limit); + if (*peek == '\"') + return false; + } + /* Identifier. Ok. */ + } + else if ('Z' - 'A' == 25 + ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_') + : ISIDST (p)) + { + /* Identifier. Ok. */ + } + else if (p == '<') + { + /* Maybe angle header, ok for import. Reject + '<=', '<<' digraph:'<:'. */ + if (!import) + return false; + peek = do_peek_next (peek, limit); + if (*peek == '=' || *peek == '<' + || (*peek == ':' && CPP_OPTION (pfile, digraphs))) + return false; + } + else if (p == ';') + { + /* SEMICOLON, ok for module. */ + if (import) + return false; + } + else if (p == '"') + { + /* STRING, ok for import. */ + if (!import) + return false; + } + else if (p == ':') + { + /* Maybe COLON, ok. Reject '::', digraph:':>'. */ + peek = do_peek_next (peek, limit); + if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs))) + return false; + } + else + /* FIXME: Detect a unicode character, excluding those not + permitted as the initial character. [lex.name]/1. I presume + we need to check the \[uU] spellings, and directly using + Unicode in say UTF8 form? Or perhaps we do the phase-1 + conversion of UTF8 to universal-character-names? */ + return false; + + return true; +} + +/* Directives-only scanning. Somewhat more relaxed than correct + parsing -- some ill-formed programs will not be rejected. */ + +void +cpp_directive_only_process (cpp_reader *pfile, + void *data, + void (*cb) (cpp_reader *, CPP_DO_task, void *, ...)) +{ + bool module_p = CPP_OPTION (pfile, module_directives); + + do + { + restart: + /* Buffer initialization, but no line cleaning. */ + cpp_buffer *buffer = pfile->buffer; + buffer->cur_note = buffer->notes_used = 0; + buffer->cur = buffer->line_base = buffer->next_line; + buffer->need_line = false; + /* Files always end in a newline. We rely on this for + character peeking safety. */ + gcc_assert (buffer->rlimit[-1] == '\n'); + + const unsigned char *base = buffer->cur; + unsigned line_count = 0; + const unsigned char *line_start = base; + + bool bol = true; + bool raw = false; + + const unsigned char *lwm = base; + for (const unsigned char *pos = base, *limit = buffer->rlimit; + pos < limit;) + { + unsigned char c = *pos++; + /* This matches the switch in _cpp_lex_direct. */ + switch (c) + { + case ' ': case '\t': case '\f': case '\v': + /* Whitespace, do nothing. */ + break; + + case '\r': /* MAC line ending, or Windows \r\n */ + if (*pos == '\n') + pos++; + /* FALLTHROUGH */ + + case '\n': + bol = true; + + next_line: + CPP_INCREMENT_LINE (pfile, 0); + line_count++; + line_start = pos; + break; + + case '\\': + /* is removed, and doesn't undo any + preceeding escape or whatnot. */ + if (*pos == '\n') + { + pos++; + goto next_line; + } + else if (*pos == '\r') + { + if (pos[1] == '\n') + pos++; + pos++; + goto next_line; + } + goto dflt; + + case '#': + if (bol) + { + /* Line directive. */ + if (pos - 1 > base && !pfile->state.skipping) + cb (pfile, CPP_DO_print, data, + line_count, base, pos - 1 - base); + + /* Prep things for directive handling. */ + buffer->next_line = pos; + buffer->need_line = true; + bool ok = _cpp_get_fresh_line (pfile); + gcc_checking_assert (ok); + + /* Ensure proper column numbering for generated + error messages. */ + buffer->line_base -= pos - line_start; + + _cpp_handle_directive (pfile, line_start + 1 != pos); + + /* Sanitize the line settings. Duplicate #include's can + mess things up. */ + // FIXME: Necessary? + pfile->line_table->highest_location + = pfile->line_table->highest_line; + + if (!pfile->state.skipping + && pfile->buffer->next_line < pfile->buffer->rlimit) + cb (pfile, CPP_DO_location, data, + pfile->line_table->highest_line); + + goto restart; + } + goto dflt; + + case '/': + { + const unsigned char *peek = do_peek_next (pos, limit); + if (!(*peek == '/' || *peek == '*')) + goto dflt; + + /* Line or block comment */ + bool is_block = *peek == '*'; + bool star = false; + bool esc = false; + location_t sloc + = linemap_position_for_column (pfile->line_table, + pos - line_start); + + while (pos < limit) + { + char c = *pos++; + switch (c) + { + case '\\': + esc = true; + break; + + case '\r': + if (*pos == '\n') + pos++; + /* FALLTHROUGH */ + + case '\n': + { + CPP_INCREMENT_LINE (pfile, 0); + line_count++; + line_start = pos; + if (!esc && !is_block) + { + bol = true; + goto done_comment; + } + } + if (!esc) + star = false; + esc = false; + break; + + case '*': + if (pos > peek && !esc) + star = is_block; + esc = false; + break; + + case '/': + if (star) + goto done_comment; + /* FALLTHROUGH */ + + default: + star = false; + esc = false; + break; + } + } + cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0, + "unterminated comment"); + done_comment: + lwm = pos; + break; + } + + case '\'': + if (!CPP_OPTION (pfile, digit_separators)) + goto delimited_string; + + /* Possibly a number punctuator. */ + if (!ISIDNUM (*do_peek_next (pos, limit))) + goto delimited_string; + + goto quote_peek; + + case '\"': + if (!CPP_OPTION (pfile, rliterals)) + goto delimited_string; + + quote_peek: + { + /* For ' see if it's a number punctuator + \.?(| + |'|'|[eEpP]|\.)* */ + /* For " see if it's a raw string + {U,L,u,u8}R. This includes CPP_NUMBER detection, + because that could be 0e+R. */ + const unsigned char *peek = pos - 1; + bool quote_first = c == '"'; + bool quote_eight = false; + bool maybe_number_start = false; + bool want_number = false; + + while ((peek = do_peek_prev (peek, lwm))) + { + unsigned char p = *peek; + if (quote_first) + { + if (!raw) + { + if (p != 'R') + break; + raw = true; + continue; + } + + quote_first = false; + if (p == 'L' || p == 'U' || p == 'u') + ; + else if (p == '8') + quote_eight = true; + else + goto second_raw; + } + else if (quote_eight) + { + if (p != 'u') + { + raw = false; + break; + } + quote_eight = false; + } + else if (c == '"') + { + second_raw:; + if (!want_number && ISIDNUM (p)) + { + raw = false; + break; + } + } + + if (ISDIGIT (p)) + maybe_number_start = true; + else if (p == '.') + want_number = true; + else if (ISIDNUM (p)) + maybe_number_start = false; + else if (p == '+' || p == '-') + { + if (const unsigned char *peek_prev + = do_peek_prev (peek, lwm)) + { + p = *peek_prev; + if (p == 'e' || p == 'E' + || p == 'p' || p == 'P') + { + want_number = true; + maybe_number_start = false; + } + else + break; + } + else + break; + } + else if (p == '\'' || p == '\"') + { + /* If this is lwm, this must be the end of a + previous string. So this is a trailing + literal type, (a) if those are allowed, + and (b) maybe_start is false. Otherwise + this must be a CPP_NUMBER because we've + met another ', and we'd have checked that + in its own right. */ + if (peek == lwm && CPP_OPTION (pfile, uliterals)) + { + if (!maybe_number_start && !want_number) + /* Must be a literal type. */ + raw = false; + } + else if (p == '\'' + && CPP_OPTION (pfile, digit_separators)) + maybe_number_start = true; + break; + } + else if (c == '\'') + break; + else if (!quote_first && !quote_eight) + break; + } + + if (maybe_number_start) + { + if (c == '\'') + /* A CPP NUMBER. */ + goto dflt; + raw = false; + } + + goto delimited_string; + } + + delimited_string: + { + /* (Possibly raw) string or char literal. */ + unsigned char end = c; + int delim_len = -1; + const unsigned char *delim = NULL; + location_t sloc = linemap_position_for_column (pfile->line_table, + pos - line_start); + int esc = 0; + + if (raw) + { + /* There can be no line breaks in the delimiter. */ + delim = pos; + for (delim_len = 0; (c = *pos++) != '('; delim_len++) + { + if (delim_len == 16) + { + cpp_error_with_line (pfile, CPP_DL_ERROR, + sloc, 0, + "raw string delimiter" + " longer than %d" + " characters", + delim_len); + raw = false; + pos = delim; + break; + } + if (strchr (") \\\t\v\f\n", c)) + { + cpp_error_with_line (pfile, CPP_DL_ERROR, + sloc, 0, + "invalid character '%c'" + " in raw string" + " delimiter", c); + raw = false; + pos = delim; + break; + } + if (pos >= limit) + goto bad_string; + } + } + + while (pos < limit) + { + char c = *pos++; + switch (c) + { + case '\\': + if (!raw) + esc++; + break; + + case '\r': + if (*pos == '\n') + pos++; + /* FALLTHROUGH */ + + case '\n': + { + CPP_INCREMENT_LINE (pfile, 0); + line_count++; + line_start = pos; + } + if (esc) + esc--; + break; + + case ')': + if (raw + && pos + delim_len + 1 < limit + && pos[delim_len] == end + && !memcmp (delim, pos, delim_len)) + { + pos += delim_len + 1; + raw = false; + goto done_string; + } + break; + + default: + if (!raw && !(esc & 1) && c == end) + goto done_string; + esc = 0; + break; + } + } + bad_string: + cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0, + "unterminated literal"); + + done_string: + raw = false; + lwm = pos - 1; + } + goto dflt; + + case '_': + case 'e': + case 'i': + case 'm': + if (bol && module_p && !pfile->state.skipping + && do_peek_module (pfile, c, pos, limit)) + { + /* We've seen the start of a module control line. + Start up the tokenizer. */ + pos--; /* Backup over the first character. */ + + /* Backup over whitespace to start of line. */ + while (pos > line_start + && (pos[-1] == ' ' || pos[-1] == '\t')) + pos--; + + if (pos > base) + cb (pfile, CPP_DO_print, data, line_count, base, pos - base); + + /* Prep things for directive handling. */ + buffer->next_line = pos; + buffer->need_line = true; + + /* Now get tokens until the PRAGMA_EOL. */ + do + { + location_t spelling; + const cpp_token *tok + = cpp_get_token_with_location (pfile, &spelling); + + gcc_assert (pfile->state.in_deferred_pragma + || tok->type == CPP_PRAGMA_EOL); + cb (pfile, CPP_DO_token, data, tok, spelling); + } + while (pfile->state.in_deferred_pragma); + + if (pfile->buffer->next_line < pfile->buffer->rlimit) + cb (pfile, CPP_DO_location, data, + pfile->line_table->highest_line); + + pfile->mi_valid = false; + goto restart; + } + goto dflt; + + default: + dflt: + bol = false; + pfile->mi_valid = false; + break; + } + } + + if (buffer->rlimit > base && !pfile->state.skipping) + cb (pfile, CPP_DO_print, data, line_count, base, buffer->rlimit - base); + + _cpp_pop_buffer (pfile); + } + while (pfile->buffer); }