From c9c3d5f28a589cd00be5748010783657189e9855 Mon Sep 17 00:00:00 2001 From: Nathan Sidwell Date: Wed, 18 Nov 2020 10:24:12 -0800 Subject: [PATCH] preprocessor: C++ module-directives C++20 modules introduces a new kind of preprocessor directive -- a module directive. These are directives but without the leading '#'. We have to detect them by sniffing the start of a logical line. When detected we replace the initial identifiers with unspellable tokens and pass them through to the language parser the same way deferred pragmas are. There's a PRAGMA_EOL at the logical end of line too. One additional complication is that we have to do header-name lexing after the initial tokens, and that requires changes in the macro-aware piece of the preprocessor. The above sniffer sets a counter in the lexer state, and that triggers at the appropriate point. We then do the same header-name lexing that occurs on a #include directive or has_include pseudo-macro. Except that the header name ends up in the token stream. A couple of token emitters need to deal with the new token possibility. gcc/c-family/ * c-lex.c (c_lex_with_flags): CPP_HEADER_NAMEs can now be seen. libcpp/ * include/cpplib.h (struct cpp_options): Add module_directives option. (NODE_MODULE): New node flag. (struct cpp_hashnode): Make rid-code a bitfield, increase bits in flags and swap with type field. * init.c (post_options): Create module-directive identifier nodes. * internal.h (struct lexer_state): Add directive_file_token & n_modules fields. Add module node enumerator. * lex.c (cpp_maybe_module_directive): New. (_cpp_lex_token): Call it. (cpp_output_token): Add '"' around CPP_HEADER_NAME token. (do_peek_ident, do_peek_module): New. (cpp_directives_only): Detect module-directive lines. * macro.c (cpp_get_token_1): Deal with directive_file_token triggering. --- gcc/c-family/c-lex.c | 5 +- libcpp/include/cpplib.h | 10 +- libcpp/init.c | 23 +++ libcpp/internal.h | 9 + libcpp/lex.c | 392 ++++++++++++++++++++++++++++++++++++++++ libcpp/macro.c | 79 ++++++++ 6 files changed, 514 insertions(+), 4 deletions(-) diff --git a/gcc/c-family/c-lex.c b/gcc/c-family/c-lex.c index 8dd1420d10d..c8d33d0c9d1 100644 --- a/gcc/c-family/c-lex.c +++ b/gcc/c-family/c-lex.c @@ -667,8 +667,11 @@ c_lex_with_flags (tree *value, location_t *loc, unsigned char *cpp_flags, *value = build_int_cst (integer_type_node, tok->val.pragma); break; - /* These tokens should not be visible outside cpplib. */ case CPP_HEADER_NAME: + *value = build_string (tok->val.str.len, (const char *)tok->val.str.text); + break; + + /* This token should not be visible outside cpplib. */ case CPP_MACRO_ARG: gcc_unreachable (); diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h index 389af32bc5c..630f2e055d1 100644 --- a/libcpp/include/cpplib.h +++ b/libcpp/include/cpplib.h @@ -487,6 +487,9 @@ struct cpp_options /* Nonzero for the '::' token. */ unsigned char scope; + /* Nonzero means tokenize C++20 module directives. */ + unsigned char module_directives; + /* Holds the name of the target (execution) character set. */ const char *narrow_charset; @@ -842,6 +845,7 @@ struct GTY(()) cpp_macro { #define NODE_USED (1 << 5) /* Dumped with -dU. */ #define NODE_CONDITIONAL (1 << 6) /* Conditional macro */ #define NODE_WARN_OPERATOR (1 << 7) /* Warn about C++ named operator. */ +#define NODE_MODULE (1 << 8) /* C++-20 module-related name. */ /* Different flavors of hash node. */ enum node_type @@ -900,11 +904,11 @@ struct GTY(()) cpp_hashnode { unsigned int directive_index : 7; /* If is_directive, then index into directive table. Otherwise, a NODE_OPERATOR. */ - unsigned char rid_code; /* Rid code - for front ends. */ + unsigned int rid_code : 8; /* Rid code - for front ends. */ + unsigned int flags : 9; /* CPP flags. */ ENUM_BITFIELD(node_type) type : 2; /* CPP node type. */ - unsigned int flags : 8; /* CPP flags. */ - /* 6 bits spare (plus another 32 on 64-bit hosts). */ + /* 5 bits spare (plus another 32 on 64-bit hosts). */ union _cpp_hashnode_value GTY ((desc ("%1.type"))) value; }; diff --git a/libcpp/init.c b/libcpp/init.c index 76882bc5f1c..fc826583d3a 100644 --- a/libcpp/init.c +++ b/libcpp/init.c @@ -843,4 +843,27 @@ post_options (cpp_reader *pfile) CPP_OPTION (pfile, trigraphs) = 0; CPP_OPTION (pfile, warn_trigraphs) = 0; } + + if (CPP_OPTION (pfile, module_directives)) + { + /* These unspellable tokens have a leading space. */ + const char *const inits[spec_nodes::M_HWM] + = {"export ", "module ", "import ", "__import"}; + + for (int ix = 0; ix != spec_nodes::M_HWM; ix++) + { + cpp_hashnode *node = cpp_lookup (pfile, UC (inits[ix]), + strlen (inits[ix])); + + /* Token we pass to the compiler. */ + pfile->spec_nodes.n_modules[ix][1] = node; + + if (ix != spec_nodes::M__IMPORT) + /* Token we recognize when lexing, drop the trailing ' '. */ + node = cpp_lookup (pfile, NODE_NAME (node), NODE_LEN (node) - 1); + + node->flags |= NODE_MODULE; + pfile->spec_nodes.n_modules[ix][0] = node; + } + } } diff --git a/libcpp/internal.h b/libcpp/internal.h index 3f5bafbccde..4c1100bbdeb 100644 --- a/libcpp/internal.h +++ b/libcpp/internal.h @@ -280,6 +280,9 @@ struct lexer_state /* Nonzero when tokenizing a deferred pragma. */ unsigned char in_deferred_pragma; + /* Count to token that is a header-name. */ + unsigned char directive_file_token; + /* Nonzero if the deferred pragma being handled allows macro expansion. */ unsigned char pragma_allow_expansion; }; @@ -292,6 +295,12 @@ struct spec_nodes cpp_hashnode *n_false; /* C++ keyword false */ cpp_hashnode *n__VA_ARGS__; /* C99 vararg macros */ cpp_hashnode *n__VA_OPT__; /* C++ vararg macros */ + + enum {M_EXPORT, M_MODULE, M_IMPORT, M__IMPORT, M_HWM}; + + /* C++20 modules, only set when module_directives is in effect. + incoming variants [0], outgoing ones [1] */ + cpp_hashnode *n_modules[M_HWM][2]; }; typedef struct _cpp_line_note _cpp_line_note; diff --git a/libcpp/lex.c b/libcpp/lex.c index f58a8828124..2343ed5aa00 100644 --- a/libcpp/lex.c +++ b/libcpp/lex.c @@ -2615,6 +2615,150 @@ _cpp_temp_token (cpp_reader *pfile) return result; } +/* We're at the beginning of a logical line (so not in + directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See + if we should enter deferred_pragma mode to tokenize the rest of the + line as a module control-line. */ + +static void +cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result) +{ + unsigned backup = 0; /* Tokens we peeked. */ + cpp_hashnode *node = result->val.node.node; + cpp_token *peek = result; + cpp_token *keyword = peek; + cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules; + int header_count = 0; + + /* Make sure the incoming state is as we expect it. This way we + can restore it using constants. */ + gcc_checking_assert (!pfile->state.in_deferred_pragma + && !pfile->state.skipping + && !pfile->state.parsing_args + && !pfile->state.angled_headers + && (pfile->state.save_comments + == !CPP_OPTION (pfile, discard_comments))); + + /* Enter directives mode sufficiently for peeking. We don't have + to actually set in_directive. */ + pfile->state.in_deferred_pragma = true; + + /* These two fields are needed to process tokenization in deferred + pragma mode. They are not used outside deferred pragma mode or + directives mode. */ + pfile->state.pragma_allow_expansion = true; + pfile->directive_line = result->src_loc; + + /* Saving comments is incompatible with directives mode. */ + pfile->state.save_comments = 0; + + if (node == n_modules[spec_nodes::M_EXPORT][0]) + { + peek = _cpp_lex_direct (pfile); + keyword = peek; + backup++; + if (keyword->type != CPP_NAME) + goto not_module; + node = keyword->val.node.node; + if (!(node->flags & NODE_MODULE)) + goto not_module; + } + + if (node == n_modules[spec_nodes::M__IMPORT][0]) + /* __import */ + header_count = backup + 2 + 16; + else if (node == n_modules[spec_nodes::M_IMPORT][0]) + /* import */ + header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0); + else if (node == n_modules[spec_nodes::M_MODULE][0]) + ; /* module */ + else + goto not_module; + + /* We've seen [export] {module|import|__import}. Check the next token. */ + if (header_count) + /* After '{,__}import' a header name may appear. */ + pfile->state.angled_headers = true; + peek = _cpp_lex_direct (pfile); + backup++; + + /* ... import followed by identifier, ':', '<' or + header-name preprocessing tokens, or module + followed by cpp-identifier, ':' or ';' preprocessing + tokens. C++ keywords are not yet relevant. */ + if (peek->type == CPP_NAME + || peek->type == CPP_COLON + || (header_count + ? (peek->type == CPP_LESS + || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R') + || peek->type == CPP_HEADER_NAME) + : peek->type == CPP_SEMICOLON)) + { + pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed); + if (!pfile->state.pragma_allow_expansion) + pfile->state.prevent_expansion++; + + if (!header_count && linemap_included_from + (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table))) + cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0, + "module control-line cannot be in included file"); + + /* The first one or two tokens cannot be macro names. */ + for (int ix = backup; ix--;) + { + cpp_token *tok = ix ? keyword : result; + cpp_hashnode *node = tok->val.node.node; + + /* Don't attempt to expand the token. */ + tok->flags |= NO_EXPAND; + if (_cpp_defined_macro_p (node) + && !cpp_fun_like_macro_p (node)) + cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0, + "module control-line \"%s\" cannot be" + " an object-like macro", + NODE_NAME (node)); + } + + /* Map to underbar variants. */ + keyword->val.node.node = n_modules[header_count + ? spec_nodes::M_IMPORT + : spec_nodes::M_MODULE][1]; + if (backup != 1) + result->val.node.node = n_modules[spec_nodes::M_EXPORT][1]; + + /* Maybe tell the tokenizer we expect a header-name down the + road. */ + pfile->state.directive_file_token = header_count; + } + else + { + not_module: + /* Drop out of directive mode. */ + /* We aaserted save_comments had this value upon entry. */ + pfile->state.save_comments + = !CPP_OPTION (pfile, discard_comments); + pfile->state.in_deferred_pragma = false; + /* Do not let this remain on. */ + pfile->state.angled_headers = false; + } + + /* In either case we want to backup the peeked tokens. */ + if (backup) + { + /* If we saw EOL, we should drop it, because this isn't a module + control-line after all. */ + bool eol = peek->type == CPP_PRAGMA_EOL; + if (!eol || backup > 1) + { + /* Put put the peeked tokens back */ + _cpp_backup_tokens_direct (pfile, backup); + /* But if the last one was an EOL, forget it. */ + if (eol) + pfile->lookaheads--; + } + } +} + /* Lex a token into RESULT (external interface). Takes care of issues like directive handling, token lookahead, multiple include optimization and skipping. */ @@ -2663,6 +2807,21 @@ _cpp_lex_token (cpp_reader *pfile) } else if (pfile->state.in_deferred_pragma) result = &pfile->directive_result; + else if (result->type == CPP_NAME + && (result->val.node.node->flags & NODE_MODULE) + && !pfile->state.skipping + /* Unlike regular directives, we do not deal with + tokenizing module directives as macro arguments. + That's not permitted. */ + && !pfile->state.parsing_args) + { + /* P1857. Before macro expansion, At start of logical + line ... */ + /* We don't have to consider lookaheads at this point. */ + gcc_checking_assert (!pfile->lookaheads); + + cpp_maybe_module_directive (pfile, result); + } if (pfile->cb.line_change && !pfile->state.skipping) pfile->cb.line_change (pfile, result, pfile->state.parsing_args); @@ -3461,7 +3620,11 @@ cpp_output_token (const cpp_token *token, FILE *fp) break; case SPELL_LITERAL: + if (token->type == CPP_HEADER_NAME) + fputc ('"', fp); fwrite (token->val.str.text, 1, token->val.str.len, fp); + if (token->type == CPP_HEADER_NAME) + fputc ('"', fp); break; case SPELL_NONE: @@ -3947,6 +4110,188 @@ do_peek_prev (const unsigned char *peek, const unsigned char *bound) return peek; } +/* If PEEK[-1] is identifier MATCH, scan past it and trailing white + space. Otherwise return NULL. */ + +static const unsigned char * +do_peek_ident (const char *match, const unsigned char *peek, + const unsigned char *limit) +{ + for (; *++match; peek++) + if (*peek != *match) + { + peek = do_peek_next (peek, limit); + if (*peek != *match) + return NULL; + } + + /* Must now not be looking at an identifier char. */ + peek = do_peek_next (peek, limit); + if (ISIDNUM (*peek)) + return NULL; + + /* Skip control-line whitespace. */ + ws: + while (*peek == ' ' || *peek == '\t') + peek++; + if (__builtin_expect (*peek == '\\', false)) + { + peek = do_peek_backslash (peek, limit); + if (*peek != '\\') + goto ws; + } + + return peek; +} + +/* Are we looking at a module control line starting as PEEK - 1? */ + +static bool +do_peek_module (cpp_reader *pfile, unsigned char c, + const unsigned char *peek, const unsigned char *limit) +{ + bool import = false; + + if (__builtin_expect (c == 'e', false)) + { + if (!((peek[0] == 'x' || peek[0] == '\\') + && (peek = do_peek_ident ("export", peek, limit)))) + return false; + + /* export, peek for import or module. No need to peek __import + here. */ + if (peek[0] == 'i') + { + if (!((peek[1] == 'm' || peek[1] == '\\') + && (peek = do_peek_ident ("import", peek + 1, limit)))) + return false; + import = true; + } + else if (peek[0] == 'm') + { + if (!((peek[1] == 'o' || peek[1] == '\\') + && (peek = do_peek_ident ("module", peek + 1, limit)))) + return false; + } + else + return false; + } + else if (__builtin_expect (c == 'i', false)) + { + if (!((peek[0] == 'm' || peek[0] == '\\') + && (peek = do_peek_ident ("import", peek, limit)))) + return false; + import = true; + } + else if (__builtin_expect (c == '_', false)) + { + /* Needed for translated includes. */ + if (!((peek[0] == '_' || peek[0] == '\\') + && (peek = do_peek_ident ("__import", peek, limit)))) + return false; + import = true; + } + else if (__builtin_expect (c == 'm', false)) + { + if (!((peek[0] == 'o' || peek[0] == '\\') + && (peek = do_peek_ident ("module", peek, limit)))) + return false; + } + else + return false; + + /* Peek the next character to see if it's good enough. We'll be at + the first non-whitespace char, including skipping an escaped + newline. */ + /* ... import followed by identifier, ':', '<' or header-name + preprocessing tokens, or module followed by identifier, ':' or + ';' preprocessing tokens. */ + unsigned char p = *peek++; + + /* A character literal is ... single quotes, ... optionally preceded + by u8, u, U, or L */ + /* A string-literal is a ... double quotes, optionally prefixed by + R, u8, u8R, u, uR, U, UR, L, or LR */ + if (p == 'u') + { + peek = do_peek_next (peek, limit); + if (*peek == '8') + { + peek++; + goto peek_u8; + } + goto peek_u; + } + else if (p == 'U' || p == 'L') + { + peek_u8: + peek = do_peek_next (peek, limit); + peek_u: + if (*peek == '\"' || *peek == '\'') + return false; + + if (*peek == 'R') + goto peek_R; + /* Identifier. Ok. */ + } + else if (p == 'R') + { + peek_R: + if (CPP_OPTION (pfile, rliterals)) + { + peek = do_peek_next (peek, limit); + if (*peek == '\"') + return false; + } + /* Identifier. Ok. */ + } + else if ('Z' - 'A' == 25 + ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_') + : ISIDST (p)) + { + /* Identifier. Ok. */ + } + else if (p == '<') + { + /* Maybe angle header, ok for import. Reject + '<=', '<<' digraph:'<:'. */ + if (!import) + return false; + peek = do_peek_next (peek, limit); + if (*peek == '=' || *peek == '<' + || (*peek == ':' && CPP_OPTION (pfile, digraphs))) + return false; + } + else if (p == ';') + { + /* SEMICOLON, ok for module. */ + if (import) + return false; + } + else if (p == '"') + { + /* STRING, ok for import. */ + if (!import) + return false; + } + else if (p == ':') + { + /* Maybe COLON, ok. Reject '::', digraph:':>'. */ + peek = do_peek_next (peek, limit); + if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs))) + return false; + } + else + /* FIXME: Detect a unicode character, excluding those not + permitted as the initial character. [lex.name]/1. I presume + we need to check the \[uU] spellings, and directly using + Unicode in say UTF8 form? Or perhaps we do the phase-1 + conversion of UTF8 to universal-character-names? */ + return false; + + return true; +} + /* Directives-only scanning. Somewhat more relaxed than correct parsing -- some ill-formed programs will not be rejected. */ @@ -3955,6 +4300,8 @@ cpp_directive_only_process (cpp_reader *pfile, void *data, void (*cb) (cpp_reader *, CPP_DO_task, void *, ...)) { + bool module_p = CPP_OPTION (pfile, module_directives); + do { restart: @@ -4347,6 +4694,51 @@ cpp_directive_only_process (cpp_reader *pfile, } goto dflt; + case '_': + case 'e': + case 'i': + case 'm': + if (bol && module_p && !pfile->state.skipping + && do_peek_module (pfile, c, pos, limit)) + { + /* We've seen the start of a module control line. + Start up the tokenizer. */ + pos--; /* Backup over the first character. */ + + /* Backup over whitespace to start of line. */ + while (pos > line_start + && (pos[-1] == ' ' || pos[-1] == '\t')) + pos--; + + if (pos > base) + cb (pfile, CPP_DO_print, data, line_count, base, pos - base); + + /* Prep things for directive handling. */ + buffer->next_line = pos; + buffer->need_line = true; + + /* Now get tokens until the PRAGMA_EOL. */ + do + { + location_t spelling; + const cpp_token *tok + = cpp_get_token_with_location (pfile, &spelling); + + gcc_assert (pfile->state.in_deferred_pragma + || tok->type == CPP_PRAGMA_EOL); + cb (pfile, CPP_DO_token, data, tok, spelling); + } + while (pfile->state.in_deferred_pragma); + + if (pfile->buffer->next_line < pfile->buffer->rlimit) + cb (pfile, CPP_DO_location, data, + pfile->line_table->highest_line); + + pfile->mi_valid = false; + goto restart; + } + goto dflt; + default: dflt: bol = false; diff --git a/libcpp/macro.c b/libcpp/macro.c index aa16752e2b2..ddcf3b4f63a 100644 --- a/libcpp/macro.c +++ b/libcpp/macro.c @@ -2963,6 +2963,85 @@ cpp_get_token_1 (cpp_reader *pfile, location_t *location) } pfile->about_to_expand_macro_p = saved_about_to_expand_macro; + + if (pfile->state.directive_file_token + && !pfile->state.parsing_args + && !(result->type == CPP_PADDING || result->type == CPP_COMMENT) + && !(15 & --pfile->state.directive_file_token)) + { + /* Do header-name frobbery. Concatenate < ... > as approprate. + Do header search if needed, and finally drop the outer <> or + "". */ + pfile->state.angled_headers = false; + + /* Do angle-header reconstitution. Then do include searching. + We'll always end up with a ""-quoted header-name in that + case. If searching finds nothing, we emit a diagnostic and + an empty string. */ + size_t len = 0; + char *fname = NULL; + + cpp_token *tmp = _cpp_temp_token (pfile); + *tmp = *result; + + tmp->type = CPP_HEADER_NAME; + bool need_search = !pfile->state.directive_file_token; + pfile->state.directive_file_token = 0; + + bool angle = result->type != CPP_STRING; + if (result->type == CPP_HEADER_NAME + || (result->type == CPP_STRING && result->val.str.text[0] != 'R')) + { + len = result->val.str.len - 2; + fname = XNEWVEC (char, len + 1); + memcpy (fname, result->val.str.text + 1, len); + fname[len] = 0; + } + else if (result->type == CPP_LESS) + fname = _cpp_bracket_include (pfile); + + if (fname) + { + /* We have a header-name. Look it up. This will emit an + unfound diagnostic. Canonicalize the found name. */ + const char *found = fname; + + if (need_search) + { + found = cpp_find_header_unit (pfile, fname, angle, tmp->src_loc); + if (!found) + found = ""; + len = strlen (found); + } + /* Force a leading './' if it's not absolute. */ + bool dotme = (found[0] == '.' ? !IS_DIR_SEPARATOR (found[1]) + : found[0] && !IS_ABSOLUTE_PATH (found)); + + if (BUFF_ROOM (pfile->u_buff) < len + 1 + dotme * 2) + _cpp_extend_buff (pfile, &pfile->u_buff, len + 1 + dotme * 2); + unsigned char *buf = BUFF_FRONT (pfile->u_buff); + size_t pos = 0; + + if (dotme) + { + buf[pos++] = '.'; + /* Apparently '/' is unconditional. */ + buf[pos++] = '/'; + } + memcpy (&buf[pos], found, len); + pos += len; + buf[pos] = 0; + + tmp->val.str.len = pos; + tmp->val.str.text = buf; + + tmp->type = CPP_HEADER_NAME; + XDELETEVEC (fname); + + result = tmp; + } + } + return result; } -- 2.30.2