From: Alan Modra Date: Fri, 13 Aug 2021 07:50:10 +0000 (+0930) Subject: ld lexer tidy, possibly break the world X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=f400c8d27e11477d79ba67ca930ca7e7511b9ee5;p=binutils-gdb.git ld lexer tidy, possibly break the world This tidies the states in which ld lexer rules are enabled. This change will quite likely trip over issues similar to those mentioned in the new ldlex.l comments, so please test it out. * ldgram.y (wildcard_name): Remove now unnecessary components. * ldlex.l: Restrict many rules' states. Remove -l expression state rule. Comment on lookahead state madness and need for /DISCARD/ in expression state. --- diff --git a/ld/ldgram.y b/ld/ldgram.y index 24979deebbe..1f6c44a073c 100644 --- a/ld/ldgram.y +++ b/ld/ldgram.y @@ -421,21 +421,11 @@ statement_anywhere: lang_add_assignment (exp_assert ($4, $6)); } ; -/* The '*' and '?' cases are there because the lexer returns them as - separate tokens rather than as NAME. */ wildcard_name: NAME { $$ = $1; } - | '*' - { - $$ = "*"; - } - | '?' - { - $$ = "?"; - } ; wildcard_maybe_exclude: diff --git a/ld/ldlex.l b/ld/ldlex.l index b0861d78e49..6aeba6de656 100644 --- a/ld/ldlex.l +++ b/ld/ldlex.l @@ -192,132 +192,155 @@ V_IDENTIFIER [*?.$_a-zA-Z\[\]\-\!\^\\]([*?.$_a-zA-Z0-9\[\]\-\!\^\\]|::)* } return INT; } -"]" { RTOKEN(']');} -"[" { RTOKEN('[');} -"<<=" { RTOKEN(LSHIFTEQ);} -">>=" { RTOKEN(RSHIFTEQ);} -"||" { RTOKEN(OROR);} -"==" { RTOKEN(EQ);} -"!=" { RTOKEN(NE);} -">=" { RTOKEN(GE);} -"<=" { RTOKEN(LE);} -"<<" { RTOKEN(LSHIFT);} -">>" { RTOKEN(RSHIFT);} -"+=" { RTOKEN(PLUSEQ);} -"-=" { RTOKEN(MINUSEQ);} -"*=" { RTOKEN(MULTEQ);} -"/=" { RTOKEN(DIVEQ);} -"&=" { RTOKEN(ANDEQ);} -"|=" { RTOKEN(OREQ);} -"&&" { RTOKEN(ANDAND);} -">" { RTOKEN('>');} -"," { RTOKEN(',');} -"&" { RTOKEN('&');} -"|" { RTOKEN('|');} -"~" { RTOKEN('~');} -"!" { RTOKEN('!');} -"?" { RTOKEN('?');} -"*" { RTOKEN('*');} -"+" { RTOKEN('+');} -"-" { RTOKEN('-');} -"/" { RTOKEN('/');} -"%" { RTOKEN('%');} -"<" { RTOKEN('<');} -"=" { RTOKEN('=');} + + /* Some tokens that only appear in expressions must be enabled for + states other than EXPRESSION, since parser lookahead means they + must be recognised before the parser switches the lexer out of + SCRIPT or WILD state into EXPRESSION state. + + This sort of thing happens for example with NAME in ldgram.y + "section" rule, which is immediately followed by ldlex_expression. + However, if you follow the grammar from "sec_or_group_p1" you see + "assignment" appearing in "statement_anywhere". Now, + "assignment" also has NAME as its first token, just like + "section". So the parser can't know whether it is in the + "section" or the "assignment" rule until it has scanned the next + token to find an assignment operator. Thus the next token after + NAME in the "section" rule may be lexed before the lexer is + switched to EXPRESSION state, and there are quite a number of + optional components. The first token in all those components + must be able to be lexed in SCRIPT state, as well as the + assignment operators. In fact, due to "opt_exp_with_type", + anything that can appear on the left hand side of "exp" might + need to be lexed in SCRIPT state. + + MRI mode tends to cover everything in MRI scripts. + */ +"]" { RTOKEN(']'); } +"[" { RTOKEN('['); } +"<<=" { RTOKEN(LSHIFTEQ); } +">>=" { RTOKEN(RSHIFTEQ); } +"||" { RTOKEN(OROR); } +"==" { RTOKEN(EQ); } +"!=" { RTOKEN(NE); } +">=" { RTOKEN(GE); } +"<=" { RTOKEN(LE); } +"<<" { RTOKEN(LSHIFT); } +">>" { RTOKEN(RSHIFT); } +"+=" { RTOKEN(PLUSEQ); } +"-=" { RTOKEN(MINUSEQ); } +"*=" { RTOKEN(MULTEQ); } +"/=" { RTOKEN(DIVEQ); } +"&=" { RTOKEN(ANDEQ); } +"|=" { RTOKEN(OREQ); } +"&&" { RTOKEN(ANDAND); } +">" { RTOKEN('>'); } +"," { RTOKEN(','); } +"&" { RTOKEN('&'); } +"|" { RTOKEN('|'); } +"~" { RTOKEN('~'); } +"!" { RTOKEN('!'); } +"?" { RTOKEN('?'); } +"*" { RTOKEN('*'); } +"+" { RTOKEN('+'); } +"-" { RTOKEN('-'); } +"/" { RTOKEN('/'); } +"%" { RTOKEN('%'); } +"<" { RTOKEN('<'); } +"=" { RTOKEN('='); } "}" { RTOKEN('}'); } "{" { RTOKEN('{'); } -")" { RTOKEN(')');} -"(" { RTOKEN('(');} +")" { RTOKEN(')'); } +"(" { RTOKEN('('); } ":" { RTOKEN(':'); } -";" { RTOKEN(';');} -