From 3ff81670848abb29b92e78f45080ad36cc85001c Mon Sep 17 00:00:00 2001 From: Carl Worth Date: Tue, 25 May 2010 13:09:03 -0700 Subject: [PATCH] Starting over with the C99 grammar for the preprocessor. This is a fresh start with a much simpler approach for the flex/bison portions of the preprocessor. This isn't functional yet, (produces no output), but can at least read all of our test cases without any parse errors. The grammar here is based on the grammar provided for the preprocessor in the C99 specification. --- glcpp-lex.l | 197 ++++------------------- glcpp-parse.y | 401 ++++++++--------------------------------------- tests/glcpp-test | 5 +- 3 files changed, 98 insertions(+), 505 deletions(-) diff --git a/glcpp-lex.l b/glcpp-lex.l index ee1f6e3aeea..f1dd11ea9bd 100644 --- a/glcpp-lex.l +++ b/glcpp-lex.l @@ -32,21 +32,14 @@ %option reentrant noyywrap %option extra-type="glcpp_parser_t *" -%x ST_DEFINE -%x ST_DEFINE_OBJ_OR_FUNC -%x ST_DEFINE_PARAMETER -%x ST_DEFINE_VALUE -%x ST_IF -%x ST_UNDEF -%x ST_UNDEF_END - SPACE [[:space:]] NONSPACE [^[:space:]] NEWLINE [\n] HSPACE [ \t] HASH ^{HSPACE}*#{HSPACE}* IDENTIFIER [_a-zA-Z][_a-zA-Z0-9]* -TOKEN [^[:space:](),]+ +PUNCTUATION [][(){}.&*~!/%<>^|;,+-] +OTHER [^][(){}.&*~!/%<>^|;,=#[:space:]+-]+ DECIMAL_INTEGER [1-9][0-9]*[uU]? OCTAL_INTEGER 0[0-7]*[uU]? @@ -54,208 +47,74 @@ HEXADECIMAL_INTEGER 0[xX][0-9a-fA-F]+[uU]? %% -{HASH}if{HSPACE}* { - BEGIN ST_IF; - return IF; -} - -{HASH}elif{HSPACE}* { - BEGIN ST_IF; - return ELIF; +{HASH}define{HSPACE}+/{IDENTIFIER}"(" { + return HASH_DEFINE_FUNC; } -{DECIMAL_INTEGER} { - yylval.ival = strtoll (yytext, NULL, 10); - return INTEGER; +{HASH}define { + return HASH_DEFINE_OBJ; } -{OCTAL_INTEGER} { - yylval.ival = strtoll (yytext + 1, NULL, 8); - return INTEGER; +{HASH}undef { + return HASH_UNDEF; } -{HEXADECIMAL_INTEGER} { - yylval.ival = strtoll (yytext + 2, NULL, 16); - return INTEGER; +{HASH} { + return HASH; } -"defined" { - return DEFINED; +{IDENTIFIER} { + yylval.str = xtalloc_strdup (yyextra, yytext); + return IDENTIFIER; } -"<<" { +"<<" { return LEFT_SHIFT; } -">>" { +">>" { return RIGHT_SHIFT; } -"<=" { +"<=" { return LESS_OR_EQUAL; } -">=" { +">=" { return GREATER_OR_EQUAL; } -"==" { +"==" { return EQUAL; } -"!=" { +"!=" { return NOT_EQUAL; } -"&&" { +"&&" { return AND; } -"||" { +"||" { return OR; } -[-+*/%<>&^|()~] { - return yytext[0]; -} - -{IDENTIFIER} { - yylval.str = xtalloc_strdup (yyextra, yytext); - return IDENTIFIER; -} - -{HSPACE}+ - -\n { - BEGIN INITIAL; - return NEWLINE; -} - -{HASH}endif{HSPACE}* { - return ENDIF; +"##" { + return PASTE; } -{HASH}else{HSPACE}* { - return ELSE; -} - -{HASH}undef{HSPACE}* { - BEGIN ST_UNDEF; - return UNDEF; -} - -{IDENTIFIER} { - BEGIN ST_UNDEF_END; - yylval.str = xtalloc_strdup (yyextra, yytext); - return IDENTIFIER; -} - -{HSPACE}* - -\n { - BEGIN INITIAL; -} - - /* We use the ST_DEFINE and ST_DEFVAL states so that we can - * pass a space token, (yes, a token for whitespace!), since - * the preprocessor specification requires distinguishing - * "#define foo()" from "#define foo ()". - */ -{HASH}define{HSPACE}* { - BEGIN ST_DEFINE; - return DEFINE; -} - -{IDENTIFIER} { - BEGIN ST_DEFINE_OBJ_OR_FUNC; - yylval.str = xtalloc_strdup (yyextra, yytext); - return IDENTIFIER; -} - -\n { - BEGIN INITIAL; - return NEWLINE; -} - -{HSPACE}+ { - BEGIN ST_DEFINE_VALUE; - return SPACE; -} - -"(" { - BEGIN ST_DEFINE_PARAMETER; - return '('; -} - -{IDENTIFIER} { - yylval.str = xtalloc_strdup (yyextra, yytext); - return IDENTIFIER; -} - -"," { - return ','; -} - -")" { - BEGIN ST_DEFINE_VALUE; - return ')'; -} - -{HSPACE}+ - -{TOKEN} { - yylval.token.type = TOKEN; - yylval.token.value = xtalloc_strdup (yyextra, yytext); - return TOKEN; -} - -[(),] { - yylval.token.type = TOKEN; - yylval.token.value = xtalloc_strdup (yyextra, yytext); - return TOKEN; +{PUNCTUATION} { + return yytext[0]; } -{HSPACE}+ - -\n { - BEGIN INITIAL; +\n { return NEWLINE; } -{IDENTIFIER} { - int parameter_index; +{OTHER} { yylval.str = xtalloc_strdup (yyextra, yytext); - switch (glcpp_parser_classify_token (yyextra, yylval.str, - ¶meter_index)) - { - case TOKEN_CLASS_IDENTIFIER: - return IDENTIFIER; - break; - case TOKEN_CLASS_IDENTIFIER_FINALIZED: - return IDENTIFIER_FINALIZED; - break; - case TOKEN_CLASS_FUNC_MACRO: - return FUNC_MACRO; - break; - case TOKEN_CLASS_OBJ_MACRO: - return OBJ_MACRO; - break; - - } -} - -[(),] { - return yytext[0]; -} - -{TOKEN} { - yylval.token.type = TOKEN; - yylval.token.value = xtalloc_strdup (yyextra, yytext); - return TOKEN; -} - -\n { - yyextra->need_newline = 1; + return OTHER; } {HSPACE}+ diff --git a/glcpp-parse.y b/glcpp-parse.y index 2c0fe9a6af9..ebb28ed1965 100644 --- a/glcpp-parse.y +++ b/glcpp-parse.y @@ -119,366 +119,97 @@ glcpp_parser_lex (glcpp_parser_t *parser); %parse-param {glcpp_parser_t *parser} %lex-param {glcpp_parser_t *parser} -%token DEFINE DEFINED ELIF ELSE ENDIF FUNC_MACRO IDENTIFIER IDENTIFIER_FINALIZED IF IFDEF IFNDEF INTEGER OBJ_MACRO NEWLINE SPACE TOKEN UNDEF -%type punctuator -%type expression INTEGER -%type content FUNC_MACRO IDENTIFIER IDENTIFIER_FINALIZED OBJ_MACRO -%type argument_list -%type macro parameter_list -%type TOKEN argument_word argument_word_or_comma -%type argument argument_or_comma replacement_list pp_tokens -%left OR -%left AND -%left '|' -%left '^' -%left '&' -%left EQUAL NOT_EQUAL -%left '<' '>' LESS_OR_EQUAL GREATER_OR_EQUAL -%left LEFT_SHIFT RIGHT_SHIFT -%left '+' '-' -%left '*' '/' '%' -%right UNARY - -/* Hard to remove shift/reduce conflicts documented as follows: - * - * 1. '(' after FUNC_MACRO name which is correctly resolved to shift - * to form macro invocation rather than reducing directly to - * content. - * - * 2. Similarly, '(' after FUNC_MACRO which is correctly resolved to - * shift to form macro invocation rather than reducing directly to - * argument. - * - * 3. Similarly again now that we added argument_or_comma as well. - */ -%expect 3 +%token HASH_DEFINE_FUNC HASH_DEFINE_OBJ HASH IDENTIFIER NEWLINE OTHER HASH_UNDEF +%token LEFT_SHIFT RIGHT_SHIFT LESS_OR_EQUAL GREATER_OR_EQUAL EQUAL NOT_EQUAL AND OR PASTE + + /* Stale stuff just to allow code to compile. */ +%token IDENTIFIER_FINALIZED FUNC_MACRO OBJ_MACRO %% - /* We do all printing at the input level. */ input: - /* empty */ { - parser->just_printed_separator = 1; - } -| input content { - int is_token; - int skipping = 0; - - if (parser->skip_stack && parser->skip_stack->type != SKIP_NO_SKIP) - skipping = 1; - - if ($2 && strlen ($2) && ! skipping) { - int c = $2[0]; - int is_not_separator = ((c >= 'a' && c <= 'z') || - (c >= 'A' && c <= 'Z') || - (c >= 'A' && c <= 'Z') || - (c >= '0' && c <= '9') || - (c == '_')); - - if (! parser->just_printed_separator && is_not_separator) - { - printf (" "); - } - printf ("%s", $2); - - if (is_not_separator) - parser->just_printed_separator = 0; - else - parser->just_printed_separator = 1; - } - - if ($2) - talloc_free ($2); - - if (parser->need_newline) { - printf ("\n"); - parser->just_printed_separator = 1; - parser->need_newline = 0; - } - } -; - -content: - IDENTIFIER { - $$ = $1; - } -| IDENTIFIER_FINALIZED { - $$ = $1; - } -| TOKEN { - $$ = $1.value; - } -| FUNC_MACRO { - $$ = $1; - } -| directive { - $$ = talloc_strdup (parser, "\n"); - } -| punctuator { - $$ = talloc_asprintf (parser, "%c", $1); - } -| macro { - $$ = NULL; - } + /* empty */ +| input line ; -punctuator: - '(' { $$ = '('; } -| ')' { $$ = ')'; } -| ',' { $$ = ','; } - ; - -macro: - FUNC_MACRO '(' argument_list ')' { - _expand_function_macro (parser, $1, $3); - } -| OBJ_MACRO { - _expand_object_macro (parser, $1); - talloc_free ($1); - } +line: + control_line +| text_line +| HASH non_directive ; -argument_list: - /* empty */ { - $$ = _argument_list_create (parser); - } -| argument { - $$ = _argument_list_create (parser); - _argument_list_append ($$, $1); - } -| argument_list ',' argument { - _argument_list_append ($1, $3); - $$ = $1; - } -; - -argument: - argument_word { - $$ = _token_list_create (parser); - _token_list_append ($$, $1.type, $1.value); - } -| argument argument_word { - _token_list_append ($1, $2.type, $2.value); - talloc_free ($2.value); - $$ = $1; - } -| argument '(' argument_or_comma ')' { - _token_list_append ($1, '(', "("); - _token_list_append_list ($1, $3); - _token_list_append ($1, ')', ")"); - $$ = $1; - } +control_line: + HASH_DEFINE_OBJ IDENTIFIER replacement_list NEWLINE +| HASH_DEFINE_FUNC IDENTIFIER '(' ')' replacement_list NEWLINE +| HASH_DEFINE_FUNC IDENTIFIER '(' identifier_list ')' replacement_list NEWLINE +| HASH_UNDEF IDENTIFIER NEWLINE +| HASH NEWLINE ; -argument_word: - IDENTIFIER { $$.type = IDENTIFIER; $$.value = $1; } -| IDENTIFIER_FINALIZED { $$.type = IDENTIFIER_FINALIZED; $$.value = $1; } -| TOKEN { $$ = $1; } -| FUNC_MACRO { $$.type = FUNC_MACRO; $$.value = $1; } -| macro { $$.type = TOKEN; $$.value = xtalloc_strdup (parser, ""); } +identifier_list: + IDENTIFIER +| identifier_list ',' IDENTIFIER ; - /* XXX: The body of argument_or_comma is the same as the body - * of argument, but with "argument" and "argument_word" - * changed to "argument_or_comma" and - * "argument_word_or_comma". It would be nice to have less - * redundancy here, but I'm not sure how. - * - * It would also be nice to have a less ugly grammar to have - * to implement, but such is the C preprocessor. - */ -argument_or_comma: - argument_word_or_comma { - $$ = _token_list_create (parser); - _token_list_append ($$, $1.type, $1.value); - } -| argument_or_comma argument_word_or_comma { - _token_list_append ($1, $2.type, $2.value); - $$ = $1; - } -| argument_or_comma '(' argument_or_comma ')' { - _token_list_append ($1, '(', "("); - _token_list_append_list ($1, $3); - _token_list_append ($1, ')', ")"); - $$ = $1; - } +text_line: + NEWLINE +| pp_tokens NEWLINE ; -argument_word_or_comma: - IDENTIFIER { $$.type = IDENTIFIER; $$.value = $1; } -| IDENTIFIER_FINALIZED { $$.type = IDENTIFIER_FINALIZED; $$.value = $1; } -| TOKEN { $$ = $1; } -| FUNC_MACRO { $$.type = FUNC_MACRO; $$.value = $1; } -| macro { $$.type = TOKEN; $$.value = xtalloc_strdup (parser, ""); } -| ',' { $$.type = ','; $$.value = xtalloc_strdup (parser, ","); } +non_directive: + pp_tokens NEWLINE ; -directive: - DEFINE IDENTIFIER NEWLINE { - token_list_t *list = _token_list_create (parser); - _define_object_macro (parser, $2, list); - } -| DEFINE IDENTIFIER SPACE replacement_list NEWLINE { - _define_object_macro (parser, $2, $4); - } -| DEFINE IDENTIFIER '(' parameter_list ')' replacement_list NEWLINE { - _define_function_macro (parser, $2, $4, $6); - } -| IF expression NEWLINE { - _glcpp_parser_skip_stack_push_if (parser, $2); - } -| IFDEF IDENTIFIER NEWLINE { - string_list_t *macro = hash_table_find (parser->defines, $2); - talloc_free ($2); - _glcpp_parser_skip_stack_push_if (parser, macro != NULL); - } -| IFNDEF IDENTIFIER NEWLINE { - string_list_t *macro = hash_table_find (parser->defines, $2); - talloc_free ($2); - _glcpp_parser_skip_stack_push_if (parser, macro == NULL); - } -| ELIF expression NEWLINE { - _glcpp_parser_skip_stack_change_if (parser, "#elif", $2); - } -| ELSE { - _glcpp_parser_skip_stack_change_if (parser, "else", 1); - } -| ENDIF { - _glcpp_parser_skip_stack_pop (parser); - } -| UNDEF IDENTIFIER { - string_list_t *macro = hash_table_find (parser->defines, $2); - if (macro) { - /* XXX: Need hash table to support a real way - * to remove an element rather than prefixing - * a new node with data of NULL like this. */ - hash_table_insert (parser->defines, NULL, $2); - talloc_free (macro); - } - talloc_free ($2); - } +replacement_list: + /* empty */ +| pp_tokens ; -expression: - INTEGER { - $$ = $1; - } -| expression OR expression { - $$ = $1 || $3; - } -| expression AND expression { - $$ = $1 && $3; - } -| expression '|' expression { - $$ = $1 | $3; - } -| expression '^' expression { - $$ = $1 ^ $3; - } -| expression '&' expression { - $$ = $1 & $3; - } -| expression NOT_EQUAL expression { - $$ = $1 != $3; - } -| expression EQUAL expression { - $$ = $1 == $3; - } -| expression GREATER_OR_EQUAL expression { - $$ = $1 >= $3; - } -| expression LESS_OR_EQUAL expression { - $$ = $1 <= $3; - } -| expression '>' expression { - $$ = $1 > $3; - } -| expression '<' expression { - $$ = $1 < $3; - } -| expression RIGHT_SHIFT expression { - $$ = $1 >> $3; - } -| expression LEFT_SHIFT expression { - $$ = $1 << $3; - } -| expression '-' expression { - $$ = $1 - $3; - } -| expression '+' expression { - $$ = $1 + $3; - } -| expression '%' expression { - $$ = $1 % $3; - } -| expression '/' expression { - $$ = $1 / $3; - } -| expression '*' expression { - $$ = $1 * $3; - } -| '!' expression %prec UNARY { - $$ = ! $2; - } -| '~' expression %prec UNARY { - $$ = ~ $2; - } -| '-' expression %prec UNARY { - $$ = - $2; - } -| '+' expression %prec UNARY { - $$ = + $2; - } -| DEFINED IDENTIFIER %prec UNARY { - string_list_t *macro = hash_table_find (parser->defines, $2); - talloc_free ($2); - if (macro) - $$ = 1; - else - $$ = 0; - } -| '(' expression ')' { - $$ = $2; - } +pp_tokens: + preprocessing_token +| pp_tokens preprocessing_token ; -parameter_list: - /* empty */ { - $$ = _string_list_create (parser); - } -| IDENTIFIER { - $$ = _string_list_create (parser); - _string_list_append_item ($$, $1); - talloc_free ($1); - } -| parameter_list ',' IDENTIFIER { - _string_list_append_item ($1, $3); - talloc_free ($3); - $$ = $1; - } +preprocessing_token: + IDENTIFIER +| punctuator +| OTHER ; -replacement_list: - /* empty */ { - $$ = _token_list_create (parser); - } -| pp_tokens { - $$ = $1; - } +punctuator: + '[' +| ']' +| '(' +| ')' +| '{' +| '}' +| '.' +| '&' +| '*' +| '+' +| '-' +| '~' +| '!' +| '/' +| '%' +| LEFT_SHIFT +| RIGHT_SHIFT +| '<' +| '>' +| LESS_OR_EQUAL +| GREATER_OR_EQUAL +| EQUAL +| NOT_EQUAL +| '^' +| '|' +| AND +| OR +| ';' +| ',' +| PASTE ; -pp_tokens: - TOKEN { - $$ = _token_list_create (parser); - _token_list_append ($$, $1.type, $1.value); - } -| pp_tokens TOKEN { - _token_list_append ($1, $2.type, $2.value); - $$ = $1; - } -; - %% string_list_t * diff --git a/tests/glcpp-test b/tests/glcpp-test index 022a2367121..868b03cce83 100755 --- a/tests/glcpp-test +++ b/tests/glcpp-test @@ -1,9 +1,12 @@ #!/bin/sh +set -e + +echo "Caution: These results are just verifying parse-ability, not correctness!" for test in *.c; do echo "Testing $test" ../glcpp < $test > $test.out gcc -E $test -o $test.gcc grep -v '^#' < $test.gcc > $test.expected - diff -B -u $test.expected $test.out +# diff -B -u $test.expected $test.out done -- 2.30.2