From 9fb8b7a495c9dc6f9a62cf82300fae5925af92fc Mon Sep 17 00:00:00 2001
From: Carl Worth <cworth@cworth.org>
Date: Tue, 25 May 2010 15:04:32 -0700
Subject: [PATCH] Make the lexer pass whitespace through (as OTHER tokens) for
 text lines.

With this change, we can recreate the original text-line input
exactly. Previously we were inserting a space between every pair of
tokens so our output had a lot more whitespace than our input.

With this change, we can drop the "-b" option to diff and match the
input exactly.
---
 glcpp-lex.l      | 72 ++++++++++++++++++++++++++++++++++++++----------
 glcpp-parse.y    |  2 --
 tests/glcpp-test |  2 +-
 3 files changed, 59 insertions(+), 17 deletions(-)
diff --git a/glcpp-lex.l b/glcpp-lex.l
index f1dd11ea9bd..7b5cdd57a0f 100644
--- a/glcpp-lex.l
+++ b/glcpp-lex.l
@@ -32,6 +32,21 @@
 %option reentrant noyywrap
 %option extra-type="glcpp_parser_t *"
 
+	/* This lexer has two states:
+	 *
+	 * The CONTROL state is for control lines (directives)
+	 * It lexes exactly as specified in the C99 specification.
+	 *
+	 * The INITIAL state is for input lines. In this state, we
+	 * make the OTHER token much more broad in that it now
+	 * includes tokens consisting entirely of whitespace. This
+	 * allows us to pass text through verbatim. It avoids the
+	 * "inadvertent token pasting" problem that would occur if we
+	 * just printed tokens, while also avoiding excess whitespace
+	 * insertion in the output.*/
+
+%x CONTROL
+
 SPACE		[[:space:]]
 NONSPACE	[^[:space:]]
 NEWLINE		[\n]
@@ -48,75 +63,104 @@ HEXADECIMAL_INTEGER	0[xX][0-9a-fA-F]+[uU]?
 %%
 
 {HASH}define{HSPACE}+/{IDENTIFIER}"(" {
+	BEGIN CONTROL;
 	return HASH_DEFINE_FUNC;
 }
 
 {HASH}define {
+	BEGIN CONTROL;
 	return HASH_DEFINE_OBJ;
 }
 
 {HASH}undef {
+	BEGIN CONTROL;
 	return HASH_UNDEF;
 }
 
 {HASH} {
+	BEGIN CONTROL;
 	return HASH;
 }
 
-{IDENTIFIER} {
+<CONTROL>{IDENTIFIER} {
 	yylval.str = xtalloc_strdup (yyextra, yytext);
 	return IDENTIFIER;
 }
 
-"<<"  {
+<CONTROL>"<<"  {
 	return LEFT_SHIFT;
 }
 
-">>" {
+<CONTROL>">>" {
 	return RIGHT_SHIFT;
 }
 
-"<=" {
+<CONTROL>"<=" {
 	return LESS_OR_EQUAL;
 }
 
-">=" {
+<CONTROL>">=" {
 	return GREATER_OR_EQUAL;
 }
 
-"==" {
+<CONTROL>"==" {
 	return EQUAL;
 }
 
-"!=" {
+<CONTROL>"!=" {
 	return NOT_EQUAL;
 }
 
-"&&" {
+<CONTROL>"&&" {
 	return AND;
 }
 
-"||" {
+<CONTROL>"||" {
 	return OR;
 }
 
-"##" {
+<CONTROL>"##" {
 	return PASTE;
 }
 
-{PUNCTUATION} {
+<CONTROL>{PUNCTUATION} {
 	return yytext[0];
 }
 
-\n {
+<CONTROL>{OTHER} {
+	yylval.str = xtalloc_strdup (yyextra, yytext);
+	return OTHER;
+}
+
+<CONTROL>{HSPACE}+
+
+<CONTROL>\n {
+	BEGIN INITIAL;
 	return NEWLINE;
 }
 
-{OTHER} {
+{IDENTIFIER} {
+	yylval.str = xtalloc_strdup (yyextra, yytext);
+	return IDENTIFIER;
+}
+
+{OTHER}+ {
+	yylval.str = xtalloc_strdup (yyextra, yytext);
+	return OTHER;
+}
+
+{HSPACE}+ {
 	yylval.str = xtalloc_strdup (yyextra, yytext);
 	return OTHER;
 }
 
-{HSPACE}+
+\n {
+	return NEWLINE;
+}
+
+. {
+	yylval.str = xtalloc_strdup (yyextra, yytext);
+	return OTHER;
+}
 
 %%
diff --git a/glcpp-parse.y b/glcpp-parse.y
index 991b8a0b856..957421b864e 100644
--- a/glcpp-parse.y
+++ b/glcpp-parse.y
@@ -517,8 +517,6 @@ _token_list_print (token_list_t *list)
 
 	for (node = list->head; node; node = node->next) {
 		_token_print (node->token);
-		if (node->next)
-			printf (" ");
 	}
 }
 
diff --git a/tests/glcpp-test b/tests/glcpp-test
index 34cca883301..8074e471197 100755
--- a/tests/glcpp-test
+++ b/tests/glcpp-test
@@ -9,5 +9,5 @@ for test in *.c; do
     gcc -E $test -o $test.gcc
 #    grep -v '^#' < $test.gcc > $test.expected
     grep -v '^[ 	]*#' < $test > $test.expected
-    diff -w -u $test.expected $test.out
+    diff -u $test.expected $test.out
 done
-- 
2.30.2