From 7d112d6670a0e0e662f8a7e64c33686e475832c8 Mon Sep 17 00:00:00 2001 From: Lewis Hyatt Date: Thu, 19 Sep 2019 19:56:11 +0000 Subject: [PATCH] Support extended characters in C/C++ identifiers (PR c/67224) libcpp/ChangeLog 2019-09-19 Lewis Hyatt PR c/67224 * charset.c (_cpp_valid_utf8): New function to help lex UTF-8 tokens. * internal.h (_cpp_valid_utf8): Declare. * lex.c (forms_identifier_p): Use it to recognize UTF-8 identifiers. (_cpp_lex_direct): Handle UTF-8 in identifiers and CPP_OTHER tokens. Do all work in "default" case to avoid slowing down typical code paths. Also handle $ and UCN in the default case for consistency. gcc/Changelog 2019-09-19 Lewis Hyatt PR c/67224 * doc/cpp.texi: Document support for extended characters in identifiers. * doc/cppopts.texi: Likewise. gcc/testsuite/ChangeLog 2019-09-19 Lewis Hyatt PR c/67224 * c-c++-common/cpp/ucnid-2011-1-utf8.c: New test. * g++.dg/cpp/ucnid-1-utf8.C: New test. * g++.dg/cpp/ucnid-2-utf8.C: New test. * g++.dg/cpp/ucnid-3-utf8.C: New test. * g++.dg/cpp/ucnid-4-utf8.C: New test. * g++.dg/other/ucnid-1-utf8.C: New test. * gcc.dg/cpp/ucnid-1-utf8.c: New test. * gcc.dg/cpp/ucnid-10-utf8.c: New test. * gcc.dg/cpp/ucnid-11-utf8.c: New test. * gcc.dg/cpp/ucnid-12-utf8.c: New test. * gcc.dg/cpp/ucnid-13-utf8.c: New test. * gcc.dg/cpp/ucnid-14-utf8.c: New test. * gcc.dg/cpp/ucnid-15-utf8.c: New test. * gcc.dg/cpp/ucnid-2-utf8.c: New test. * gcc.dg/cpp/ucnid-3-utf8.c: New test. * gcc.dg/cpp/ucnid-4-utf8.c: New test. * gcc.dg/cpp/ucnid-6-utf8.c: New test. * gcc.dg/cpp/ucnid-7-utf8.c: New test. * gcc.dg/cpp/ucnid-9-utf8.c: New test. * gcc.dg/ucnid-1-utf8.c: New test. * gcc.dg/ucnid-10-utf8.c: New test. * gcc.dg/ucnid-11-utf8.c: New test. * gcc.dg/ucnid-12-utf8.c: New test. * gcc.dg/ucnid-13-utf8.c: New test. * gcc.dg/ucnid-14-utf8.c: New test. * gcc.dg/ucnid-15-utf8.c: New test. * gcc.dg/ucnid-16-utf8.c: New test. * gcc.dg/ucnid-2-utf8.c: New test. * gcc.dg/ucnid-3-utf8.c: New test. * gcc.dg/ucnid-4-utf8.c: New test. * gcc.dg/ucnid-5-utf8.c: New test. * gcc.dg/ucnid-6-utf8.c: New test. * gcc.dg/ucnid-7-utf8.c: New test. * gcc.dg/ucnid-8-utf8.c: New test. * gcc.dg/ucnid-9-utf8.c: New test. From-SVN: r275979 --- gcc/ChangeLog | 7 ++ gcc/doc/cpp.texi | 32 +++---- gcc/doc/cppopts.texi | 5 +- gcc/testsuite/ChangeLog | 39 +++++++++ .../c-c++-common/cpp/ucnid-2011-1-utf8.c | 15 ++++ gcc/testsuite/g++.dg/cpp/ucnid-1-utf8.C | 17 ++++ gcc/testsuite/g++.dg/cpp/ucnid-2-utf8.C | 24 ++++++ gcc/testsuite/g++.dg/cpp/ucnid-3-utf8.C | 23 +++++ gcc/testsuite/g++.dg/cpp/ucnid-4-utf8.C | 17 ++++ gcc/testsuite/g++.dg/other/ucnid-1-utf8.C | 28 +++++++ gcc/testsuite/gcc.dg/cpp/ucnid-1-utf8.c | 26 ++++++ gcc/testsuite/gcc.dg/cpp/ucnid-10-utf8.c | 8 ++ gcc/testsuite/gcc.dg/cpp/ucnid-11-utf8.c | 30 +++++++ gcc/testsuite/gcc.dg/cpp/ucnid-12-utf8.c | 13 +++ gcc/testsuite/gcc.dg/cpp/ucnid-13-utf8.c | 5 ++ gcc/testsuite/gcc.dg/cpp/ucnid-14-utf8.c | 6 ++ gcc/testsuite/gcc.dg/cpp/ucnid-15-utf8.c | 6 ++ gcc/testsuite/gcc.dg/cpp/ucnid-2-utf8.c | 16 ++++ gcc/testsuite/gcc.dg/cpp/ucnid-3-utf8.c | 7 ++ gcc/testsuite/gcc.dg/cpp/ucnid-4-utf8.c | 17 ++++ gcc/testsuite/gcc.dg/cpp/ucnid-6-utf8.c | 5 ++ gcc/testsuite/gcc.dg/cpp/ucnid-7-utf8.c | 21 +++++ gcc/testsuite/gcc.dg/cpp/ucnid-9-utf8.c | 8 ++ gcc/testsuite/gcc.dg/ucnid-1-utf8.c | 25 ++++++ gcc/testsuite/gcc.dg/ucnid-10-utf8.c | 11 +++ gcc/testsuite/gcc.dg/ucnid-11-utf8.c | 7 ++ gcc/testsuite/gcc.dg/ucnid-12-utf8.c | 7 ++ gcc/testsuite/gcc.dg/ucnid-13-utf8.c | 15 ++++ gcc/testsuite/gcc.dg/ucnid-14-utf8.c | 23 +++++ gcc/testsuite/gcc.dg/ucnid-15-utf8.c | 38 +++++++++ gcc/testsuite/gcc.dg/ucnid-16-utf8.c | 6 ++ gcc/testsuite/gcc.dg/ucnid-2-utf8.c | 28 +++++++ gcc/testsuite/gcc.dg/ucnid-3-utf8.c | 28 +++++++ gcc/testsuite/gcc.dg/ucnid-4-utf8.c | 28 +++++++ gcc/testsuite/gcc.dg/ucnid-5-utf8.c | 19 +++++ gcc/testsuite/gcc.dg/ucnid-6-utf8.c | 28 +++++++ gcc/testsuite/gcc.dg/ucnid-7-utf8.c | 9 ++ gcc/testsuite/gcc.dg/ucnid-8-utf8.c | 16 ++++ gcc/testsuite/gcc.dg/ucnid-9-utf8.c | 25 ++++++ libcpp/ChangeLog | 10 +++ libcpp/charset.c | 83 ++++++++++++++++++- libcpp/internal.h | 8 ++ libcpp/lex.c | 55 ++++++++---- 43 files changed, 807 insertions(+), 37 deletions(-) create mode 100644 gcc/testsuite/c-c++-common/cpp/ucnid-2011-1-utf8.c create mode 100644 gcc/testsuite/g++.dg/cpp/ucnid-1-utf8.C create mode 100644 gcc/testsuite/g++.dg/cpp/ucnid-2-utf8.C create mode 100644 gcc/testsuite/g++.dg/cpp/ucnid-3-utf8.C create mode 100644 gcc/testsuite/g++.dg/cpp/ucnid-4-utf8.C create mode 100644 gcc/testsuite/g++.dg/other/ucnid-1-utf8.C create mode 100644 gcc/testsuite/gcc.dg/cpp/ucnid-1-utf8.c create mode 100644 gcc/testsuite/gcc.dg/cpp/ucnid-10-utf8.c create mode 100644 gcc/testsuite/gcc.dg/cpp/ucnid-11-utf8.c create mode 100644 gcc/testsuite/gcc.dg/cpp/ucnid-12-utf8.c create mode 100644 gcc/testsuite/gcc.dg/cpp/ucnid-13-utf8.c create mode 100644 gcc/testsuite/gcc.dg/cpp/ucnid-14-utf8.c create mode 100644 gcc/testsuite/gcc.dg/cpp/ucnid-15-utf8.c create mode 100644 gcc/testsuite/gcc.dg/cpp/ucnid-2-utf8.c create mode 100644 gcc/testsuite/gcc.dg/cpp/ucnid-3-utf8.c create mode 100644 gcc/testsuite/gcc.dg/cpp/ucnid-4-utf8.c create mode 100644 gcc/testsuite/gcc.dg/cpp/ucnid-6-utf8.c create mode 100644 gcc/testsuite/gcc.dg/cpp/ucnid-7-utf8.c create mode 100644 gcc/testsuite/gcc.dg/cpp/ucnid-9-utf8.c create mode 100644 gcc/testsuite/gcc.dg/ucnid-1-utf8.c create mode 100644 gcc/testsuite/gcc.dg/ucnid-10-utf8.c create mode 100644 gcc/testsuite/gcc.dg/ucnid-11-utf8.c create mode 100644 gcc/testsuite/gcc.dg/ucnid-12-utf8.c create mode 100644 gcc/testsuite/gcc.dg/ucnid-13-utf8.c create mode 100644 gcc/testsuite/gcc.dg/ucnid-14-utf8.c create mode 100644 gcc/testsuite/gcc.dg/ucnid-15-utf8.c create mode 100644 gcc/testsuite/gcc.dg/ucnid-16-utf8.c create mode 100644 gcc/testsuite/gcc.dg/ucnid-2-utf8.c create mode 100644 gcc/testsuite/gcc.dg/ucnid-3-utf8.c create mode 100644 gcc/testsuite/gcc.dg/ucnid-4-utf8.c create mode 100644 gcc/testsuite/gcc.dg/ucnid-5-utf8.c create mode 100644 gcc/testsuite/gcc.dg/ucnid-6-utf8.c create mode 100644 gcc/testsuite/gcc.dg/ucnid-7-utf8.c create mode 100644 gcc/testsuite/gcc.dg/ucnid-8-utf8.c create mode 100644 gcc/testsuite/gcc.dg/ucnid-9-utf8.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index e7fededd9b9..7f16c16e6a1 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,10 @@ +2019-09-19 Lewis Hyatt + + PR c/67224 + * doc/cpp.texi: Document support for extended characters in + identifiers. + * doc/cppopts.texi: Likewise. + 2019-09-19 Richard Biener * tree-vect-loop.c (vect_is_slp_reduction): Remove. diff --git a/gcc/doc/cpp.texi b/gcc/doc/cpp.texi index e271f5180d8..f2de39a270c 100644 --- a/gcc/doc/cpp.texi +++ b/gcc/doc/cpp.texi @@ -274,11 +274,11 @@ the character in the source character set that they represent, then converted to the execution character set, just like unescaped characters. -In identifiers, characters outside the ASCII range can only be -specified with the @samp{\u} and @samp{\U} escapes, not used -directly. If strict ISO C90 conformance is specified with an option +In identifiers, characters outside the ASCII range can be specified +with the @samp{\u} and @samp{\U} escapes or used directly in the input +encoding. If strict ISO C90 conformance is specified with an option such as @option{-std=c90}, or @option{-fno-extended-identifiers} is -used, then those escapes are not permitted in identifiers. +used, then those constructs are not permitted in identifiers. @node Initial processing @section Initial processing @@ -503,8 +503,7 @@ In the 1999 C standard, identifiers may contain letters which are not part of the ``basic source character set'', at the implementation's discretion (such as accented Latin letters, Greek letters, or Chinese ideograms). This may be done with an extended character set, or the -@samp{\u} and @samp{\U} escape sequences. GCC only accepts such -characters in the @samp{\u} and @samp{\U} forms. +@samp{\u} and @samp{\U} escape sequences. As an extension, GCC treats @samp{$} as a letter. This is for compatibility with some systems, such as VMS, where @samp{$} is commonly @@ -584,15 +583,15 @@ Punctuator: @{ @} [ ] # ## @end smallexample @cindex other tokens -Any other single character is considered ``other''. It is passed on to -the preprocessor's output unmolested. The C compiler will almost -certainly reject source code containing ``other'' tokens. In ASCII, the -only other characters are @samp{@@}, @samp{$}, @samp{`}, and control +Any other single byte is considered ``other'' and passed on to the +preprocessor's output unchanged. The C compiler will almost certainly +reject source code containing ``other'' tokens. In ASCII, the only +``other'' characters are @samp{@@}, @samp{$}, @samp{`}, and control characters other than NUL (all bits zero). (Note that @samp{$} is -normally considered a letter.) All characters with the high bit set -(numeric range 0x7F--0xFF) are also ``other'' in the present -implementation. This will change when proper support for international -character sets is added to GCC@. +normally considered a letter.) All bytes with the high bit set +(numeric range 0x7F--0xFF) that were not succesfully interpreted as +part of an extended character in the input encoding are also ``other'' +in the present implementation. NUL is a special case because of the high probability that its appearance is accidental, and because it may be invisible to the user @@ -4179,7 +4178,10 @@ be controlled using the @option{-fexec-charset} and The C and C++ standards allow identifiers to be composed of @samp{_} and the alphanumeric characters. C++ also allows universal character names. C99 and later C standards permit both universal character -names and implementation-defined characters. +names and implementation-defined characters. In both C and C++ modes, +GCC accepts in identifiers exactly those extended characters that +correspond to universal character names permitted by the chosen +standard. GCC allows the @samp{$} character in identifiers as an extension for most targets. This is true regardless of the @option{std=} switch, diff --git a/gcc/doc/cppopts.texi b/gcc/doc/cppopts.texi index 61e22cd93ae..f4bc3f546f8 100644 --- a/gcc/doc/cppopts.texi +++ b/gcc/doc/cppopts.texi @@ -254,8 +254,9 @@ Accept @samp{$} in identifiers. @item -fextended-identifiers @opindex fextended-identifiers -Accept universal character names in identifiers. This option is -enabled by default for C99 (and later C standard versions) and C++. +Accept universal character names and extended characters in +identifiers. This option is enabled by default for C99 (and later C +standard versions) and C++. @item -fno-canonical-system-headers @opindex fno-canonical-system-headers diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 7efdac9c250..1f9b5ac567b 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,42 @@ +2019-09-19 Lewis Hyatt + + PR c/67224 + * c-c++-common/cpp/ucnid-2011-1-utf8.c: New test. + * g++.dg/cpp/ucnid-1-utf8.C: New test. + * g++.dg/cpp/ucnid-2-utf8.C: New test. + * g++.dg/cpp/ucnid-3-utf8.C: New test. + * g++.dg/cpp/ucnid-4-utf8.C: New test. + * g++.dg/other/ucnid-1-utf8.C: New test. + * gcc.dg/cpp/ucnid-1-utf8.c: New test. + * gcc.dg/cpp/ucnid-10-utf8.c: New test. + * gcc.dg/cpp/ucnid-11-utf8.c: New test. + * gcc.dg/cpp/ucnid-12-utf8.c: New test. + * gcc.dg/cpp/ucnid-13-utf8.c: New test. + * gcc.dg/cpp/ucnid-14-utf8.c: New test. + * gcc.dg/cpp/ucnid-15-utf8.c: New test. + * gcc.dg/cpp/ucnid-2-utf8.c: New test. + * gcc.dg/cpp/ucnid-3-utf8.c: New test. + * gcc.dg/cpp/ucnid-4-utf8.c: New test. + * gcc.dg/cpp/ucnid-6-utf8.c: New test. + * gcc.dg/cpp/ucnid-7-utf8.c: New test. + * gcc.dg/cpp/ucnid-9-utf8.c: New test. + * gcc.dg/ucnid-1-utf8.c: New test. + * gcc.dg/ucnid-10-utf8.c: New test. + * gcc.dg/ucnid-11-utf8.c: New test. + * gcc.dg/ucnid-12-utf8.c: New test. + * gcc.dg/ucnid-13-utf8.c: New test. + * gcc.dg/ucnid-14-utf8.c: New test. + * gcc.dg/ucnid-15-utf8.c: New test. + * gcc.dg/ucnid-16-utf8.c: New test. + * gcc.dg/ucnid-2-utf8.c: New test. + * gcc.dg/ucnid-3-utf8.c: New test. + * gcc.dg/ucnid-4-utf8.c: New test. + * gcc.dg/ucnid-5-utf8.c: New test. + * gcc.dg/ucnid-6-utf8.c: New test. + * gcc.dg/ucnid-7-utf8.c: New test. + * gcc.dg/ucnid-8-utf8.c: New test. + * gcc.dg/ucnid-9-utf8.c: New test. + 2019-09-19 Iain Sandoe * gcc.dg/pr89313.c: Test for __POWERPC__ in addition to diff --git a/gcc/testsuite/c-c++-common/cpp/ucnid-2011-1-utf8.c b/gcc/testsuite/c-c++-common/cpp/ucnid-2011-1-utf8.c new file mode 100644 index 00000000000..02c5fc08f84 --- /dev/null +++ b/gcc/testsuite/c-c++-common/cpp/ucnid-2011-1-utf8.c @@ -0,0 +1,15 @@ +/* { dg-do preprocess } */ +/* { dg-options "-std=c11 -pedantic" { target c } } */ +/* { dg-options "-std=c++11 -pedantic" { target c++ } } */ + +¨ + +B̀ + +̀ /* { dg-error "not valid at the start of an identifier" } */ + +À /* { dg-warning "not in NFC" } */ + +𐀀 +🿽 +󡈴 diff --git a/gcc/testsuite/g++.dg/cpp/ucnid-1-utf8.C b/gcc/testsuite/g++.dg/cpp/ucnid-1-utf8.C new file mode 100644 index 00000000000..839b1881bf4 --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp/ucnid-1-utf8.C @@ -0,0 +1,17 @@ +/* { dg-do preprocess } */ +/* { dg-options "-std=gnu++98 -pedantic" } */ + +ª /* { dg-error "not valid in an identifier" } */ +« /* { dg-error "not valid in an identifier" } */ +¶ /* { dg-error "not valid in an identifier" } */ +º /* { dg-error "not valid in an identifier" } */ +À +Ö +΄ + +Ù© /* { dg-error "not valid in an identifier" } */ +AÙ© /* { dg-error "not valid in an identifier" } */ +0º /* { dg-error "not valid in an identifier" } */ +0Ù© /* { dg-error "not valid in an identifier" } */ +๙ +A๙ diff --git a/gcc/testsuite/g++.dg/cpp/ucnid-2-utf8.C b/gcc/testsuite/g++.dg/cpp/ucnid-2-utf8.C new file mode 100644 index 00000000000..0381452d898 --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp/ucnid-2-utf8.C @@ -0,0 +1,24 @@ +/* Test stringization of identifiers with extended characters works. */ + +/* Note: The results expected in these tests are what GCC currently +outputs, but they are not technically standard-conforming. If GCC is +changed in the future to produce the standard-conforming output, then +this test will fail and should be adjusted to check for UCNs in the +output rather than UTF-8. See PR 91755 for more details. */ + +/* { dg-do run } */ + +#include +#include + +#define h(s) #s +#define str(s) h(s) + +int +main () +{ + if (strcmp (str (str (Á)), "\"Á\"")) + abort (); + if (strcmp (str (str (Á)), "\"Á\"")) + abort (); +} diff --git a/gcc/testsuite/g++.dg/cpp/ucnid-3-utf8.C b/gcc/testsuite/g++.dg/cpp/ucnid-3-utf8.C new file mode 100644 index 00000000000..5c3044a171d --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp/ucnid-3-utf8.C @@ -0,0 +1,23 @@ +/* Test pasting of identifiers with extended characters works. */ + +/* Note: The results expected in these tests are what GCC currently +outputs, but they are not technically standard-conforming. If GCC is +changed in the future to produce the standard-conforming output, then +this test will fail and should be adjusted to check for UCNs in the +output rather than UTF-8. See PR 91755 for more details. */ + +/* { dg-do run } */ + +#include +#include + +#define c(s1, s2) s1 ## s2 +#define h(s) #s +#define str(s) h(s) + +int +main () +{ + if (strcmp (str (str (c (Á, Á))), "\"ÁÁ\"")) + abort (); +} diff --git a/gcc/testsuite/g++.dg/cpp/ucnid-4-utf8.C b/gcc/testsuite/g++.dg/cpp/ucnid-4-utf8.C new file mode 100644 index 00000000000..de252e87165 --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp/ucnid-4-utf8.C @@ -0,0 +1,17 @@ +/* { dg-do preprocess } */ +/* { dg-options "-std=gnu++98"} */ + +ª +« /* { dg-error "not valid in an identifier" } */ +¶ /* { dg-error "not valid in an identifier" } */ +º +À +Ö +΄ + +Ù© /* OK in C++ */ +AÙ© +0º +0Ù© +๙ /* OK in C++ */ +A๙ diff --git a/gcc/testsuite/g++.dg/other/ucnid-1-utf8.C b/gcc/testsuite/g++.dg/other/ucnid-1-utf8.C new file mode 100644 index 00000000000..dab41523595 --- /dev/null +++ b/gcc/testsuite/g++.dg/other/ucnid-1-utf8.C @@ -0,0 +1,28 @@ +/* { dg-do run } */ +/* { dg-options "" } */ +/* { dg-xfail-if "" { powerpc-ibm-aix* } } */ +/* { dg-skip-if "" { ! ucn } } */ +#include + +int À(void) { return 1; } +int Á(void) { return 2; } +int Â(void) { return 3; } +int whÿ(void) { return 4; } +int aÄbсδe(void) { return 5; } + +int main (void) +{ + + if (À() != 1) + abort (); + if (Á() != 2) + abort (); + if (Â() != 3) + abort (); + if (whÿ() != 4) + abort (); + if (aÄbсδe() != 5) + abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-1-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-1-utf8.c new file mode 100644 index 00000000000..9100b980960 --- /dev/null +++ b/gcc/testsuite/gcc.dg/cpp/ucnid-1-utf8.c @@ -0,0 +1,26 @@ +/* { dg-do run } */ +/* { dg-options "-std=c99 -g3" } */ +void abort (void); + +#define À 1 +#define Á 2 +#define  3 +#define whÿ 4 +#define aÄbсδe 5 + +int main (void) +{ + + if (À != 1) + abort (); + if (Á != 2) + abort (); + if ( != 3) + abort (); + if (whÿ != 4) + abort (); + if (aÄbсδe != 5) + abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-10-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-10-utf8.c new file mode 100644 index 00000000000..7eeb026ba7c --- /dev/null +++ b/gcc/testsuite/gcc.dg/cpp/ucnid-10-utf8.c @@ -0,0 +1,8 @@ +/* Test UTF-8 is allowed in preprocessing numbers. */ +/* { dg-do compile } */ +/* { dg-options "-std=c99" } */ + +#define a(x) b(x) +#define b(x) 0 +#define p ) +int c = a(0À.p); diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-11-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-11-utf8.c new file mode 100644 index 00000000000..56b88f8be61 --- /dev/null +++ b/gcc/testsuite/gcc.dg/cpp/ucnid-11-utf8.c @@ -0,0 +1,30 @@ +/* Test spelling differences in UCNs are properly diagnosed for macro + redefinitions. */ +/* { dg-do preprocess } */ +/* { dg-options "-std=c99 -pedantic-errors" } */ + +/* Different spelling of UCN in expansion. */ +#define m1 \u00c1 /* { dg-message "-:previous definition" } */ +#define m1 Á /* { dg-error "-:redefined" } */ + +#define m1ok Á +#define m1ok Á + +/* Different spelling of UCN in argument name. */ +#define m2(\u00c1) /* { dg-message "-:previous definition" } */ +#define m2(Á) /* { dg-error "-:redefined" } */ + +#define m2ok(Á) +#define m2ok(Á) + +/* Same spelling in argument name but different spelling when used in + expansion. */ +#define m3(\u00c1) \u00c1 /* { dg-message "-:previous definition" } */ +#define m3(\u00c1) Á /* { dg-error "-:redefined" } */ + +#define m3ok(\u00c1) Á +#define m3ok(\u00c1) Á + +/* Different spelling of the macro name itself is OK. */ +#define m4ok\u00c1 +#define m4okÁ diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-12-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-12-utf8.c new file mode 100644 index 00000000000..9b54249c6a3 --- /dev/null +++ b/gcc/testsuite/gcc.dg/cpp/ucnid-12-utf8.c @@ -0,0 +1,13 @@ +/* Test spelling differences in UCNs in macro definitions still count + as the same identifier for macro expansion. */ +/* { dg-do compile } */ +/* { dg-options "-std=c99 -pedantic-errors" } */ + +#define m1\u00c1 +#ifndef m1Á +#error not defined +#endif + +#define m2(\u00c1) Á + +int i = m2 (0); diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-13-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-13-utf8.c new file mode 100644 index 00000000000..aff39b635db --- /dev/null +++ b/gcc/testsuite/gcc.dg/cpp/ucnid-13-utf8.c @@ -0,0 +1,5 @@ +/* Verify macros named with UTF-8 are output in -dD output with UCNs. */ +/* { dg-do preprocess } */ +/* { dg-options "-std=c99 -dD" } */ +/* { dg-final { scan-file ucnid-13-utf8.i "\\\\U000000c1" } } */ +#define Á 1 diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-14-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-14-utf8.c new file mode 100644 index 00000000000..6ea14ebbaa2 --- /dev/null +++ b/gcc/testsuite/gcc.dg/cpp/ucnid-14-utf8.c @@ -0,0 +1,6 @@ +/* Verify macro definitions with UTF-8 are output in -dD output with + the original spelling. */ +/* { dg-do preprocess } */ +/* { dg-options "-std=c99 -dD" } */ +/* { dg-final { scan-file ucnid-14-utf8.i "Á" } } */ +#define a Á diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-15-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-15-utf8.c new file mode 100644 index 00000000000..cf2289a80aa --- /dev/null +++ b/gcc/testsuite/gcc.dg/cpp/ucnid-15-utf8.c @@ -0,0 +1,6 @@ +/* Verify macro definitions with UTF-8 in argument names are output in + -dD output with the original spelling. */ +/* { dg-do preprocess } */ +/* { dg-options "-std=c99 -dD" } */ +/* { dg-final { scan-file ucnid-15-utf8.i "#define a\\(Á\\) x:Á:y:Á:z" } } */ +#define a(Á) x:Á:y:Á:z diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-2-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-2-utf8.c new file mode 100644 index 00000000000..e3730f8641a --- /dev/null +++ b/gcc/testsuite/gcc.dg/cpp/ucnid-2-utf8.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-std=c99" } */ +#include +#include + +#define str(t) #t + +int main (void) +{ + const char s[] = str (ゲ); + + if (strcmp (s, "ゲ") != 0) + abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-3-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-3-utf8.c new file mode 100644 index 00000000000..4c9ed25b590 --- /dev/null +++ b/gcc/testsuite/gcc.dg/cpp/ucnid-3-utf8.c @@ -0,0 +1,7 @@ +/* { dg-do compile } */ +/* { dg-options "-std=c99" } */ + +#define paste(x, y) x ## y + +int paste(ª, Ô±) = 3; + diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-4-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-4-utf8.c new file mode 100644 index 00000000000..ccc7a1e0296 --- /dev/null +++ b/gcc/testsuite/gcc.dg/cpp/ucnid-4-utf8.c @@ -0,0 +1,17 @@ +/* { dg-do preprocess } */ +/* { dg-options "-std=c99" } */ + +ª +« /* not a preprocessing error because we lex it into its own token */ +¶ /* not a preprocessing error because we lex it into its own token */ +º +À +Ö +΄ + +Ù© /* { dg-error "not valid at the start of an identifier" } */ +AÙ© +0º +0Ù© +๙ /* { dg-error "not valid at the start of an identifier" } */ +A๙ diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-6-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-6-utf8.c new file mode 100644 index 00000000000..b4dd0946142 --- /dev/null +++ b/gcc/testsuite/gcc.dg/cpp/ucnid-6-utf8.c @@ -0,0 +1,5 @@ +/* { dg-do compile } */ +/* { dg-options "-std=c89" } */ +#define a b( +#define b(x) q +int aª); diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-7-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-7-utf8.c new file mode 100644 index 00000000000..22aff7eaed5 --- /dev/null +++ b/gcc/testsuite/gcc.dg/cpp/ucnid-7-utf8.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-std=c99" } */ + +/* When GCC reads UTF-8-encoded input into its internal UTF-8 +representation, it does not apply any transformation to the data, and +in particular it makes no attempt to verify that the encoding is valid +UTF-8. Historically, if any non-ASCII characters were found outside a +string or comment, they were treated as stray tokens and did not +necessarily produce an error, e.g. if, as in this test, they disappear +in the preprocessor. Now that UTF-8 is also supported in identifiers, +the basic structure of this process has not changed; GCC just treats +invalid UTF-8 as a stray token. This test verifies that the historical +behavior is unchanged. In the future, if GCC were changed, say, to +validate the UTF-8 on input, then this test would no longer be +appropriate. */ + + +#define a b( +#define b(x) q +/* The line below contains invalid UTF-8. */ +int aÏ); diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-9-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-9-utf8.c new file mode 100644 index 00000000000..1558eca8bd0 --- /dev/null +++ b/gcc/testsuite/gcc.dg/cpp/ucnid-9-utf8.c @@ -0,0 +1,8 @@ +/* { dg-do preprocess } */ +/* { dg-options "-std=c99 -pedantic" } */ + +Ⅰ +ↂ +〇 +〡 +〩 diff --git a/gcc/testsuite/gcc.dg/ucnid-1-utf8.c b/gcc/testsuite/gcc.dg/ucnid-1-utf8.c new file mode 100644 index 00000000000..72136737b62 --- /dev/null +++ b/gcc/testsuite/gcc.dg/ucnid-1-utf8.c @@ -0,0 +1,25 @@ +/* { dg-do run } */ +/* { dg-options "-std=c99 -g" } */ +void abort (void); + +int main (void) +{ + int À = 1; + int Á = 2; + int  = 3; + int whÿ = 4; + int aÄbсδe = 5; + + if (À != 1) + abort (); + if (Á != 2) + abort (); + if ( != 3) + abort (); + if (whÿ != 4) + abort (); + if (aÄbсδe != 5) + abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.dg/ucnid-10-utf8.c b/gcc/testsuite/gcc.dg/ucnid-10-utf8.c new file mode 100644 index 00000000000..86830b8b228 --- /dev/null +++ b/gcc/testsuite/gcc.dg/ucnid-10-utf8.c @@ -0,0 +1,11 @@ +/* Verify diagnostics for extended identifiers refer to UCNs (in the C + locale). Test #pragma pack diagnostics. */ +/* { dg-do compile } */ +/* { dg-options "-std=gnu99" } */ +/* { dg-require-ascii-locale "" } */ +/* { dg-skip-if "" { powerpc-ibm-aix* } } */ + +#pragma pack(push) +#pragma pack(pop, ó) /* { dg-warning "pop, \\\\U000000f3.*push, \\\\U000000f3" } */ +#pragma pack(ç) /* { dg-warning "unknown action '\\\\U000000e7'" } */ + diff --git a/gcc/testsuite/gcc.dg/ucnid-11-utf8.c b/gcc/testsuite/gcc.dg/ucnid-11-utf8.c new file mode 100644 index 00000000000..c6a89bae774 --- /dev/null +++ b/gcc/testsuite/gcc.dg/ucnid-11-utf8.c @@ -0,0 +1,7 @@ +/* { dg-do run } */ +/* { dg-xfail-if "" { powerpc-ibm-aix* } } */ +/* { dg-skip-if "" { ! ucn } } */ +/* { dg-skip-if "-fdata-sections not supported" { { hppa*-*-hpux* } && { ! lp64 } } } */ +/* { dg-options "-std=c99 -fdata-sections -g" } */ + +#include "ucnid-3-utf8.c" diff --git a/gcc/testsuite/gcc.dg/ucnid-12-utf8.c b/gcc/testsuite/gcc.dg/ucnid-12-utf8.c new file mode 100644 index 00000000000..cfdffba8f6c --- /dev/null +++ b/gcc/testsuite/gcc.dg/ucnid-12-utf8.c @@ -0,0 +1,7 @@ +/* { dg-do run } */ +/* { dg-xfail-if "" { powerpc-ibm-aix* } } */ +/* { dg-skip-if "" { ! ucn } } */ +/* { dg-skip-if "-ffunction-sections not supported" { { hppa*-*-hpux* } && { ! lp64 } } } */ +/* { dg-options "-std=c99 -ffunction-sections -g" } */ + +#include "ucnid-4-utf8.c" diff --git a/gcc/testsuite/gcc.dg/ucnid-13-utf8.c b/gcc/testsuite/gcc.dg/ucnid-13-utf8.c new file mode 100644 index 00000000000..41536c3a742 --- /dev/null +++ b/gcc/testsuite/gcc.dg/ucnid-13-utf8.c @@ -0,0 +1,15 @@ +/* Verify diagnostics for extended identifiers refer to UCNs (in the C + locale). Miscellaneous diagnostics. */ +/* { dg-do compile } */ +/* { dg-options "-std=gnu99 -Wpacked" } */ +/* { dg-require-ascii-locale "" } */ +/* { dg-skip-if "" { powerpc-ibm-aix* } } */ + +int a __attribute__((À)); /* { dg-warning "'\\\\U000000c0' attribute directive ignored" } */ + +extern void Á (void) __attribute__((deprecated)); +void g (void) { Á (); } /* { dg-warning "'\\\\U000000c1' is deprecated" } */ + +struct  { char c; } __attribute__((packed)); /* { dg-warning "'\\\\U000000c2'" } */ + +void h (void) { asm ("%[Ã]" : : ); } /* { dg-error "undefined named operand '\\\\U000000c3'" } */ diff --git a/gcc/testsuite/gcc.dg/ucnid-14-utf8.c b/gcc/testsuite/gcc.dg/ucnid-14-utf8.c new file mode 100644 index 00000000000..e781ed6b4ed --- /dev/null +++ b/gcc/testsuite/gcc.dg/ucnid-14-utf8.c @@ -0,0 +1,23 @@ +/* Test miscellaneous uses of UTF-8 in identifiers compile and run OK, + with debug info enabled. */ +/* { dg-do run } */ +/* { dg-options "-std=c99 -g" } */ + +extern void abort (void); +extern void exit (int); + +int +main (void) +{ + struct À { int Á; } x; + struct À *y = &x; + y->Á = 1; + if (x.Á != 1) + abort (); + goto ÿ; + ÿ: ; + enum e {  = 4 }; + if ( != 4) + abort (); + exit (0); +} diff --git a/gcc/testsuite/gcc.dg/ucnid-15-utf8.c b/gcc/testsuite/gcc.dg/ucnid-15-utf8.c new file mode 100644 index 00000000000..e2336891b0d --- /dev/null +++ b/gcc/testsuite/gcc.dg/ucnid-15-utf8.c @@ -0,0 +1,38 @@ +/* Test combinations of UTF-8 in various parts of identifiers. */ +/* { dg-do run } */ +/* { dg-xfail-if "" { "powerpc-ibm-aix*" } } */ +/* { dg-skip-if "" { ! ucn } } */ +/* { dg-options "-std=c99" } */ + +extern void abort (void); + +int π = 3; +int π² = 9; +int πp1 = 4; +int twoπ = 6; +int four_plus_π_ = 7; +int 😀ÀÁÂÃÄÅßàáâãäaÃ¥bæçèéêcëìígîïð7ñ9__òóô4õöÆ3ÇÈÉÊËabcÌÍÎÏÐÑÒÓÔÕÖ😄😅🤣😂_ÿ = 2; +int π\u03C0 = 9; + +int main() { + if (π != 3) + abort (); + + if (π² != 9) + abort (); + + if (πp1 != 4) + abort (); + + if (twoπ != 6) + abort (); + + if (four_plus_π_ != 7) + abort () ; + + if (😀ÀÁÂÃÄÅßàáâãäaÃ¥bæçèéêcëìígîïð7ñ9__òóô4õöÆ3ÇÈÉÊËabcÌÍÎÏÐÑÒÓÔÕÖ😄😅🤣😂_ÿ != 2) + abort (); + + if(ππ != π²) + abort (); +} diff --git a/gcc/testsuite/gcc.dg/ucnid-16-utf8.c b/gcc/testsuite/gcc.dg/ucnid-16-utf8.c new file mode 100644 index 00000000000..5d000a0758a --- /dev/null +++ b/gcc/testsuite/gcc.dg/ucnid-16-utf8.c @@ -0,0 +1,6 @@ +/* { dg-do compile } */ +/* { dg-options "-std=c99 -g -finput-charset=latin1" } */ +/* { dg-final { scan-file ucnid-16-utf8.s "²" } } */ + +/* This superscript is encoded in latin1; verify that we still get UTF-8 in the output. */ +int x² = 9; diff --git a/gcc/testsuite/gcc.dg/ucnid-2-utf8.c b/gcc/testsuite/gcc.dg/ucnid-2-utf8.c new file mode 100644 index 00000000000..70f9464638c --- /dev/null +++ b/gcc/testsuite/gcc.dg/ucnid-2-utf8.c @@ -0,0 +1,28 @@ +/* { dg-do run } */ +/* { dg-xfail-if "" { powerpc-ibm-aix* } } */ +/* { dg-skip-if "" { ! ucn } } */ +/* { dg-options "-std=c99 -g" } */ +void abort (void); + +static int À = 1; +static int Á = 2; +static int  = 3; +static int whÿ = 4; +static int aÄbсδe = 5; + +int main (void) +{ + + if (À != 1) + abort (); + if (Á != 2) + abort (); + if ( != 3) + abort (); + if (whÿ != 4) + abort (); + if (aÄbсδe != 5) + abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.dg/ucnid-3-utf8.c b/gcc/testsuite/gcc.dg/ucnid-3-utf8.c new file mode 100644 index 00000000000..f8509a64323 --- /dev/null +++ b/gcc/testsuite/gcc.dg/ucnid-3-utf8.c @@ -0,0 +1,28 @@ +/* { dg-do run } */ +/* { dg-xfail-if "" { powerpc-ibm-aix* } } */ +/* { dg-skip-if "" { ! ucn } } */ +/* { dg-options "-std=c99 -g" } */ +void abort (void); + +int À = 1; +int Á = 2; +int  = 3; +int whÿ = 4; +int aÄbсδe = 5; + +int main (void) +{ + + if (À != 1) + abort (); + if (Á != 2) + abort (); + if ( != 3) + abort (); + if (whÿ != 4) + abort (); + if (aÄbсδe != 5) + abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.dg/ucnid-4-utf8.c b/gcc/testsuite/gcc.dg/ucnid-4-utf8.c new file mode 100644 index 00000000000..bf1c403b48a --- /dev/null +++ b/gcc/testsuite/gcc.dg/ucnid-4-utf8.c @@ -0,0 +1,28 @@ +/* { dg-do run } */ +/* { dg-xfail-if "" { powerpc-ibm-aix* } } */ +/* { dg-skip-if "" { ! ucn } } */ +/* { dg-options "-std=c99 -g" } */ +void abort (void); + +int À(void) { return 1; } +int Á(void) { return 2; } +int Â(void) { return 3; } +int whÿ(void) { return 4; } +int aÄbсδe(void) { return 5; } + +int main (void) +{ + + if (À() != 1) + abort (); + if (Á() != 2) + abort (); + if (Â() != 3) + abort (); + if (whÿ() != 4) + abort (); + if (aÄbсδe() != 5) + abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.dg/ucnid-5-utf8.c b/gcc/testsuite/gcc.dg/ucnid-5-utf8.c new file mode 100644 index 00000000000..f4473e1df17 --- /dev/null +++ b/gcc/testsuite/gcc.dg/ucnid-5-utf8.c @@ -0,0 +1,19 @@ +/* { dg-do run } */ +/* { dg-skip-if "No dollar in identfiers" { avr-*-* powerpc-ibm-aix* } } */ +/* { dg-options "-std=c99 -fdollars-in-identifiers -g" } */ +void abort (void); + +int a$b(void) { return 1; } +int a$b😀(void) { return 2; } + +int main (void) +{ + + if (a$b() != 1) + abort (); + + if (a$b😀() != 2) + abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.dg/ucnid-6-utf8.c b/gcc/testsuite/gcc.dg/ucnid-6-utf8.c new file mode 100644 index 00000000000..36ce52bc571 --- /dev/null +++ b/gcc/testsuite/gcc.dg/ucnid-6-utf8.c @@ -0,0 +1,28 @@ +/* { dg-do run } */ +/* { dg-xfail-if "" { "powerpc-ibm-aix*" } } */ +/* { dg-skip-if "" { ! ucn } } */ +/* { dg-options "-std=c99 -save-temps -g" } */ +void abort (void); + +int À(void) { return 1; } +int Á(void) { return 2; } +int Â(void) { return 3; } +int whÿ(void) { return 4; } +int aÄbсδe(void) { return 5; } + +int main (void) +{ + + if (À() != 1) + abort (); + if (Á() != 2) + abort (); + if (Â() != 3) + abort (); + if (whÿ() != 4) + abort (); + if (aÄbсδe() != 5) + abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.dg/ucnid-7-utf8.c b/gcc/testsuite/gcc.dg/ucnid-7-utf8.c new file mode 100644 index 00000000000..07f5ca0f9d3 --- /dev/null +++ b/gcc/testsuite/gcc.dg/ucnid-7-utf8.c @@ -0,0 +1,9 @@ +/* Verify diagnostics for extended identifiers refer to UCNs (in the C + locale). */ +/* { dg-do compile } */ +/* { dg-options "-std=c99" } */ +/* { dg-require-ascii-locale "" } */ +/* { dg-skip-if "" { "powerpc-ibm-aix*" } } */ + +void *p = &é; /* { dg-error "'\\\\U000000e9' undeclared" } */ +void *q = &Ḁ; /* { dg-error "'\\\\U00001e00' undeclared" } */ diff --git a/gcc/testsuite/gcc.dg/ucnid-8-utf8.c b/gcc/testsuite/gcc.dg/ucnid-8-utf8.c new file mode 100644 index 00000000000..e6c440d9775 --- /dev/null +++ b/gcc/testsuite/gcc.dg/ucnid-8-utf8.c @@ -0,0 +1,16 @@ +/* Verify diagnostics for extended identifiers refer to UCNs (in the C + locale). Further tests of C front-end diagnostics. */ +/* { dg-do compile } */ +/* { dg-options "-std=gnu99 -Wvla" } */ +/* { dg-require-ascii-locale "" } */ +/* { dg-skip-if "" { powerpc-ibm-aix* } } */ + +int a __attribute__((__mode__(é))); /* { dg-error "unknown machine mode '\\\\U000000e9'" } */ +struct s1 { int é : 0; }; /* { dg-error "zero width for bit-field '\\\\U000000e9'" } */ + +void f (int b) { int é[b]; } /* { dg-warning "variable length array '\\\\U000000e9'" } */ + +void g (static int é); /* { dg-error "storage class specified for parameter '\\\\U000000e9'" } */ + +struct s2 { int á; } é = { { 0 } }; /* { dg-warning "braces around scalar initializer" } */ +/* { dg-message "near initialization for '\\\\U000000e9\\.\\\\U000000e1'" "UCN diag" { target *-*-* } .-1 } */ diff --git a/gcc/testsuite/gcc.dg/ucnid-9-utf8.c b/gcc/testsuite/gcc.dg/ucnid-9-utf8.c new file mode 100644 index 00000000000..c9371966da5 --- /dev/null +++ b/gcc/testsuite/gcc.dg/ucnid-9-utf8.c @@ -0,0 +1,25 @@ +/* Test __func__ with extended identifiers and character set + conversions. */ +/* { dg-do run } */ +/* { dg-xfail-if "" { "powerpc-ibm-aix*" } } */ +/* { dg-skip-if "" { ! ucn } } */ +/* { dg-options "-std=c99 -fexec-charset=ISO-8859-1 -g" } */ +/* { dg-require-iconv "ISO-8859-1" } */ + +extern int strcmp (const char *, const char *); +extern void abort (void); +extern void exit (int); + +void +é (void) +{ + if (strcmp (__func__, "é") != 0) + abort (); +} + +int +main (void) +{ + é (); + exit (0); +} diff --git a/libcpp/ChangeLog b/libcpp/ChangeLog index 1ec8541a54c..0c851952b55 100644 --- a/libcpp/ChangeLog +++ b/libcpp/ChangeLog @@ -1,3 +1,13 @@ +2019-09-19 Lewis Hyatt + + PR c/67224 + * charset.c (_cpp_valid_utf8): New function to help lex UTF-8 tokens. + * internal.h (_cpp_valid_utf8): Declare. + * lex.c (forms_identifier_p): Use it to recognize UTF-8 identifiers. + (_cpp_lex_direct): Handle UTF-8 in identifiers and CPP_OTHER tokens. + Do all work in "default" case to avoid slowing down typical code paths. + Also handle $ and UCN in the default case for consistency. + 2019-08-30 Nathan Sidwell New # semantics for popping to "" name. diff --git a/libcpp/charset.c b/libcpp/charset.c index 8a0e5cbb29b..10286219bd6 100644 --- a/libcpp/charset.c +++ b/libcpp/charset.c @@ -1198,6 +1198,84 @@ convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit, return from; } +/* Performs a similar task as _cpp_valid_ucn, but parses UTF-8-encoded + extended characters rather than UCNs. If the return value is TRUE, then a + character was successfully decoded and stored in *CP; *PSTR has been + updated to point one past the valid UTF-8 sequence. Diagnostics may have + been emitted if the character parsed is not allowed in the current context. + If the return value is FALSE, then *PSTR has not been modified and *CP may + equal 0, to indicate that *PSTR does not form a valid UTF-8 sequence, or it + may, when processing an identifier in C mode, equal a codepoint that was + validly encoded but is not allowed to appear in an identifier. In either + case, no diagnostic is emitted, and the return value of FALSE should cause + a new token to be formed. + + Unlike _cpp_valid_ucn, this will never be called when lexing a string; only + a potential identifier, or a CPP_OTHER token. NST is unused in the latter + case. + + As in _cpp_valid_ucn, IDENTIFIER_POS is 0 when not in an identifier, 1 for + the start of an identifier, or 2 otherwise. */ + +extern bool +_cpp_valid_utf8 (cpp_reader *pfile, + const uchar **pstr, + const uchar *limit, + int identifier_pos, + struct normalize_state *nst, + cppchar_t *cp) +{ + const uchar *base = *pstr; + size_t inbytesleft = limit - base; + if (one_utf8_to_cppchar (pstr, &inbytesleft, cp)) + { + /* No diagnostic here as this byte will rather become a + new token. */ + *cp = 0; + return false; + } + + if (identifier_pos) + { + switch (ucn_valid_in_identifier (pfile, *cp, nst)) + { + + case 0: + /* In C++, this is an error for invalid character in an identifier + because logically, the UTF-8 was converted to a UCN during + translation phase 1 (even though we don't physically do it that + way). In C, this byte rather becomes grammatically a separate + token. */ + + if (CPP_OPTION (pfile, cplusplus)) + cpp_error (pfile, CPP_DL_ERROR, + "extended character %.*s is not valid in an identifier", + (int) (*pstr - base), base); + else + { + *pstr = base; + return false; + } + + break; + + case 2: + if (identifier_pos == 1) + { + /* This is treated the same way in C++ or C99 -- lexed as an + identifier which is then invalid because an identifier is + not allowed to start with this character. */ + cpp_error (pfile, CPP_DL_ERROR, + "extended character %.*s is not valid at the start of an identifier", + (int) (*pstr - base), base); + } + break; + } + } + + return true; +} + /* Subroutine of convert_hex and convert_oct. N is the representation in the execution character set of a numeric escape; write it into the string buffer TBUF and update the end-of-string pointer therein. WIDE @@ -1956,8 +2034,9 @@ cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token, } /* Convert an identifier denoted by ID and LEN, which might contain - UCN escapes, to the source character set, either UTF-8 or - UTF-EBCDIC. Assumes that the identifier is actually a valid identifier. */ + UCN escapes or UTF-8 multibyte chars, to the source character set, + either UTF-8 or UTF-EBCDIC. Assumes that the identifier is actually + a valid identifier. */ cpp_hashnode * _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len) { diff --git a/libcpp/internal.h b/libcpp/internal.h index f9bcd37c571..90263bde47d 100644 --- a/libcpp/internal.h +++ b/libcpp/internal.h @@ -791,6 +791,14 @@ extern bool _cpp_valid_ucn (cpp_reader *, const unsigned char **, cppchar_t *, source_range *char_range, cpp_string_location_reader *loc_reader); + +extern bool _cpp_valid_utf8 (cpp_reader *pfile, + const uchar **pstr, + const uchar *limit, + int identifier_pos, + struct normalize_state *nst, + cppchar_t *cp); + extern void _cpp_destroy_iconv (cpp_reader *); extern unsigned char *_cpp_convert_input (cpp_reader *, const char *, unsigned char *, size_t, size_t, diff --git a/libcpp/lex.c b/libcpp/lex.c index 52e5bceb3ff..0e8de3807b3 100644 --- a/libcpp/lex.c +++ b/libcpp/lex.c @@ -1313,7 +1313,9 @@ warn_about_normalization (cpp_reader *pfile, } } -/* Returns TRUE if the sequence starting at buffer->cur is invalid in +static const cppchar_t utf8_signifier = 0xC0; + +/* Returns TRUE if the sequence starting at buffer->cur is valid in an identifier. FIRST is TRUE if this starts an identifier. */ static bool forms_identifier_p (cpp_reader *pfile, int first, @@ -1336,17 +1338,25 @@ forms_identifier_p (cpp_reader *pfile, int first, return true; } - /* Is this a syntactically valid UCN? */ - if (CPP_OPTION (pfile, extended_identifiers) - && *buffer->cur == '\\' - && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U')) + /* Is this a syntactically valid UCN or a valid UTF-8 char? */ + if (CPP_OPTION (pfile, extended_identifiers)) { cppchar_t s; - buffer->cur += 2; - if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first, - state, &s, NULL, NULL)) - return true; - buffer->cur -= 2; + if (*buffer->cur >= utf8_signifier) + { + if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first, + state, &s)) + return true; + } + else if (*buffer->cur == '\\' + && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U')) + { + buffer->cur += 2; + if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first, + state, &s, NULL, NULL)) + return true; + buffer->cur -= 2; + } } return false; @@ -1464,7 +1474,8 @@ lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn, pfile->buffer->cur = cur; if (starts_ucn || forms_identifier_p (pfile, false, nst)) { - /* Slower version for identifiers containing UCNs (or $). */ + /* Slower version for identifiers containing UCNs + or extended chars (including $). */ do { while (ISIDNUM (*pfile->buffer->cur)) { @@ -3123,12 +3134,12 @@ _cpp_lex_direct (cpp_reader *pfile) /* @ is a punctuator in Objective-C. */ case '@': result->type = CPP_ATSIGN; break; - case '$': - case '\\': + default: { const uchar *base = --buffer->cur; - struct normalize_state nst = INITIAL_NORMALIZE_STATE; + /* Check for an extended identifier ($ or UCN or UTF-8). */ + struct normalize_state nst = INITIAL_NORMALIZE_STATE; if (forms_identifier_p (pfile, true, &nst)) { result->type = CPP_NAME; @@ -3137,13 +3148,21 @@ _cpp_lex_direct (cpp_reader *pfile) warn_about_normalization (pfile, result, &nst); break; } + + /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a + single token. */ buffer->cur++; + if (c >= utf8_signifier) + { + const uchar *pstr = base; + cppchar_t s; + if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s)) + buffer->cur = pstr; + } + create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER); + break; } - /* FALLTHRU */ - default: - create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER); - break; } /* Potentially convert the location of the token to a range. */ -- 2.30.2