From: Joseph Myers Date: Thu, 14 Nov 2019 20:18:33 +0000 (+0000) Subject: Support UTF-8 character constants for C2x. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=7c5890cc0a0ecea0e88cc39e9fba6385fb579e61;p=gcc.git Support UTF-8 character constants for C2x. C2x adds u8'' character constants to C. This patch adds the corresponding GCC support. Most of the support was already present for C++ and just needed enabling for C2x. However, in C2x these constants have type unsigned char, which required corresponding adjustments in the compiler and the preprocessor to give them that type for C. For C, it seems clear to me that having type unsigned char means the constants are unsigned in the preprocessor (and thus treated as having type uintmax_t in #if conditionals), so this patch implements that. I included a conditional in the libcpp change to avoid affecting signedness for C++, but I'm not sure if in fact these constants should also be unsigned in the preprocessor for C++ in which case that !CPP_OPTION (pfile, cplusplus) conditional would not be needed. Bootstrapped with no regressions on x86_64-pc-linux-gnu. gcc/c: * c-parser.c (c_parser_postfix_expression) (c_parser_check_literal_zero): Handle CPP_UTF8CHAR. * gimple-parser.c (c_parser_gimple_postfix_expression): Likewise. gcc/c-family: * c-lex.c (lex_charconst): Make CPP_UTF8CHAR constants unsigned char for C. gcc/testsuite: * gcc.dg/c11-utf8char-1.c, gcc.dg/c2x-utf8char-1.c, gcc.dg/c2x-utf8char-2.c, gcc.dg/c2x-utf8char-3.c, gcc.dg/gnu2x-utf8char-1.c: New tests. libcpp: * charset.c (narrow_str_to_charconst): Make CPP_UTF8CHAR constants unsigned for C. * init.c (lang_defaults): Set utf8_char_literals for GNUC2X and STDC2X. From-SVN: r278265 --- diff --git a/gcc/c-family/ChangeLog b/gcc/c-family/ChangeLog index 336159d79f6..f4fdccc448a 100644 --- a/gcc/c-family/ChangeLog +++ b/gcc/c-family/ChangeLog @@ -1,3 +1,8 @@ +2019-11-14 Joseph Myers + + * c-lex.c (lex_charconst): Make CPP_UTF8CHAR constants unsigned + char for C. + 2019-11-14 Jakub Jelinek * c-omp.c (c_omp_check_context_selector): Add nvidia to the list of diff --git a/gcc/c-family/c-lex.c b/gcc/c-family/c-lex.c index 42010a762a6..d446633f814 100644 --- a/gcc/c-family/c-lex.c +++ b/gcc/c-family/c-lex.c @@ -1376,7 +1376,9 @@ lex_charconst (const cpp_token *token) type = char16_type_node; else if (token->type == CPP_UTF8CHAR) { - if (flag_char8_t) + if (!c_dialect_cxx ()) + type = unsigned_char_type_node; + else if (flag_char8_t) type = char8_type_node; else type = char_type_node; diff --git a/gcc/c/ChangeLog b/gcc/c/ChangeLog index 04dce4b45ce..b881cab75de 100644 --- a/gcc/c/ChangeLog +++ b/gcc/c/ChangeLog @@ -1,3 +1,9 @@ +2019-11-14 Joseph Myers + + * c-parser.c (c_parser_postfix_expression) + (c_parser_check_literal_zero): Handle CPP_UTF8CHAR. + * gimple-parser.c (c_parser_gimple_postfix_expression): Likewise. + 2019-11-14 Richard Sandiford * c-typeck.c (build_conditional_expr): Use truth_type_for instead diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c index 5e30a7f1916..8ce4e70a0fc 100644 --- a/gcc/c/c-parser.c +++ b/gcc/c/c-parser.c @@ -8783,6 +8783,7 @@ c_parser_postfix_expression (c_parser *parser) case CPP_CHAR: case CPP_CHAR16: case CPP_CHAR32: + case CPP_UTF8CHAR: case CPP_WCHAR: expr.value = c_parser_peek_token (parser)->value; /* For the purpose of warning when a pointer is compared with @@ -10459,6 +10460,7 @@ c_parser_check_literal_zero (c_parser *parser, unsigned *literal_zero_mask, case CPP_WCHAR: case CPP_CHAR16: case CPP_CHAR32: + case CPP_UTF8CHAR: /* If a parameter is literal zero alone, remember it for -Wmemset-transposed-args warning. */ if (integer_zerop (tok->value) diff --git a/gcc/c/gimple-parser.c b/gcc/c/gimple-parser.c index 6fdb83c1abe..c16d0dfb88e 100644 --- a/gcc/c/gimple-parser.c +++ b/gcc/c/gimple-parser.c @@ -1395,6 +1395,7 @@ c_parser_gimple_postfix_expression (gimple_parser &parser) case CPP_CHAR: case CPP_CHAR16: case CPP_CHAR32: + case CPP_UTF8CHAR: case CPP_WCHAR: expr.value = c_parser_peek_token (parser)->value; set_c_expr_source_range (&expr, tok_range); diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 668444818f2..51624b7212e 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2019-11-14 Joseph Myers + + * gcc.dg/c11-utf8char-1.c, gcc.dg/c2x-utf8char-1.c, + gcc.dg/c2x-utf8char-2.c, gcc.dg/c2x-utf8char-3.c, + gcc.dg/gnu2x-utf8char-1.c: New tests. + 2019-11-14 Richard Sandiford PR testsuite/92366 diff --git a/gcc/testsuite/gcc.dg/c11-utf8char-1.c b/gcc/testsuite/gcc.dg/c11-utf8char-1.c new file mode 100644 index 00000000000..26dbd92b3ad --- /dev/null +++ b/gcc/testsuite/gcc.dg/c11-utf8char-1.c @@ -0,0 +1,7 @@ +/* Test C2x UTF-8 characters. Test not accepted for C11. */ +/* { dg-do compile } */ +/* { dg-options "-std=c11 -pedantic-errors" } */ + +#define z(x) 0 +#define u8 z( +unsigned char a = u8'a'); diff --git a/gcc/testsuite/gcc.dg/c2x-utf8char-1.c b/gcc/testsuite/gcc.dg/c2x-utf8char-1.c new file mode 100644 index 00000000000..76543afca82 --- /dev/null +++ b/gcc/testsuite/gcc.dg/c2x-utf8char-1.c @@ -0,0 +1,29 @@ +/* Test C2x UTF-8 characters. Test valid usages. */ +/* { dg-do compile } */ +/* { dg-options "-std=c2x -pedantic-errors" } */ + +unsigned char a = u8'a'; +_Static_assert (u8'a' == 97); + +unsigned char b = u8'\0'; +_Static_assert (u8'\0' == 0); + +unsigned char c = u8'\xff'; +_Static_assert (u8'\xff' == 255); + +unsigned char d = u8'\377'; +_Static_assert (u8'\377' == 255); + +_Static_assert (sizeof (u8'a') == 1); +_Static_assert (sizeof (u8'\0') == 1); +_Static_assert (sizeof (u8'\xff') == 1); +_Static_assert (sizeof (u8'\377') == 1); + +_Static_assert (_Generic (u8'a', unsigned char: 1, default: 2) == 1); +_Static_assert (_Generic (u8'\0', unsigned char: 1, default: 2) == 1); +_Static_assert (_Generic (u8'\xff', unsigned char: 1, default: 2) == 1); +_Static_assert (_Generic (u8'\377', unsigned char: 1, default: 2) == 1); + +#if u8'\0' - 1 < 0 +#error "UTF-8 constants not unsigned in preprocessor" +#endif diff --git a/gcc/testsuite/gcc.dg/c2x-utf8char-2.c b/gcc/testsuite/gcc.dg/c2x-utf8char-2.c new file mode 100644 index 00000000000..4e6a2f6955f --- /dev/null +++ b/gcc/testsuite/gcc.dg/c2x-utf8char-2.c @@ -0,0 +1,8 @@ +/* Test C2x UTF-8 characters. Character values not affected by + different execution character set. */ +/* { dg-do compile } */ +/* { dg-require-iconv "IBM1047" } */ +/* { dg-options "-std=c2x -pedantic-errors -fexec-charset=IBM1047" } */ + +_Static_assert (u8'a' == 97); +_Static_assert (u8'a' != (unsigned char) 'a'); diff --git a/gcc/testsuite/gcc.dg/c2x-utf8char-3.c b/gcc/testsuite/gcc.dg/c2x-utf8char-3.c new file mode 100644 index 00000000000..7c489831998 --- /dev/null +++ b/gcc/testsuite/gcc.dg/c2x-utf8char-3.c @@ -0,0 +1,8 @@ +/* Test C2x UTF-8 characters. Test errors for invalid code. */ +/* { dg-do compile } */ +/* { dg-options "-std=c2x -pedantic-errors" } */ + +unsigned char a = u8''; /* { dg-error "empty character constant" } */ +unsigned char b = u8'ab'; /* { dg-error "character constant too long for its type" } */ +unsigned char c = u8'\u00ff'; /* { dg-error "character constant too long for its type" } */ +unsigned char d = u8'\x100'; /* { dg-error "hex escape sequence out of range" } */ diff --git a/gcc/testsuite/gcc.dg/gnu2x-utf8char-1.c b/gcc/testsuite/gcc.dg/gnu2x-utf8char-1.c new file mode 100644 index 00000000000..9c3add2ae83 --- /dev/null +++ b/gcc/testsuite/gcc.dg/gnu2x-utf8char-1.c @@ -0,0 +1,5 @@ +/* Test C2x UTF-8 characters. Test accepted with -std=gnu2x. */ +/* { dg-do compile } */ +/* { dg-options "-std=gnu2x" } */ + +#include "c2x-utf8char-1.c" diff --git a/libcpp/ChangeLog b/libcpp/ChangeLog index b57f26584a1..448f954d2ee 100644 --- a/libcpp/ChangeLog +++ b/libcpp/ChangeLog @@ -1,3 +1,10 @@ +2019-11-14 Joseph Myers + + * charset.c (narrow_str_to_charconst): Make CPP_UTF8CHAR constants + unsigned for C. + * init.c (lang_defaults): Set utf8_char_literals for GNUC2X and + STDC2X. + 2019-11-07 Jakub Jelinek PR c++/91370 - Implement P1041R4 and P1139R2 - Stronger Unicode reqs diff --git a/libcpp/charset.c b/libcpp/charset.c index 0b8815af46b..d4574415ac1 100644 --- a/libcpp/charset.c +++ b/libcpp/charset.c @@ -1928,6 +1928,8 @@ narrow_str_to_charconst (cpp_reader *pfile, cpp_string str, /* Multichar constants are of type int and therefore signed. */ if (i > 1) unsigned_p = 0; + else if (type == CPP_UTF8CHAR && !CPP_OPTION (pfile, cplusplus)) + unsigned_p = 1; else unsigned_p = CPP_OPTION (pfile, unsigned_char); diff --git a/libcpp/init.c b/libcpp/init.c index 32b0e70a210..f5f41b012f8 100644 --- a/libcpp/init.c +++ b/libcpp/init.c @@ -102,13 +102,13 @@ static const struct lang_flags lang_defaults[] = /* GNUC99 */ { 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0 }, /* GNUC11 */ { 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0 }, /* GNUC17 */ { 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0 }, - /* GNUC2X */ { 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1 }, + /* GNUC2X */ { 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1 }, /* STDC89 */ { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0 }, /* STDC94 */ { 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0 }, /* STDC99 */ { 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0 }, /* STDC11 */ { 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0 }, /* STDC17 */ { 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0 }, - /* STDC2X */ { 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1 }, + /* STDC2X */ { 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1 }, /* GNUCXX */ { 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0 }, /* CXX98 */ { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0 }, /* GNUCXX11 */ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0 },