From b1b60145aedb8adcb0b9dcf43a5ae735c2f03b51 Mon Sep 17 00:00:00 2001 From: Pedro Alves Date: Tue, 22 May 2018 17:35:38 +0100 Subject: [PATCH] Support UTF-8 identifiers in C/C++ expressions (PR gdb/22973) MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Factor out cp_ident_is_alpha/cp_ident_is_alnum out of gdb/cp-name-parser.y and use it in the C/C++ expression parser too. New test included. gdb/ChangeLog: 2018-05-22 Pedro Alves 張俊芝 PR gdb/22973 * c-exp.y: Include "c-support.h". (parse_number, c_parse_escape, lex_one_token): Use TOLOWER instead of tolower. Use c_ident_is_alpha to scan names. * c-lang.c: Include "c-support.h". (convert_ucn, convert_octal, convert_hex, convert_escape): Use ISXDIGIT instead of isxdigit and ISDIGIT instead of isdigit. * c-support.h: New file, with bits factored out from ... * cp-name-parser.y: ... this file. Include "c-support.h". (cp_ident_is_alpha, cp_ident_is_alnum): Deleted, moved to c-support.h and renamed. (symbol_end, yylex): Adjust. gdb/testsuite/ChangeLog: 2018-05-22 Pedro Alves PR gdb/22973 * gdb.base/utf8-identifiers.c: New file. * gdb.base/utf8-identifiers.exp: New file. --- gdb/ChangeLog | 17 +++++ gdb/c-exp.y | 27 ++++---- gdb/c-lang.c | 11 +-- gdb/c-support.h | 46 ++++++++++++ gdb/cp-name-parser.y | 29 ++------ gdb/testsuite/ChangeLog | 6 ++ gdb/testsuite/gdb.base/utf8-identifiers.c | 71 +++++++++++++++++++ gdb/testsuite/gdb.base/utf8-identifiers.exp | 77 +++++++++++++++++++++ 8 files changed, 240 insertions(+), 44 deletions(-) create mode 100644 gdb/c-support.h create mode 100644 gdb/testsuite/gdb.base/utf8-identifiers.c create mode 100644 gdb/testsuite/gdb.base/utf8-identifiers.exp diff --git a/gdb/ChangeLog b/gdb/ChangeLog index ff5e0a2cbe1..b34fa7b74e7 100644 --- a/gdb/ChangeLog +++ b/gdb/ChangeLog @@ -1,3 +1,20 @@ +2018-05-22 Pedro Alves + 張俊芝 + + PR gdb/22973 + * c-exp.y: Include "c-support.h". + (parse_number, c_parse_escape, lex_one_token): Use TOLOWER instead + of tolower. Use c_ident_is_alpha to scan names. + * c-lang.c: Include "c-support.h". + (convert_ucn, convert_octal, convert_hex, convert_escape): Use + ISXDIGIT instead of isxdigit and ISDIGIT instead of isdigit. + * c-support.h: New file, with bits factored out from ... + * cp-name-parser.y: ... this file. + Include "c-support.h". + (cp_ident_is_alpha, cp_ident_is_alnum): Deleted, moved to + c-support.h and renamed. + (symbol_end, yylex): Adjust. + 2018-05-22 Pedro Franco de Carvalho * arch/ppc-linux-common.c (ppc_linux_has_isa205): Change the diff --git a/gdb/c-exp.y b/gdb/c-exp.y index 5e10d2a3b43..ae31af52dfb 100644 --- a/gdb/c-exp.y +++ b/gdb/c-exp.y @@ -42,6 +42,7 @@ #include "parser-defs.h" #include "language.h" #include "c-lang.h" +#include "c-support.h" #include "bfd.h" /* Required by objfiles.h. */ #include "symfile.h" /* Required by objfiles.h. */ #include "objfiles.h" /* For have_full_symbols and have_partial_symbols */ @@ -1806,13 +1807,13 @@ parse_number (struct parser_state *par_state, len -= 2; } /* Handle suffixes: 'f' for float, 'l' for long double. */ - else if (len >= 1 && tolower (p[len - 1]) == 'f') + else if (len >= 1 && TOLOWER (p[len - 1]) == 'f') { putithere->typed_val_float.type = parse_type (par_state)->builtin_float; len -= 1; } - else if (len >= 1 && tolower (p[len - 1]) == 'l') + else if (len >= 1 && TOLOWER (p[len - 1]) == 'l') { putithere->typed_val_float.type = parse_type (par_state)->builtin_long_double; @@ -2023,9 +2024,9 @@ c_parse_escape (const char **ptr, struct obstack *output) if (output) obstack_grow_str (output, "\\x"); ++tokptr; - if (!isxdigit (*tokptr)) + if (!ISXDIGIT (*tokptr)) error (_("\\x escape without a following hex digit")); - while (isxdigit (*tokptr)) + while (ISXDIGIT (*tokptr)) { if (output) obstack_1grow (output, *tokptr); @@ -2048,7 +2049,7 @@ c_parse_escape (const char **ptr, struct obstack *output) if (output) obstack_grow_str (output, "\\"); for (i = 0; - i < 3 && isdigit (*tokptr) && *tokptr != '8' && *tokptr != '9'; + i < 3 && ISDIGIT (*tokptr) && *tokptr != '8' && *tokptr != '9'; ++i) { if (output) @@ -2073,9 +2074,9 @@ c_parse_escape (const char **ptr, struct obstack *output) obstack_1grow (output, *tokptr); } ++tokptr; - if (!isxdigit (*tokptr)) + if (!ISXDIGIT (*tokptr)) error (_("\\%c escape without a following hex digit"), c); - for (i = 0; i < len && isxdigit (*tokptr); ++i) + for (i = 0; i < len && ISXDIGIT (*tokptr); ++i) { if (output) obstack_1grow (output, *tokptr); @@ -2668,7 +2669,7 @@ lex_one_token (struct parser_state *par_state, bool *is_quoted_name) size_t len = strlen ("selector"); if (strncmp (p, "selector", len) == 0 - && (p[len] == '\0' || isspace (p[len]))) + && (p[len] == '\0' || ISSPACE (p[len]))) { lexptr = p + len; return SELECTOR; @@ -2677,9 +2678,9 @@ lex_one_token (struct parser_state *par_state, bool *is_quoted_name) goto parse_string; } - while (isspace (*p)) + while (ISSPACE (*p)) p++; - if (strncmp (p, "entry", len) == 0 && !isalnum (p[len]) + if (strncmp (p, "entry", len) == 0 && !c_ident_is_alnum (p[len]) && p[len] != '_') { lexptr = &p[len]; @@ -2741,16 +2742,14 @@ lex_one_token (struct parser_state *par_state, bool *is_quoted_name) } } - if (!(c == '_' || c == '$' - || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) + if (!(c == '_' || c == '$' || c_ident_is_alpha (c))) /* We must have come across a bad character (e.g. ';'). */ error (_("Invalid character '%c' in expression."), c); /* It's a name. See how long it is. */ namelen = 0; for (c = tokstart[namelen]; - (c == '_' || c == '$' || (c >= '0' && c <= '9') - || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '<');) + (c == '_' || c == '$' || c_ident_is_alnum (c) || c == '<');) { /* Template parameter lists are part of the name. FIXME: This mishandles `print $a<4&&$a>3'. */ diff --git a/gdb/c-lang.c b/gdb/c-lang.c index 15e633f8c8f..6bbb4709571 100644 --- a/gdb/c-lang.c +++ b/gdb/c-lang.c @@ -25,6 +25,7 @@ #include "language.h" #include "varobj.h" #include "c-lang.h" +#include "c-support.h" #include "valprint.h" #include "macroscope.h" #include "charset.h" @@ -382,7 +383,7 @@ convert_ucn (char *p, char *limit, const char *dest_charset, gdb_byte data[4]; int i; - for (i = 0; i < length && p < limit && isxdigit (*p); ++i, ++p) + for (i = 0; i < length && p < limit && ISXDIGIT (*p); ++i, ++p) result = (result << 4) + host_hex_value (*p); for (i = 3; i >= 0; --i) @@ -424,7 +425,7 @@ convert_octal (struct type *type, char *p, unsigned long value = 0; for (i = 0; - i < 3 && p < limit && isdigit (*p) && *p != '8' && *p != '9'; + i < 3 && p < limit && ISDIGIT (*p) && *p != '8' && *p != '9'; ++i) { value = 8 * value + host_hex_value (*p); @@ -447,7 +448,7 @@ convert_hex (struct type *type, char *p, { unsigned long value = 0; - while (p < limit && isxdigit (*p)) + while (p < limit && ISXDIGIT (*p)) { value = 16 * value + host_hex_value (*p); ++p; @@ -488,7 +489,7 @@ convert_escape (struct type *type, const char *dest_charset, case 'x': ADVANCE; - if (!isxdigit (*p)) + if (!ISXDIGIT (*p)) error (_("\\x used with no following hex digits.")); p = convert_hex (type, p, limit, output); break; @@ -510,7 +511,7 @@ convert_escape (struct type *type, const char *dest_charset, int length = *p == 'u' ? 4 : 8; ADVANCE; - if (!isxdigit (*p)) + if (!ISXDIGIT (*p)) error (_("\\u used with no following hex digits")); p = convert_ucn (p, limit, dest_charset, output, length); } diff --git a/gdb/c-support.h b/gdb/c-support.h new file mode 100644 index 00000000000..3641d6f534b --- /dev/null +++ b/gdb/c-support.h @@ -0,0 +1,46 @@ +/* Helper routines for C support in GDB. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + + This file is part of GDB. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef C_SUPPORT_H +#define C_SUPPORT_H + +#include "safe-ctype.h" + +/* Like ISALPHA, but also returns true for the union of all UTF-8 + multi-byte sequence bytes and non-ASCII characters in + extended-ASCII charsets (e.g., Latin1). I.e., returns true if the + high bit is set. Note that not all UTF-8 ranges are allowed in C++ + identifiers, but we don't need to be pedantic so for simplicity we + ignore that here. Plus this avoids the complication of actually + knowing what was the right encoding. */ + +static inline bool +c_ident_is_alpha (unsigned char ch) +{ + return ISALPHA (ch) || ch >= 0x80; +} + +/* Similarly, but Like ISALNUM. */ + +static inline bool +c_ident_is_alnum (unsigned char ch) +{ + return ISALNUM (ch) || ch >= 0x80; +} + +#endif /* C_SUPPORT_H */ diff --git a/gdb/cp-name-parser.y b/gdb/cp-name-parser.y index f522e464191..ebae56261b3 100644 --- a/gdb/cp-name-parser.y +++ b/gdb/cp-name-parser.y @@ -35,6 +35,7 @@ #include "safe-ctype.h" #include "demangle.h" #include "cp-support.h" +#include "c-support.h" /* Bison does not make it easy to create a parser without global state, unfortunately. Here are all the global variables used @@ -1304,28 +1305,6 @@ d_binary (const char *name, struct demangle_component *lhs, struct demangle_comp fill_comp (DEMANGLE_COMPONENT_BINARY_ARGS, lhs, rhs)); } -/* Like ISALPHA, but also returns true for the union of all UTF-8 - multi-byte sequence bytes and non-ASCII characters in - extended-ASCII charsets (e.g., Latin1). I.e., returns true if the - high bit is set. Note that not all UTF-8 ranges are allowed in C++ - identifiers, but we don't need to be pedantic so for simplicity we - ignore that here. Plus this avoids the complication of actually - knowing what was the right encoding. */ - -static inline bool -cp_ident_is_alpha (unsigned char ch) -{ - return ISALPHA (ch) || ch >= 0x80; -} - -/* Similarly, but Like ISALNUM. */ - -static inline bool -cp_ident_is_alnum (unsigned char ch) -{ - return ISALNUM (ch) || ch >= 0x80; -} - /* Find the end of a symbol name starting at LEXPTR. */ static const char * @@ -1333,7 +1312,7 @@ symbol_end (const char *lexptr) { const char *p = lexptr; - while (*p && (cp_ident_is_alnum (*p) || *p == '_' || *p == '$' || *p == '.')) + while (*p && (c_ident_is_alnum (*p) || *p == '_' || *p == '$' || *p == '.')) p++; return p; @@ -1813,7 +1792,7 @@ yylex (void) return ERROR; } - if (!(c == '_' || c == '$' || cp_ident_is_alpha (c))) + if (!(c == '_' || c == '$' || c_ident_is_alpha (c))) { /* We must have come across a bad character (e.g. ';'). */ yyerror (_("invalid character")); @@ -1824,7 +1803,7 @@ yylex (void) namelen = 0; do c = tokstart[++namelen]; - while (cp_ident_is_alnum (c) || c == '_' || c == '$'); + while (c_ident_is_alnum (c) || c == '_' || c == '$'); lexptr += namelen; diff --git a/gdb/testsuite/ChangeLog b/gdb/testsuite/ChangeLog index 208939bf829..393ab8884ab 100644 --- a/gdb/testsuite/ChangeLog +++ b/gdb/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2018-05-22 Pedro Alves + + PR gdb/22973 + * gdb.base/utf8-identifiers.c: New file. + * gdb.base/utf8-identifiers.exp: New file. + 2018-05-22 Pedro Franco de Carvalho * gdb.arch/powerpc-fpscr-gcore.exp: New file. diff --git a/gdb/testsuite/gdb.base/utf8-identifiers.c b/gdb/testsuite/gdb.base/utf8-identifiers.c new file mode 100644 index 00000000000..c80b42a03d2 --- /dev/null +++ b/gdb/testsuite/gdb.base/utf8-identifiers.c @@ -0,0 +1,71 @@ +/* -*- coding: utf-8 -*- */ + +/* This testcase is part of GDB, the GNU debugger. + + Copyright 2017-2018 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/* UTF-8 "função1". */ +#define FUNCAO1 fun\u00e7\u00e3o1 + +/* UTF-8 "função2". */ +#define FUNCAO2 fun\u00e7\u00e3o2 + +/* UTF-8 "my_função". */ +#define MY_FUNCAO my_fun\u00e7\u00e3o + +/* UTF-8 "num_€". */ +#define NUM_EUROS num_\u20ac + +struct S +{ + int NUM_EUROS; +} g_s; + +void +FUNCAO1 (void) +{ + g_s.NUM_EUROS = 1000; +} + +void +FUNCAO2 (void) +{ + g_s.NUM_EUROS = 1000; +} + +void +MY_FUNCAO (void) +{ +} + +int NUM_EUROS = 2000; + +static void +done () +{ +} + +int +main () +{ + FUNCAO1 (); + done (); + FUNCAO2 (); + MY_FUNCAO (); + + return 0; +} diff --git a/gdb/testsuite/gdb.base/utf8-identifiers.exp b/gdb/testsuite/gdb.base/utf8-identifiers.exp new file mode 100644 index 00000000000..12fe3768e2a --- /dev/null +++ b/gdb/testsuite/gdb.base/utf8-identifiers.exp @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- */ + +# This testcase is part of GDB, the GNU debugger. + +# Copyright 2017-2018 Free Software Foundation, Inc. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# Test GDB's support for UTF-8 C/C++ identifiers. + +load_lib completion-support.exp + +standard_testfile + +# Enable basic use of UTF-8. LC_ALL gets reset for each testfile. +setenv LC_ALL C.UTF-8 + +if { [prepare_for_testing "failed to prepare" ${testfile} [list $srcfile]] } { + return -1 +} + +if ![runto done] { + fail "couldn't run to done" + return +} + +# Test expressions. +gdb_test "print g_s.num_€" " = 1000" +gdb_test "print num_€" " = 2000" + +# Test linespecs/breakpoints. +gdb_test "break função2" "Breakpoint $decimal at .*$srcfile.*" + +set test "info breakpoints" +gdb_test_multiple $test $test { + -re "in função2 at .*$srcfile.*$gdb_prompt $" { + pass $test + } +} + +gdb_test "continue" \ + "Breakpoint $decimal, função2 \\(\\) at .*$srcfile.*" + +# Unload symbols from shared libraries to avoid random symbol and file +# names getting in the way of completion. +gdb_test_no_output "nosharedlibrary" + +# Test linespec completion. + +# A unique completion. +test_gdb_complete_unique "break my_fun" "break my_função" + +# A multiple-matches completion: + +# kfailed because gdb/readline display the completion match list like +# this, with no separating space: +# +# (gdb) break função[TAB] +# função1função2 +# +# ... which is bogus. +setup_kfail "gdb/23211" "*-*-*" +test_gdb_complete_multiple "break " "fun" "ção" {"função1" "função2"} + +# Test expression completion. +test_gdb_complete_unique "print g_s.num" "print g_s.num_€" -- 2.30.2