From: Tom Tromey Date: Wed, 26 Jan 2022 22:39:03 +0000 (-0700) Subject: Allow non-ASCII characters in Rust identifiers X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=a723766c0e2cc4e8d53813f90f1167e620da0784;p=binutils-gdb.git Allow non-ASCII characters in Rust identifiers Rust 1.53 (quite a while ago now) ungated the support for non-ASCII identifiers. This didn't work in gdb. This is PR rust/20166. This patch fixes the problem by allowing non-ASCII characters to be considered as identifier components. It seemed simplest to just pass them through -- doing any extra checking didn't seem worthwhile. The new test also verifies that such characters are allowed in strings and character literals as well. The latter also required a bit of work in the lexer. Bug: https://sourceware.org/bugzilla/show_bug.cgi?id=20166 --- diff --git a/gdb/rust-parse.c b/gdb/rust-parse.c index 894f2e79d49..8be7d33cfe9 100644 --- a/gdb/rust-parse.c +++ b/gdb/rust-parse.c @@ -33,6 +33,12 @@ using namespace expr; +#if WORDS_BIGENDIAN +#define UTF32 "UTF-32BE" +#else +#define UTF32 "UTF-32LE" +#endif + /* A regular expression for matching Rust numbers. This is split up since it is very long and this gives us a way to comment the sections. */ @@ -577,6 +583,35 @@ rust_parser::lex_escape (int is_byte) return result; } +/* A helper for lex_character. Search forward for the closing single + quote, then convert the bytes from the host charset to UTF-32. */ + +static uint32_t +lex_multibyte_char (const char *text, int *len) +{ + /* Only look a maximum of 5 bytes for the closing quote. This is + the maximum for UTF-8. */ + int quote; + gdb_assert (text[0] != '\''); + for (quote = 1; text[quote] != '\0' && text[quote] != '\''; ++quote) + ; + *len = quote; + /* The caller will issue an error. */ + if (text[quote] == '\0') + return 0; + + auto_obstack result; + convert_between_encodings (host_charset (), UTF32, (const gdb_byte *) text, + quote, 1, &result, translit_none); + + int size = obstack_object_size (&result); + if (size > 4) + error (_("overlong character literal")); + uint32_t value; + memcpy (&value, obstack_finish (&result), size); + return value; +} + /* Lex a character constant. */ int @@ -592,13 +627,15 @@ rust_parser::lex_character () } gdb_assert (pstate->lexptr[0] == '\''); ++pstate->lexptr; - /* This should handle UTF-8 here. */ - if (pstate->lexptr[0] == '\\') + if (pstate->lexptr[0] == '\'') + error (_("empty character literal")); + else if (pstate->lexptr[0] == '\\') value = lex_escape (is_byte); else { - value = pstate->lexptr[0] & 0xff; - ++pstate->lexptr; + int len; + value = lex_multibyte_char (&pstate->lexptr[0], &len); + pstate->lexptr += len; } if (pstate->lexptr[0] != '\'') @@ -695,16 +732,9 @@ rust_parser::lex_string () if (is_byte) obstack_1grow (&obstack, value); else - { -#if WORDS_BIGENDIAN -#define UTF32 "UTF-32BE" -#else -#define UTF32 "UTF-32LE" -#endif - convert_between_encodings (UTF32, "UTF-8", (gdb_byte *) &value, - sizeof (value), sizeof (value), - &obstack, translit_none); - } + convert_between_encodings (UTF32, "UTF-8", (gdb_byte *) &value, + sizeof (value), sizeof (value), + &obstack, translit_none); } else if (pstate->lexptr[0] == '\0') error (_("Unexpected EOF in string")); @@ -746,7 +776,10 @@ rust_identifier_start_p (char c) return ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' - || c == '$'); + || c == '$' + /* Allow any non-ASCII character as an identifier. There + doesn't seem to be a need to be picky about this. */ + || (c & 0x80) != 0); } /* Lex an identifier. */ @@ -772,13 +805,14 @@ rust_parser::lex_identifier () ++pstate->lexptr; - /* For the time being this doesn't handle Unicode rules. Non-ASCII - identifiers are gated anyway. */ + /* Allow any non-ASCII character here. This "handles" UTF-8 by + passing it through. */ while ((pstate->lexptr[0] >= 'a' && pstate->lexptr[0] <= 'z') || (pstate->lexptr[0] >= 'A' && pstate->lexptr[0] <= 'Z') || pstate->lexptr[0] == '_' || (is_gdb_var && pstate->lexptr[0] == '$') - || (pstate->lexptr[0] >= '0' && pstate->lexptr[0] <= '9')) + || (pstate->lexptr[0] >= '0' && pstate->lexptr[0] <= '9') + || (pstate->lexptr[0] & 0x80) != 0) ++pstate->lexptr; diff --git a/gdb/testsuite/gdb.rust/unicode.exp b/gdb/testsuite/gdb.rust/unicode.exp new file mode 100644 index 00000000000..9de0a0e724f --- /dev/null +++ b/gdb/testsuite/gdb.rust/unicode.exp @@ -0,0 +1,51 @@ +# Copyright (C) 2022 Free Software Foundation, Inc. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# Test raw identifiers. + +load_lib rust-support.exp +if {[skip_rust_tests]} { + continue +} + +# Non-ASCII identifiers were allowed starting in 1.53. +set v [split [rust_compiler_version] .] +if {[lindex $v 0] == 1 && [lindex $v 1] < 53} { + untested "this test requires rust 1.53 or greater" + return -1 +} + +# Enable basic use of UTF-8. LC_ALL gets reset for each testfile. +setenv LC_ALL C.UTF-8 + +standard_testfile .rs +if {[prepare_for_testing "failed to prepare" $testfile $srcfile {debug rust}]} { + return -1 +} + +set line [gdb_get_line_number "set breakpoint here"] +if {![runto ${srcfile}:$line]} { + untested "could not run to breakpoint" + return -1 +} + +gdb_test "print 𝕯" " = 98" "print D" +gdb_test "print \"𝕯\"" " = \"𝕯\"" "print D in string" +# This output is maybe not ideal, but it also isn't incorrect. +gdb_test "print '𝕯'" " = 120175 '\\\\u\\\{01d56f\\\}'" \ + "print D as char" +gdb_test "print cç" " = 97" "print cc" + +gdb_test "print 'çc'" "overlong character literal" "print cc as char" diff --git a/gdb/testsuite/gdb.rust/unicode.rs b/gdb/testsuite/gdb.rust/unicode.rs new file mode 100644 index 00000000000..c6ca90e6450 --- /dev/null +++ b/gdb/testsuite/gdb.rust/unicode.rs @@ -0,0 +1,26 @@ +// Copyright (C) 2022 Free Software Foundation, Inc. + +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + +#![allow(dead_code)] +#![allow(unused_variables)] +#![allow(unused_assignments)] +#![allow(uncommon_codepoints)] +#![allow(non_snake_case)] + +fn main() { + let 𝕯 = 98; + let cç = 97; + println!("{}, {}", 𝕯, cç); // set breakpoint here +}