Allow non-ASCII characters in Rust identifiers

author Tom Tromey <tom@tromey.com>

Wed, 26 Jan 2022 22:39:03 +0000 (15:39 -0700)

committer Tom Tromey <tom@tromey.com>

Sun, 6 Feb 2022 20:24:25 +0000 (13:24 -0700)
author Tom Tromey <tom@tromey.com>
Wed, 26 Jan 2022 22:39:03 +0000 (15:39 -0700)
committer Tom Tromey <tom@tromey.com>
Sun, 6 Feb 2022 20:24:25 +0000 (13:24 -0700)
diff --git a/gdb/rust-parse.c b/gdb/rust-parse.c

index 894f2e79d497dddd8e36897a4e9d1bbeba19bace..8be7d33cfe98a27a2049da37ac9424dc52b7e4ca 100644 (file)
--- a/gdb/rust-parse.c
+++ b/gdb/rust-parse.c
@@ -33,6 +33,12 @@
  
  using namespace expr;
  
+#if WORDS_BIGENDIAN
+#define UTF32 "UTF-32BE"
+#else
+#define UTF32 "UTF-32LE"
+#endif
+
  /* A regular expression for matching Rust numbers.  This is split up
     since it is very long and this gives us a way to comment the
     sections.  */
@@ -577,6 +583,35 @@ rust_parser::lex_escape (int is_byte)
    return result;
  }
  
+/* A helper for lex_character.  Search forward for the closing single
+   quote, then convert the bytes from the host charset to UTF-32.  */
+
+static uint32_t
+lex_multibyte_char (const char *text, int *len)
+{
+  /* Only look a maximum of 5 bytes for the closing quote.  This is
+     the maximum for UTF-8.  */
+  int quote;
+  gdb_assert (text[0] != '\'');
+  for (quote = 1; text[quote] != '\0' && text[quote] != '\''; ++quote)
+    ;
+  *len = quote;
+  /* The caller will issue an error.  */
+  if (text[quote] == '\0')
+    return 0;
+
+  auto_obstack result;
+  convert_between_encodings (host_charset (), UTF32, (const gdb_byte *) text,
+                            quote, 1, &result, translit_none);
+
+  int size = obstack_object_size (&result);
+  if (size > 4)
+    error (_("overlong character literal"));
+  uint32_t value;
+  memcpy (&value, obstack_finish (&result), size);
+  return value;
+}
+
  /* Lex a character constant.  */
  
  int
@@ -592,13 +627,15 @@ rust_parser::lex_character ()
      }
    gdb_assert (pstate->lexptr[0] == '\'');
    ++pstate->lexptr;
-  /* This should handle UTF-8 here.  */
-  if (pstate->lexptr[0] == '\\')
+  if (pstate->lexptr[0] == '\'')
+    error (_("empty character literal"));
+  else if (pstate->lexptr[0] == '\\')
      value = lex_escape (is_byte);
    else
      {
-      value = pstate->lexptr[0] & 0xff;
-      ++pstate->lexptr;
+      int len;
+      value = lex_multibyte_char (&pstate->lexptr[0], &len);
+      pstate->lexptr += len;
      }
  
    if (pstate->lexptr[0] != '\'')
@@ -695,16 +732,9 @@ rust_parser::lex_string ()
           if (is_byte)
             obstack_1grow (&obstack, value);
           else
-           {
-#if WORDS_BIGENDIAN
-#define UTF32 "UTF-32BE"
-#else
-#define UTF32 "UTF-32LE"
-#endif
-             convert_between_encodings (UTF32, "UTF-8", (gdb_byte *) &value,
-                                        sizeof (value), sizeof (value),
-                                        &obstack, translit_none);
-           }
+           convert_between_encodings (UTF32, "UTF-8", (gdb_byte *) &value,
+                                      sizeof (value), sizeof (value),
+                                      &obstack, translit_none);
         }
        else if (pstate->lexptr[0] == '\0')
         error (_("Unexpected EOF in string"));
@@ -746,7 +776,10 @@ rust_identifier_start_p (char c)
    return ((c >= 'a' && c <= 'z')
           || (c >= 'A' && c <= 'Z')
           || c == '_'
-         || c == '$');
+         || c == '$'
+         /* Allow any non-ASCII character as an identifier.  There
+            doesn't seem to be a need to be picky about this.  */
+         || (c & 0x80) != 0);
  }
  
  /* Lex an identifier.  */
@@ -772,13 +805,14 @@ rust_parser::lex_identifier ()
  
    ++pstate->lexptr;
  
-  /* For the time being this doesn't handle Unicode rules.  Non-ASCII
-     identifiers are gated anyway.  */
+  /* Allow any non-ASCII character here.  This "handles" UTF-8 by
+     passing it through.  */
    while ((pstate->lexptr[0] >= 'a' && pstate->lexptr[0] <= 'z')
          || (pstate->lexptr[0] >= 'A' && pstate->lexptr[0] <= 'Z')
          || pstate->lexptr[0] == '_'
          || (is_gdb_var && pstate->lexptr[0] == '$')
-        || (pstate->lexptr[0] >= '0' && pstate->lexptr[0] <= '9'))
+        || (pstate->lexptr[0] >= '0' && pstate->lexptr[0] <= '9')
+        || (pstate->lexptr[0] & 0x80) != 0)
      ++pstate->lexptr;
  
  
diff --git a/gdb/testsuite/gdb.rust/unicode.exp b/gdb/testsuite/gdb.rust/unicode.exp

new file mode 100644 (file)

index 0000000..9de0a0e
--- /dev/null
+++ b/gdb/testsuite/gdb.rust/unicode.exp
@@ -0,0 +1,51 @@
+# Copyright (C) 2022 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Test raw identifiers.
+
+load_lib rust-support.exp
+if {[skip_rust_tests]} {
+    continue
+}
+
+# Non-ASCII identifiers were allowed starting in 1.53.
+set v [split [rust_compiler_version] .]
+if {[lindex $v 0] == 1 && [lindex $v 1] < 53} {
+    untested "this test requires rust 1.53 or greater"
+    return -1
+}
+
+# Enable basic use of UTF-8.  LC_ALL gets reset for each testfile.
+setenv LC_ALL C.UTF-8
+
+standard_testfile .rs
+if {[prepare_for_testing "failed to prepare" $testfile $srcfile {debug rust}]} {
+    return -1
+}
+
+set line [gdb_get_line_number "set breakpoint here"]
+if {![runto ${srcfile}:$line]} {
+    untested "could not run to breakpoint"
+    return -1
+}
+
+gdb_test "print 𝕯" " = 98" "print D"
+gdb_test "print \"𝕯\"" " = \"𝕯\"" "print D in string"
+# This output is maybe not ideal, but it also isn't incorrect.
+gdb_test "print '𝕯'" " = 120175 '\\\\u\\\{01d56f\\\}'" \
+    "print D as char"
+gdb_test "print cç" " = 97" "print cc"
+
+gdb_test "print 'çc'" "overlong character literal" "print cc as char"
diff --git a/gdb/testsuite/gdb.rust/unicode.rs b/gdb/testsuite/gdb.rust/unicode.rs

new file mode 100644 (file)

index 0000000..c6ca90e
--- /dev/null
+++ b/gdb/testsuite/gdb.rust/unicode.rs
@@ -0,0 +1,26 @@
+// Copyright (C) 2022 Free Software Foundation, Inc.
+
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#![allow(dead_code)]
+#![allow(unused_variables)]
+#![allow(unused_assignments)]
+#![allow(uncommon_codepoints)]
+#![allow(non_snake_case)]
+
+fn main() {
+    let 𝕯 = 98;
+    let cç = 97;
+    println!("{}, {}", 𝕯, cç);        // set breakpoint here
+}
author	Tom Tromey <tom@tromey.com>
	Wed, 26 Jan 2022 22:39:03 +0000 (15:39 -0700)
committer	Tom Tromey <tom@tromey.com>
	Sun, 6 Feb 2022 20:24:25 +0000 (13:24 -0700)
gdb/rust-parse.c		patch \| blob \| history
gdb/testsuite/gdb.rust/unicode.exp	[new file with mode: 0644]	patch \| blob
gdb/testsuite/gdb.rust/unicode.rs	[new file with mode: 0644]	patch \| blob