Support UTF-8 identifiers in C/C++ expressions (PR gdb/22973)

author Pedro Alves <palves@redhat.com>

Tue, 22 May 2018 16:35:38 +0000 (17:35 +0100)

committer Pedro Alves <palves@redhat.com>

Tue, 22 May 2018 16:35:38 +0000 (17:35 +0100)
author Pedro Alves <palves@redhat.com>
Tue, 22 May 2018 16:35:38 +0000 (17:35 +0100)
committer Pedro Alves <palves@redhat.com>
Tue, 22 May 2018 16:35:38 +0000 (17:35 +0100)
diff --git a/gdb/ChangeLog b/gdb/ChangeLog

index ff5e0a2cbe1e7cd0de780778469d42ba3d53433f..b34fa7b74e7c16095bbe8278d06ab33523eecb29 100644 (file)
--- a/gdb/ChangeLog
+++ b/gdb/ChangeLog
@@ -1,3 +1,20 @@
+2018-05-22  Pedro Alves  <palves@redhat.com>
+           張俊芝  <zjz@zjz.name>
+
+       PR gdb/22973
+       * c-exp.y: Include "c-support.h".
+       (parse_number, c_parse_escape, lex_one_token): Use TOLOWER instead
+       of tolower.  Use c_ident_is_alpha to scan names.
+       * c-lang.c: Include "c-support.h".
+       (convert_ucn, convert_octal, convert_hex, convert_escape): Use
+       ISXDIGIT instead of isxdigit and ISDIGIT instead of isdigit.
+       * c-support.h: New file, with bits factored out from ...
+       * cp-name-parser.y: ... this file.
+       Include "c-support.h".
+       (cp_ident_is_alpha, cp_ident_is_alnum): Deleted, moved to
+       c-support.h and renamed.
+       (symbol_end, yylex): Adjust.
+
  2018-05-22  Pedro Franco de Carvalho  <pedromfc@linux.vnet.ibm.com>
  
         * arch/ppc-linux-common.c (ppc_linux_has_isa205): Change the
diff --git a/gdb/c-exp.y b/gdb/c-exp.y

index 5e10d2a3b4370bf18aa0d46d924795130b602487..ae31af52dfb89e71c6de9b84dc79a0244ac54a3b 100644 (file)
--- a/gdb/c-exp.y
+++ b/gdb/c-exp.y
@@ -42,6 +42,7 @@
  #include "parser-defs.h"
  #include "language.h"
  #include "c-lang.h"
+#include "c-support.h"
  #include "bfd.h" /* Required by objfiles.h.  */
  #include "symfile.h" /* Required by objfiles.h.  */
  #include "objfiles.h" /* For have_full_symbols and have_partial_symbols */
@@ -1806,13 +1807,13 @@ parse_number (struct parser_state *par_state,
           len -= 2;
         }
        /* Handle suffixes: 'f' for float, 'l' for long double.  */
-      else if (len >= 1 && tolower (p[len - 1]) == 'f')
+      else if (len >= 1 && TOLOWER (p[len - 1]) == 'f')
         {
           putithere->typed_val_float.type
             = parse_type (par_state)->builtin_float;
           len -= 1;
         }
-      else if (len >= 1 && tolower (p[len - 1]) == 'l')
+      else if (len >= 1 && TOLOWER (p[len - 1]) == 'l')
         {
           putithere->typed_val_float.type
             = parse_type (par_state)->builtin_long_double;
@@ -2023,9 +2024,9 @@ c_parse_escape (const char **ptr, struct obstack *output)
        if (output)
         obstack_grow_str (output, "\\x");
        ++tokptr;
-      if (!isxdigit (*tokptr))
+      if (!ISXDIGIT (*tokptr))
         error (_("\\x escape without a following hex digit"));
-      while (isxdigit (*tokptr))
+      while (ISXDIGIT (*tokptr))
         {
           if (output)
             obstack_1grow (output, *tokptr);
@@ -2048,7 +2049,7 @@ c_parse_escape (const char **ptr, struct obstack *output)
         if (output)
           obstack_grow_str (output, "\\");
         for (i = 0;
-            i < 3 && isdigit (*tokptr) && *tokptr != '8' && *tokptr != '9';
+            i < 3 && ISDIGIT (*tokptr) && *tokptr != '8' && *tokptr != '9';
              ++i)
           {
             if (output)
@@ -2073,9 +2074,9 @@ c_parse_escape (const char **ptr, struct obstack *output)
             obstack_1grow (output, *tokptr);
           }
         ++tokptr;
-       if (!isxdigit (*tokptr))
+       if (!ISXDIGIT (*tokptr))
           error (_("\\%c escape without a following hex digit"), c);
-       for (i = 0; i < len && isxdigit (*tokptr); ++i)
+       for (i = 0; i < len && ISXDIGIT (*tokptr); ++i)
           {
             if (output)
               obstack_1grow (output, *tokptr);
@@ -2668,7 +2669,7 @@ lex_one_token (struct parser_state *par_state, bool *is_quoted_name)
             size_t len = strlen ("selector");
  
             if (strncmp (p, "selector", len) == 0
-               && (p[len] == '\0' || isspace (p[len])))
+               && (p[len] == '\0' || ISSPACE (p[len])))
               {
                 lexptr = p + len;
                 return SELECTOR;
@@ -2677,9 +2678,9 @@ lex_one_token (struct parser_state *par_state, bool *is_quoted_name)
               goto parse_string;
           }
  
-       while (isspace (*p))
+       while (ISSPACE (*p))
           p++;
-       if (strncmp (p, "entry", len) == 0 && !isalnum (p[len])
+       if (strncmp (p, "entry", len) == 0 && !c_ident_is_alnum (p[len])
             && p[len] != '_')
           {
             lexptr = &p[len];
@@ -2741,16 +2742,14 @@ lex_one_token (struct parser_state *par_state, bool *is_quoted_name)
        }
      }
  
-  if (!(c == '_' || c == '$'
-       || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')))
+  if (!(c == '_' || c == '$' || c_ident_is_alpha (c)))
      /* We must have come across a bad character (e.g. ';').  */
      error (_("Invalid character '%c' in expression."), c);
  
    /* It's a name.  See how long it is.  */
    namelen = 0;
    for (c = tokstart[namelen];
-       (c == '_' || c == '$' || (c >= '0' && c <= '9')
-       || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '<');)
+       (c == '_' || c == '$' || c_ident_is_alnum (c) || c == '<');)
      {
        /* Template parameter lists are part of the name.
          FIXME: This mishandles `print $a<4&&$a>3'.  */
diff --git a/gdb/c-lang.c b/gdb/c-lang.c

index 15e633f8c8fc187d695b26ce5c4e4473661fcc4e..6bbb47095713ea4513c7b01ad386de2edeec9d13 100644 (file)
--- a/gdb/c-lang.c
+++ b/gdb/c-lang.c
@@ -25,6 +25,7 @@
  #include "language.h"
  #include "varobj.h"
  #include "c-lang.h"
+#include "c-support.h"
  #include "valprint.h"
  #include "macroscope.h"
  #include "charset.h"
@@ -382,7 +383,7 @@ convert_ucn (char *p, char *limit, const char *dest_charset,
    gdb_byte data[4];
    int i;
  
-  for (i = 0; i < length && p < limit && isxdigit (*p); ++i, ++p)
+  for (i = 0; i < length && p < limit && ISXDIGIT (*p); ++i, ++p)
      result = (result << 4) + host_hex_value (*p);
  
    for (i = 3; i >= 0; --i)
@@ -424,7 +425,7 @@ convert_octal (struct type *type, char *p,
    unsigned long value = 0;
  
    for (i = 0;
-       i < 3 && p < limit && isdigit (*p) && *p != '8' && *p != '9';
+       i < 3 && p < limit && ISDIGIT (*p) && *p != '8' && *p != '9';
         ++i)
      {
        value = 8 * value + host_hex_value (*p);
@@ -447,7 +448,7 @@ convert_hex (struct type *type, char *p,
  {
    unsigned long value = 0;
  
-  while (p < limit && isxdigit (*p))
+  while (p < limit && ISXDIGIT (*p))
      {
        value = 16 * value + host_hex_value (*p);
        ++p;
@@ -488,7 +489,7 @@ convert_escape (struct type *type, const char *dest_charset,
  
      case 'x':
        ADVANCE;
-      if (!isxdigit (*p))
+      if (!ISXDIGIT (*p))
         error (_("\\x used with no following hex digits."));
        p = convert_hex (type, p, limit, output);
        break;
@@ -510,7 +511,7 @@ convert_escape (struct type *type, const char *dest_charset,
         int length = *p == 'u' ? 4 : 8;
  
         ADVANCE;
-       if (!isxdigit (*p))
+       if (!ISXDIGIT (*p))
           error (_("\\u used with no following hex digits"));
         p = convert_ucn (p, limit, dest_charset, output, length);
        }
diff --git a/gdb/c-support.h b/gdb/c-support.h

new file mode 100644 (file)

index 0000000..3641d6f
--- /dev/null
+++ b/gdb/c-support.h
@@ -0,0 +1,46 @@
+/* Helper routines for C support in GDB.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+   This file is part of GDB.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#ifndef C_SUPPORT_H
+#define C_SUPPORT_H
+
+#include "safe-ctype.h"
+
+/* Like ISALPHA, but also returns true for the union of all UTF-8
+   multi-byte sequence bytes and non-ASCII characters in
+   extended-ASCII charsets (e.g., Latin1).  I.e., returns true if the
+   high bit is set.  Note that not all UTF-8 ranges are allowed in C++
+   identifiers, but we don't need to be pedantic so for simplicity we
+   ignore that here.  Plus this avoids the complication of actually
+   knowing what was the right encoding.  */
+
+static inline bool
+c_ident_is_alpha (unsigned char ch)
+{
+  return ISALPHA (ch) || ch >= 0x80;
+}
+
+/* Similarly, but Like ISALNUM.  */
+
+static inline bool
+c_ident_is_alnum (unsigned char ch)
+{
+  return ISALNUM (ch) || ch >= 0x80;
+}
+
+#endif /* C_SUPPORT_H */
diff --git a/gdb/cp-name-parser.y b/gdb/cp-name-parser.y

index f522e464191574c878ff65dbfb7ed022a318a951..ebae56261b31b9f7f9426a8f75d4065edbd0a57d 100644 (file)
--- a/gdb/cp-name-parser.y
+++ b/gdb/cp-name-parser.y
@@ -35,6 +35,7 @@
  #include "safe-ctype.h"
  #include "demangle.h"
  #include "cp-support.h"
+#include "c-support.h"
  
  /* Bison does not make it easy to create a parser without global
     state, unfortunately.  Here are all the global variables used
@@ -1304,28 +1305,6 @@ d_binary (const char *name, struct demangle_component *lhs, struct demangle_comp
                       fill_comp (DEMANGLE_COMPONENT_BINARY_ARGS, lhs, rhs));
  }
  
-/* Like ISALPHA, but also returns true for the union of all UTF-8
-   multi-byte sequence bytes and non-ASCII characters in
-   extended-ASCII charsets (e.g., Latin1).  I.e., returns true if the
-   high bit is set.  Note that not all UTF-8 ranges are allowed in C++
-   identifiers, but we don't need to be pedantic so for simplicity we
-   ignore that here.  Plus this avoids the complication of actually
-   knowing what was the right encoding.  */
-
-static inline bool
-cp_ident_is_alpha (unsigned char ch)
-{
-  return ISALPHA (ch) || ch >= 0x80;
-}
-
-/* Similarly, but Like ISALNUM.  */
-
-static inline bool
-cp_ident_is_alnum (unsigned char ch)
-{
-  return ISALNUM (ch) || ch >= 0x80;
-}
-
  /* Find the end of a symbol name starting at LEXPTR.  */
  
  static const char *
@@ -1333,7 +1312,7 @@ symbol_end (const char *lexptr)
  {
    const char *p = lexptr;
  
-  while (*p && (cp_ident_is_alnum (*p) || *p == '_' || *p == '$' || *p == '.'))
+  while (*p && (c_ident_is_alnum (*p) || *p == '_' || *p == '$' || *p == '.'))
      p++;
  
    return p;
@@ -1813,7 +1792,7 @@ yylex (void)
        return ERROR;
      }
  
-  if (!(c == '_' || c == '$' || cp_ident_is_alpha (c)))
+  if (!(c == '_' || c == '$' || c_ident_is_alpha (c)))
      {
        /* We must have come across a bad character (e.g. ';').  */
        yyerror (_("invalid character"));
@@ -1824,7 +1803,7 @@ yylex (void)
    namelen = 0;
    do
      c = tokstart[++namelen];
-  while (cp_ident_is_alnum (c) || c == '_' || c == '$');
+  while (c_ident_is_alnum (c) || c == '_' || c == '$');
  
    lexptr += namelen;
  
diff --git a/gdb/testsuite/ChangeLog b/gdb/testsuite/ChangeLog

index 208939bf82922bc4c56e3f4801fc5f7c73229eeb..393ab8884ab51611756f0395bd46aa81fe1a6302 100644 (file)
--- a/gdb/testsuite/ChangeLog
+++ b/gdb/testsuite/ChangeLog
@@ -1,3 +1,9 @@
+2018-05-22  Pedro Alves  <palves@redhat.com>
+
+       PR gdb/22973
+       * gdb.base/utf8-identifiers.c: New file.
+       * gdb.base/utf8-identifiers.exp: New file.
+
  2018-05-22  Pedro Franco de Carvalho  <pedromfc@linux.vnet.ibm.com>
  
         * gdb.arch/powerpc-fpscr-gcore.exp: New file.
diff --git a/gdb/testsuite/gdb.base/utf8-identifiers.c b/gdb/testsuite/gdb.base/utf8-identifiers.c

new file mode 100644 (file)

index 0000000..c80b42a
--- /dev/null
+++ b/gdb/testsuite/gdb.base/utf8-identifiers.c
@@ -0,0 +1,71 @@
+/* -*- coding: utf-8 -*- */
+
+/* This testcase is part of GDB, the GNU debugger.
+
+   Copyright 2017-2018 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* UTF-8 "função1".  */
+#define FUNCAO1 fun\u00e7\u00e3o1
+
+/* UTF-8 "função2".  */
+#define FUNCAO2 fun\u00e7\u00e3o2
+
+/* UTF-8 "my_função".  */
+#define MY_FUNCAO my_fun\u00e7\u00e3o
+
+/* UTF-8 "num_€".  */
+#define NUM_EUROS num_\u20ac
+
+struct S
+{
+  int NUM_EUROS;
+} g_s;
+
+void
+FUNCAO1 (void)
+{
+  g_s.NUM_EUROS = 1000;
+}
+
+void
+FUNCAO2 (void)
+{
+  g_s.NUM_EUROS = 1000;
+}
+
+void
+MY_FUNCAO (void)
+{
+}
+
+int NUM_EUROS = 2000;
+
+static void
+done ()
+{
+}
+
+int
+main ()
+{
+  FUNCAO1 ();
+  done ();
+  FUNCAO2 ();
+  MY_FUNCAO ();
+
+  return 0;
+}
diff --git a/gdb/testsuite/gdb.base/utf8-identifiers.exp b/gdb/testsuite/gdb.base/utf8-identifiers.exp

new file mode 100644 (file)

index 0000000..12fe376
--- /dev/null
+++ b/gdb/testsuite/gdb.base/utf8-identifiers.exp
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*- */
+
+# This testcase is part of GDB, the GNU debugger.
+
+# Copyright 2017-2018 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Test GDB's support for UTF-8 C/C++ identifiers.
+
+load_lib completion-support.exp
+
+standard_testfile
+
+# Enable basic use of UTF-8.  LC_ALL gets reset for each testfile.
+setenv LC_ALL C.UTF-8
+
+if { [prepare_for_testing "failed to prepare" ${testfile} [list $srcfile]] } {
+    return -1
+}
+
+if ![runto done] {
+    fail "couldn't run to done"
+    return
+}
+
+# Test expressions.
+gdb_test "print g_s.num_€" " = 1000"
+gdb_test "print num_€" " = 2000"
+
+# Test linespecs/breakpoints.
+gdb_test "break função2" "Breakpoint $decimal at .*$srcfile.*"
+
+set test "info breakpoints"
+gdb_test_multiple $test $test {
+    -re "in função2 at .*$srcfile.*$gdb_prompt $" {
+       pass $test
+    }
+}
+
+gdb_test "continue" \
+    "Breakpoint $decimal, função2 \\(\\) at .*$srcfile.*"
+
+# Unload symbols from shared libraries to avoid random symbol and file
+# names getting in the way of completion.
+gdb_test_no_output "nosharedlibrary"
+
+# Test linespec completion.
+
+# A unique completion.
+test_gdb_complete_unique "break my_fun" "break my_função"
+
+# A multiple-matches completion:
+
+# kfailed because gdb/readline display the completion match list like
+# this, with no separating space:
+#
+#  (gdb) break função[TAB]
+#  função1função2
+#
+# ... which is bogus.
+setup_kfail "gdb/23211" "*-*-*"
+test_gdb_complete_multiple "break " "fun" "ção" {"função1" "função2"}
+
+# Test expression completion.
+test_gdb_complete_unique "print g_s.num" "print g_s.num_€"
author	Pedro Alves <palves@redhat.com>
	Tue, 22 May 2018 16:35:38 +0000 (17:35 +0100)
committer	Pedro Alves <palves@redhat.com>
	Tue, 22 May 2018 16:35:38 +0000 (17:35 +0100)
gdb/ChangeLog		patch \| blob \| history
gdb/c-exp.y		patch \| blob \| history
gdb/c-lang.c		patch \| blob \| history
gdb/c-support.h	[new file with mode: 0644]	patch \| blob
gdb/cp-name-parser.y		patch \| blob \| history
gdb/testsuite/ChangeLog		patch \| blob \| history
gdb/testsuite/gdb.base/utf8-identifiers.c	[new file with mode: 0644]	patch \| blob
gdb/testsuite/gdb.base/utf8-identifiers.exp	[new file with mode: 0644]	patch \| blob