From 299953ca95cc5ac47e52236e2eeb44afc5369134 Mon Sep 17 00:00:00 2001 From: Andrew Burgess Date: Fri, 26 Nov 2021 13:15:28 +0000 Subject: [PATCH] gdb/python: handle non utf-8 characters when source highlighting This commit adds support for source files that contain non utf-8 characters when performing source styling using the Python pygments package. This does not change the behaviour of GDB when the GNU Source Highlight library is used. For the following problem description, assume that either GDB is built without GNU Source Highlight support, of that this has been disabled using 'maintenance set gnu-source-highlight enabled off'. The initial problem reported was that a source file containing non utf-8 characters would cause GDB to print a Python exception, and then display the source without styling, e.g.: Python Exception : 'utf-8' codec can't decode byte 0xc0 in position 142: invalid start byte /* Source code here, without styling... */ Further, as the user steps through different source files, each time the problematic source file was evicted from the source cache, and then later reloaded, the exception would be printed again. Finally, this problem is only present when using Python 3, this issue is not present for Python 2. What makes this especially frustrating is that GDB can clearly print the source file contents, they're right there... If we disable styling completely, or make use of the GNU Source Highlight library, then everything is fine. So why is there an error when we try to apply styling using Python? The problem is the use of PyString_FromString (which is an alias for PyUnicode_FromString in Python 3), this function converts a C string into a either a Unicode object (Py3) or a str object (Py2). For Python 2 there is no unicode encoding performed during this function call, but for Python 3 the input is assumed to be a uft-8 encoding string for the purpose of the conversion. And here of course, is the problem, if the source file contains non utf-8 characters, then it should not be treated as utf-8, but that's what we do, and that's why we get an error. My first thought when looking at this was to spot when the PyString_FromString call failed with a UnicodeDecodeError and silently ignore the error. This would mean that GDB would print the source without styling, but would also avoid the annoying exception message. However, I also make use of `pygmentize`, a command line wrapper around the Python pygments module, which I use to apply syntax highlighting in the output of `less`. And this command line wrapper is quite happy to syntax highlight my source file that contains non utf-8 characters, so it feels like the problem should be solvable. It turns out that inside the pygments module there is already support for guessing the encoding of the incoming file content, if the incoming content is not already a Unicode string. This is what happens for Python 2 where the incoming content is of `str` type. We could try and make GDB smarter when it comes to converting C strings into Python Unicode objects; this would probably require us to just try a couple of different encoding schemes rather than just giving up after utf-8. However, I figure, why bother? The pygments module already does this for us, and the colorize API is not part of the documented external API of GDB. So, why not just change the colorize API, instead of the content being a Unicode string (for Python 3), lets just make the content be a bytes object. The pygments module can then take responsibility for guessing the encoding. So, currently, the colorize API receives a unicode object, and returns a unicode object. I propose that the colorize API receive a bytes object, and return a bytes object. --- gdb/python/lib/gdb/__init__.py | 6 +- gdb/python/python.c | 31 +++++---- gdb/testsuite/gdb.python/py-source-styling.c | 29 +++++++++ .../gdb.python/py-source-styling.exp | 64 +++++++++++++++++++ 4 files changed, 114 insertions(+), 16 deletions(-) create mode 100644 gdb/testsuite/gdb.python/py-source-styling.c create mode 100644 gdb/testsuite/gdb.python/py-source-styling.exp diff --git a/gdb/python/lib/gdb/__init__.py b/gdb/python/lib/gdb/__init__.py index d5e7eac45cc..9734a0d9437 100644 --- a/gdb/python/lib/gdb/__init__.py +++ b/gdb/python/lib/gdb/__init__.py @@ -234,7 +234,7 @@ def find_pc_line(pc): def set_parameter(name, value): """Set the GDB parameter NAME to VALUE.""" - execute('set ' + name + ' ' + str(value), to_string=True) + execute("set " + name + " " + str(value), to_string=True) @contextmanager @@ -258,7 +258,9 @@ try: try: lexer = lexers.get_lexer_for_filename(filename, stripnl=False) formatter = formatters.TerminalFormatter() - return highlight(contents, lexer, formatter) + return highlight(contents, lexer, formatter).encode( + host_charset(), "backslashreplace" + ) except: return None diff --git a/gdb/python/python.c b/gdb/python/python.c index 2c8081e1b07..2e659ee6e14 100644 --- a/gdb/python/python.c +++ b/gdb/python/python.c @@ -1172,13 +1172,24 @@ gdbpy_colorize (const std::string &filename, const std::string &contents) gdbpy_print_stack (); return {}; } - gdbpy_ref<> contents_arg (PyString_FromString (contents.c_str ())); + + /* The pygments library, which is what we currently use for applying + styling, is happy to take input as a bytes object, and to figure out + the encoding for itself. This removes the need for us to figure out + (guess?) at how the content is encoded, which is probably a good + thing. */ + gdbpy_ref<> contents_arg (PyBytes_FromStringAndSize (contents.c_str (), + contents.size ())); if (contents_arg == nullptr) { gdbpy_print_stack (); return {}; } + /* Calling gdb.colorize passing in the filename (a string), and the file + contents (a bytes object). This function should return either a bytes + object, the same contents with styling applied, or None to indicate + that no styling should be performed. */ gdbpy_ref<> result (PyObject_CallFunctionObjArgs (hook.get (), fname_arg.get (), contents_arg.get (), @@ -1189,25 +1200,17 @@ gdbpy_colorize (const std::string &filename, const std::string &contents) return {}; } - if (!gdbpy_is_string (result.get ())) + if (result == Py_None) return {}; - - gdbpy_ref<> unic = python_string_to_unicode (result.get ()); - if (unic == nullptr) - { - gdbpy_print_stack (); - return {}; - } - gdbpy_ref<> host_str (PyUnicode_AsEncodedString (unic.get (), - host_charset (), - nullptr)); - if (host_str == nullptr) + else if (!PyBytes_Check (result.get ())) { + PyErr_SetString (PyExc_TypeError, + _("Return value from gdb.colorize should be a bytes object or None.")); gdbpy_print_stack (); return {}; } - return std::string (PyBytes_AsString (host_str.get ())); + return std::string (PyBytes_AsString (result.get ())); } diff --git a/gdb/testsuite/gdb.python/py-source-styling.c b/gdb/testsuite/gdb.python/py-source-styling.c new file mode 100644 index 00000000000..e27f46053f5 --- /dev/null +++ b/gdb/testsuite/gdb.python/py-source-styling.c @@ -0,0 +1,29 @@ +/* This testcase is part of GDB, the GNU debugger. + + Copyright 2022 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +int +main () +{ + int some_variable = 1234; + + /* The following line contains a character that is non-utf-8. This is a + critical part of the test as Python 3 can't convert this into a string + using its default mechanism. */ + char c[] = "À"; /* List this line. */ + + return 0; +} diff --git a/gdb/testsuite/gdb.python/py-source-styling.exp b/gdb/testsuite/gdb.python/py-source-styling.exp new file mode 100644 index 00000000000..68bbc9f9758 --- /dev/null +++ b/gdb/testsuite/gdb.python/py-source-styling.exp @@ -0,0 +1,64 @@ +# Copyright (C) 2022 Free Software Foundation, Inc. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# This file is part of the GDB testsuite. It checks for memory leaks +# associated with allocating and deallocation gdb.Inferior objects. + +load_lib gdb-python.exp + +standard_testfile + +save_vars { env(TERM) } { + # We need an ANSI-capable terminal to get the output, additionally + # we need to set LC_ALL so GDB knows the terminal is UTF-8 + # capable, otherwise we'll get a UnicodeEncodeError trying to + # encode the output. + setenv TERM ansi + + if { [prepare_for_testing "failed to prepare" ${testfile} ${srcfile}] } { + return -1 + } + + if { [skip_python_tests] } { continue } + + if { ![gdb_py_module_available "pygments"] } { + unsupported "pygments module not available" + return -1 + } + + if ![runto_main] { + return + } + + gdb_test_no_output "maint set gnu-source-highlight enabled off" + + gdb_test "maint flush source-cache" "Source cache flushed\\." + + set seen_style_escape false + set line_number [gdb_get_line_number "List this line."] + gdb_test_multiple "list ${line_number}" "" { + -re "Python Exception.*" { + fail $gdb_test_name + } + -re "\033" { + set seen_style_escape true + exp_continue + } + -re "$gdb_prompt $" { + gdb_assert { $seen_style_escape } + pass $gdb_test_name + } + } +} -- 2.30.2