From 6deb7a8185b9f359289bad3e86d7be248ac75550 Mon Sep 17 00:00:00 2001 From: Andrew Burgess Date: Sat, 27 Aug 2022 16:15:31 +0100 Subject: [PATCH] gdb/disasm: better intel flavour disassembly styling with Pygments This commit was inspired by this stackoverflow post: https://stackoverflow.com/questions/73491793/why-is-there-a-%C2%B1-in-lea-rax-rip-%C2%B1-0xeb3 One of the comments helpfully links to this Python test case: from pygments import formatters, lexers, highlight def colorize_disasm(content, gdbarch): try: lexer = lexers.get_lexer_by_name("asm") formatter = formatters.TerminalFormatter() return highlight(content, lexer, formatter).rstrip().encode() except: return None print(colorize_disasm("lea [rip+0x211] # COMMENT", None).decode()) Run the test case and you should see that the '+' character is underlined, and could be confused with a combined +/- symbol. What's happening is that Pygments is failing to parse the input text, and the '+' is actually being marked in the error style. The error style is red and underlined. It is worth noting that the assembly instruction being disassembled here is an x86-64 instruction in the 'intel' disassembly style, rather than the default att style. Clearly the Pygments module expects the att syntax by default. If we change the test case to this: from pygments import formatters, lexers, highlight def colorize_disasm(content, gdbarch): try: lexer = lexers.get_lexer_by_name("asm") lexer.add_filter('raiseonerror') formatter = formatters.TerminalFormatter() return highlight(content, lexer, formatter).rstrip().encode() except: return None res = colorize_disasm("lea rax,[rip+0xeb3] # COMMENT", None) if res: print(res.decode()) else: print("No result!") Here I've added the call: lexer.add_filter('raiseonerror'), and I am now checking to see if the result is None or not. Running this and the test now print 'No result!' - instead of styling the '+' in the error style, we instead give up on the styling attempt. There are two things we need to fix relating to this disassembly text. First, Pygments is expecting att style disassembly, not the intel style that this example uses. Fortunately, Pygments also supports the intel style, all we need to do is use the 'nasm' lexer instead of the 'asm' lexer. However, this leads to the second problem; in our disassembler line we have '# COMMENT'. The "official" Intel disassembler style uses ';' for its comment character, however, gas and libopcodes use '#' as the comment character, as gas uses ';' for an instruction separator. Unfortunately, Pygments expects ';' as the comment character, and treats '#' as an error, which means, with the addition of the 'raiseonerror' filter, that any line containing a '#' comment, will not get styled correctly. However, as the i386 disassembler never produces a '#' character other than for comments, we can easily "fix" Pygments parsing of the disassembly line. This is done by creating a filter. This filter looks for an Error token with the value '#', we then change this into a comment token. Every token after this (until the end of the line) is also converted into a comment. In this commit I do the following: 1. Check the 'disassembly-flavor' setting and select between the 'asm' and 'nasm' lexers based on the setting. If the setting is not available then the 'asm' lexer is used by default, 2. Use "add_filter('raiseonerror')" to ensure that the formatted output will not include any error text, which would be underlined, and might be confusing, 3. If the 'nasm' lexer is selected, then add an additional filter that will format '#' and all other text on the line, as a comment, and 4. If Pygments throws an exception, instead of returning None, return the original, unmodified content. This will mean that this one instruction is printed without styling, but GDB will continue to call into the Python code to style later instructions. I haven't included a test specifically for the above error case, though I have manually check that the above case now styles correctly (with no underline). The existing style tests check that the disassembler styling still works though, so I know I've not generally broken things. One final thought I have after looking at this issue is that I wonder now if using Pygments for styling disassembly from every architecture is actually a good idea? Clearly, the 'asm' lexer is OK with att style x86-64, but not OK with intel style x86-64, so who knows how well it will handle other random architectures? When I first added this feature I tested it against some random RISC-V, ARM, and X86-64 (att style) code, and it seemed fine, but I never tried to make an exhaustive check of all instructions, so its quite possible that there are corner cases where things are styled incorrectly. With the above changes I think that things should be a bit better now. If a particular instruction doesn't parse correctly then our Pygments based styling code will just not style that one instruction. This is combined with the fact that many architectures are now moving to libopcodes based styling, which is much more reliable. So, I think it is fine to keep using Pygments as a fallback mechanism for styling all architectures, even if we know it might not be perfect in all cases. --- gdb/python/lib/gdb/styling.py | 59 ++++++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 4 deletions(-) diff --git a/gdb/python/lib/gdb/styling.py b/gdb/python/lib/gdb/styling.py index aef39c6857c..b97f1dd7fb8 100644 --- a/gdb/python/lib/gdb/styling.py +++ b/gdb/python/lib/gdb/styling.py @@ -20,26 +20,77 @@ import gdb try: from pygments import formatters, lexers, highlight + from pygments.token import Error, Comment, Text + from pygments.filters import TokenMergeFilter + + _formatter = None + + def get_formatter(): + global _formatter + if _formatter is None: + _formatter = formatters.TerminalFormatter() + return _formatter def colorize(filename, contents): # Don't want any errors. try: lexer = lexers.get_lexer_for_filename(filename, stripnl=False) - formatter = formatters.TerminalFormatter() + formatter = get_formatter() return highlight(contents, lexer, formatter).encode( gdb.host_charset(), "backslashreplace" ) except: return None + class HandleNasmComments(TokenMergeFilter): + @staticmethod + def fix_comments(lexer, stream): + in_comment = False + for ttype, value in stream: + if ttype is Error and value == "#": + in_comment = True + if in_comment: + if ttype is Text and value == "\n": + in_comment = False + else: + ttype = Comment.Single + yield ttype, value + + def filter(self, lexer, stream): + f = HandleNasmComments.fix_comments + return super().filter(lexer, f(lexer, stream)) + + _asm_lexers = {} + + def __get_asm_lexer(gdbarch): + lexer_type = "asm" + try: + # For an i386 based architecture, in 'intel' mode, use the nasm + # lexer. + flavor = gdb.parameter("disassembly-flavor") + if flavor == "intel" and gdbarch.name()[:4] == "i386": + lexer_type = "nasm" + except: + # If GDB is built without i386 support then attempting to fetch + # the 'disassembly-flavor' parameter will throw an error, which we + # ignore. + pass + + global _asm_lexers + if lexer_type not in _asm_lexers: + _asm_lexers[lexer_type] = lexers.get_lexer_by_name(lexer_type) + _asm_lexers[lexer_type].add_filter(HandleNasmComments()) + _asm_lexers[lexer_type].add_filter("raiseonerror") + return _asm_lexers[lexer_type] + def colorize_disasm(content, gdbarch): # Don't want any errors. try: - lexer = lexers.get_lexer_by_name("asm") - formatter = formatters.TerminalFormatter() + lexer = __get_asm_lexer(gdbarch) + formatter = get_formatter() return highlight(content, lexer, formatter).rstrip().encode() except: - return None + return content except: -- 2.30.2