examples: Add dwarf_lineprogram_filenames.py (#285)

author William Woodruff <william@trailofbits.com>

Sat, 7 Mar 2020 13:24:43 +0000 (08:24 -0500)

committer GitHub <noreply@github.com>

Sat, 7 Mar 2020 13:24:43 +0000 (05:24 -0800)
author William Woodruff <william@trailofbits.com>
Sat, 7 Mar 2020 13:24:43 +0000 (08:24 -0500)
committer GitHub <noreply@github.com>
Sat, 7 Mar 2020 13:24:43 +0000 (05:24 -0800)
diff --git a/examples/dwarf_lineprogram_filenames.py b/examples/dwarf_lineprogram_filenames.py

new file mode 100644 (file)

index 0000000..57a8cdd
--- /dev/null
+++ b/examples/dwarf_lineprogram_filenames.py
@@ -0,0 +1,95 @@
+#-------------------------------------------------------------------------------
+# elftools example: dwarf_lpe_filenames.py
+#
+# In the .debug_line section, the Dwarf line program generates a matrix
+# of address-source references. This example demonstrates accessing the state
+# of each line program entry to retrieve the underlying filenames.
+#
+# William Woodruff (william@yossarian.net)
+# This code is in the public domain
+#-------------------------------------------------------------------------------
+from __future__ import print_function
+from collections import defaultdict
+import os
+import sys
+
+# If pyelftools is not installed, the example can also run from the root or
+# examples/ dir of the source distribution.
+sys.path[0:0] = ['.', '..']
+
+from elftools.elf.elffile import ELFFile
+
+
+def process_file(filename):
+    print('Processing file:', filename)
+    with open(filename, 'rb') as f:
+        elffile = ELFFile(f)
+
+        if not elffile.has_dwarf_info():
+            print('  file has no DWARF info')
+            return
+
+        dwarfinfo = elffile.get_dwarf_info()
+        for CU in dwarfinfo.iter_CUs():
+            print('  Found a compile unit at offset %s, length %s' % (
+                CU.cu_offset, CU['unit_length']))
+
+            # Every compilation unit in the DWARF information may or may not
+            # have a corresponding line program in .debug_line.
+            line_program = dwarfinfo.line_program_for_CU(CU)
+            if line_program is None:
+                print('  DWARF info is missing a line program for this CU')
+                continue
+
+            # Print a reverse mapping of filename -> #entries
+            line_entry_mapping(line_program)
+
+
+def line_entry_mapping(line_program):
+    filename_map = defaultdict(int)
+
+    # The line program, when decoded, returns a list of line program
+    # entries. Each entry contains a state, which we'll use to build
+    # a reverse mapping of filename -> #entries.
+    lp_entries = line_program.get_entries()
+    for lpe in lp_entries:
+        # We skip LPEs that don't have an associated file.
+        # This can happen if instructions in the compiled binary
+        # don't correspond directly to any original source file.
+        if not lpe.state or lpe.state.file == 0:
+            continue
+        filename = lpe_filename(line_program, lpe.state.file)
+        filename_map[filename] += 1
+
+    for filename, lpe_count in filename_map.items():
+        print("    filename=%s -> %d entries" % (filename, lpe_count))
+
+
+def lpe_filename(line_program, file_index):
+    # Retrieving the filename associated with a line program entry
+    # involves two levels of indirection: we take the file index from
+    # the LPE to grab the file_entry from the line program header,
+    # then take the directory index from the file_entry to grab the
+    # directory name from the line program header. Finally, we
+    # join the (base) filename from the file_entry to the directory
+    # name to get the absolute filename.
+    lp_header = line_program.header
+    file_entries = lp_header["file_entry"]
+
+    # File and directory indices are 1-indexed.
+    file_entry = file_entries[file_index - 1]
+    dir_index = file_entry["dir_index"]
+
+    # A dir_index of 0 indicates that no absolute directory was recorded during
+    # compilation; return just the basename.
+    if dir_index == 0:
+        return file_entry.name.decode()
+
+    directory = lp_header["include_directory"][dir_index - 1]
+    return os.path.join(directory, file_entry.name).decode()
+
+
+if __name__ == '__main__':
+    if sys.argv[1] == '--test':
+        for filename in sys.argv[2:]:
+            process_file(filename)
diff --git a/examples/reference_output/dwarf_lineprogram_filenames.out b/examples/reference_output/dwarf_lineprogram_filenames.out

new file mode 100644 (file)

index 0000000..b20bbdd
--- /dev/null
+++ b/examples/reference_output/dwarf_lineprogram_filenames.out
@@ -0,0 +1,8 @@
+Processing file: ./examples/sample_exe64.elf
+  Found a compile unit at offset 0, length 115
+    filename=../sysdeps/x86_64/elf/start.S -> 13 entries
+  Found a compile unit at offset 119, length 135
+  Found a compile unit at offset 258, length 156
+    filename=z.c -> 5 entries
+  Found a compile unit at offset 418, length 300
+    filename=elf-init.c -> 15 entries
author	William Woodruff <william@trailofbits.com>
	Sat, 7 Mar 2020 13:24:43 +0000 (08:24 -0500)
committer	GitHub <noreply@github.com>
	Sat, 7 Mar 2020 13:24:43 +0000 (05:24 -0800)
examples/dwarf_lineprogram_filenames.py	[new file with mode: 0644]	patch \| blob
examples/reference_output/dwarf_lineprogram_filenames.out	[new file with mode: 0644]	patch \| blob