Added support for decoding .debug_pubtypes and .debug_pubnames sections (#208)
authorrvijayc <44033253+rvijayc@users.noreply.github.com>
Mon, 24 Dec 2018 14:02:08 +0000 (06:02 -0800)
committerEli Bendersky <eliben@users.noreply.github.com>
Mon, 24 Dec 2018 14:02:08 +0000 (06:02 -0800)
* Added support for decoding .debug_pubtypes and .debug_pubnames sections

* Added reference output to dwarf_pubnames_types.py example.

* Added readelf support, fixed review comments and documentation updates

* Avoid printing the entire die in pubnames example to workaround Python2 vs 3 imcompatibilites

elftools/dwarf/dwarfinfo.py
elftools/dwarf/namelut.py [new file with mode: 0755]
elftools/dwarf/structs.py
elftools/elf/elffile.py
examples/dwarf_pubnames_types.py [new file with mode: 0644]
examples/reference_output/dwarf_pubnames_types.out [new file with mode: 0644]
scripts/readelf.py
test/run_readelf_tests.py

index b8faf9d7ac57b0df106eece33870e29343c1c156..45903ec868f90d2dd23c1f0cb633a21181794520 100644 (file)
@@ -19,6 +19,7 @@ from .callframe import CallFrameInfo
 from .locationlists import LocationLists
 from .ranges import RangeLists
 from .aranges import ARanges
+from .namelut import NameLUT
 
 
 # Describes a debug section
@@ -67,7 +68,9 @@ class DWARFInfo(object):
             debug_str_sec,
             debug_loc_sec,
             debug_ranges_sec,
-            debug_line_sec):
+            debug_line_sec,
+            debug_pubtypes_sec,
+            debug_pubnames_sec):
         """ config:
                 A DwarfConfig object
 
@@ -86,6 +89,8 @@ class DWARFInfo(object):
         self.debug_loc_sec = debug_loc_sec
         self.debug_ranges_sec = debug_ranges_sec
         self.debug_line_sec = debug_line_sec
+        self.debug_pubtypes_sec = debug_pubtypes_sec
+        self.debug_pubnames_sec = debug_pubnames_sec
 
         # This is the DWARFStructs the context uses, so it doesn't depend on
         # DWARF format and address_size (these are determined per CU) - set them
@@ -185,6 +190,38 @@ class DWARFInfo(object):
             for_eh_frame=True)
         return cfi.get_entries()
 
+    def get_pubtypes(self):
+        """
+        Returns a NameLUT object that contains information read from the
+        .debug_pubtypes section in the ELF file.
+
+        NameLUT is essentially a dictionary containing the CU/DIE offsets of
+        each symbol. See the NameLUT doc string for more details.
+        """
+
+        if self.debug_pubtypes_sec:
+            return NameLUT(self.debug_pubtypes_sec.stream,
+                    self.debug_pubtypes_sec.size,
+                    self.structs)
+        else:
+            return None
+
+    def get_pubnames(self):
+        """
+        Returns a NameLUT object that contains information read from the
+        .debug_pubnames section in the ELF file.
+
+        NameLUT is essentially a dictionary containing the CU/DIE offsets of
+        each symbol. See the NameLUT doc string for more details.
+        """
+
+        if self.debug_pubnames_sec:
+            return NameLUT(self.debug_pubnames_sec.stream,
+                    self.debug_pubnames_sec.size,
+                    self.structs)
+        else:
+            return None
+
     def get_aranges(self):
         """ Get an ARanges object representing the .debug_aranges section of
             the DWARF data, or None if the section doesn't exist
diff --git a/elftools/dwarf/namelut.py b/elftools/dwarf/namelut.py
new file mode 100755 (executable)
index 0000000..b7de798
--- /dev/null
@@ -0,0 +1,202 @@
+#-------------------------------------------------------------------------------
+# elftools: dwarf/namelut.py
+#
+# DWARF pubtypes/pubnames section decoding (.debug_pubtypes, .debug_pubnames)
+#
+# Vijay Ramasami (rvijayc@gmail.com)
+# This code is in the public domain
+#-------------------------------------------------------------------------------
+import os
+import collections
+from collections import OrderedDict
+from ..common.utils import struct_parse
+from bisect import bisect_right
+import math
+from ..construct import CString, Struct
+
+NameLUTEntry = collections.namedtuple('NameLUTEntry', 'cu_ofs die_ofs')
+
+class NameLUT(collections.Mapping):
+    """ 
+    A "Name LUT" holds any of the tables specified by .debug_pubtypes or
+    .debug_pubnames sections. This is basically a dictionary where the key is
+    the symbol name (either a public variable, function or a type), and the
+    value is the tuple (cu_offset, die_offset) corresponding to the variable.
+    The die_offset is an absolute offset (meaning, it can be used to search the
+    CU by iterating until a match is obtained).
+
+    An ordered dictionary is used to preserve the CU order (i.e, items are
+    stored on a per-CU basis (as it was originally in the .debug_* section).
+
+    Usage:
+
+    The NameLUT walks and talks like a dictionary and hence it can be used as
+    such. Some examples below:
+
+    # get the pubnames (a NameLUT from DWARF info).
+    pubnames = dwarf_info.get_pubnames()
+
+    # lookup a variable.
+    entry1 = pubnames["var_name1"]
+    entry2 = pubnames.get("var_name2", default=<default_var>)
+    print(entry2.cu_ofs)
+    ...
+
+    # iterate over items.
+    for (name, entry) in pubnames.items():
+      # do stuff with name, entry.cu_ofs, entry.die_ofs
+
+    # iterate over items on a per-CU basis.
+    import itertools
+    for cu_ofs, item_list in itertools.groupby(pubnames.items(), 
+        key = lambda x: x[1].cu_ofs):
+      # items are now grouped by cu_ofs.
+      # item_list is an iterator yeilding NameLUTEntry'ies belonging 
+      # to cu_ofs.
+      # We can parse the CU at cu_offset and use the parsed CU results
+      # to parse the pubname DIEs in the CU listed by item_list.
+      for item in item_list:
+        # work with item which is part of the CU with cu_ofs.
+
+    """
+
+    def __init__(self, stream, size, structs):
+
+        self._stream = stream
+        self._size = size
+        self._structs = structs
+        # entries are lazily loaded on demand.
+        self._entries = None
+        # CU headers (for readelf).
+        self._cu_headers = None
+
+    def get_entries(self):
+        """
+        Returns the parsed NameLUT entries. The returned object is a dictionary
+        with the symbol name as the key and NameLUTEntry(cu_ofs, die_ofs) as
+        the value.
+
+        This is useful when dealing with very large ELF files with millions of
+        entries. The returned entries can be pickled to a file and restored by
+        calling set_entries on subsequent loads.
+        """
+        if self._entries is None: 
+            self._entries, self._cu_headers = self._get_entries()
+        return self._entries
+
+    def set_entries(self, entries, cu_headers):
+        """
+        Set the NameLUT entries from an external source. The input is a
+        dictionary with the symbol name as the key and NameLUTEntry(cu_ofs,
+        die_ofs) as the value.
+        
+        This option is useful when dealing with very large ELF files with
+        millions of entries. The entries can be parsed once and pickled to a
+        file and can be restored via this function on subsequent loads.
+        """
+        self._entries = entries
+        self._cu_headers = cu_headers
+
+    def __len__(self):
+        """
+        Returns the number of entries in the NameLUT.
+        """
+        if self._entries is None: 
+            self._entries, self._cu_headers = self._get_entries()
+        return len(self._entries)
+
+    def __getitem__(self, name):
+        """
+        Returns a namedtuple - NameLUTEntry(cu_ofs, die_ofs) - that corresponds
+        to the given symbol name.
+        """
+        if self._entries is None: 
+            self._entries, self._cu_headers = self._get_entries()
+        return self._entries.get(name)
+
+    def __iter__(self):
+        """
+        Returns an iterator to the NameLUT dictionary.
+        """
+        if self._entries is None: 
+            self._entries, self._cu_headers = self._get_entries()
+        return iter(self._entries)
+
+    def items(self):
+        """
+        Returns the NameLUT dictionary items.
+        """
+        if self._entries is None: 
+            self._entries, self._cu_headers = self._get_entries()
+        return self._entries.items()
+
+    def get(self, name, default=None):
+        """
+        Returns NameLUTEntry(cu_ofs, die_ofs) for the provided symbol name or
+        None if the symbol does not exist in the corresponding section.
+        """
+        if self._entries is None: 
+            self._entries, self._cu_headers = self._get_entries()
+        return self._entries.get(name, default)
+
+    def get_cu_headers(self):
+        """
+        Returns all CU headers. Mainly required for readelf.
+        """
+        if self._cu_headers is None: 
+            self._entries, self._cu_headers = self._get_entries()
+        
+        return self._cu_headers
+
+    def _get_entries(self):
+        """
+        Parse the (name, cu_ofs, die_ofs) information from this section and
+        store as a dictionary.
+        """
+
+        self._stream.seek(0)
+        entries = OrderedDict()
+        cu_headers = []
+        offset = 0
+        entry_struct = Struct("Dwarf_offset_name_pair",
+                self._structs.Dwarf_offset('die_ofs'),
+                CString('name'))
+        die_ofs_struct = self._structs.Dwarf_offset('die_ofs')
+                
+        # each run of this loop will fetch one CU worth of entries.
+        while offset < self._size:
+
+            # read the header for this CU.
+            namelut_hdr = struct_parse(self._structs.Dwarf_nameLUT_header,
+                    self._stream, offset)
+            cu_headers.append(namelut_hdr)
+            # compute the next offset.
+            offset = (offset + namelut_hdr.unit_length +
+                     self._structs.initial_length_field_size())
+
+            bytes_read = 0
+            # before inner loop, latch data that will be used in the inner
+            # loop to avoid attribute access and other computation.
+            hdr_cu_ofs = namelut_hdr.debug_info_offset
+            # read the first tuple for this CU.
+            entry = struct_parse(entry_struct,
+                    self._stream)
+            # while die_ofs of the entry is non-zero (which indicates the end) ...
+            while True:
+                # add this entry to the look-up dictionary.
+                entries[entry.name.decode('utf-8')] = NameLUTEntry(
+                        cu_ofs = hdr_cu_ofs,
+                        die_ofs = hdr_cu_ofs + entry.die_ofs)
+                # get the DIE offset entry alone.
+                die_ofs = struct_parse(die_ofs_struct, self._stream)
+                # if it is zero, then we done.
+                if die_ofs == 0:
+                    break
+                else:
+                    # else this is a valid DIE, get the name as well and 
+                    # construct the entry
+                    entry.name = struct_parse(CString('name'), self._stream)
+                    entry.die_ofs = die_ofs
+
+        # return the entries parsed so far.
+        return (entries, cu_headers)
index f3b6ef3b673fd0e47c26a64417c691d0c84bf293..6dde82b865afc1e57ff5d590af7fa844f7178994 100644 (file)
@@ -34,6 +34,9 @@ class DWARFStructs(object):
             Dwarf_offset:
                 32-bit or 64-bit word, depending on dwarf_format
 
+            Dwarf_length:
+                32-bit or 64-bit word, depending on dwarf_format
+
             Dwarf_target_addr:
                 32-bit or 64-bit word, depending on address size
 
@@ -105,6 +108,7 @@ class DWARFStructs(object):
             self.Dwarf_uint32 = ULInt32
             self.Dwarf_uint64 = ULInt64
             self.Dwarf_offset = ULInt32 if self.dwarf_format == 32 else ULInt64
+            self.Dwarf_length = ULInt32 if self.dwarf_format == 32 else ULInt64
             self.Dwarf_target_addr = (
                 ULInt32 if self.address_size == 4 else ULInt64)
             self.Dwarf_int8 = SLInt8
@@ -117,6 +121,7 @@ class DWARFStructs(object):
             self.Dwarf_uint32 = UBInt32
             self.Dwarf_uint64 = UBInt64
             self.Dwarf_offset = UBInt32 if self.dwarf_format == 32 else UBInt64
+            self.Dwarf_length = UBInt32 if self.dwarf_format == 32 else UBInt64
             self.Dwarf_target_addr = (
                 UBInt32 if self.address_size == 4 else UBInt64)
             self.Dwarf_int8 = SBInt8
@@ -132,6 +137,7 @@ class DWARFStructs(object):
         self._create_lineprog_header()
         self._create_callframe_entry_headers()
         self._create_aranges_header()
+        self._create_nameLUT_header()
 
     def _create_initial_length(self):
         def _InitialLength(name):
@@ -218,6 +224,14 @@ class DWARFStructs(object):
             self.Dwarf_uint8('segment_size')
             )
 
+    def _create_nameLUT_header(self):
+        self.Dwarf_nameLUT_header = Struct("Dwarf_nameLUT_header",
+            self.Dwarf_initial_length('unit_length'),
+            self.Dwarf_uint16('version'),
+            self.Dwarf_offset('debug_info_offset'), 
+            self.Dwarf_length('debug_info_length')
+            )
+
     def _create_lineprog_header(self):
         # A file entry is terminated by a NULL byte, so we don't want to parse
         # past it. Therefore an If is used.
index 6c029488f42450f25eb677e758f0264178355dcd..537a647f8e2556e747d26e6f77ba29000c60a153 100644 (file)
@@ -167,7 +167,8 @@ class ELFFile(object):
 
         section_names = ('.debug_info', '.debug_aranges', '.debug_abbrev',
                          '.debug_str', '.debug_line', '.debug_frame',
-                         '.debug_loc', '.debug_ranges')
+                         '.debug_loc', '.debug_ranges', '.debug_pubtypes', 
+                         '.debug_pubnames')
 
         compressed = bool(self.get_section_by_name('.zdebug_info'))
         if compressed:
@@ -178,8 +179,8 @@ class ELFFile(object):
 
         (debug_info_sec_name, debug_aranges_sec_name, debug_abbrev_sec_name,
          debug_str_sec_name, debug_line_sec_name, debug_frame_sec_name,
-         debug_loc_sec_name, debug_ranges_sec_name,
-         eh_frame_sec_name) = section_names
+         debug_loc_sec_name, debug_ranges_sec_name, debug_pubtypes_name,
+         debug_pubnames_name, eh_frame_sec_name) = section_names
 
         debug_sections = {}
         for secname in section_names:
@@ -207,7 +208,10 @@ class ELFFile(object):
                 debug_str_sec=debug_sections[debug_str_sec_name],
                 debug_loc_sec=debug_sections[debug_loc_sec_name],
                 debug_ranges_sec=debug_sections[debug_ranges_sec_name],
-                debug_line_sec=debug_sections[debug_line_sec_name])
+                debug_line_sec=debug_sections[debug_line_sec_name],
+                debug_pubtypes_sec = debug_sections[debug_pubtypes_name],
+                debug_pubnames_sec = debug_sections[debug_pubnames_name]
+                )
 
     def get_machine_arch(self):
         """ Return the machine architecture, as detected from the ELF header.
diff --git a/examples/dwarf_pubnames_types.py b/examples/dwarf_pubnames_types.py
new file mode 100644 (file)
index 0000000..9a32ce1
--- /dev/null
@@ -0,0 +1,114 @@
+#-------------------------------------------------------------------------------
+# elftools example: dwarf_pubnames_types.py
+#
+# Dump the contents of .debug_pubnames and .debug_pubtypes sections from the
+# ELF file.
+#
+# Note: sample_exe64.elf doesn't have a .debug_pubtypes section.
+#
+# Vijay Ramasami (rvijayc@gmail.com)
+# This code is in the public domain
+#-------------------------------------------------------------------------------
+from __future__ import print_function
+import sys
+
+# If pyelftools is not installed, the example can also run from the root or
+# examples/ dir of the source distribution.
+sys.path[0:0] = ['.', '..']
+
+from elftools.elf.elffile import ELFFile
+from elftools.common.py3compat import bytes2str
+
+def process_file(filename):
+    print('Processing file:', filename)
+    with open(filename, 'rb') as f:
+        elffile = ELFFile(f)
+
+        if not elffile.has_dwarf_info():
+            print('  file has no DWARF info')
+            return
+
+        # get_dwarf_info returns a DWARFInfo context object, which is the
+        # starting point for all DWARF-based processing in pyelftools.
+        dwarfinfo = elffile.get_dwarf_info()
+
+        # get .debug_pubtypes section.
+        pubnames = dwarfinfo.get_pubnames()
+        if pubnames is None:
+            print('ERROR: No .debug_pubnames section found in ELF.')
+        else:
+            print('%d entries found in .debug_pubnames' % len(pubnames))
+            
+            # try getting information on a global symbol.
+            print('Trying pubnames example ...')
+            sym_name = 'main'
+            try:
+                entry = pubnames[sym_name]
+            except KeyError:
+                print('ERROR: No pubname entry found for ' + sym_name)
+            else:
+                print('%s: cu_ofs = %d, die_ofs = %d' %
+                        (sym_name, entry.cu_ofs, entry.die_ofs))
+
+                # get the actual CU/DIE that has this information.
+                print('Fetching the actual die for %s ...' % sym_name)
+                for cu in dwarfinfo.iter_CUs():
+                    if cu.cu_offset == entry.cu_ofs:
+                        for die in cu.iter_DIEs():
+                            if die.offset == entry.die_ofs:
+                                print('Die Name: %s' % 
+                                        bytes2str(die.attributes['DW_AT_name'].value))
+        
+            # dump all entries in .debug_pubnames section.
+            print('Dumping .debug_pubnames table ...')
+            print('-' * 66)
+            print('%50s%8s%8s' % ('Symbol', 'CU_OFS', 'DIE_OFS'))
+            print('-' * 66)
+            for (name, entry) in pubnames.items():
+                print('%50s%8d%8d' % (name, entry.cu_ofs, entry.die_ofs))
+            print('-' * 66)
+
+        # get .debug_pubtypes section.
+        pubtypes = dwarfinfo.get_pubtypes()
+        if pubtypes is None:
+            print('ERROR: No .debug_pubtypes section found in ELF')
+        else:
+            print('%d entries found in .debug_pubtypes' % len(pubtypes))
+
+            # try getting information on a global type.
+            sym_name = 'char'
+            # note: using the .get() API (pubtypes[key] will also work).
+            entry = pubtypes.get(sym_name)
+            if entry is None:
+                print('ERROR: No pubtype entry for %s' % sym_name)
+            else:
+                print('%s: cu_ofs %d, die_ofs %d' %
+                        (sym_name, entry.cu_ofs, entry.die_ofs))
+
+                # get the actual CU/DIE that has this information.
+                print('Fetching the actual die for %s ...' % sym_name)
+                for cu in dwarfinfo.iter_CUs():
+                    if cu.cu_offset == entry.cu_ofs:
+                        for die in cu.iter_DIEs():
+                            if die.offset == entry.die_ofs:
+                                print('Die Name: %s' % 
+                                        bytes2str(die.attributes['DW_AT_name'].value))
+        
+            # dump all entries in .debug_pubtypes section.
+            print('Dumping .debug_pubtypes table ...')
+            print('-' * 66)
+            print('%50s%8s%8s' % ('Symbol', 'CU_OFS', 'DIE_OFS'))
+            print('-' * 66)
+            for (name, entry) in pubtypes.items():
+                print('%50s%8d%8d' % (name, entry.cu_ofs, entry.die_ofs))
+            print('-' * 66)
+
+if __name__ == '__main__':
+    if sys.argv[1] == '--test':
+        process_file(sys.argv[2])
+        sys.exit(0)
+
+    if len(sys.argv) < 2:
+        print('Expected usage: {0} <executable>'.format(sys.argv[0]))
+        sys.exit(1)
+    process_file(sys.argv[1])
diff --git a/examples/reference_output/dwarf_pubnames_types.out b/examples/reference_output/dwarf_pubnames_types.out
new file mode 100644 (file)
index 0000000..3ed3d26
--- /dev/null
@@ -0,0 +1,17 @@
+Processing file: ./examples/sample_exe64.elf
+5 entries found in .debug_pubnames
+Trying pubnames example ...
+main: cu_ofs = 258, die_ofs = 303
+Fetching the actual die for main ...
+Die Name: main
+Dumping .debug_pubnames table ...
+------------------------------------------------------------------
+                                            Symbol  CU_OFS DIE_OFS
+------------------------------------------------------------------
+                                    _IO_stdin_used     119     230
+                                              main     258     303
+                                              glob     258     395
+                                   __libc_csu_fini     418     495
+                                   __libc_csu_init     418     523
+------------------------------------------------------------------
+ERROR: No .debug_pubtypes section found in ELF
index 087218a129e9ae0ec0302fbed52782d2e7d5d8bd..dfcd98d77015d0a329d034c0b88a57392205a768 100755 (executable)
 import argparse
 import os, sys
 import string
+import itertools
+# Note: zip has different behaviour between Python 2.x and 3.x.
+# - Using izip ensures compatibility.
+try:
+    from itertools import izip
+except:
+    izip = zip
 
 # For running from development directory. It should take precedence over the
 # installed pyelftools.
@@ -457,7 +464,7 @@ class ReadElf(object):
                 for note in section.iter_notes():
                       self._emitline("\nDisplaying notes found in: {}".format(
                           section.name))
-                      self._emitline('  Owner                 Data size        Description')
+                      self._emitline('  Owner                 Data size Description')
                       self._emitline('  %s %s\t%s' % (
                           note['n_name'].ljust(20),
                           self._format_hex(note['n_descsz'], fieldsize=8),
@@ -753,6 +760,8 @@ class ReadElf(object):
             self._dump_debug_frames_interp()
         elif dump_what == 'aranges':
             self._dump_debug_aranges()
+        elif dump_what in { 'pubtypes', 'pubnames' }:
+            self._dump_debug_namelut(dump_what)
         else:
             self._emitline('debug dump not yet supported for "%s"' % dump_what)
 
@@ -1106,6 +1115,40 @@ class ReadElf(object):
                     self._dwarfinfo.debug_frame_sec,
                     self._dwarfinfo.CFI_entries())
 
+    def _dump_debug_namelut(self, what):
+        """
+        Dump the debug pubnames section.
+        """
+        if what == 'pubnames':
+            namelut = self._dwarfinfo.get_pubnames()
+            section = self._dwarfinfo.debug_pubnames_sec
+        else:
+            namelut = self._dwarfinfo.get_pubtypes()
+            section = self._dwarfinfo.debug_pubtypes_sec
+
+        # readelf prints nothing if the section is not present.
+        if namelut is None or len(namelut) == 0:    
+            return
+        
+        self._emitline('Contents of the %s section:' % section.name)
+        self._emitline()
+        
+        cu_headers = namelut.get_cu_headers()
+
+        # go over CU-by-CU first and item-by-item next.
+        for (cu_hdr, (cu_ofs, items)) in izip(cu_headers, itertools.groupby(
+            namelut.items(), key = lambda x: x[1].cu_ofs)):
+
+            self._emitline('  Length:                              %d'   % cu_hdr.unit_length)
+            self._emitline('  Version:                             %d'   % cu_hdr.version)
+            self._emitline('  Offset into .debug_info section:     0x%x' % cu_hdr.debug_info_offset)
+            self._emitline('  Size of area in .debug_info section: %d'   % cu_hdr.debug_info_length) 
+            self._emitline()
+            self._emitline('    Offset  Name')
+            for item in items:
+                self._emitline('    %x          %s' % (item[1].die_ofs - cu_ofs, item[0]))
+        self._emitline()
+
     def _dump_debug_aranges(self):
         """ Dump the aranges table
         """
index 7f4631b646e7804db9a830958f94b0bf1cb630be..76ecd72dcf0a64de43ebc3718b74126c5adf6782 100755 (executable)
@@ -58,7 +58,9 @@ def run_test_on_file(filename, verbose=False):
             '-e', '-d', '-s', '-n', '-r', '-x.text', '-p.shstrtab', '-V',
             '--debug-dump=info', '--debug-dump=decodedline',
             '--debug-dump=frames', '--debug-dump=frames-interp',
-            '--debug-dump=aranges']:
+            '--debug-dump=aranges', '--debug-dump=pubtypes',
+            '--debug-dump=pubnames'
+            ]:
         if verbose: testlog.info("..option='%s'" % option)
 
         # TODO(zlobober): this is a dirty hack to make tests work for ELF core