From 519a234fc1316ac0e9e5628b34308f0d47995f71 Mon Sep 17 00:00:00 2001 From: "Milton D. Miller II" Date: Wed, 22 Apr 2020 12:57:32 +0000 Subject: [PATCH] Cached random access to CUs and DIEs (#264) * dwarf/compileunit: Lookup DIE from a reference Accept a resolved reference address for a DIE in a compile unit and parse the DIE at that location. Insert into the _diemap / _dielist cache shared with iter_DIE_children() for fast repeated lookups. This can be used to follow attribute references to a DIE that be referenced several times (eg for a DW_AT_type reference) or find a DIE referenced in a lookup table. * dwarf/dwarfinfo: Cache CUs, direct parse or search from known units Maintain a cache of compile units parsed and a map of their offsets similar to the one mainained of DIEs by compile units. Add the ability to parse a random compile unit when the offset of the compile unit header is known. Add the ability to search for a compile unit containing (spanning) a given refaddr, such as that obtained from a DIE reference class attribute, starting from the closest previous cached compile unit. * dwarf/die: search for parents on demand Add a function to set the _parent link of known chldren, iterating down each parent of a target DIE. Walk all children of a given parent and set each child's ._parent to avoid O(n^2) walking. A future commit will add other methods to instatiate a DIE that will not set the _parent link as the DIE is instantiated. This walk uses the knowledge that in a flattened tree a parents offset will always be less than the childs. The call to die.set_parent in compile_unit iter_DIE_children could be removed to make the method private,, but it is free to set starting from the top DIE. Alternativly make it an optional argument to DIE creation. * dwarf/dwarfinfo: APIs to lookup DIEs Add APIs to lookup a DIE from: (a) a DIE reference class attribute taking into account the attribute form, (b) from a lookup table entry (NameLUTEntry) from a .pub_types or .pub_names section, or (c) directly from a reference addresss (.debug_info offset) regardless of how it was obtained. Add a test that will lookup dies from pubnames and follow die by ref. This is a simple test that exercises the new cache lookup methods and provides a starting point on how to determine a variables type. For now raise NotImplemented exception for type signature lookup and supplemental dwarf object files. --- elftools/dwarf/compileunit.py | 79 ++++++++++++++---- elftools/dwarf/die.py | 69 +++++++++++++++- elftools/dwarf/dwarfinfo.py | 123 +++++++++++++++++++++++++++- test/test_dwarf_cu_and_die_cache.py | 58 +++++++++++++ 4 files changed, 304 insertions(+), 25 deletions(-) create mode 100644 test/test_dwarf_cu_and_die_cache.py diff --git a/elftools/dwarf/compileunit.py b/elftools/dwarf/compileunit.py index cabeb8c..eb66c57 100644 --- a/elftools/dwarf/compileunit.py +++ b/elftools/dwarf/compileunit.py @@ -6,8 +6,9 @@ # Eli Bendersky (eliben@gmail.com) # This code is in the public domain #------------------------------------------------------------------------------- -from bisect import bisect_left +from bisect import bisect_right from .die import DIE +from ..common.utils import dwarf_assert class CompileUnit(object): @@ -98,6 +99,28 @@ class CompileUnit(object): return top + @property + def size(self): + return self['unit_length'] + self.structs.initial_length_field_size() + + def get_DIE_from_refaddr(self, refaddr): + """ Obtain a DIE contained in this CU from a reference. + + refaddr: + The offset into the .debug_info section, which must be + contained in this CU or a DWARFError will be raised. + + When using a reference class attribute with a form that is + relative to the compile unit, add unit add the compile unit's + .cu_addr before calling this function. + """ + # All DIEs are after the cu header and within the unit + dwarf_assert( + self.cu_die_offset <= refaddr < self.cu_offset + self.size, + 'refaddr %s not in DIE range of CU %s' % (refaddr, self.cu_offset)) + + return self._get_cached_DIE(refaddr) + def iter_DIEs(self): """ Iterate over all the DIEs in the CU, in order of their appearance. Note that null DIEs will also be returned. @@ -113,25 +136,12 @@ class CompileUnit(object): if not die.has_children: return - # `cur_offset` tracks the offset past our current DIE as we iterate - # over children, providing the pivot as we bisect `self._diemap` - # and ensuring that we insert our children (and child offsets) - # in the correct order within both `self._dielist` and `self._diemap`. + # `cur_offset` tracks the stream offset of the next DIE to yield + # as we iterate over our children, cur_offset = die.offset + die.size while True: - i = bisect_left(self._diemap, cur_offset) - # Note that `self._diemap` cannot be empty because a `die`, the argument, - # is already parsed. - if i < len(self._diemap) and cur_offset == self._diemap[i]: - child = self._dielist[i] - else: - child = DIE( - cu=self, - stream=die.stream, - offset=cur_offset) - self._dielist.insert(i, child) - self._diemap.insert(i, cur_offset) + child = self._get_cached_DIE(cur_offset) child.set_parent(die) @@ -179,3 +189,38 @@ class CompileUnit(object): for d in self._iter_DIE_subtree(c): yield d yield die._terminator + + def _get_cached_DIE(self, offset): + """ Given a DIE offset, look it up in the cache. If not present, + parse the DIE and insert it into the cache. + + offset: + The offset of the DIE in the debug_info section to retrieve. + + The stream reference is copied from the top DIE. The top die will + also be parsed and cached if needed. + + See also get_DIE_from_refaddr(self, refaddr). + """ + # The top die must be in the cache if any DIE is in the cache. + # The stream is the same for all DIEs in this CU, so populate + # the top DIE and obtain a reference to its stream. + top_die_stream = self.get_top_DIE().stream + + # `offset` is the offset in the stream of the DIE we want to return. + # The map is maintined as a parallel array to the list. We call + # bisect each time to ensure new DIEs are inserted in the correct + # order within both `self._dielist` and `self._diemap`. + i = bisect_right(self._diemap, offset) + + # Note that `self._diemap` cannot be empty because a the top DIE + # was inserted by the call to .get_top_DIE(). Also it has the minimal + # offset, so the bisect_right insert point will always be at least 1. + if offset == self._diemap[i - 1]: + die = self._dielist[i - 1] + else: + die = DIE(cu=self, stream=top_die_stream, offset=offset) + self._dielist.insert(i, die) + self._diemap.insert(i, offset) + + return die diff --git a/elftools/dwarf/die.py b/elftools/dwarf/die.py index 1835fd8..455a2e3 100755 --- a/elftools/dwarf/die.py +++ b/elftools/dwarf/die.py @@ -98,10 +98,34 @@ class DIE(object): """ return self.tag is None + def get_DIE_from_attribute(self, name): + """ Return the DIE referenced by the named attribute of this DIE. + The attribute must be in the reference attribute class. + + name: + The name of the attribute in the reference class. + """ + attr = self.attributes[name] + if attr.form in ('DW_FORM_ref1', 'DW_FORM_ref2', 'DW_FORM_ref4', + 'DW_FORM_ref8', 'DW_FORM_ref'): + refaddr = self.cu.cu_offset + attr.raw_value + return self.cu.get_DIE_from_refaddr(refaddr) + elif attr.form in ('DW_FORM_refaddr'): + return self.cu.dwarfinfo.get_DIE_from_refaddr(attr.raw_value) + elif attr.form in ('DW_FORM_ref_sig8'): + # Implement search type units for matching signature + raise NotImplementedError('%s (type unit by signature)' % attr.form) + elif attr.form in ('DW_FORM_ref_sup4', 'DW_FORM_ref_sup8'): + raise NotImplementedError('%s to dwo' % attr.form) + else: + raise DWARFError('%s is not a reference class form attribute' % attr) + def get_parent(self): - """ The parent DIE of this DIE. None if the DIE has no parent (i.e. a - top-level DIE). + """ Return the parent DIE of this DIE, or None if the DIE has no + parent (i.e. is a top-level DIE). """ + if self._parent is None: + self._search_ancestor_offspring() return self._parent def get_full_path(self): @@ -126,8 +150,9 @@ class DIE(object): def iter_siblings(self): """ Yield all siblings of this DIE """ - if self._parent: - for sibling in self._parent.iter_children(): + parent = self.get_parent() + if parent: + for sibling in parent.iter_children(): if sibling is not self: yield sibling else: @@ -142,6 +167,42 @@ class DIE(object): #------ PRIVATE ------# + def _search_ancestor_offspring(self): + """ Search our ancestors identifying their offspring to find our parent. + + DIEs are stored as a flattened tree. The top DIE is the ancestor + of all DIEs in the unit. Each parent is guaranteed to be at + an offset less than their children. In each generation of children + the sibling with the closest offset not greater than our offset is + our ancestor. + """ + # This code is called when get_parent notices that the _parent has + # not been identified. To avoid execution for each sibling record all + # the children of any parent iterated. Assuming get_parent will also be + # called for siblings, it is more efficient if siblings references are + # provided and no worse than a single walk if they are missing, while + # stopping iteration early could result in O(n^2) walks. + search = self.cu.get_top_DIE() + while search.offset < self.offset: + prev = search + for child in search.iter_children(): + child.set_parent(search) + if child.offset <= self.offset: + prev = child + + # We also need to check the offset of the terminator DIE + if search.has_children and search._terminator.offset <= self.offset: + prev = search._terminator + + # If we didn't find a closer parent, give up, don't loop. + # Either we mis-parsed an ancestor or someone created a DIE + # by an offset that was not actually the start of a DIE. + if prev is search: + raise ValueError("offset %s not in CU %s DIE tree" % + (self.offset, self.cu.cu_offset)) + + search = prev + def __repr__(self): s = 'DIE %s, size=%s, has_children=%s\n' % ( self.tag, self.size, self.has_children) diff --git a/elftools/dwarf/dwarfinfo.py b/elftools/dwarf/dwarfinfo.py index 70ebe04..896c292 100644 --- a/elftools/dwarf/dwarfinfo.py +++ b/elftools/dwarf/dwarfinfo.py @@ -7,6 +7,7 @@ # This code is in the public domain #------------------------------------------------------------------------------- from collections import namedtuple +from bisect import bisect_right from ..common.exceptions import DWARFError from ..common.utils import (struct_parse, dwarf_assert, @@ -103,6 +104,11 @@ class DWARFInfo(object): # Cache for abbrev tables: a dict keyed by offset self._abbrevtable_cache = {} + # Cache of compile units and map of their offsets for bisect lookup. + # Access with .iter_CUs(), .get_CU_containing(), and/or .get_CU_at(). + self._cu_cache = [] + self._cu_offsets_map = [] + @property def has_debug_info(self): """ Return whether this contains debug information. @@ -112,6 +118,84 @@ class DWARFInfo(object): """ return bool(self.debug_info_sec) + def get_DIE_from_lut_entry(self, lut_entry): + """ Get the DIE from the pubnames or putbtypes lookup table entry. + + lut_entry: + A NameLUTEntry object from a NameLUT instance (see + .get_pubmames and .get_pubtypes methods). + """ + cu = self.get_CU_at(lut_entry.cu_ofs) + return self.get_DIE_from_refaddr(lut_entry.die_ofs, cu) + + def get_DIE_from_refaddr(self, refaddr, cu=None): + """ Given a .debug_info section offset of a DIE, return the DIE. + + refaddr: + The refaddr may come from a DW_FORM_ref_addr attribute. + + cu: + The compile unit object, if known. If None a search + from the closest offset less than refaddr will be performed. + """ + if cu is None: + cu = self.get_CU_containing(refaddr) + return cu.get_DIE_from_refaddr(refaddr) + + def get_CU_containing(self, refaddr): + """ Find the CU that includes the given reference address in the + .debug_info section. + + refaddr: + Either a refaddr of a DIE (possibly from a DW_FORM_refaddr + attribute) or the section offset of a CU (possibly from an + aranges table). + + This function will parse and cache CUs until the search criteria + is met, starting from the closest known offset lessthan or equal + to the given address. + """ + dwarf_assert( + self.has_debug_info, + 'CU lookup but no debug info section') + dwarf_assert( + 0 <= refaddr < self.debug_info_sec.size, + "refaddr %s beyond .debug_info size" % refaddr) + + # The CU containing the DIE we desire will be to the right of the + # DIE insert point. If we have a CU address, then it will be a + # match but the right insert minus one will still be the item. + # The first CU starts at offset 0, so start there if cache is empty. + i = bisect_right(self._cu_offsets_map, refaddr) + start = self._cu_offsets_map[i - 1] if i > 0 else 0 + + # parse CUs until we find one containing the desired address + for cu in self._parse_CUs_iter(start): + if cu.cu_offset <= refaddr < cu.cu_offset + cu.size: + return cu + + raise ValueError("CU for reference address %s not found" % refaddr) + + def get_CU_at(self, offset): + """ Given a CU header offset, return the parsed CU. + + offset: + The offset may be from an accelerated access table such as + the public names, public types, address range table, or + prior use. + + This function will directly parse the CU doing no validation of + the offset beyond checking the size of the .debug_info section. + """ + dwarf_assert( + self.has_debug_info, + 'CU lookup but no debug info section') + dwarf_assert( + 0 <= offset < self.debug_info_sec.size, + "offset %s beyond .debug_info size" % offset) + + return self._cached_CU_at_offset(offset) + def iter_CUs(self): """ Yield all the compile units (CompileUnit objects) in the debug info """ @@ -253,15 +337,20 @@ class DWARFInfo(object): #------ PRIVATE ------# - def _parse_CUs_iter(self): - """ Parse CU entries from debug_info. Yield CUs in order of appearance. + def _parse_CUs_iter(self, offset=0): + """ Iterate CU objects in order of appearance in the debug_info section. + + offset: + The offset of the first CU to yield. Additional iterations + will return the sequential unit objects. + + See .iter_CUs(), .get_CU_containing(), and .get_CU_at(). """ if self.debug_info_sec is None: return - offset = 0 while offset < self.debug_info_sec.size: - cu = self._parse_CU_at_offset(offset) + cu = self._cached_CU_at_offset(offset) # Compute the offset of the next CU in the section. The unit_length # field of the CU header contains its size not including the length # field itself. @@ -270,6 +359,32 @@ class DWARFInfo(object): cu.structs.initial_length_field_size()) yield cu + def _cached_CU_at_offset(self, offset): + """ Return the CU with unit header at the given offset into the + debug_info section from the cache. If not present, the unit is + header is parsed and the object is installed in the cache. + + offset: + The offset of the unit header in the .debug_info section + to of the unit to fetch from the cache. + + See get_CU_at(). + """ + # Find the insert point for the requested offset. With bisect_right, + # if this entry is present in the cache it will be the prior entry. + i = bisect_right(self._cu_offsets_map, offset) + if i >= 1 and offset == self._cu_offsets_map[i - 1]: + return self._cu_cache[i - 1] + + # Parse the CU and insert the offset and object into the cache. + # The ._cu_offsets_map[] contains just the numeric offsets for the + # bisect_right search while the parallel indexed ._cu_cache[] holds + # the object references. + cu = self._parse_CU_at_offset(offset) + self._cu_offsets_map.insert(i, offset) + self._cu_cache.insert(i, cu) + return cu + def _parse_CU_at_offset(self, offset): """ Parse and return a CU at the given offset in the debug_info stream. """ diff --git a/test/test_dwarf_cu_and_die_cache.py b/test/test_dwarf_cu_and_die_cache.py new file mode 100644 index 0000000..bf7f4d7 --- /dev/null +++ b/test/test_dwarf_cu_and_die_cache.py @@ -0,0 +1,58 @@ +#------------------------------------------------------------------------------- +# elftools tests +# +# Eli Bendersky (eliben@gmail.com), Milton Miller +# This code is in the public domain +#------------------------------------------------------------------------------- +import os +import unittest + +from elftools.elf.elffile import ELFFile +from elftools.common.py3compat import bytes2str + +class TestCacheLUTandDIEref(unittest.TestCase): + def dprint(self, list): + if False: + self.oprint(list) + + def oprint(self, list): + if False: + print(list) + + def test_die_from_LUTentry(self): + lines = [''] + with open(os.path.join('test', 'testfiles_for_unittests', + 'lambda.elf'), 'rb') as f: + elffile = ELFFile(f) + self.assertTrue(elffile.has_dwarf_info()) + + dwarf = elffile.get_dwarf_info() + pt = dwarf.get_pubnames() + for (k, v) in pt.items(): + ndie = dwarf.get_DIE_from_lut_entry(v) + self.dprint(ndie) + if not 'DW_AT_type' in ndie.attributes: + continue + if not 'DW_AT_name' in ndie.attributes: + continue + name = bytes2str(ndie.attributes['DW_AT_name'].value) + tlist = [] + tdie = ndie + while True: + tdie = tdie.get_DIE_from_attribute('DW_AT_type') + self.dprint(ndie) + ttag = tdie.tag + if isinstance(ttag, int): + ttag = 'TAG(0x%x)' % ttag + tlist.append(ttag) + if 'DW_AT_name' in tdie.attributes: + break + tlist.append(bytes2str(tdie.attributes['DW_AT_name'].value)) + tname = ' '.join(tlist) + line = "%s DIE at %s is of type %s" % ( + ndie.tag, ndie.offset, tname) + lines.append(line) + self.dprint(line) + + self.oprint('\n'.join(lines)) + self.assertGreater(len(lines), 1) -- 2.30.2