From: Milton D. Miller II <miltonm@us.ibm.com>
Date: Wed, 22 Apr 2020 12:57:32 +0000 (+0000)
Subject: Cached random access to CUs and DIEs (#264)
X-Git-Tag: v0.27~31
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=519a234fc1316ac0e9e5628b34308f0d47995f71;p=pyelftools.git

Cached random access to CUs and DIEs (#264)

* dwarf/compileunit: Lookup DIE from a reference

Accept a resolved reference address for a DIE in a compile unit and
parse the DIE at that location.  Insert into the _diemap / _dielist
cache shared with iter_DIE_children() for fast repeated lookups.

This can be used to follow attribute references to a DIE that be
referenced several times (eg for a DW_AT_type reference) or find
a DIE referenced in a lookup table.

* dwarf/dwarfinfo: Cache CUs, direct parse or search from known units

Maintain a cache of compile units parsed and a map of their offsets
similar to the one mainained of DIEs by compile units.

Add the ability to parse a random compile unit when the offset of
the compile unit header is known.

Add the ability to search for a compile unit containing (spanning)
a given refaddr, such as that obtained from a DIE reference class
attribute, starting from the closest previous cached compile unit.

* dwarf/die: search for parents on demand

Add a function to set the _parent link of known chldren, iterating
down each parent of a target DIE.  Walk all children of a given
parent and set each child's ._parent to avoid O(n^2) walking.

A future commit will add other methods to instatiate a DIE that will
not set the _parent link as the DIE is instantiated.

This walk uses the knowledge that in a flattened tree a parents offset
will always be less than the childs.

The call to die.set_parent in compile_unit iter_DIE_children could be
removed to make the method private,, but it is free to set starting
from the top DIE.  Alternativly make it an optional argument to
DIE creation.

* dwarf/dwarfinfo: APIs to lookup DIEs

Add APIs to lookup a DIE from: (a) a DIE reference class attribute
taking into account the attribute form, (b) from a lookup table entry
(NameLUTEntry) from a .pub_types or .pub_names section, or (c) directly
from a reference addresss (.debug_info offset) regardless of how it
was obtained.

Add a test that will lookup dies from pubnames and follow die by ref.

	This is a simple test that exercises the new cache lookup
	methods and provides a starting point on how to determine a
	variables type.

For now raise NotImplemented exception for type signature lookup
and supplemental dwarf object files.
---

diff --git a/elftools/dwarf/compileunit.py b/elftools/dwarf/compileunit.py
index cabeb8c..eb66c57 100644
--- a/elftools/dwarf/compileunit.py
+++ b/elftools/dwarf/compileunit.py
@@ -6,8 +6,9 @@
 # Eli Bendersky (eliben@gmail.com)
 # This code is in the public domain
 #-------------------------------------------------------------------------------
-from bisect import bisect_left
+from bisect import bisect_right
 from .die import DIE
+from ..common.utils import dwarf_assert
 
 
 class CompileUnit(object):
@@ -98,6 +99,28 @@ class CompileUnit(object):
 
         return top
 
+    @property
+    def size(self):
+        return self['unit_length'] + self.structs.initial_length_field_size()
+
+    def get_DIE_from_refaddr(self, refaddr):
+        """ Obtain a DIE contained in this CU from a reference.
+
+            refaddr:
+                The offset into the .debug_info section, which must be
+                contained in this CU or a DWARFError will be raised.
+
+            When using a reference class attribute with a form that is
+            relative to the compile unit, add unit add the compile unit's
+            .cu_addr before calling this function.
+        """
+        # All DIEs are after the cu header and within the unit
+        dwarf_assert(
+            self.cu_die_offset <= refaddr < self.cu_offset + self.size,
+            'refaddr %s not in DIE range of CU %s' % (refaddr, self.cu_offset))
+
+        return self._get_cached_DIE(refaddr)
+
     def iter_DIEs(self):
         """ Iterate over all the DIEs in the CU, in order of their appearance.
             Note that null DIEs will also be returned.
@@ -113,25 +136,12 @@ class CompileUnit(object):
         if not die.has_children:
             return
 
-        # `cur_offset` tracks the offset past our current DIE as we iterate
-        # over children, providing the pivot as we bisect `self._diemap`
-        # and ensuring that we insert our children (and child offsets)
-        # in the correct order within both `self._dielist` and `self._diemap`.
+        # `cur_offset` tracks the stream offset of the next DIE to yield
+        # as we iterate over our children,
         cur_offset = die.offset + die.size
 
         while True:
-            i = bisect_left(self._diemap, cur_offset)
-            # Note that `self._diemap` cannot be empty because a `die`, the argument,
-            # is already parsed.
-            if i < len(self._diemap) and cur_offset == self._diemap[i]:
-                child = self._dielist[i]
-            else:
-                child = DIE(
-                        cu=self,
-                        stream=die.stream,
-                        offset=cur_offset)
-                self._dielist.insert(i, child)
-                self._diemap.insert(i, cur_offset)
+            child = self._get_cached_DIE(cur_offset)
 
             child.set_parent(die)
 
@@ -179,3 +189,38 @@ class CompileUnit(object):
                 for d in self._iter_DIE_subtree(c):
                     yield d
             yield die._terminator
+
+    def _get_cached_DIE(self, offset):
+        """ Given a DIE offset, look it up in the cache.  If not present,
+            parse the DIE and insert it into the cache.
+
+            offset:
+                The offset of the DIE in the debug_info section to retrieve.
+
+            The stream reference is copied from the top DIE.  The top die will
+            also be parsed and cached if needed.
+
+            See also get_DIE_from_refaddr(self, refaddr).
+        """
+        # The top die must be in the cache if any DIE is in the cache.
+        # The stream is the same for all DIEs in this CU, so populate
+        # the top DIE and obtain a reference to its stream.
+        top_die_stream = self.get_top_DIE().stream
+
+        # `offset` is the offset in the stream of the DIE we want to return.
+        # The map is maintined as a parallel array to the list.  We call
+        # bisect each time to ensure new DIEs are inserted in the correct
+        # order within both `self._dielist` and `self._diemap`.
+        i = bisect_right(self._diemap, offset)
+
+        # Note that `self._diemap` cannot be empty because a the top DIE
+        # was inserted by the call to .get_top_DIE().  Also it has the minimal
+        # offset, so the bisect_right insert point will always be at least 1.
+        if offset == self._diemap[i - 1]:
+            die = self._dielist[i - 1]
+        else:
+            die = DIE(cu=self, stream=top_die_stream, offset=offset)
+            self._dielist.insert(i, die)
+            self._diemap.insert(i, offset)
+
+        return die
diff --git a/elftools/dwarf/die.py b/elftools/dwarf/die.py
index 1835fd8..455a2e3 100755
--- a/elftools/dwarf/die.py
+++ b/elftools/dwarf/die.py
@@ -98,10 +98,34 @@ class DIE(object):
         """
         return self.tag is None
 
+    def get_DIE_from_attribute(self, name):
+        """ Return the DIE referenced by the named attribute of this DIE.
+            The attribute must be in the reference attribute class.
+
+            name:
+                The name of the attribute in the reference class.
+        """
+        attr = self.attributes[name]
+        if attr.form in ('DW_FORM_ref1', 'DW_FORM_ref2', 'DW_FORM_ref4',
+                         'DW_FORM_ref8', 'DW_FORM_ref'):
+            refaddr = self.cu.cu_offset + attr.raw_value
+            return self.cu.get_DIE_from_refaddr(refaddr)
+        elif attr.form in ('DW_FORM_refaddr'):
+            return self.cu.dwarfinfo.get_DIE_from_refaddr(attr.raw_value)
+        elif attr.form in ('DW_FORM_ref_sig8'):
+            # Implement search type units for matching signature
+            raise NotImplementedError('%s (type unit by signature)' % attr.form)
+        elif attr.form in ('DW_FORM_ref_sup4', 'DW_FORM_ref_sup8'):
+            raise NotImplementedError('%s to dwo' % attr.form)
+        else:
+            raise DWARFError('%s is not a reference class form attribute' % attr)
+
     def get_parent(self):
-        """ The parent DIE of this DIE. None if the DIE has no parent (i.e. a
-            top-level DIE).
+        """ Return the parent DIE of this DIE, or None if the DIE has no
+            parent (i.e. is a top-level DIE).
         """
+        if self._parent is None:
+            self._search_ancestor_offspring()
         return self._parent
 
     def get_full_path(self):
@@ -126,8 +150,9 @@ class DIE(object):
     def iter_siblings(self):
         """ Yield all siblings of this DIE
         """
-        if self._parent:
-            for sibling in self._parent.iter_children():
+        parent = self.get_parent()
+        if parent:
+            for sibling in parent.iter_children():
                 if sibling is not self:
                     yield sibling
         else:
@@ -142,6 +167,42 @@ class DIE(object):
 
     #------ PRIVATE ------#
 
+    def _search_ancestor_offspring(self):
+        """ Search our ancestors identifying their offspring to find our parent.
+
+            DIEs are stored as a flattened tree.  The top DIE is the ancestor
+            of all DIEs in the unit.  Each parent is guaranteed to be at
+            an offset less than their children.  In each generation of children
+            the sibling with the closest offset not greater than our offset is
+            our ancestor.
+        """
+        # This code is called when get_parent notices that the _parent has
+        # not been identified.  To avoid execution for each sibling record all
+        # the children of any parent iterated.  Assuming get_parent will also be
+        # called for siblings, it is more efficient if siblings references are
+        # provided and no worse than a single walk if they are missing, while
+        # stopping iteration early could result in O(n^2) walks.
+        search = self.cu.get_top_DIE()
+        while search.offset < self.offset:
+            prev = search
+            for child in search.iter_children():
+                child.set_parent(search)
+                if child.offset <= self.offset:
+                    prev = child
+
+            # We also need to check the offset of the terminator DIE
+            if search.has_children and search._terminator.offset <= self.offset:
+                    prev = search._terminator
+
+            # If we didn't find a closer parent, give up, don't loop.
+            # Either we mis-parsed an ancestor or someone created a DIE
+            # by an offset that was not actually the start of a DIE.
+            if prev is search:
+                raise ValueError("offset %s not in CU %s DIE tree" %
+                    (self.offset, self.cu.cu_offset))
+
+            search = prev
+
     def __repr__(self):
         s = 'DIE %s, size=%s, has_children=%s\n' % (
             self.tag, self.size, self.has_children)
diff --git a/elftools/dwarf/dwarfinfo.py b/elftools/dwarf/dwarfinfo.py
index 70ebe04..896c292 100644
--- a/elftools/dwarf/dwarfinfo.py
+++ b/elftools/dwarf/dwarfinfo.py
@@ -7,6 +7,7 @@
 # This code is in the public domain
 #-------------------------------------------------------------------------------
 from collections import namedtuple
+from bisect import bisect_right
 
 from ..common.exceptions import DWARFError
 from ..common.utils import (struct_parse, dwarf_assert,
@@ -103,6 +104,11 @@ class DWARFInfo(object):
         # Cache for abbrev tables: a dict keyed by offset
         self._abbrevtable_cache = {}
 
+        # Cache of compile units and map of their offsets for bisect lookup.
+        # Access with .iter_CUs(), .get_CU_containing(), and/or .get_CU_at().
+        self._cu_cache = []
+        self._cu_offsets_map = []
+
     @property
     def has_debug_info(self):
         """ Return whether this contains debug information.
@@ -112,6 +118,84 @@ class DWARFInfo(object):
         """
         return bool(self.debug_info_sec)
 
+    def get_DIE_from_lut_entry(self, lut_entry):
+        """ Get the DIE from the pubnames or putbtypes lookup table entry.
+
+            lut_entry:
+                A NameLUTEntry object from a NameLUT instance (see
+                .get_pubmames and .get_pubtypes methods).
+        """
+        cu = self.get_CU_at(lut_entry.cu_ofs)
+        return self.get_DIE_from_refaddr(lut_entry.die_ofs, cu)
+
+    def get_DIE_from_refaddr(self, refaddr, cu=None):
+        """ Given a .debug_info section offset of a DIE, return the DIE.
+
+            refaddr:
+                The refaddr may come from a DW_FORM_ref_addr attribute.
+
+            cu:
+                The compile unit object, if known.  If None a search
+                from the closest offset less than refaddr will be performed.
+        """
+        if cu is None:
+            cu = self.get_CU_containing(refaddr)
+        return cu.get_DIE_from_refaddr(refaddr)
+
+    def get_CU_containing(self, refaddr):
+        """ Find the CU that includes the given reference address in the
+            .debug_info section.
+
+            refaddr:
+                Either a refaddr of a DIE (possibly from a DW_FORM_refaddr
+                attribute) or the section offset of a CU (possibly from an
+                aranges table).
+
+           This function will parse and cache CUs until the search criteria
+           is met, starting from the closest known offset lessthan or equal
+           to the given address.
+        """
+        dwarf_assert(
+            self.has_debug_info,
+            'CU lookup but no debug info section')
+        dwarf_assert(
+            0 <= refaddr < self.debug_info_sec.size,
+            "refaddr %s beyond .debug_info size" % refaddr)
+
+        # The CU containing the DIE we desire will be to the right of the
+        # DIE insert point.  If we have a CU address, then it will be a
+        # match but the right insert minus one will still be the item.
+        # The first CU starts at offset 0, so start there if cache is empty.
+        i = bisect_right(self._cu_offsets_map, refaddr)
+        start = self._cu_offsets_map[i - 1] if i > 0 else 0
+
+        # parse CUs until we find one containing the desired address
+        for cu in self._parse_CUs_iter(start):
+            if cu.cu_offset <= refaddr < cu.cu_offset + cu.size:
+                return cu
+
+        raise ValueError("CU for reference address %s not found" % refaddr)
+
+    def get_CU_at(self, offset):
+        """ Given a CU header offset, return the parsed CU.
+
+            offset:
+                The offset may be from an accelerated access table such as
+                the public names, public types, address range table, or
+                prior use.
+
+            This function will directly parse the CU doing no validation of
+            the offset beyond checking the size of the .debug_info section.
+        """
+        dwarf_assert(
+            self.has_debug_info,
+            'CU lookup but no debug info section')
+        dwarf_assert(
+            0 <= offset < self.debug_info_sec.size,
+            "offset %s beyond .debug_info size" % offset)
+
+        return self._cached_CU_at_offset(offset)
+
     def iter_CUs(self):
         """ Yield all the compile units (CompileUnit objects) in the debug info
         """
@@ -253,15 +337,20 @@ class DWARFInfo(object):
 
     #------ PRIVATE ------#
 
-    def _parse_CUs_iter(self):
-        """ Parse CU entries from debug_info. Yield CUs in order of appearance.
+    def _parse_CUs_iter(self, offset=0):
+        """ Iterate CU objects in order of appearance in the debug_info section.
+
+            offset:
+                The offset of the first CU to yield.  Additional iterations
+                will return the sequential unit objects.
+
+            See .iter_CUs(), .get_CU_containing(), and .get_CU_at().
         """
         if self.debug_info_sec is None:
             return
 
-        offset = 0
         while offset < self.debug_info_sec.size:
-            cu = self._parse_CU_at_offset(offset)
+            cu = self._cached_CU_at_offset(offset)
             # Compute the offset of the next CU in the section. The unit_length
             # field of the CU header contains its size not including the length
             # field itself.
@@ -270,6 +359,32 @@ class DWARFInfo(object):
                         cu.structs.initial_length_field_size())
             yield cu
 
+    def _cached_CU_at_offset(self, offset):
+        """ Return the CU with unit header at the given offset into the
+            debug_info section from the cache.  If not present, the unit is
+            header is parsed and the object is installed in the cache.
+
+            offset:
+                The offset of the unit header in the .debug_info section
+                to of the unit to fetch from the cache.
+
+            See get_CU_at().
+        """
+        # Find the insert point for the requested offset.  With bisect_right,
+        # if this entry is present in the cache it will be the prior entry.
+        i = bisect_right(self._cu_offsets_map, offset)
+        if i >= 1 and offset == self._cu_offsets_map[i - 1]:
+            return self._cu_cache[i - 1]
+
+        # Parse the CU and insert the offset and object into the cache.
+        # The ._cu_offsets_map[] contains just the numeric offsets for the
+        # bisect_right search while the parallel indexed ._cu_cache[] holds
+        # the object references.
+        cu = self._parse_CU_at_offset(offset)
+        self._cu_offsets_map.insert(i, offset)
+        self._cu_cache.insert(i, cu)
+        return cu
+
     def _parse_CU_at_offset(self, offset):
         """ Parse and return a CU at the given offset in the debug_info stream.
         """
diff --git a/test/test_dwarf_cu_and_die_cache.py b/test/test_dwarf_cu_and_die_cache.py
new file mode 100644
index 0000000..bf7f4d7
--- /dev/null
+++ b/test/test_dwarf_cu_and_die_cache.py
@@ -0,0 +1,58 @@
+#-------------------------------------------------------------------------------
+# elftools tests
+#
+# Eli Bendersky (eliben@gmail.com), Milton Miller <miltonm@us.ibm.com>
+# This code is in the public domain
+#-------------------------------------------------------------------------------
+import os
+import unittest
+
+from elftools.elf.elffile import ELFFile
+from elftools.common.py3compat import bytes2str
+
+class TestCacheLUTandDIEref(unittest.TestCase):
+    def dprint(self, list):
+        if False:
+            self.oprint(list)
+
+    def oprint(self, list):
+        if False:
+            print(list)
+
+    def test_die_from_LUTentry(self):
+        lines = ['']
+        with open(os.path.join('test', 'testfiles_for_unittests',
+                               'lambda.elf'), 'rb') as f:
+            elffile = ELFFile(f)
+            self.assertTrue(elffile.has_dwarf_info())
+
+            dwarf = elffile.get_dwarf_info()
+            pt = dwarf.get_pubnames()
+            for (k, v) in pt.items():
+                ndie = dwarf.get_DIE_from_lut_entry(v)
+                self.dprint(ndie)
+                if not 'DW_AT_type' in ndie.attributes:
+                    continue
+                if not 'DW_AT_name' in ndie.attributes:
+                    continue
+                name = bytes2str(ndie.attributes['DW_AT_name'].value)
+                tlist = []
+                tdie = ndie
+                while True:
+                    tdie = tdie.get_DIE_from_attribute('DW_AT_type')
+                    self.dprint(ndie)
+                    ttag = tdie.tag
+                    if isinstance(ttag, int):
+                        ttag = 'TAG(0x%x)' % ttag
+                    tlist.append(ttag)
+                    if 'DW_AT_name' in tdie.attributes:
+                        break
+                tlist.append(bytes2str(tdie.attributes['DW_AT_name'].value))
+                tname = ' '.join(tlist)
+                line = "%s DIE at %s is of type %s" % (
+                        ndie.tag, ndie.offset, tname)
+                lines.append(line)
+                self.dprint(line)
+
+        self.oprint('\n'.join(lines))
+        self.assertGreater(len(lines), 1)