Lazy DIE parsing (#249)
authorWilliam Woodruff <william@trailofbits.com>
Fri, 8 Nov 2019 03:07:28 +0000 (22:07 -0500)
committerEli Bendersky <eliben@users.noreply.github.com>
Fri, 8 Nov 2019 03:07:28 +0000 (19:07 -0800)
Supersedes/closes #216.

elftools/dwarf/compileunit.py
elftools/dwarf/die.py

index 8b4030f4f99541bb5554b63944511adcdf07cf7e..cabeb8c6f06911a9c296b1827b9bf00f068e1c1c 100644 (file)
@@ -6,6 +6,7 @@
 # Eli Bendersky (eliben@gmail.com)
 # This code is in the public domain
 #-------------------------------------------------------------------------------
+from bisect import bisect_left
 from .die import DIE
 
 
@@ -53,8 +54,16 @@ class CompileUnit(object):
         # requested.
         self._abbrev_table = None
 
-        # A list of DIEs belonging to this CU. Lazily parsed.
+        # A list of DIEs belonging to this CU.
+        # This list is lazily constructed as DIEs are iterated over.
         self._dielist = []
+        # A list of file offsets, corresponding (by index) to the DIEs
+        # in `self._dielist`. This list exists separately from
+        # `self._dielist` to make it binary searchable, enabling the
+        # DIE population strategy used in `iter_DIE_children`.
+        # Like `self._dielist`, this list is lazily constructed
+        # as DIEs are iterated over.
+        self._diemap = []
 
     def dwarf_format(self):
         """ Get the DWARF format (32 or 64) for this CU
@@ -73,14 +82,85 @@ class CompileUnit(object):
         """ Get the top DIE (which is either a DW_TAG_compile_unit or
             DW_TAG_partial_unit) of this CU
         """
-        return self._get_DIE(0)
+
+        # Note that a top DIE always has minimal offset and is therefore
+        # at the beginning of our lists, so no bisect is required.
+        if len(self._diemap) > 0:
+            return self._dielist[0]
+
+        top = DIE(
+                cu=self,
+                stream=self.dwarfinfo.debug_info_sec.stream,
+                offset=self.cu_die_offset)
+
+        self._dielist.insert(0, top)
+        self._diemap.insert(0, self.cu_die_offset)
+
+        return top
 
     def iter_DIEs(self):
         """ Iterate over all the DIEs in the CU, in order of their appearance.
             Note that null DIEs will also be returned.
         """
-        self._parse_DIEs()
-        return iter(self._dielist)
+        return self._iter_DIE_subtree(self.get_top_DIE())
+
+    def iter_DIE_children(self, die):
+        """ Given a DIE, yields either its children, without null DIE list
+            terminator, or nothing, if that DIE has no children.
+
+            The null DIE terminator is saved in that DIE when iteration ended.
+        """
+        if not die.has_children:
+            return
+
+        # `cur_offset` tracks the offset past our current DIE as we iterate
+        # over children, providing the pivot as we bisect `self._diemap`
+        # and ensuring that we insert our children (and child offsets)
+        # in the correct order within both `self._dielist` and `self._diemap`.
+        cur_offset = die.offset + die.size
+
+        while True:
+            i = bisect_left(self._diemap, cur_offset)
+            # Note that `self._diemap` cannot be empty because a `die`, the argument,
+            # is already parsed.
+            if i < len(self._diemap) and cur_offset == self._diemap[i]:
+                child = self._dielist[i]
+            else:
+                child = DIE(
+                        cu=self,
+                        stream=die.stream,
+                        offset=cur_offset)
+                self._dielist.insert(i, child)
+                self._diemap.insert(i, cur_offset)
+
+            child.set_parent(die)
+
+            if child.is_null():
+                die._terminator = child
+                return
+
+            yield child
+
+            if not child.has_children:
+                cur_offset += child.size
+            elif "DW_AT_sibling" in child.attributes:
+                sibling = child.attributes["DW_AT_sibling"]
+                cur_offset = sibling.value + self.cu_offset
+            else:
+                # If no DW_AT_sibling attribute is provided by the producer
+                # then the whole child subtree must be parsed to find its next
+                # sibling. There is one zero byte representing null DIE
+                # terminating children list. It is used to locate child subtree
+                # bounds.
+
+                # If children are not parsed yet, this instruction will manage
+                # to recursive call of this function which will result in
+                # setting of `_terminator` attribute of the `child`.
+                if child._terminator is None:
+                    for _ in self.iter_DIE_children(child):
+                        pass
+
+                cur_offset = child._terminator.offset + child._terminator.size
 
     #------ PRIVATE ------#
 
@@ -89,64 +169,13 @@ class CompileUnit(object):
         """
         return self.header[name]
 
-    def _get_DIE(self, index):
-        """ Get the DIE at the given index
-        """
-        self._parse_DIEs()
-        return self._dielist[index]
-
-    def _parse_DIEs(self):
-        """ Parse all the DIEs pertaining to this CU from the stream and shove
-            them sequentially into self._dielist.
-            Also set the child/sibling/parent links in the DIEs according
-            (unflattening the prefix-order of the DIE tree).
+    def _iter_DIE_subtree(self, die):
+        """ Given a DIE, this yields it with its subtree including null DIEs
+            (child list terminators).
         """
-        if len(self._dielist) > 0:
-            return
-
-        # Compute the boundary (one byte past the bounds) of this CU in the
-        # stream
-        cu_boundary = ( self.cu_offset +
-                        self['unit_length'] +
-                        self.structs.initial_length_field_size())
-
-        # First pass: parse all DIEs and place them into self._dielist
-        die_offset = self.cu_die_offset
-        while die_offset < cu_boundary:
-            die = DIE(
-                    cu=self,
-                    stream=self.dwarfinfo.debug_info_sec.stream,
-                    offset=die_offset)
-            self._dielist.append(die)
-            die_offset += die.size
-
-        # Second pass - unflatten the DIE tree
-        self._unflatten_tree()
-
-    def _unflatten_tree(self):
-        """ "Unflatten" the DIE tree from it serial representation, by setting
-            the child/sibling/parent links of DIEs.
-
-            Assumes self._dielist was already populated by a linear list of DIEs
-            read from the stream section
-        """
-        # the first DIE in the list is the root node
-        root = self._dielist[0]
-        parentstack = [root]
-
-        for die in self._dielist[1:]:
-            if not die.is_null():
-                cur_parent = parentstack[-1]
-                # This DIE is a child of the current parent
-                cur_parent.add_child(die)
-                die.set_parent(cur_parent)
-                if die.has_children:
-                    parentstack.append(die)
-            else:
-                # parentstack should not be really empty here. However, some
-                # compilers generate DWARF that has extra NULLs in the end and
-                # we don't want pyelftools to fail parsing them just because of
-                # this.
-                if len(parentstack) > 0:
-                    # end of children for the current parent
-                    parentstack.pop()
+        yield die
+        if die.has_children:
+            for c in die.iter_children():
+                for d in self._iter_DIE_subtree(c):
+                    yield d
+            yield die._terminator
index ec50fb1d3d55e1d8b69459ca438b3d3c70917524..1835fd87312ce6a6fe6aad1146adcc6f2d54eeef 100755 (executable)
@@ -86,7 +86,9 @@ class DIE(object):
         self.has_children = None
         self.abbrev_code = None
         self.size = 0
-        self._children = []
+        # Null DIE terminator. It can be used to obtain offset range occupied
+        # by this DIE including its whole subtree.
+        self._terminator = None
         self._parent = None
 
         self._parse_DIE()
@@ -117,9 +119,9 @@ class DIE(object):
         return os.path.join(comp_dir, fname)
 
     def iter_children(self):
-        """ Yield all children of this DIE
+        """ Iterates all children of this DIE
         """
-        return iter(self._children)
+        return self.cu.iter_DIE_children(self)
 
     def iter_siblings(self):
         """ Yield all siblings of this DIE
@@ -134,8 +136,6 @@ class DIE(object):
     # The following methods are used while creating the DIE and should not be
     # interesting to consumers
     #
-    def add_child(self, die):
-        self._children.append(die)
 
     def set_parent(self, die):
         self._parent = die