From 670079afe5a472aab74cec74acc15ae7c50cdb83 Mon Sep 17 00:00:00 2001 From: William Woodruff Date: Thu, 7 Nov 2019 22:07:28 -0500 Subject: [PATCH] Lazy DIE parsing (#249) Supersedes/closes #216. --- elftools/dwarf/compileunit.py | 157 ++++++++++++++++++++-------------- elftools/dwarf/die.py | 10 +-- 2 files changed, 98 insertions(+), 69 deletions(-) diff --git a/elftools/dwarf/compileunit.py b/elftools/dwarf/compileunit.py index 8b4030f..cabeb8c 100644 --- a/elftools/dwarf/compileunit.py +++ b/elftools/dwarf/compileunit.py @@ -6,6 +6,7 @@ # Eli Bendersky (eliben@gmail.com) # This code is in the public domain #------------------------------------------------------------------------------- +from bisect import bisect_left from .die import DIE @@ -53,8 +54,16 @@ class CompileUnit(object): # requested. self._abbrev_table = None - # A list of DIEs belonging to this CU. Lazily parsed. + # A list of DIEs belonging to this CU. + # This list is lazily constructed as DIEs are iterated over. self._dielist = [] + # A list of file offsets, corresponding (by index) to the DIEs + # in `self._dielist`. This list exists separately from + # `self._dielist` to make it binary searchable, enabling the + # DIE population strategy used in `iter_DIE_children`. + # Like `self._dielist`, this list is lazily constructed + # as DIEs are iterated over. + self._diemap = [] def dwarf_format(self): """ Get the DWARF format (32 or 64) for this CU @@ -73,14 +82,85 @@ class CompileUnit(object): """ Get the top DIE (which is either a DW_TAG_compile_unit or DW_TAG_partial_unit) of this CU """ - return self._get_DIE(0) + + # Note that a top DIE always has minimal offset and is therefore + # at the beginning of our lists, so no bisect is required. + if len(self._diemap) > 0: + return self._dielist[0] + + top = DIE( + cu=self, + stream=self.dwarfinfo.debug_info_sec.stream, + offset=self.cu_die_offset) + + self._dielist.insert(0, top) + self._diemap.insert(0, self.cu_die_offset) + + return top def iter_DIEs(self): """ Iterate over all the DIEs in the CU, in order of their appearance. Note that null DIEs will also be returned. """ - self._parse_DIEs() - return iter(self._dielist) + return self._iter_DIE_subtree(self.get_top_DIE()) + + def iter_DIE_children(self, die): + """ Given a DIE, yields either its children, without null DIE list + terminator, or nothing, if that DIE has no children. + + The null DIE terminator is saved in that DIE when iteration ended. + """ + if not die.has_children: + return + + # `cur_offset` tracks the offset past our current DIE as we iterate + # over children, providing the pivot as we bisect `self._diemap` + # and ensuring that we insert our children (and child offsets) + # in the correct order within both `self._dielist` and `self._diemap`. + cur_offset = die.offset + die.size + + while True: + i = bisect_left(self._diemap, cur_offset) + # Note that `self._diemap` cannot be empty because a `die`, the argument, + # is already parsed. + if i < len(self._diemap) and cur_offset == self._diemap[i]: + child = self._dielist[i] + else: + child = DIE( + cu=self, + stream=die.stream, + offset=cur_offset) + self._dielist.insert(i, child) + self._diemap.insert(i, cur_offset) + + child.set_parent(die) + + if child.is_null(): + die._terminator = child + return + + yield child + + if not child.has_children: + cur_offset += child.size + elif "DW_AT_sibling" in child.attributes: + sibling = child.attributes["DW_AT_sibling"] + cur_offset = sibling.value + self.cu_offset + else: + # If no DW_AT_sibling attribute is provided by the producer + # then the whole child subtree must be parsed to find its next + # sibling. There is one zero byte representing null DIE + # terminating children list. It is used to locate child subtree + # bounds. + + # If children are not parsed yet, this instruction will manage + # to recursive call of this function which will result in + # setting of `_terminator` attribute of the `child`. + if child._terminator is None: + for _ in self.iter_DIE_children(child): + pass + + cur_offset = child._terminator.offset + child._terminator.size #------ PRIVATE ------# @@ -89,64 +169,13 @@ class CompileUnit(object): """ return self.header[name] - def _get_DIE(self, index): - """ Get the DIE at the given index - """ - self._parse_DIEs() - return self._dielist[index] - - def _parse_DIEs(self): - """ Parse all the DIEs pertaining to this CU from the stream and shove - them sequentially into self._dielist. - Also set the child/sibling/parent links in the DIEs according - (unflattening the prefix-order of the DIE tree). + def _iter_DIE_subtree(self, die): + """ Given a DIE, this yields it with its subtree including null DIEs + (child list terminators). """ - if len(self._dielist) > 0: - return - - # Compute the boundary (one byte past the bounds) of this CU in the - # stream - cu_boundary = ( self.cu_offset + - self['unit_length'] + - self.structs.initial_length_field_size()) - - # First pass: parse all DIEs and place them into self._dielist - die_offset = self.cu_die_offset - while die_offset < cu_boundary: - die = DIE( - cu=self, - stream=self.dwarfinfo.debug_info_sec.stream, - offset=die_offset) - self._dielist.append(die) - die_offset += die.size - - # Second pass - unflatten the DIE tree - self._unflatten_tree() - - def _unflatten_tree(self): - """ "Unflatten" the DIE tree from it serial representation, by setting - the child/sibling/parent links of DIEs. - - Assumes self._dielist was already populated by a linear list of DIEs - read from the stream section - """ - # the first DIE in the list is the root node - root = self._dielist[0] - parentstack = [root] - - for die in self._dielist[1:]: - if not die.is_null(): - cur_parent = parentstack[-1] - # This DIE is a child of the current parent - cur_parent.add_child(die) - die.set_parent(cur_parent) - if die.has_children: - parentstack.append(die) - else: - # parentstack should not be really empty here. However, some - # compilers generate DWARF that has extra NULLs in the end and - # we don't want pyelftools to fail parsing them just because of - # this. - if len(parentstack) > 0: - # end of children for the current parent - parentstack.pop() + yield die + if die.has_children: + for c in die.iter_children(): + for d in self._iter_DIE_subtree(c): + yield d + yield die._terminator diff --git a/elftools/dwarf/die.py b/elftools/dwarf/die.py index ec50fb1..1835fd8 100755 --- a/elftools/dwarf/die.py +++ b/elftools/dwarf/die.py @@ -86,7 +86,9 @@ class DIE(object): self.has_children = None self.abbrev_code = None self.size = 0 - self._children = [] + # Null DIE terminator. It can be used to obtain offset range occupied + # by this DIE including its whole subtree. + self._terminator = None self._parent = None self._parse_DIE() @@ -117,9 +119,9 @@ class DIE(object): return os.path.join(comp_dir, fname) def iter_children(self): - """ Yield all children of this DIE + """ Iterates all children of this DIE """ - return iter(self._children) + return self.cu.iter_DIE_children(self) def iter_siblings(self): """ Yield all siblings of this DIE @@ -134,8 +136,6 @@ class DIE(object): # The following methods are used while creating the DIE and should not be # interesting to consumers # - def add_child(self, die): - self._children.append(die) def set_parent(self, die): self._parent = die -- 2.30.2