Fix for mixed version loclists, tests (#521)
[pyelftools.git] / elftools / dwarf / dwarfinfo.py
index 330a23879c4972ba94522aac6fb66bb5fc81921e..406146286902fdc221fb5444d344d2eb78a7a3b9 100644 (file)
@@ -6,8 +6,11 @@
 # Eli Bendersky (eliben@gmail.com)
 # This code is in the public domain
 #-------------------------------------------------------------------------------
+import os
 from collections import namedtuple
+from bisect import bisect_right
 
+from ..construct.lib.container import Container
 from ..common.exceptions import DWARFError
 from ..common.utils import (struct_parse, dwarf_assert,
                             parse_cstring_from_stream)
@@ -16,9 +19,11 @@ from .compileunit import CompileUnit
 from .abbrevtable import AbbrevTable
 from .lineprogram import LineProgram
 from .callframe import CallFrameInfo
-from .locationlists import LocationLists
-from .ranges import RangeLists
+from .locationlists import LocationLists, LocationListsPair
+from .ranges import RangeLists, RangeListsPair
 from .aranges import ARanges
+from .namelut import NameLUT
+from .dwarf_util import _get_base_offset
 
 
 # Describes a debug section
@@ -27,12 +32,14 @@ from .aranges import ARanges
 # name: section name in the container file
 # global_offset: the global offset of the section in its container file
 # size: the size of the section's data, in bytes
+# address: the virtual address for the section's data
 #
 # 'name' and 'global_offset' are for descriptional purposes only and
-# aren't strictly required for the DWARF parsing to work.
+# aren't strictly required for the DWARF parsing to work. 'address' is required
+# to properly decode the special '.eh_frame' format.
 #
 DebugSectionDescriptor = namedtuple('DebugSectionDescriptor',
-    'stream name global_offset size')
+    'stream name global_offset size address')
 
 
 # Some configuration parameters for the DWARF reader. This exists to allow
@@ -53,7 +60,7 @@ DwarfConfig = namedtuple('DwarfConfig',
 
 class DWARFInfo(object):
     """ Acts also as a "context" to other major objects, bridging between
-        various parts of the debug infromation.
+        various parts of the debug information.
     """
     def __init__(self,
             config,
@@ -65,7 +72,17 @@ class DWARFInfo(object):
             debug_str_sec,
             debug_loc_sec,
             debug_ranges_sec,
-            debug_line_sec):
+            debug_line_sec,
+            debug_pubtypes_sec,
+            debug_pubnames_sec,
+            debug_addr_sec,
+            debug_str_offsets_sec,
+            debug_line_str_sec,
+            debug_loclists_sec,
+            debug_rnglists_sec,
+            debug_sup_sec,
+            gnu_debugaltlink_sec
+            ):
         """ config:
                 A DwarfConfig object
 
@@ -84,6 +101,20 @@ class DWARFInfo(object):
         self.debug_loc_sec = debug_loc_sec
         self.debug_ranges_sec = debug_ranges_sec
         self.debug_line_sec = debug_line_sec
+        self.debug_addr_sec = debug_addr_sec
+        self.debug_str_offsets_sec = debug_str_offsets_sec
+        self.debug_line_str_sec = debug_line_str_sec
+        self.debug_pubtypes_sec = debug_pubtypes_sec
+        self.debug_pubnames_sec = debug_pubnames_sec
+        self.debug_loclists_sec = debug_loclists_sec
+        self.debug_rnglists_sec = debug_rnglists_sec
+        self.debug_sup_sec = debug_sup_sec
+        self.gnu_debugaltlink_sec = gnu_debugaltlink_sec
+
+        # Sets the supplementary_dwarfinfo to None. Client code can set this
+        # to something else, typically a DWARFInfo file read from an ELFFile
+        # which path is stored in the debug_sup_sec or gnu_debugaltlink_sec.
+        self.supplementary_dwarfinfo = None
 
         # This is the DWARFStructs the context uses, so it doesn't depend on
         # DWARF format and address_size (these are determined per CU) - set them
@@ -95,6 +126,100 @@ class DWARFInfo(object):
 
         # Cache for abbrev tables: a dict keyed by offset
         self._abbrevtable_cache = {}
+        # Cache for program lines tables: a dict keyed by offset
+        self._linetable_cache = {}
+        # Cache of compile units and map of their offsets for bisect lookup.
+        # Access with .iter_CUs(), .get_CU_containing(), and/or .get_CU_at().
+        self._cu_cache = []
+        self._cu_offsets_map = []
+
+    @property
+    def has_debug_info(self):
+        """ Return whether this contains debug information.
+
+        It can be not the case when the ELF only contains .eh_frame, which is
+        encoded DWARF but not actually for debugging.
+        """
+        return bool(self.debug_info_sec)
+
+    def get_DIE_from_lut_entry(self, lut_entry):
+        """ Get the DIE from the pubnames or putbtypes lookup table entry.
+
+            lut_entry:
+                A NameLUTEntry object from a NameLUT instance (see
+                .get_pubmames and .get_pubtypes methods).
+        """
+        cu = self.get_CU_at(lut_entry.cu_ofs)
+        return self.get_DIE_from_refaddr(lut_entry.die_ofs, cu)
+
+    def get_DIE_from_refaddr(self, refaddr, cu=None):
+        """ Given a .debug_info section offset of a DIE, return the DIE.
+
+            refaddr:
+                The refaddr may come from a DW_FORM_ref_addr attribute.
+
+            cu:
+                The compile unit object, if known.  If None a search
+                from the closest offset less than refaddr will be performed.
+        """
+        if cu is None:
+            cu = self.get_CU_containing(refaddr)
+        return cu.get_DIE_from_refaddr(refaddr)
+
+    def get_CU_containing(self, refaddr):
+        """ Find the CU that includes the given reference address in the
+            .debug_info section.
+
+            refaddr:
+                Either a refaddr of a DIE (possibly from a DW_FORM_ref_addr
+                attribute) or the section offset of a CU (possibly from an
+                aranges table).
+
+           This function will parse and cache CUs until the search criteria
+           is met, starting from the closest known offset lessthan or equal
+           to the given address.
+        """
+        dwarf_assert(
+            self.has_debug_info,
+            'CU lookup but no debug info section')
+        dwarf_assert(
+            0 <= refaddr < self.debug_info_sec.size,
+            "refaddr %s beyond .debug_info size" % refaddr)
+
+        # The CU containing the DIE we desire will be to the right of the
+        # DIE insert point.  If we have a CU address, then it will be a
+        # match but the right insert minus one will still be the item.
+        # The first CU starts at offset 0, so start there if cache is empty.
+        i = bisect_right(self._cu_offsets_map, refaddr)
+        start = self._cu_offsets_map[i - 1] if i > 0 else 0
+
+        # parse CUs until we find one containing the desired address
+        for cu in self._parse_CUs_iter(start):
+            if cu.cu_offset <= refaddr < cu.cu_offset + cu.size:
+                return cu
+
+        raise ValueError("CU for reference address %s not found" % refaddr)
+
+    def get_CU_at(self, offset):
+        """ Given a CU header offset, return the parsed CU.
+
+            offset:
+                The offset may be from an accelerated access table such as
+                the public names, public types, address range table, or
+                prior use.
+
+            This function will directly parse the CU doing no validation of
+            the offset beyond checking the size of the .debug_info section.
+        """
+        dwarf_assert(
+            self.has_debug_info,
+            'CU lookup but no debug info section')
+        dwarf_assert(
+            0 <= offset < self.debug_info_sec.size,
+            "offset %s beyond .debug_info size" % offset)
+
+        return self._cached_CU_at_offset(offset)
 
     def iter_CUs(self):
         """ Yield all the compile units (CompileUnit objects) in the debug info
@@ -129,10 +254,24 @@ class DWARFInfo(object):
         """
         return parse_cstring_from_stream(self.debug_str_sec.stream, offset)
 
+    def get_string_from_linetable(self, offset):
+        """ Obtain a string from the string table section, given an offset
+            relative to the section.
+        """
+        return parse_cstring_from_stream(self.debug_line_str_sec.stream, offset)
+
     def line_program_for_CU(self, CU):
         """ Given a CU object, fetch the line program it points to from the
             .debug_line section.
             If the CU doesn't point to a line program, return None.
+
+            Note about directory and file names. They are returned as two collections
+            in the lineprogram object's header - include_directory and file_entry.
+
+            In DWARFv5, they have introduced a different, extensible format for those
+            collections. So in a lineprogram v5+, there are two more collections in
+            the header - directories and file_names. Those might contain extra DWARFv5
+            information that is not exposed in include_directory and file_entry.
         """
         # The line program is pointed to by the DW_AT_stmt_list attribute of
         # the top DIE of a CU.
@@ -154,6 +293,7 @@ class DWARFInfo(object):
         cfi = CallFrameInfo(
             stream=self.debug_frame_sec.stream,
             size=self.debug_frame_sec.size,
+            address=self.debug_frame_sec.address,
             base_structs=self.structs)
         return cfi.get_entries()
 
@@ -168,49 +308,109 @@ class DWARFInfo(object):
         cfi = CallFrameInfo(
             stream=self.eh_frame_sec.stream,
             size=self.eh_frame_sec.size,
-            base_structs=self.structs)
+            address=self.eh_frame_sec.address,
+            base_structs=self.structs,
+            for_eh_frame=True)
         return cfi.get_entries()
 
+    def get_pubtypes(self):
+        """
+        Returns a NameLUT object that contains information read from the
+        .debug_pubtypes section in the ELF file.
+
+        NameLUT is essentially a dictionary containing the CU/DIE offsets of
+        each symbol. See the NameLUT doc string for more details.
+        """
+
+        if self.debug_pubtypes_sec:
+            return NameLUT(self.debug_pubtypes_sec.stream,
+                    self.debug_pubtypes_sec.size,
+                    self.structs)
+        else:
+            return None
+
+    def get_pubnames(self):
+        """
+        Returns a NameLUT object that contains information read from the
+        .debug_pubnames section in the ELF file.
+
+        NameLUT is essentially a dictionary containing the CU/DIE offsets of
+        each symbol. See the NameLUT doc string for more details.
+        """
+
+        if self.debug_pubnames_sec:
+            return NameLUT(self.debug_pubnames_sec.stream,
+                    self.debug_pubnames_sec.size,
+                    self.structs)
+        else:
+            return None
+
     def get_aranges(self):
         """ Get an ARanges object representing the .debug_aranges section of
             the DWARF data, or None if the section doesn't exist
         """
         if self.debug_aranges_sec:
-            return ARanges(self.debug_aranges_sec.stream, 
-                self.debug_aranges_sec.size, 
+            return ARanges(self.debug_aranges_sec.stream,
+                self.debug_aranges_sec.size,
                 self.structs)
         else:
             return None
 
     def location_lists(self):
-        """ Get a LocationLists object representing the .debug_loc section of
+        """ Get a LocationLists object representing the .debug_loc/debug_loclists section of
             the DWARF data, or None if this section doesn't exist.
+
+            If both sections exist, it returns a LocationListsPair.
         """
-        if self.debug_loc_sec:
-            return LocationLists(self.debug_loc_sec.stream, self.structs)
+        if self.debug_loclists_sec and self.debug_loc_sec is None:
+            return LocationLists(self.debug_loclists_sec.stream, self.structs, 5, self)
+        elif self.debug_loc_sec and self.debug_loclists_sec is None:
+            return LocationLists(self.debug_loc_sec.stream, self.structs, 4, self)
+        elif self.debug_loc_sec and self.debug_loclists_sec:
+            return LocationListsPair(self.debug_loc_sec.stream, self.debug_loclists_sec.stream, self.structs, self)
         else:
             return None
 
     def range_lists(self):
-        """ Get a RangeLists object representing the .debug_ranges section of
+        """ Get a RangeLists object representing the .debug_ranges/.debug_rnglists section of
             the DWARF data, or None if this section doesn't exist.
+
+            If both sections exist, it returns a RangeListsPair.
         """
-        if self.debug_ranges_sec:
-            return RangeLists(self.debug_ranges_sec.stream, self.structs)
+        if self.debug_rnglists_sec and self.debug_ranges_sec is None:
+            return RangeLists(self.debug_rnglists_sec.stream, self.structs, 5, self)
+        elif self.debug_ranges_sec and self.debug_rnglists_sec is None:
+            return RangeLists(self.debug_ranges_sec.stream, self.structs, 4, self)
+        elif self.debug_ranges_sec and self.debug_rnglists_sec:
+            return RangeListsPair(self.debug_ranges_sec.stream, self.debug_rnglists_sec.stream, self.structs, self)
         else:
             return None
 
+    def get_addr(self, cu, addr_index):
+        """Provided a CU and an index, retrieves an address from the debug_addr section
+        """
+        if not self.debug_addr_sec:
+            raise DWARFError('The file does not contain a debug_addr section for indirect address access')
+        # Selectors are not supported, but no assert on that. TODO?
+        cu_addr_base = _get_base_offset(cu, 'DW_AT_addr_base')
+        return struct_parse(cu.structs.Dwarf_target_addr(''), self.debug_addr_sec.stream, cu_addr_base + addr_index*cu.header.address_size)            
+
     #------ PRIVATE ------#
 
-    def _parse_CUs_iter(self):
-        """ Parse CU entries from debug_info. Yield CUs in order of appearance.
+    def _parse_CUs_iter(self, offset=0):
+        """ Iterate CU objects in order of appearance in the debug_info section.
+
+            offset:
+                The offset of the first CU to yield.  Additional iterations
+                will return the sequential unit objects.
+
+            See .iter_CUs(), .get_CU_containing(), and .get_CU_at().
         """
         if self.debug_info_sec is None:
             return
 
-        offset = 0
         while offset < self.debug_info_sec.size:
-            cu = self._parse_CU_at_offset(offset)
+            cu = self._cached_CU_at_offset(offset)
             # Compute the offset of the next CU in the section. The unit_length
             # field of the CU header contains its size not including the length
             # field itself.
@@ -219,6 +419,32 @@ class DWARFInfo(object):
                         cu.structs.initial_length_field_size())
             yield cu
 
+    def _cached_CU_at_offset(self, offset):
+        """ Return the CU with unit header at the given offset into the
+            debug_info section from the cache.  If not present, the unit is
+            header is parsed and the object is installed in the cache.
+
+            offset:
+                The offset of the unit header in the .debug_info section
+                to of the unit to fetch from the cache.
+
+            See get_CU_at().
+        """
+        # Find the insert point for the requested offset.  With bisect_right,
+        # if this entry is present in the cache it will be the prior entry.
+        i = bisect_right(self._cu_offsets_map, offset)
+        if i >= 1 and offset == self._cu_offsets_map[i - 1]:
+            return self._cu_cache[i - 1]
+
+        # Parse the CU and insert the offset and object into the cache.
+        # The ._cu_offsets_map[] contains just the numeric offsets for the
+        # bisect_right search while the parallel indexed ._cu_cache[] holds
+        # the object references.
+        cu = self._parse_CU_at_offset(offset)
+        self._cu_offsets_map.insert(i, offset)
+        self._cu_cache.insert(i, cu)
+        return cu
+
     def _parse_CU_at_offset(self, offset):
         """ Parse and return a CU at the given offset in the debug_info stream.
         """
@@ -234,24 +460,25 @@ class DWARFInfo(object):
             self.structs.Dwarf_uint32(''), self.debug_info_sec.stream, offset)
         dwarf_format = 64 if initial_length == 0xFFFFFFFF else 32
 
-        # At this point we still haven't read the whole header, so we don't
-        # know the address_size. Therefore, we're going to create structs
-        # with a default address_size=4. If, after parsing the header, we
-        # find out address_size is actually 8, we just create a new structs
-        # object for this CU.
+
+        # Temporary structs for parsing the header
+        # The structs for the rest of the CU depend on the header data.
         #
         cu_structs = DWARFStructs(
             little_endian=self.config.little_endian,
             dwarf_format=dwarf_format,
-            address_size=4)
+            address_size=4,
+            dwarf_version=2)
 
         cu_header = struct_parse(
             cu_structs.Dwarf_CU_header, self.debug_info_sec.stream, offset)
-        if cu_header['address_size'] == 8:
-            cu_structs = DWARFStructs(
-                little_endian=self.config.little_endian,
-                dwarf_format=dwarf_format,
-                address_size=8)
+
+        # structs for the rest of the CU, taking into account bitness and DWARF version
+        cu_structs = DWARFStructs(
+            little_endian=self.config.little_endian,
+            dwarf_format=dwarf_format,
+            address_size=cu_header['address_size'],
+            dwarf_version=cu_header['version'])
 
         cu_die_offset = self.debug_info_sec.stream.tell()
         dwarf_assert(
@@ -267,26 +494,84 @@ class DWARFInfo(object):
     def _is_supported_version(self, version):
         """ DWARF version supported by this parser
         """
-        return 2 <= version <= 4
+        return 2 <= version <= 5
 
-    def _parse_line_program_at_offset(self, debug_line_offset, structs):
+    def _parse_line_program_at_offset(self, offset, structs):
         """ Given an offset to the .debug_line section, parse the line program
             starting at this offset in the section and return it.
             structs is the DWARFStructs object used to do this parsing.
         """
+
+        if offset in self._linetable_cache:
+            return self._linetable_cache[offset]
+
         lineprog_header = struct_parse(
             structs.Dwarf_lineprog_header,
             self.debug_line_sec.stream,
-            debug_line_offset)
+            offset)
+
+        # DWARF5: resolve names
+        def resolve_strings(self, lineprog_header, format_field, data_field):
+            if lineprog_header.get(format_field, False):
+                data = lineprog_header[data_field]
+                for field in lineprog_header[format_field]:
+                    def replace_value(data, content_type, replacer):
+                        for entry in data:
+                            entry[content_type] = replacer(entry[content_type])
+
+                    if field.form == 'DW_FORM_line_strp':
+                        replace_value(data, field.content_type, self.get_string_from_linetable)
+                    elif field.form == 'DW_FORM_strp':
+                        replace_value(data, field.content_type, self.get_string_from_table)
+                    elif field.form in ('DW_FORM_strp_sup', 'DW_FORM_GNU_strp_alt'):
+                        if self.supplementary_dwarfinfo:
+                            replace_value(data, field.content_type, self.supplementary_dwarfinfo.get_string_fromtable)
+                        else:
+                            replace_value(data, field.content_type, lambda x: str(x))
+                    elif field.form in ('DW_FORM_strp_sup', 'DW_FORM_strx', 'DW_FORM_strx1', 'DW_FORM_strx2', 'DW_FORM_strx3', 'DW_FORM_strx4'):
+                        raise NotImplementedError()
+
+        resolve_strings(self, lineprog_header, 'directory_entry_format', 'directories')
+        resolve_strings(self, lineprog_header, 'file_name_entry_format', 'file_names')
+
+        # DWARF5: provide compatible file/directory name arrays for legacy lineprogram consumers
+        if lineprog_header.get('directories', False):
+            lineprog_header.include_directory = tuple(d.DW_LNCT_path for d in lineprog_header.directories)
+        if lineprog_header.get('file_names', False):
+            lineprog_header.file_entry = tuple(
+                Container(**{
+                    'name':e.get('DW_LNCT_path'),
+                    'dir_index': e.get('DW_LNCT_directory_index'),
+                    'mtime': e.get('DW_LNCT_timestamp'),
+                    'length': e.get('DW_LNCT_size')})
+                for e in lineprog_header.file_names)
 
         # Calculate the offset to the next line program (see DWARF 6.2.4)
-        end_offset = (  debug_line_offset + lineprog_header['unit_length'] +
+        end_offset = (  offset + lineprog_header['unit_length'] +
                         structs.initial_length_field_size())
 
-        return LineProgram(
+        lineprogram = LineProgram(
             header=lineprog_header,
             stream=self.debug_line_sec.stream,
             structs=structs,
             program_start_offset=self.debug_line_sec.stream.tell(),
             program_end_offset=end_offset)
 
+        self._linetable_cache[offset] = lineprogram
+        return lineprogram
+
+    def parse_debugsupinfo(self):
+        """
+        Extract a filename from either .debug_sup or .gnu_debualtlink sections.
+        """
+        if self.debug_sup_sec is not None:
+            self.debug_sup_sec.stream.seek(0)
+            suplink = self.structs.Dwarf_debugsup.parse_stream(self.debug_sup_sec.stream)
+            if suplink.is_supplementary == 0:
+                return suplink.sup_filename
+        if self.gnu_debugaltlink_sec is not None:
+            self.gnu_debugaltlink_sec.stream.seek(0)
+            suplink = self.structs.Dwarf_debugaltlink.parse_stream(self.gnu_debugaltlink_sec.stream)
+            return suplink.sup_filename
+        return None
+