ref_addr size changed between v2 and v3 - take 2 (#273)
authorSeva Alekseyev <sevaa@yarxi.ru>
Sat, 7 Mar 2020 13:34:29 +0000 (08:34 -0500)
committerGitHub <noreply@github.com>
Sat, 7 Mar 2020 13:34:29 +0000 (05:34 -0800)
In DWARF 2, the DW_FORM_ref_addr format matches the target address size, while in DWARF3+ it matches the bitness of the CU record. Here are the relevant fragments from the spec, part 7:

v2:

    The second type of reference is the address of any debugging information entry within the same executable or shared object; it may refer to an entry in a different compilation unit from the unit containing the reference. This type of reference (DW_FORM_ref_addr) is the size of an address on the target architecture; it is relocatable in a relocatable object file and relocated in an executable file or shared object.

v3:

    The second type of reference can identify any debugging information entry within a program; in particular, it may refer to an entry in a different compilation unit from the unit containing the reference, and may refer to an entry in a different shared object. This type of reference (DW_FORM_ref_addr) is an offset from the beginning of the .debug_info section of the target executable or shared object; it is relocatable in a relocatable object file and frequently relocated in an executable file or shared object. For references from one shared object or static executable file to another, the relocation and identification of the target object must be performed by the consumer. In the 32-bit DWARF format, this offset is a 4-byte unsigned value; in the 64-bit DWARF format, it is an 8-byte unsigned value (see Section 7.4).

If elftools encounters 32-bit DWARF v2 targeting a 64-bit architecture, it will misparse DW_FORM_ref_addr and crash downstream.

I encountered this in an iOS binary from 2017, built with Xcode several versions ago for ARM64. This probably never came up before because by the time 64 bit code became relevant, most toolchains would generate DWARF 3 or newer.

Co-authored-by: Seva Alekseyev <sevaa@nih.gov>
elftools/dwarf/dwarfinfo.py
elftools/dwarf/structs.py
test/test_refaddr_bitness.py [new file with mode: 0644]
test/testfiles_for_unittests/arm64_on_dwarfv2.abbrev.dat [new file with mode: 0644]
test/testfiles_for_unittests/arm64_on_dwarfv2.info.dat [new file with mode: 0644]
test/testfiles_for_unittests/arm64_on_dwarfv2.str.dat [new file with mode: 0644]

index 45903ec868f90d2dd23c1f0cb633a21181794520..ce1bce88f63a5dce5ac5c5c247c168b3cb800487 100644 (file)
@@ -285,24 +285,25 @@ class DWARFInfo(object):
             self.structs.Dwarf_uint32(''), self.debug_info_sec.stream, offset)
         dwarf_format = 64 if initial_length == 0xFFFFFFFF else 32
 
-        # At this point we still haven't read the whole header, so we don't
-        # know the address_size. Therefore, we're going to create structs
-        # with a default address_size=4. If, after parsing the header, we
-        # find out address_size is actually 8, we just create a new structs
-        # object for this CU.
+        
+        # Temporary structs for parsing the header
+        # The structs for the rest of the CU depend on the header data. 
         #
         cu_structs = DWARFStructs(
             little_endian=self.config.little_endian,
             dwarf_format=dwarf_format,
-            address_size=4)
+            address_size=4,
+            dwarf_version=2)
 
         cu_header = struct_parse(
             cu_structs.Dwarf_CU_header, self.debug_info_sec.stream, offset)
-        if cu_header['address_size'] == 8:
-            cu_structs = DWARFStructs(
-                little_endian=self.config.little_endian,
-                dwarf_format=dwarf_format,
-                address_size=8)
+
+        # structs for the rest of the CU, taking into account bitness and DWARF version
+        cu_structs = DWARFStructs(
+            little_endian=self.config.little_endian,
+            dwarf_format=dwarf_format,
+            address_size=cu_header['address_size'],
+            dwarf_version=cu_header['version'])
 
         cu_die_offset = self.debug_info_sec.stream.tell()
         dwarf_assert(
index 6dde82b865afc1e57ff5d590af7fa844f7178994..a2d6c09a1558ab3af8adf3bc37d69439dbf0ba35 100644 (file)
@@ -200,7 +200,7 @@ class DWARFStructs(object):
             DW_FORM_ref4=self.Dwarf_uint32(''),
             DW_FORM_ref8=self.Dwarf_uint64(''),
             DW_FORM_ref_udata=self.Dwarf_uleb128(''),
-            DW_FORM_ref_addr=self.Dwarf_offset(''),
+            DW_FORM_ref_addr=self.Dwarf_target_addr('') if self.dwarf_version == 2 else self.Dwarf_offset(''),
 
             DW_FORM_indirect=self.Dwarf_uleb128(''),
 
diff --git a/test/test_refaddr_bitness.py b/test/test_refaddr_bitness.py
new file mode 100644 (file)
index 0000000..67f3e6a
--- /dev/null
@@ -0,0 +1,62 @@
+#------------------------------------------------------------------------------
+# elftools tests
+#
+# Seva Alekseyev (sevaa@sprynet.com)
+# This code is in the public domain
+#
+# The error that motivated this fix was in an iOS binary in Mach-O format.
+# It had v2 DWARF data, but it was targeting a 64 bit architecture.
+# Before the fix, pyelftools would assume that DW_FORM_ref_addr attribute took 4 bytes
+# and misparse the DWARF data in the binary.
+#
+# Since pyelftools doesn't work with Mach-O files, I've taken a sample binary apart,
+# and saved the three relevant sections - info, abbrev, and str as flat files.
+# The metadata (the fact that it's targeting ARM64) is hard-coded, since the Mach-O header
+# isn't preserved.
+#------------------------------------------------------------------------------
+
+import unittest
+import os, sys, io
+
+from elftools.dwarf.dwarfinfo import DWARFInfo, DebugSectionDescriptor, DwarfConfig
+
+class TestRefAddrOnDWARFv2With64BitTarget(unittest.TestCase):
+    def test_main(self):
+        # Read the three saved sections as bytestreams
+        with open(os.path.join('test', 'testfiles_for_unittests', 'arm64_on_dwarfv2.info.dat'), 'rb') as f:
+            info = f.read()
+        with open(os.path.join('test', 'testfiles_for_unittests', 'arm64_on_dwarfv2.abbrev.dat'), 'rb') as f:
+            abbrev = f.read()
+        with open(os.path.join('test', 'testfiles_for_unittests', 'arm64_on_dwarfv2.str.dat'), 'rb') as f:
+            str = f.read()            
+
+        # Parse the DWARF info
+        di = DWARFInfo(
+            config = DwarfConfig(little_endian = True, default_address_size = 8, machine_arch = "ARM64"),
+            debug_info_sec = DebugSectionDescriptor(io.BytesIO(info), '__debug_info', None, len(info), 0),
+            debug_aranges_sec = None,
+            debug_abbrev_sec = DebugSectionDescriptor(io.BytesIO(abbrev), '__debug_abbrev', None, len(abbrev), 0),
+            debug_frame_sec = None,
+            eh_frame_sec = None,
+            debug_str_sec = DebugSectionDescriptor(io.BytesIO(str), '__debug_str', None, len(str), 0),
+            debug_loc_sec = None,
+            debug_ranges_sec = None,
+            debug_line_sec = None,
+            debug_pubtypes_sec = None,
+            debug_pubnames_sec = None
+        )
+
+        CUs = [cu for cu in di.iter_CUs()]
+        # Locate a CU that I know has a reference in DW_FORM_ref_addr form
+        CU = CUs[21]
+        self.assertEqual(CU['version'], 2)
+        # Make sure pyelftools appreciates the difference between the target address size and DWARF inter-DIE offset size
+        self.assertEqual(CU.structs.dwarf_format, 32)
+        self.assertEqual(CU['address_size'], 8)
+        DIEs = [die for die in CU.iter_DIEs()]
+        # Before the patch, DIE #2 is misparsed, the current offset is off, the rest are misparsed too
+        self.assertEqual(len(DIEs), 15)
+        # It was 9 before the patch, which was wrong.
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/test/testfiles_for_unittests/arm64_on_dwarfv2.abbrev.dat b/test/testfiles_for_unittests/arm64_on_dwarfv2.abbrev.dat
new file mode 100644 (file)
index 0000000..16b2768
Binary files /dev/null and b/test/testfiles_for_unittests/arm64_on_dwarfv2.abbrev.dat differ
diff --git a/test/testfiles_for_unittests/arm64_on_dwarfv2.info.dat b/test/testfiles_for_unittests/arm64_on_dwarfv2.info.dat
new file mode 100644 (file)
index 0000000..b87d3b9
Binary files /dev/null and b/test/testfiles_for_unittests/arm64_on_dwarfv2.info.dat differ
diff --git a/test/testfiles_for_unittests/arm64_on_dwarfv2.str.dat b/test/testfiles_for_unittests/arm64_on_dwarfv2.str.dat
new file mode 100644 (file)
index 0000000..3bea27a
Binary files /dev/null and b/test/testfiles_for_unittests/arm64_on_dwarfv2.str.dat differ