Improve symbol table handling in DynamicSegment (#219)

author Andreas Ziegler <ziegler@einserver.de>

Tue, 19 Mar 2019 01:48:19 +0000 (02:48 +0100)

committer Eli Bendersky <eliben@users.noreply.github.com>

Tue, 19 Mar 2019 01:48:19 +0000 (18:48 -0700)
author Andreas Ziegler <ziegler@einserver.de>
Tue, 19 Mar 2019 01:48:19 +0000 (02:48 +0100)
committer Eli Bendersky <eliben@users.noreply.github.com>
Tue, 19 Mar 2019 01:48:19 +0000 (18:48 -0700)
diff --git a/elftools/elf/dynamic.py b/elftools/elf/dynamic.py

index 202a1f495569322265502f7b71a67a54e4644334..fcdec4e6639e78731657bd13a3352886d1387d6c 100644 (file)
--- a/elftools/elf/dynamic.py
+++ b/elftools/elf/dynamic.py
@@ -8,6 +8,8 @@
  #-------------------------------------------------------------------------------
  import itertools
  
+from collections import defaultdict
+from .hash import HashSection, GNUHashSection
  from .sections import Section, Symbol
  from .enums import ENUM_D_TAG
  from .segments import Segment
@@ -226,6 +228,36 @@ class DynamicSegment(Segment, Dynamic):
                  break
          Segment.__init__(self, header, stream)
          Dynamic.__init__(self, stream, elffile, stringtable, self['p_offset'])
+        self._symbol_list = None
+        self._symbol_name_map = None
+
+    def num_symbols(self):
+        """ Number of symbols in the table recovered from DT_SYMTAB
+        """
+        if self._symbol_list is None:
+            self._symbol_list = list(self.iter_symbols())
+        return len(self._symbol_list)
+
+    def get_symbol(self, index):
+        """ Get the symbol at index #index from the table (Symbol object)
+        """
+        if self._symbol_list is None:
+            self._symbol_list = list(self.iter_symbols())
+        return self._symbol_list[index]
+
+    def get_symbol_by_name(self, name):
+        """ Get a symbol(s) by name. Return None if no symbol by the given name
+            exists.
+        """
+        # The first time this method is called, construct a name to number
+        # mapping
+        #
+        if self._symbol_name_map is None:
+            self._symbol_name_map = defaultdict(list)
+            for i, sym in enumerate(self.iter_symbols()):
+                self._symbol_name_map[sym.name].append(i)
+        symnums = self._symbol_name_map.get(name)
+        return [self.get_symbol(i) for i in symnums] if symnums else None
  
      def iter_symbols(self):
          """ Yield all symbols in this dynamic segment. The symbols are usually
@@ -239,33 +271,56 @@ class DynamicSegment(Segment, Dynamic):
  
          symbol_size = self.elfstructs.Elf_Sym.sizeof()
  
-        # Find closest higher pointer than tab_ptr. We'll use that to mark the
-        # end of the symbol table.
-        nearest_ptr = None
-        for tag in self.iter_tags():
-            tag_ptr = tag['d_ptr']
-            if tag['d_tag'] == 'DT_SYMENT':
-                if symbol_size != tag['d_val']:
-                    # DT_SYMENT is the size of one symbol entry. It must be the
-                    # same as returned by Elf_Sym.sizeof.
-                    raise ELFError('DT_SYMENT (%d) != Elf_Sym (%d).' %
-                                   (tag['d_val'], symbol_size))
-            if (tag_ptr > tab_ptr and
-                    (nearest_ptr is None or nearest_ptr > tag_ptr)):
-                nearest_ptr = tag_ptr
-
-        if nearest_ptr is None:
-            # Use the end of segment that contains DT_SYMTAB.
-            for segment in self.elffile.iter_segments():
-                if (segment['p_vaddr'] <= tab_ptr and
-                        tab_ptr <= (segment['p_vaddr'] + segment['p_filesz'])):
-                    nearest_ptr = segment['p_vaddr'] + segment['p_filesz']
-
-        if nearest_ptr is None:
+        end_ptr = None
+
+        # Check if a DT_GNU_HASH tag exists and recover the number of symbols
+        # from the corresponding section
+        _, gnu_hash_offset = self.get_table_offset('DT_GNU_HASH')
+        if gnu_hash_offset is not None:
+            hash_section = GNUHashSection(self.stream, gnu_hash_offset,
+                                          self.elffile)
+            end_ptr = tab_ptr + \
+                hash_section.get_number_of_symbols() * symbol_size
+
+        # If DT_GNU_HASH did not exist, maybe we can use DT_HASH
+        if end_ptr is None:
+            _, hash_offset = self.get_table_offset('DT_HASH')
+            if hash_offset is not None:
+                hash_section = HashSection(self.stream, hash_offset,
+                                           self.elffile)
+                end_ptr = tab_ptr + \
+                    hash_section.get_number_of_symbols() * symbol_size
+
+        if end_ptr is None:
+            # Find closest higher pointer than tab_ptr. We'll use that to mark
+            # the end of the symbol table.
+            nearest_ptr = None
+            for tag in self.iter_tags():
+                tag_ptr = tag['d_ptr']
+                if tag['d_tag'] == 'DT_SYMENT':
+                    if symbol_size != tag['d_val']:
+                        # DT_SYMENT is the size of one symbol entry. It must be
+                        # the same as returned by Elf_Sym.sizeof.
+                        raise ELFError('DT_SYMENT (%d) != Elf_Sym (%d).' %
+                                    (tag['d_val'], symbol_size))
+                if (tag_ptr > tab_ptr and
+                        (nearest_ptr is None or nearest_ptr > tag_ptr)):
+                    nearest_ptr = tag_ptr
+
+            if nearest_ptr is None:
+                # Use the end of segment that contains DT_SYMTAB.
+                for segment in self.elffile.iter_segments():
+                    if (segment['p_vaddr'] <= tab_ptr and
+                            tab_ptr <= (segment['p_vaddr'] + segment['p_filesz'])):
+                        nearest_ptr = segment['p_vaddr'] + segment['p_filesz']
+
+            end_ptr = nearest_ptr
+
+        if end_ptr is None:
              raise ELFError('Cannot determine the end of DT_SYMTAB.')
  
          string_table = self._get_stringtable()
-        for i in range((nearest_ptr - tab_ptr) // symbol_size):
+        for i in range((end_ptr - tab_ptr) // symbol_size):
              symbol = struct_parse(self.elfstructs.Elf_Sym, self._stream,
                                    i * symbol_size + tab_offset)
              symbol_name = string_table.get_string(symbol['st_name'])
diff --git a/elftools/elf/hash.py b/elftools/elf/hash.py

new file mode 100644 (file)

index 0000000..3c39f8b
--- /dev/null
+++ b/elftools/elf/hash.py
@@ -0,0 +1,77 @@
+#-------------------------------------------------------------------------------
+# elftools: elf/hash.py
+#
+# ELF hash table sections
+#
+# Andreas Ziegler (andreas.ziegler@fau.de)
+# This code is in the public domain
+#-------------------------------------------------------------------------------
+
+from ..common.utils import struct_parse
+
+
+class HashSection(object):
+    """ Minimal part of an ELF hash section to find the number of symbols in the
+        symbol table - useful for super-stripped binaries without section
+        headers where only the start of the symbol table is known from the
+        dynamic segment. The layout and contents are nicely described at
+        https://flapenguin.me/2017/04/24/elf-lookup-dt-hash/.
+    """
+    def __init__(self, stream, offset, elffile):
+        self._stream = stream
+        self._offset = offset
+        self._elffile = elffile
+        self.params = struct_parse(self._elffile.structs.Elf_Hash,
+                                   self._stream,
+                                   self._offset)
+
+    def get_number_of_symbols(self):
+        """ Get the number of symbols from the hash table parameters.
+        """
+        return self.params['nchains']
+
+
+class GNUHashSection(object):
+    """ Minimal part of a GNU hash section to find the number of symbols in the
+        symbol table - useful for super-stripped binaries without section
+        headers where only the start of the symbol table is known from the
+        dynamic segment. The layout and contents are nicely described at
+        https://flapenguin.me/2017/05/10/elf-lookup-dt-gnu-hash/.
+    """
+    def __init__(self, stream, offset, elffile):
+        self._stream = stream
+        self._offset = offset
+        self._elffile = elffile
+        self.params = struct_parse(self._elffile.structs.Gnu_Hash,
+                                   self._stream,
+                                   self._offset)
+
+    def get_number_of_symbols(self):
+        """ Get the number of symbols in the hash table by finding the bucket
+            with the highest symbol index and walking to the end of its chain.
+        """
+        # Element sizes in the hash table
+        wordsize = self._elffile.structs.Elf_word('').sizeof()
+        xwordsize = self._elffile.structs.Elf_xword('').sizeof()
+
+        # Find highest index in buckets array
+        max_idx = max(self.params['buckets'])
+        if max_idx < self.params['symoffset']:
+            return self.params['symoffset']
+
+        # Position the stream at the start of the corresponding chain
+        chain_pos = self._offset + 4 * wordsize + \
+            self.params['bloom_size'] * xwordsize + \
+            self.params['nbuckets'] * wordsize + \
+            (max_idx - self.params['symoffset']) * wordsize
+
+        # Walk the chain to its end (lowest bit is set)
+        while True:
+            cur_hash = struct_parse(self._elffile.structs.Elf_word('elem'),
+                                    self._stream,
+                                    chain_pos)
+            if cur_hash & 1:
+                return max_idx + 1
+
+            max_idx += 1
+            chain_pos += wordsize
diff --git a/elftools/elf/structs.py b/elftools/elf/structs.py

index 660f6872f1a32b0550d8894357b09a34e43360c7..6b5610b7fc311b4d63bd5502c3d7d9417ad5a347 100644 (file)
--- a/elftools/elf/structs.py
+++ b/elftools/elf/structs.py
@@ -90,6 +90,8 @@ class ELFStructs(object):
          self._create_note(e_type)
          self._create_stabs()
          self._create_arm_attributes()
+        self._create_elf_hash()
+        self._create_gnu_hash()
  
      #-------------------------------- PRIVATE --------------------------------#
  
@@ -398,3 +400,29 @@ class ELFStructs(object):
                                          Enum(self.Elf_uleb128('tag'),
                                               **ENUM_ATTR_TAG_ARM)
          )
+
+    def _create_elf_hash(self):
+        # Structure of the old SYSV-style hash table header. It is documented
+        # in the Oracle "Linker and Libraries Guide", Part IV ELF Application
+        # Binary Interface, Chapter 14 Object File Format, Section Hash Table
+        # Section:
+        # https://docs.oracle.com/cd/E53394_01/html/E54813/chapter6-48031.html
+
+        self.Elf_Hash = Struct('Elf_Hash',
+                               self.Elf_word('nbuckets'),
+                               self.Elf_word('nchains'),
+                               Array(lambda ctx: ctx['nbuckets'], self.Elf_word('buckets')),
+                               Array(lambda ctx: ctx['nchains'], self.Elf_word('chains')))
+
+    def _create_gnu_hash(self):
+        # Structure of the GNU-style hash table header. Documentation for this
+        # table is mostly in the GLIBC source code, a good explanation of the
+        # format can be found in this blog post:
+        # https://flapenguin.me/2017/05/10/elf-lookup-dt-gnu-hash/
+        self.Gnu_Hash = Struct('Gnu_Hash',
+                               self.Elf_word('nbuckets'),
+                               self.Elf_word('symoffset'),
+                               self.Elf_word('bloom_size'),
+                               self.Elf_word('bloom_shift'),
+                               Array(lambda ctx: ctx['bloom_size'], self.Elf_xword('bloom')),
+                               Array(lambda ctx: ctx['nbuckets'], self.Elf_word('buckets')))
diff --git a/test/test_dynamic.py b/test/test_dynamic.py

index c55fc2e3b95f55c3f829fdaeb5e7ae1664cd9066..1f48362e61bb7f30027734227fd151133b45e2bb 100644 (file)
--- a/test/test_dynamic.py
+++ b/test/test_dynamic.py
@@ -54,8 +54,9 @@ class TestDynamic(unittest.TestCase):
          exp = ['libc.so.6']
          self.assertEqual(libs, exp)
  
-    def test_reading_symbols(self):
-        """Verify we can read symbol table without SymbolTableSection"""
+    def test_reading_symbols_elf_hash(self):
+        """ Verify we can read symbol table without SymbolTableSection but with
+            a SYSV-style symbol hash table"""
          with open(os.path.join('test', 'testfiles_for_unittests',
                                 'aarch64_super_stripped.elf'), 'rb') as f:
              elf = ELFFile(f)
@@ -63,10 +64,41 @@ class TestDynamic(unittest.TestCase):
                  if segment.header.p_type != 'PT_DYNAMIC':
                      continue
  
+                num_symbols = segment.num_symbols()
                  symbol_names = [x.name for x in segment.iter_symbols()]
+                symbol_at_index_3 = segment.get_symbol(3)
+                symbols_abort = segment.get_symbol_by_name('abort')
  
+        self.assertEqual(num_symbols, 4)
          exp = ['', '__libc_start_main', '__gmon_start__', 'abort']
          self.assertEqual(symbol_names, exp)
+        self.assertEqual(symbol_at_index_3.name, 'abort')
+        self.assertIsNotNone(symbols_abort)
+        self.assertEqual(symbols_abort[0], symbol_at_index_3)
+
+    def test_reading_symbols_gnu_hash(self):
+        """ Verify we can read symbol table without SymbolTableSection but with
+            a GNU symbol hash table"""
+        with open(os.path.join('test', 'testfiles_for_unittests',
+                               'android_dyntags.elf'), 'rb') as f:
+            elf = ELFFile(f)
+            for segment in elf.iter_segments():
+                if segment.header.p_type != 'PT_DYNAMIC':
+                    continue
+
+                num_symbols = segment.num_symbols()
+                symbol_names = [x.name for x in segment.iter_symbols()]
+                symbol_at_index_3 = segment.get_symbol(3)
+                symbols_atfork = segment.get_symbol_by_name('__register_atfork')
+
+        self.assertEqual(num_symbols, 212)
+        exp = ['', '__cxa_finalize' , '__cxa_atexit', '__register_atfork',
+               '__stack_chk_fail', '_ZNK7android7RefBase9decStrongEPKv',
+               '_ZN7android7RefBaseD2Ev', '_ZdlPv', 'pthread_mutex_lock']
+        self.assertEqual(symbol_names[:9], exp)
+        self.assertEqual(symbol_at_index_3.name, '__register_atfork')
+        self.assertIsNotNone(symbols_atfork)
+        self.assertEqual(symbols_atfork[0], symbol_at_index_3)
  
      def test_sunw_tags(self):
          def extract_sunw(filename):
diff --git a/test/test_hash.py b/test/test_hash.py

new file mode 100644 (file)

index 0000000..9fab30e
--- /dev/null
+++ b/test/test_hash.py
@@ -0,0 +1,47 @@
+#-------------------------------------------------------------------------------
+# elftools tests
+#
+# Andreas Ziegler (andreas.ziegler@fau.de)
+# This code is in the public domain
+#-------------------------------------------------------------------------------
+import unittest
+import os
+
+from elftools.elf.elffile import ELFFile
+from elftools.common.exceptions import ELFError
+from elftools.elf.hash import HashSection, GNUHashSection
+
+class TestELFHash(unittest.TestCase):
+    def test_get_number_of_syms(self):
+        """ Verify we can get get the number of symbols from an ELF hash
+            section.
+        """
+
+        with open(os.path.join('test', 'testfiles_for_unittests',
+                               'aarch64_super_stripped.elf'), 'rb') as f:
+            elf = ELFFile(f)
+            for segment in elf.iter_segments():
+                if segment.header.p_type != 'PT_DYNAMIC':
+                    continue
+
+                _, hash_offset = segment.get_table_offset('DT_HASH')
+            hash_section = HashSection(elf.stream, hash_offset, elf)
+            self.assertEqual(hash_section.get_number_of_symbols(), 4)
+
+
+class TestGNUHash(unittest.TestCase):
+    def test_get_number_of_syms(self):
+        """ Verify we can get get the number of symbols from a GNU hash
+            section.
+        """
+
+        with open(os.path.join('test', 'testfiles_for_unittests',
+                               'lib_versioned64.so.1.elf'), 'rb') as f:
+            elf = ELFFile(f)
+            for segment in elf.iter_segments():
+                if segment.header.p_type != 'PT_DYNAMIC':
+                    continue
+
+                _, hash_offset = segment.get_table_offset('DT_GNU_HASH')
+            hash_section = GNUHashSection(elf.stream, hash_offset, elf)
+            self.assertEqual(hash_section.get_number_of_symbols(), 24)
author	Andreas Ziegler <ziegler@einserver.de>
	Tue, 19 Mar 2019 01:48:19 +0000 (02:48 +0100)
committer	Eli Bendersky <eliben@users.noreply.github.com>
	Tue, 19 Mar 2019 01:48:19 +0000 (18:48 -0700)
elftools/elf/dynamic.py		patch \| blob \| history
elftools/elf/hash.py	[new file with mode: 0644]	patch \| blob
elftools/elf/structs.py		patch \| blob \| history
test/test_dynamic.py		patch \| blob \| history
test/test_hash.py	[new file with mode: 0644]	patch \| blob