{GNU,}HashSection: Implement symbol lookup (#290)
authorAndreas Ziegler <andreas.ziegler@fau.de>
Mon, 9 Mar 2020 12:39:23 +0000 (13:39 +0100)
committerGitHub <noreply@github.com>
Mon, 9 Mar 2020 12:39:23 +0000 (05:39 -0700)
In super-stripped binaries, symbol tables can not be accessed
directly as we do not have section headers to find them. In
this case, we can already use the mandatory DynamicSegment
which provides methods for individual access and iteration
over symbols via a minimal implementation of symbol hash
sections which only provided the number of symbols so far.

As we can also directly look up symbols via the hash table,
let's implement this functionality as well.

The code is based on @rhelmot's implementation as discussed
in #219, with some changes around reading the hash parameters.

For supporting individual symbol lookup, we also need the
corresponding symbol table to get the Symbol objects if the
matching hash was found in the hash section. In regular ELF
files, the symbol table is denoted by the section index
provided in the sh_link field of the hash section and
automatically created when building the hash section, for
super-stripped binaries we can use the DynamicSegment (which
needs to be present in any case) as the symbol table as it
also provides a get_symbol() method relying on other ways to
determine the list of symbols. Both of these variants can be
seen in the improved test_hash.py file.

The hash tables are implemented in a base class which does not
derive from the Section class in order to allow instantiation
even if no section headers are present in the underlying file.

elftools/elf/dynamic.py
elftools/elf/elffile.py
elftools/elf/hash.py
test/test_hash.py

index a445ae98026cd2a1b11ac2eb7fd2ed4bdf9186da..c0458d845db8b00c5f42c4ebdbd7aa65ca1cc3ee 100644 (file)
@@ -9,7 +9,7 @@
 import itertools
 
 from collections import defaultdict
-from .hash import HashSection, GNUHashSection
+from .hash import ELFHashTable, GNUHashTable
 from .sections import Section, Symbol
 from .enums import ENUM_D_TAG
 from .segments import Segment
@@ -298,11 +298,10 @@ class DynamicSegment(Segment, Dynamic):
         end_ptr = None
 
         # Check if a DT_GNU_HASH tag exists and recover the number of symbols
-        # from the corresponding section
+        # from the corresponding hash table
         _, gnu_hash_offset = self.get_table_offset('DT_GNU_HASH')
         if gnu_hash_offset is not None:
-            hash_section = GNUHashSection(self.stream, gnu_hash_offset,
-                                          self.elffile)
+            hash_section = GNUHashTable(self.elffile, gnu_hash_offset, self)
             end_ptr = tab_ptr + \
                 hash_section.get_number_of_symbols() * symbol_size
 
@@ -310,8 +309,8 @@ class DynamicSegment(Segment, Dynamic):
         if end_ptr is None:
             _, hash_offset = self.get_table_offset('DT_HASH')
             if hash_offset is not None:
-                hash_section = HashSection(self.stream, hash_offset,
-                                           self.elffile)
+                # Get the hash table from the DT_HASH offset
+                hash_section = ELFHashTable(self.elffile, hash_offset, self)
                 end_ptr = tab_ptr + \
                     hash_section.get_number_of_symbols() * symbol_size
 
index c5b66c7f7acaa2584a2ae34d6ab3cc014198fd35..f95fd54fb82d2538fb121eea17eb83c5a798ef1a 100644 (file)
@@ -33,7 +33,7 @@ from .gnuversions import (
         GNUVerSymSection)
 from .segments import Segment, InterpSegment, NoteSegment
 from ..dwarf.dwarfinfo import DWARFInfo, DebugSectionDescriptor, DwarfConfig
-
+from .hash import ELFHashSection, GNUHashSection
 
 class ELFFile(object):
     """ Creation: the constructor accepts a stream (file-like object) with the
@@ -504,6 +504,10 @@ class ELFFile(object):
             return StabSection(section_header, name, self)
         elif sectype == 'SHT_ARM_ATTRIBUTES':
             return ARMAttributesSection(section_header, name, self)
+        elif sectype == 'SHT_HASH':
+            return self._make_elf_hash_section(section_header, name)
+        elif sectype == 'SHT_GNU_HASH':
+            return self._make_gnu_hash_section(section_header, name)
         else:
             return Section(section_header, name, self)
 
@@ -557,6 +561,20 @@ class ELFFile(object):
             elffile=self,
             symboltable=strtab_section)
 
+    def _make_elf_hash_section(self, section_header, name):
+        linked_symtab_index = section_header['sh_link']
+        symtab_section = self.get_section(linked_symtab_index)
+        return ELFHashSection(
+            section_header, name, self, symtab_section
+        )
+
+    def _make_gnu_hash_section(self, section_header, name):
+        linked_symtab_index = section_header['sh_link']
+        symtab_section = self.get_section(linked_symtab_index)
+        return GNUHashSection(
+            section_header, name, self, symtab_section
+        )
+
     def _get_segment_header(self, n):
         """ Find the header of segment #n, parse it and return the struct
         """
index 3c39f8b3bb746211a6ed929a80bd491109f16ec9..2e8a6fed41d0afde64692db9c54f001de44b9c1a 100644 (file)
 # This code is in the public domain
 #-------------------------------------------------------------------------------
 
+import struct
+
 from ..common.utils import struct_parse
+from .sections import Section
 
 
-class HashSection(object):
-    """ Minimal part of an ELF hash section to find the number of symbols in the
+class ELFHashTable(object):
+    """ Representation of an ELF hash table to find symbols in the
         symbol table - useful for super-stripped binaries without section
         headers where only the start of the symbol table is known from the
         dynamic segment. The layout and contents are nicely described at
         https://flapenguin.me/2017/04/24/elf-lookup-dt-hash/.
+
+        The symboltable argument needs to implement a get_symbol() method -
+        in a regular ELF file, this will be the linked symbol table section
+        as indicated by the sh_link attribute. For super-stripped binaries,
+        one should use the DynamicSegment object as the symboltable as it
+        supports symbol lookup without access to a symbol table section.
     """
-    def __init__(self, stream, offset, elffile):
-        self._stream = stream
-        self._offset = offset
-        self._elffile = elffile
-        self.params = struct_parse(self._elffile.structs.Elf_Hash,
-                                   self._stream,
-                                   self._offset)
+
+    def __init__(self, elffile, start_offset, symboltable):
+        self.elffile = elffile
+        self._symboltable = symboltable
+        self.params = struct_parse(self.elffile.structs.Elf_Hash,
+                                   self.elffile.stream,
+                                   start_offset)
 
     def get_number_of_symbols(self):
         """ Get the number of symbols from the hash table parameters.
         """
         return self.params['nchains']
 
+    def get_symbol(self, name):
+        """ Look up a symbol from this hash table with the given name.
+        """
+        if self.params['nbuckets'] == 0:
+            return None
+        hval = self.elf_hash(name) % self.params['nbuckets']
+        symndx = self.params['buckets'][hval]
+        while symndx != 0:
+            sym = self._symboltable.get_symbol(symndx)
+            if sym.name == name:
+                return sym
+            symndx = self.params['chains'][symndx]
+        return None
+
+    @staticmethod
+    def elf_hash(name):
+        """ Compute the hash value for a given symbol name.
+        """
+        if not isinstance(name, bytes):
+            name = name.encode('utf-8')
+        h = 0
+        x = 0
+        for c in bytearray(name):
+            h = (h << 4) + c
+            x = h & 0xF0000000
+            if x != 0:
+                h ^= (x >> 24)
+            h &= ~x
+        return h
+
 
-class GNUHashSection(object):
-    """ Minimal part of a GNU hash section to find the number of symbols in the
+class ELFHashSection(Section, ELFHashTable):
+    """ Section representation of an ELF hash table. In regular ELF files, this
+        allows us to use the common functions defined on Section objects when
+        dealing with the hash table.
+    """
+    def __init__(self, header, name, elffile, symboltable):
+        Section.__init__(self, header, name, elffile)
+        ELFHashTable.__init__(self, elffile, self['sh_offset'], symboltable)
+
+
+class GNUHashTable(object):
+    """ Representation of a GNU hash table to find symbols in the
         symbol table - useful for super-stripped binaries without section
         headers where only the start of the symbol table is known from the
         dynamic segment. The layout and contents are nicely described at
         https://flapenguin.me/2017/05/10/elf-lookup-dt-gnu-hash/.
+
+        The symboltable argument needs to implement a get_symbol() method -
+        in a regular ELF file, this will be the linked symbol table section
+        as indicated by the sh_link attribute. For super-stripped binaries,
+        one should use the DynamicSegment object as the symboltable as it
+        supports symbol lookup without access to a symbol table section.
     """
-    def __init__(self, stream, offset, elffile):
-        self._stream = stream
-        self._offset = offset
-        self._elffile = elffile
-        self.params = struct_parse(self._elffile.structs.Gnu_Hash,
-                                   self._stream,
-                                   self._offset)
+    def __init__(self, elffile, start_offset, symboltable):
+        self.elffile = elffile
+        self._symboltable = symboltable
+        self.params = struct_parse(self.elffile.structs.Gnu_Hash,
+                                   self.elffile.stream,
+                                   start_offset)
+        # Element sizes in the hash table
+        self._wordsize = self.elffile.structs.Elf_word('').sizeof()
+        self._xwordsize = self.elffile.structs.Elf_xword('').sizeof()
+        self._chain_pos = start_offset + 4 * self._wordsize + \
+            self.params['bloom_size'] * self._xwordsize + \
+            self.params['nbuckets'] * self._wordsize
 
     def get_number_of_symbols(self):
         """ Get the number of symbols in the hash table by finding the bucket
             with the highest symbol index and walking to the end of its chain.
         """
-        # Element sizes in the hash table
-        wordsize = self._elffile.structs.Elf_word('').sizeof()
-        xwordsize = self._elffile.structs.Elf_xword('').sizeof()
-
         # Find highest index in buckets array
         max_idx = max(self.params['buckets'])
         if max_idx < self.params['symoffset']:
             return self.params['symoffset']
 
         # Position the stream at the start of the corresponding chain
-        chain_pos = self._offset + 4 * wordsize + \
-            self.params['bloom_size'] * xwordsize + \
-            self.params['nbuckets'] * wordsize + \
-            (max_idx - self.params['symoffset']) * wordsize
+        max_chain_pos = self._chain_pos + \
+            (max_idx - self.params['symoffset']) * self._wordsize
+        self.elffile.stream.seek(max_chain_pos)
 
         # Walk the chain to its end (lowest bit is set)
         while True:
-            cur_hash = struct_parse(self._elffile.structs.Elf_word('elem'),
-                                    self._stream,
-                                    chain_pos)
+            cur_hash = struct.unpack('I', self.elffile.stream.read(self._wordsize))[0]
             if cur_hash & 1:
                 return max_idx + 1
 
             max_idx += 1
-            chain_pos += wordsize
+
+    def _matches_bloom(self, H1):
+        """ Helper function to check if the given hash could be in the hash
+            table by testing it against the bloom filter.
+        """
+        arch_bits = self.elffile.elfclass
+        H2 = H1 >> self.params['bloom_shift']
+        word_idx = int(H1 / arch_bits) % self.params['bloom_size']
+        BITMASK = (1 << (H1 % arch_bits)) | (1 << (H2 % arch_bits))
+        return (self.params['bloom'][word_idx] & BITMASK) == BITMASK
+
+    def get_symbol(self, name):
+        """ Look up a symbol from this hash table with the given name.
+        """
+        namehash = self.gnu_hash(name)
+        if not self._matches_bloom(namehash):
+            return None
+
+        symidx = self.params['buckets'][namehash % self.params['nbuckets']]
+        if symidx < self.params['symoffset']:
+            return None
+
+        self.elffile.stream.seek(self._chain_pos + (symidx - self.params['symoffset']) * self._wordsize)
+        while True:
+            cur_hash = struct.unpack('I', self.elffile.stream.read(self._wordsize))[0]
+            if cur_hash | 1 == namehash | 1:
+                symbol = self._symboltable.get_symbol(symidx)
+                if name == symbol.name:
+                    return symbol
+
+            if cur_hash & 1:
+                break
+            symidx += 1
+        return None
+
+    @staticmethod
+    def gnu_hash(key):
+        """ Compute the GNU-style hash value for a given symbol name.
+        """
+        if not isinstance(key, bytes):
+            key = key.encode('utf-8')
+        h = 5381
+        for c in bytearray(key):
+            h = h * 33 + c
+        return h & 0xFFFFFFFF
+
+
+class GNUHashSection(Section, GNUHashTable):
+    """ Section representation of a GNU hash table. In regular ELF files, this
+        allows us to use the common functions defined on Section objects when
+        dealing with the hash table.
+    """
+    def __init__(self, header, name, elffile, symboltable):
+        Section.__init__(self, header, name, elffile)
+        GNUHashTable.__init__(self, elffile, self['sh_offset'], symboltable)
index 9fab30e058b9ecc415cb5015a5480fe9c287159f..7832a7698fffce17c8c40420a7233cfef4c4a67c 100644 (file)
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 #-------------------------------------------------------------------------------
 # elftools tests
 #
@@ -8,28 +9,73 @@ import unittest
 import os
 
 from elftools.elf.elffile import ELFFile
-from elftools.common.exceptions import ELFError
-from elftools.elf.hash import HashSection, GNUHashSection
+from elftools.elf.hash import ELFHashTable, GNUHashTable
 
 class TestELFHash(unittest.TestCase):
+    """ Tests for the ELF hash table.
+    """
+
+    def test_elf_hash(self):
+        """ Verify correctness of ELF hashing function. The expected values
+            were computed with the C implementation from the glibc source code.
+        """
+        self.assertEqual(ELFHashTable.elf_hash(''), 0x00000000)
+        self.assertEqual(ELFHashTable.elf_hash('main'), 0x000737fe)
+        self.assertEqual(ELFHashTable.elf_hash('printf'), 0x077905a6)
+        self.assertEqual(ELFHashTable.elf_hash('exit'), 0x0006cf04)
+        self.assertEqual(ELFHashTable.elf_hash(u'ïó®123'), 0x0efddae3)
+        self.assertEqual(ELFHashTable.elf_hash(b'\xe4\xbd\xa0\xe5\xa5\xbd'),
+                         0x0f07f00d)
+
     def test_get_number_of_syms(self):
         """ Verify we can get get the number of symbols from an ELF hash
             section.
         """
-
         with open(os.path.join('test', 'testfiles_for_unittests',
                                'aarch64_super_stripped.elf'), 'rb') as f:
             elf = ELFFile(f)
+            dynamic_segment = None
             for segment in elf.iter_segments():
-                if segment.header.p_type != 'PT_DYNAMIC':
-                    continue
+                if segment.header.p_type == 'PT_DYNAMIC':
+                    dynamic_segment = segment
+                    break
+
+            _, hash_offset = dynamic_segment.get_table_offset('DT_HASH')
 
-                _, hash_offset = segment.get_table_offset('DT_HASH')
-            hash_section = HashSection(elf.stream, hash_offset, elf)
+            hash_section = ELFHashTable(elf, hash_offset, dynamic_segment)
+            self.assertIsNotNone(hash_section)
             self.assertEqual(hash_section.get_number_of_symbols(), 4)
 
+    def test_get_symbol(self):
+        """ Verify we can get a specific symbol from an ELF hash section.
+        """
+        path = os.path.join('test', 'testfiles_for_unittests',
+                            'simple_mipsel.elf')
+        with open(path, 'rb') as f:
+            elf = ELFFile(f)
+            hash_section = elf.get_section_by_name('.hash')
+            self.assertIsNotNone(hash_section)
+            symbol_main = hash_section.get_symbol('main')
+            self.assertIsNotNone(symbol_main)
+            self.assertEqual(symbol_main['st_value'], int(0x400790))
+
 
 class TestGNUHash(unittest.TestCase):
+    """ Tests for the GNU hash table.
+    """
+
+    def test_gnu_hash(self):
+        """ Verify correctness of GNU hashing function. The expected values
+            were computed with the C implementation from the glibc source code.
+        """
+        self.assertEqual(GNUHashTable.gnu_hash(''), 0x00001505)
+        self.assertEqual(GNUHashTable.gnu_hash('main'), 0x7c9a7f6a)
+        self.assertEqual(GNUHashTable.gnu_hash('printf'), 0x156b2bb8)
+        self.assertEqual(GNUHashTable.gnu_hash('exit'), 0x7c967e3f)
+        self.assertEqual(GNUHashTable.gnu_hash(u'ïó®123'), 0x8025a693)
+        self.assertEqual(GNUHashTable.gnu_hash(b'\xe4\xbd\xa0\xe5\xa5\xbd'),
+                         0x296eec2d)
+
     def test_get_number_of_syms(self):
         """ Verify we can get get the number of symbols from a GNU hash
             section.
@@ -38,10 +84,18 @@ class TestGNUHash(unittest.TestCase):
         with open(os.path.join('test', 'testfiles_for_unittests',
                                'lib_versioned64.so.1.elf'), 'rb') as f:
             elf = ELFFile(f)
-            for segment in elf.iter_segments():
-                if segment.header.p_type != 'PT_DYNAMIC':
-                    continue
-
-                _, hash_offset = segment.get_table_offset('DT_GNU_HASH')
-            hash_section = GNUHashSection(elf.stream, hash_offset, elf)
+            hash_section = elf.get_section_by_name('.gnu.hash')
+            self.assertIsNotNone(hash_section)
             self.assertEqual(hash_section.get_number_of_symbols(), 24)
+
+    def test_get_symbol(self):
+        """ Verify we can get a specific symbol from a GNU hash section.
+        """
+        with open(os.path.join('test', 'testfiles_for_unittests',
+                               'lib_versioned64.so.1.elf'), 'rb') as f:
+            elf = ELFFile(f)
+            hash_section = elf.get_section_by_name('.gnu.hash')
+            self.assertIsNotNone(hash_section)
+            symbol_f1 = hash_section.get_symbol('function1_ver1_1')
+            self.assertIsNotNone(symbol_f1)
+            self.assertEqual(symbol_f1['st_value'], int(0x9a2))