From: Andreas Ziegler Date: Tue, 19 Mar 2019 01:48:19 +0000 (+0100) Subject: Improve symbol table handling in DynamicSegment (#219) X-Git-Tag: v0.26~20 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=27941c50fef8cff8ef991419511664154c8cdf52;p=pyelftools.git Improve symbol table handling in DynamicSegment (#219) dynamic: parse DT_{GNU_}HASH for number of symbols In ultra-stripped binaries we can find the symbol table by parsing the dynamic segment and using the pointer in the DT_SYMTAB tag as the base address. However, we don't know anything about the number of symbols in the symbol table. Earlier, this code relied on finding the closest pointer value bigger than the base address of the symbol table. In PIE executables and shared libraries however this method could break as the pointer value for DT_SYMTAB is in the same range as things like DT_RELASZ or DT_STRSZ, leading to a too small number of symbols returned by iter_symbols(). The crashpad project has implemented a different strategy to find the number of symbols: parsing the symbol lookup hash tables (see [0]) as every symbol must have a corresponding entry in the hash table. This commit implements this behaviour for DynamicSegment, leaving the old code as a backup if neither DT_HASH or DT_GNU_HASH tags have been found. For DT_HASH type tables, it is quite easy as the header already contains the number of entries. For DT_GNU_HASH things are a bit more complicated as we need to work forward from the highest symbol referenced in the header (a good explanation of the format can be found at [1]). [0]: https://github.com/chromium/crashpad/commit/1f1657d573c789aa36b6022440e34d9ec30d894c [1]: https://flapenguin.me/2017/05/10/elf-lookup-dt-gnu-hash/ * dynamic: provide more functions for symbol access So far, the DynamicSegment only provided a method to iterate over all symbols but for some use cases it might be useful to use the recovered symbol table more like a normal SymbolTableSection. To this end, provide get_symbol(index) to fetch a symbol by its index, num_symbols() to get the total number of symbols and get_symbol_by_name(name) to look for a list of symbols with a given name. --- diff --git a/elftools/elf/dynamic.py b/elftools/elf/dynamic.py index 202a1f4..fcdec4e 100644 --- a/elftools/elf/dynamic.py +++ b/elftools/elf/dynamic.py @@ -8,6 +8,8 @@ #------------------------------------------------------------------------------- import itertools +from collections import defaultdict +from .hash import HashSection, GNUHashSection from .sections import Section, Symbol from .enums import ENUM_D_TAG from .segments import Segment @@ -226,6 +228,36 @@ class DynamicSegment(Segment, Dynamic): break Segment.__init__(self, header, stream) Dynamic.__init__(self, stream, elffile, stringtable, self['p_offset']) + self._symbol_list = None + self._symbol_name_map = None + + def num_symbols(self): + """ Number of symbols in the table recovered from DT_SYMTAB + """ + if self._symbol_list is None: + self._symbol_list = list(self.iter_symbols()) + return len(self._symbol_list) + + def get_symbol(self, index): + """ Get the symbol at index #index from the table (Symbol object) + """ + if self._symbol_list is None: + self._symbol_list = list(self.iter_symbols()) + return self._symbol_list[index] + + def get_symbol_by_name(self, name): + """ Get a symbol(s) by name. Return None if no symbol by the given name + exists. + """ + # The first time this method is called, construct a name to number + # mapping + # + if self._symbol_name_map is None: + self._symbol_name_map = defaultdict(list) + for i, sym in enumerate(self.iter_symbols()): + self._symbol_name_map[sym.name].append(i) + symnums = self._symbol_name_map.get(name) + return [self.get_symbol(i) for i in symnums] if symnums else None def iter_symbols(self): """ Yield all symbols in this dynamic segment. The symbols are usually @@ -239,33 +271,56 @@ class DynamicSegment(Segment, Dynamic): symbol_size = self.elfstructs.Elf_Sym.sizeof() - # Find closest higher pointer than tab_ptr. We'll use that to mark the - # end of the symbol table. - nearest_ptr = None - for tag in self.iter_tags(): - tag_ptr = tag['d_ptr'] - if tag['d_tag'] == 'DT_SYMENT': - if symbol_size != tag['d_val']: - # DT_SYMENT is the size of one symbol entry. It must be the - # same as returned by Elf_Sym.sizeof. - raise ELFError('DT_SYMENT (%d) != Elf_Sym (%d).' % - (tag['d_val'], symbol_size)) - if (tag_ptr > tab_ptr and - (nearest_ptr is None or nearest_ptr > tag_ptr)): - nearest_ptr = tag_ptr - - if nearest_ptr is None: - # Use the end of segment that contains DT_SYMTAB. - for segment in self.elffile.iter_segments(): - if (segment['p_vaddr'] <= tab_ptr and - tab_ptr <= (segment['p_vaddr'] + segment['p_filesz'])): - nearest_ptr = segment['p_vaddr'] + segment['p_filesz'] - - if nearest_ptr is None: + end_ptr = None + + # Check if a DT_GNU_HASH tag exists and recover the number of symbols + # from the corresponding section + _, gnu_hash_offset = self.get_table_offset('DT_GNU_HASH') + if gnu_hash_offset is not None: + hash_section = GNUHashSection(self.stream, gnu_hash_offset, + self.elffile) + end_ptr = tab_ptr + \ + hash_section.get_number_of_symbols() * symbol_size + + # If DT_GNU_HASH did not exist, maybe we can use DT_HASH + if end_ptr is None: + _, hash_offset = self.get_table_offset('DT_HASH') + if hash_offset is not None: + hash_section = HashSection(self.stream, hash_offset, + self.elffile) + end_ptr = tab_ptr + \ + hash_section.get_number_of_symbols() * symbol_size + + if end_ptr is None: + # Find closest higher pointer than tab_ptr. We'll use that to mark + # the end of the symbol table. + nearest_ptr = None + for tag in self.iter_tags(): + tag_ptr = tag['d_ptr'] + if tag['d_tag'] == 'DT_SYMENT': + if symbol_size != tag['d_val']: + # DT_SYMENT is the size of one symbol entry. It must be + # the same as returned by Elf_Sym.sizeof. + raise ELFError('DT_SYMENT (%d) != Elf_Sym (%d).' % + (tag['d_val'], symbol_size)) + if (tag_ptr > tab_ptr and + (nearest_ptr is None or nearest_ptr > tag_ptr)): + nearest_ptr = tag_ptr + + if nearest_ptr is None: + # Use the end of segment that contains DT_SYMTAB. + for segment in self.elffile.iter_segments(): + if (segment['p_vaddr'] <= tab_ptr and + tab_ptr <= (segment['p_vaddr'] + segment['p_filesz'])): + nearest_ptr = segment['p_vaddr'] + segment['p_filesz'] + + end_ptr = nearest_ptr + + if end_ptr is None: raise ELFError('Cannot determine the end of DT_SYMTAB.') string_table = self._get_stringtable() - for i in range((nearest_ptr - tab_ptr) // symbol_size): + for i in range((end_ptr - tab_ptr) // symbol_size): symbol = struct_parse(self.elfstructs.Elf_Sym, self._stream, i * symbol_size + tab_offset) symbol_name = string_table.get_string(symbol['st_name']) diff --git a/elftools/elf/hash.py b/elftools/elf/hash.py new file mode 100644 index 0000000..3c39f8b --- /dev/null +++ b/elftools/elf/hash.py @@ -0,0 +1,77 @@ +#------------------------------------------------------------------------------- +# elftools: elf/hash.py +# +# ELF hash table sections +# +# Andreas Ziegler (andreas.ziegler@fau.de) +# This code is in the public domain +#------------------------------------------------------------------------------- + +from ..common.utils import struct_parse + + +class HashSection(object): + """ Minimal part of an ELF hash section to find the number of symbols in the + symbol table - useful for super-stripped binaries without section + headers where only the start of the symbol table is known from the + dynamic segment. The layout and contents are nicely described at + https://flapenguin.me/2017/04/24/elf-lookup-dt-hash/. + """ + def __init__(self, stream, offset, elffile): + self._stream = stream + self._offset = offset + self._elffile = elffile + self.params = struct_parse(self._elffile.structs.Elf_Hash, + self._stream, + self._offset) + + def get_number_of_symbols(self): + """ Get the number of symbols from the hash table parameters. + """ + return self.params['nchains'] + + +class GNUHashSection(object): + """ Minimal part of a GNU hash section to find the number of symbols in the + symbol table - useful for super-stripped binaries without section + headers where only the start of the symbol table is known from the + dynamic segment. The layout and contents are nicely described at + https://flapenguin.me/2017/05/10/elf-lookup-dt-gnu-hash/. + """ + def __init__(self, stream, offset, elffile): + self._stream = stream + self._offset = offset + self._elffile = elffile + self.params = struct_parse(self._elffile.structs.Gnu_Hash, + self._stream, + self._offset) + + def get_number_of_symbols(self): + """ Get the number of symbols in the hash table by finding the bucket + with the highest symbol index and walking to the end of its chain. + """ + # Element sizes in the hash table + wordsize = self._elffile.structs.Elf_word('').sizeof() + xwordsize = self._elffile.structs.Elf_xword('').sizeof() + + # Find highest index in buckets array + max_idx = max(self.params['buckets']) + if max_idx < self.params['symoffset']: + return self.params['symoffset'] + + # Position the stream at the start of the corresponding chain + chain_pos = self._offset + 4 * wordsize + \ + self.params['bloom_size'] * xwordsize + \ + self.params['nbuckets'] * wordsize + \ + (max_idx - self.params['symoffset']) * wordsize + + # Walk the chain to its end (lowest bit is set) + while True: + cur_hash = struct_parse(self._elffile.structs.Elf_word('elem'), + self._stream, + chain_pos) + if cur_hash & 1: + return max_idx + 1 + + max_idx += 1 + chain_pos += wordsize diff --git a/elftools/elf/structs.py b/elftools/elf/structs.py index 660f687..6b5610b 100644 --- a/elftools/elf/structs.py +++ b/elftools/elf/structs.py @@ -90,6 +90,8 @@ class ELFStructs(object): self._create_note(e_type) self._create_stabs() self._create_arm_attributes() + self._create_elf_hash() + self._create_gnu_hash() #-------------------------------- PRIVATE --------------------------------# @@ -398,3 +400,29 @@ class ELFStructs(object): Enum(self.Elf_uleb128('tag'), **ENUM_ATTR_TAG_ARM) ) + + def _create_elf_hash(self): + # Structure of the old SYSV-style hash table header. It is documented + # in the Oracle "Linker and Libraries Guide", Part IV ELF Application + # Binary Interface, Chapter 14 Object File Format, Section Hash Table + # Section: + # https://docs.oracle.com/cd/E53394_01/html/E54813/chapter6-48031.html + + self.Elf_Hash = Struct('Elf_Hash', + self.Elf_word('nbuckets'), + self.Elf_word('nchains'), + Array(lambda ctx: ctx['nbuckets'], self.Elf_word('buckets')), + Array(lambda ctx: ctx['nchains'], self.Elf_word('chains'))) + + def _create_gnu_hash(self): + # Structure of the GNU-style hash table header. Documentation for this + # table is mostly in the GLIBC source code, a good explanation of the + # format can be found in this blog post: + # https://flapenguin.me/2017/05/10/elf-lookup-dt-gnu-hash/ + self.Gnu_Hash = Struct('Gnu_Hash', + self.Elf_word('nbuckets'), + self.Elf_word('symoffset'), + self.Elf_word('bloom_size'), + self.Elf_word('bloom_shift'), + Array(lambda ctx: ctx['bloom_size'], self.Elf_xword('bloom')), + Array(lambda ctx: ctx['nbuckets'], self.Elf_word('buckets'))) diff --git a/test/test_dynamic.py b/test/test_dynamic.py index c55fc2e..1f48362 100644 --- a/test/test_dynamic.py +++ b/test/test_dynamic.py @@ -54,8 +54,9 @@ class TestDynamic(unittest.TestCase): exp = ['libc.so.6'] self.assertEqual(libs, exp) - def test_reading_symbols(self): - """Verify we can read symbol table without SymbolTableSection""" + def test_reading_symbols_elf_hash(self): + """ Verify we can read symbol table without SymbolTableSection but with + a SYSV-style symbol hash table""" with open(os.path.join('test', 'testfiles_for_unittests', 'aarch64_super_stripped.elf'), 'rb') as f: elf = ELFFile(f) @@ -63,10 +64,41 @@ class TestDynamic(unittest.TestCase): if segment.header.p_type != 'PT_DYNAMIC': continue + num_symbols = segment.num_symbols() symbol_names = [x.name for x in segment.iter_symbols()] + symbol_at_index_3 = segment.get_symbol(3) + symbols_abort = segment.get_symbol_by_name('abort') + self.assertEqual(num_symbols, 4) exp = ['', '__libc_start_main', '__gmon_start__', 'abort'] self.assertEqual(symbol_names, exp) + self.assertEqual(symbol_at_index_3.name, 'abort') + self.assertIsNotNone(symbols_abort) + self.assertEqual(symbols_abort[0], symbol_at_index_3) + + def test_reading_symbols_gnu_hash(self): + """ Verify we can read symbol table without SymbolTableSection but with + a GNU symbol hash table""" + with open(os.path.join('test', 'testfiles_for_unittests', + 'android_dyntags.elf'), 'rb') as f: + elf = ELFFile(f) + for segment in elf.iter_segments(): + if segment.header.p_type != 'PT_DYNAMIC': + continue + + num_symbols = segment.num_symbols() + symbol_names = [x.name for x in segment.iter_symbols()] + symbol_at_index_3 = segment.get_symbol(3) + symbols_atfork = segment.get_symbol_by_name('__register_atfork') + + self.assertEqual(num_symbols, 212) + exp = ['', '__cxa_finalize' , '__cxa_atexit', '__register_atfork', + '__stack_chk_fail', '_ZNK7android7RefBase9decStrongEPKv', + '_ZN7android7RefBaseD2Ev', '_ZdlPv', 'pthread_mutex_lock'] + self.assertEqual(symbol_names[:9], exp) + self.assertEqual(symbol_at_index_3.name, '__register_atfork') + self.assertIsNotNone(symbols_atfork) + self.assertEqual(symbols_atfork[0], symbol_at_index_3) def test_sunw_tags(self): def extract_sunw(filename): diff --git a/test/test_hash.py b/test/test_hash.py new file mode 100644 index 0000000..9fab30e --- /dev/null +++ b/test/test_hash.py @@ -0,0 +1,47 @@ +#------------------------------------------------------------------------------- +# elftools tests +# +# Andreas Ziegler (andreas.ziegler@fau.de) +# This code is in the public domain +#------------------------------------------------------------------------------- +import unittest +import os + +from elftools.elf.elffile import ELFFile +from elftools.common.exceptions import ELFError +from elftools.elf.hash import HashSection, GNUHashSection + +class TestELFHash(unittest.TestCase): + def test_get_number_of_syms(self): + """ Verify we can get get the number of symbols from an ELF hash + section. + """ + + with open(os.path.join('test', 'testfiles_for_unittests', + 'aarch64_super_stripped.elf'), 'rb') as f: + elf = ELFFile(f) + for segment in elf.iter_segments(): + if segment.header.p_type != 'PT_DYNAMIC': + continue + + _, hash_offset = segment.get_table_offset('DT_HASH') + hash_section = HashSection(elf.stream, hash_offset, elf) + self.assertEqual(hash_section.get_number_of_symbols(), 4) + + +class TestGNUHash(unittest.TestCase): + def test_get_number_of_syms(self): + """ Verify we can get get the number of symbols from a GNU hash + section. + """ + + with open(os.path.join('test', 'testfiles_for_unittests', + 'lib_versioned64.so.1.elf'), 'rb') as f: + elf = ELFFile(f) + for segment in elf.iter_segments(): + if segment.header.p_type != 'PT_DYNAMIC': + continue + + _, hash_offset = segment.get_table_offset('DT_GNU_HASH') + hash_section = GNUHashSection(elf.stream, hash_offset, elf) + self.assertEqual(hash_section.get_number_of_symbols(), 24)