dynamic: parse DT_{GNU_}HASH for number of symbols
In ultra-stripped binaries we can find the symbol table by
parsing the dynamic segment and using the pointer in the
DT_SYMTAB tag as the base address. However, we don't know
anything about the number of symbols in the symbol table.
Earlier, this code relied on finding the closest pointer
value bigger than the base address of the symbol table. In
PIE executables and shared libraries however this method
could break as the pointer value for DT_SYMTAB is in the same
range as things like DT_RELASZ or DT_STRSZ, leading to a too
small number of symbols returned by iter_symbols().
The crashpad project has implemented a different strategy to
find the number of symbols: parsing the symbol lookup hash
tables (see [0]) as every symbol must have a corresponding
entry in the hash table. This commit implements this
behaviour for DynamicSegment, leaving the old code as a
backup if neither DT_HASH or DT_GNU_HASH tags have been
found.
For DT_HASH type tables, it is quite easy as the header
already contains the number of entries. For DT_GNU_HASH
things are a bit more complicated as we need to work forward
from the highest symbol referenced in the header (a good
explanation of the format can be found at [1]).
[0]: https://github.com/chromium/crashpad/commit/
1f1657d573c789aa36b6022440e34d9ec30d894c
[1]: https://flapenguin.me/2017/05/10/elf-lookup-dt-gnu-hash/
* dynamic: provide more functions for symbol access
So far, the DynamicSegment only provided a method to iterate
over all symbols but for some use cases it might be useful to
use the recovered symbol table more like a normal
SymbolTableSection.
To this end, provide get_symbol(index) to fetch a symbol by
its index, num_symbols() to get the total number of symbols
and get_symbol_by_name(name) to look for a list of symbols
with a given name.
#-------------------------------------------------------------------------------
import itertools
+from collections import defaultdict
+from .hash import HashSection, GNUHashSection
from .sections import Section, Symbol
from .enums import ENUM_D_TAG
from .segments import Segment
break
Segment.__init__(self, header, stream)
Dynamic.__init__(self, stream, elffile, stringtable, self['p_offset'])
+ self._symbol_list = None
+ self._symbol_name_map = None
+
+ def num_symbols(self):
+ """ Number of symbols in the table recovered from DT_SYMTAB
+ """
+ if self._symbol_list is None:
+ self._symbol_list = list(self.iter_symbols())
+ return len(self._symbol_list)
+
+ def get_symbol(self, index):
+ """ Get the symbol at index #index from the table (Symbol object)
+ """
+ if self._symbol_list is None:
+ self._symbol_list = list(self.iter_symbols())
+ return self._symbol_list[index]
+
+ def get_symbol_by_name(self, name):
+ """ Get a symbol(s) by name. Return None if no symbol by the given name
+ exists.
+ """
+ # The first time this method is called, construct a name to number
+ # mapping
+ #
+ if self._symbol_name_map is None:
+ self._symbol_name_map = defaultdict(list)
+ for i, sym in enumerate(self.iter_symbols()):
+ self._symbol_name_map[sym.name].append(i)
+ symnums = self._symbol_name_map.get(name)
+ return [self.get_symbol(i) for i in symnums] if symnums else None
def iter_symbols(self):
""" Yield all symbols in this dynamic segment. The symbols are usually
symbol_size = self.elfstructs.Elf_Sym.sizeof()
- # Find closest higher pointer than tab_ptr. We'll use that to mark the
- # end of the symbol table.
- nearest_ptr = None
- for tag in self.iter_tags():
- tag_ptr = tag['d_ptr']
- if tag['d_tag'] == 'DT_SYMENT':
- if symbol_size != tag['d_val']:
- # DT_SYMENT is the size of one symbol entry. It must be the
- # same as returned by Elf_Sym.sizeof.
- raise ELFError('DT_SYMENT (%d) != Elf_Sym (%d).' %
- (tag['d_val'], symbol_size))
- if (tag_ptr > tab_ptr and
- (nearest_ptr is None or nearest_ptr > tag_ptr)):
- nearest_ptr = tag_ptr
-
- if nearest_ptr is None:
- # Use the end of segment that contains DT_SYMTAB.
- for segment in self.elffile.iter_segments():
- if (segment['p_vaddr'] <= tab_ptr and
- tab_ptr <= (segment['p_vaddr'] + segment['p_filesz'])):
- nearest_ptr = segment['p_vaddr'] + segment['p_filesz']
-
- if nearest_ptr is None:
+ end_ptr = None
+
+ # Check if a DT_GNU_HASH tag exists and recover the number of symbols
+ # from the corresponding section
+ _, gnu_hash_offset = self.get_table_offset('DT_GNU_HASH')
+ if gnu_hash_offset is not None:
+ hash_section = GNUHashSection(self.stream, gnu_hash_offset,
+ self.elffile)
+ end_ptr = tab_ptr + \
+ hash_section.get_number_of_symbols() * symbol_size
+
+ # If DT_GNU_HASH did not exist, maybe we can use DT_HASH
+ if end_ptr is None:
+ _, hash_offset = self.get_table_offset('DT_HASH')
+ if hash_offset is not None:
+ hash_section = HashSection(self.stream, hash_offset,
+ self.elffile)
+ end_ptr = tab_ptr + \
+ hash_section.get_number_of_symbols() * symbol_size
+
+ if end_ptr is None:
+ # Find closest higher pointer than tab_ptr. We'll use that to mark
+ # the end of the symbol table.
+ nearest_ptr = None
+ for tag in self.iter_tags():
+ tag_ptr = tag['d_ptr']
+ if tag['d_tag'] == 'DT_SYMENT':
+ if symbol_size != tag['d_val']:
+ # DT_SYMENT is the size of one symbol entry. It must be
+ # the same as returned by Elf_Sym.sizeof.
+ raise ELFError('DT_SYMENT (%d) != Elf_Sym (%d).' %
+ (tag['d_val'], symbol_size))
+ if (tag_ptr > tab_ptr and
+ (nearest_ptr is None or nearest_ptr > tag_ptr)):
+ nearest_ptr = tag_ptr
+
+ if nearest_ptr is None:
+ # Use the end of segment that contains DT_SYMTAB.
+ for segment in self.elffile.iter_segments():
+ if (segment['p_vaddr'] <= tab_ptr and
+ tab_ptr <= (segment['p_vaddr'] + segment['p_filesz'])):
+ nearest_ptr = segment['p_vaddr'] + segment['p_filesz']
+
+ end_ptr = nearest_ptr
+
+ if end_ptr is None:
raise ELFError('Cannot determine the end of DT_SYMTAB.')
string_table = self._get_stringtable()
- for i in range((nearest_ptr - tab_ptr) // symbol_size):
+ for i in range((end_ptr - tab_ptr) // symbol_size):
symbol = struct_parse(self.elfstructs.Elf_Sym, self._stream,
i * symbol_size + tab_offset)
symbol_name = string_table.get_string(symbol['st_name'])
--- /dev/null
+#-------------------------------------------------------------------------------
+# elftools: elf/hash.py
+#
+# ELF hash table sections
+#
+# Andreas Ziegler (andreas.ziegler@fau.de)
+# This code is in the public domain
+#-------------------------------------------------------------------------------
+
+from ..common.utils import struct_parse
+
+
+class HashSection(object):
+ """ Minimal part of an ELF hash section to find the number of symbols in the
+ symbol table - useful for super-stripped binaries without section
+ headers where only the start of the symbol table is known from the
+ dynamic segment. The layout and contents are nicely described at
+ https://flapenguin.me/2017/04/24/elf-lookup-dt-hash/.
+ """
+ def __init__(self, stream, offset, elffile):
+ self._stream = stream
+ self._offset = offset
+ self._elffile = elffile
+ self.params = struct_parse(self._elffile.structs.Elf_Hash,
+ self._stream,
+ self._offset)
+
+ def get_number_of_symbols(self):
+ """ Get the number of symbols from the hash table parameters.
+ """
+ return self.params['nchains']
+
+
+class GNUHashSection(object):
+ """ Minimal part of a GNU hash section to find the number of symbols in the
+ symbol table - useful for super-stripped binaries without section
+ headers where only the start of the symbol table is known from the
+ dynamic segment. The layout and contents are nicely described at
+ https://flapenguin.me/2017/05/10/elf-lookup-dt-gnu-hash/.
+ """
+ def __init__(self, stream, offset, elffile):
+ self._stream = stream
+ self._offset = offset
+ self._elffile = elffile
+ self.params = struct_parse(self._elffile.structs.Gnu_Hash,
+ self._stream,
+ self._offset)
+
+ def get_number_of_symbols(self):
+ """ Get the number of symbols in the hash table by finding the bucket
+ with the highest symbol index and walking to the end of its chain.
+ """
+ # Element sizes in the hash table
+ wordsize = self._elffile.structs.Elf_word('').sizeof()
+ xwordsize = self._elffile.structs.Elf_xword('').sizeof()
+
+ # Find highest index in buckets array
+ max_idx = max(self.params['buckets'])
+ if max_idx < self.params['symoffset']:
+ return self.params['symoffset']
+
+ # Position the stream at the start of the corresponding chain
+ chain_pos = self._offset + 4 * wordsize + \
+ self.params['bloom_size'] * xwordsize + \
+ self.params['nbuckets'] * wordsize + \
+ (max_idx - self.params['symoffset']) * wordsize
+
+ # Walk the chain to its end (lowest bit is set)
+ while True:
+ cur_hash = struct_parse(self._elffile.structs.Elf_word('elem'),
+ self._stream,
+ chain_pos)
+ if cur_hash & 1:
+ return max_idx + 1
+
+ max_idx += 1
+ chain_pos += wordsize
self._create_note(e_type)
self._create_stabs()
self._create_arm_attributes()
+ self._create_elf_hash()
+ self._create_gnu_hash()
#-------------------------------- PRIVATE --------------------------------#
Enum(self.Elf_uleb128('tag'),
**ENUM_ATTR_TAG_ARM)
)
+
+ def _create_elf_hash(self):
+ # Structure of the old SYSV-style hash table header. It is documented
+ # in the Oracle "Linker and Libraries Guide", Part IV ELF Application
+ # Binary Interface, Chapter 14 Object File Format, Section Hash Table
+ # Section:
+ # https://docs.oracle.com/cd/E53394_01/html/E54813/chapter6-48031.html
+
+ self.Elf_Hash = Struct('Elf_Hash',
+ self.Elf_word('nbuckets'),
+ self.Elf_word('nchains'),
+ Array(lambda ctx: ctx['nbuckets'], self.Elf_word('buckets')),
+ Array(lambda ctx: ctx['nchains'], self.Elf_word('chains')))
+
+ def _create_gnu_hash(self):
+ # Structure of the GNU-style hash table header. Documentation for this
+ # table is mostly in the GLIBC source code, a good explanation of the
+ # format can be found in this blog post:
+ # https://flapenguin.me/2017/05/10/elf-lookup-dt-gnu-hash/
+ self.Gnu_Hash = Struct('Gnu_Hash',
+ self.Elf_word('nbuckets'),
+ self.Elf_word('symoffset'),
+ self.Elf_word('bloom_size'),
+ self.Elf_word('bloom_shift'),
+ Array(lambda ctx: ctx['bloom_size'], self.Elf_xword('bloom')),
+ Array(lambda ctx: ctx['nbuckets'], self.Elf_word('buckets')))
exp = ['libc.so.6']
self.assertEqual(libs, exp)
- def test_reading_symbols(self):
- """Verify we can read symbol table without SymbolTableSection"""
+ def test_reading_symbols_elf_hash(self):
+ """ Verify we can read symbol table without SymbolTableSection but with
+ a SYSV-style symbol hash table"""
with open(os.path.join('test', 'testfiles_for_unittests',
'aarch64_super_stripped.elf'), 'rb') as f:
elf = ELFFile(f)
if segment.header.p_type != 'PT_DYNAMIC':
continue
+ num_symbols = segment.num_symbols()
symbol_names = [x.name for x in segment.iter_symbols()]
+ symbol_at_index_3 = segment.get_symbol(3)
+ symbols_abort = segment.get_symbol_by_name('abort')
+ self.assertEqual(num_symbols, 4)
exp = ['', '__libc_start_main', '__gmon_start__', 'abort']
self.assertEqual(symbol_names, exp)
+ self.assertEqual(symbol_at_index_3.name, 'abort')
+ self.assertIsNotNone(symbols_abort)
+ self.assertEqual(symbols_abort[0], symbol_at_index_3)
+
+ def test_reading_symbols_gnu_hash(self):
+ """ Verify we can read symbol table without SymbolTableSection but with
+ a GNU symbol hash table"""
+ with open(os.path.join('test', 'testfiles_for_unittests',
+ 'android_dyntags.elf'), 'rb') as f:
+ elf = ELFFile(f)
+ for segment in elf.iter_segments():
+ if segment.header.p_type != 'PT_DYNAMIC':
+ continue
+
+ num_symbols = segment.num_symbols()
+ symbol_names = [x.name for x in segment.iter_symbols()]
+ symbol_at_index_3 = segment.get_symbol(3)
+ symbols_atfork = segment.get_symbol_by_name('__register_atfork')
+
+ self.assertEqual(num_symbols, 212)
+ exp = ['', '__cxa_finalize' , '__cxa_atexit', '__register_atfork',
+ '__stack_chk_fail', '_ZNK7android7RefBase9decStrongEPKv',
+ '_ZN7android7RefBaseD2Ev', '_ZdlPv', 'pthread_mutex_lock']
+ self.assertEqual(symbol_names[:9], exp)
+ self.assertEqual(symbol_at_index_3.name, '__register_atfork')
+ self.assertIsNotNone(symbols_atfork)
+ self.assertEqual(symbols_atfork[0], symbol_at_index_3)
def test_sunw_tags(self):
def extract_sunw(filename):
--- /dev/null
+#-------------------------------------------------------------------------------
+# elftools tests
+#
+# Andreas Ziegler (andreas.ziegler@fau.de)
+# This code is in the public domain
+#-------------------------------------------------------------------------------
+import unittest
+import os
+
+from elftools.elf.elffile import ELFFile
+from elftools.common.exceptions import ELFError
+from elftools.elf.hash import HashSection, GNUHashSection
+
+class TestELFHash(unittest.TestCase):
+ def test_get_number_of_syms(self):
+ """ Verify we can get get the number of symbols from an ELF hash
+ section.
+ """
+
+ with open(os.path.join('test', 'testfiles_for_unittests',
+ 'aarch64_super_stripped.elf'), 'rb') as f:
+ elf = ELFFile(f)
+ for segment in elf.iter_segments():
+ if segment.header.p_type != 'PT_DYNAMIC':
+ continue
+
+ _, hash_offset = segment.get_table_offset('DT_HASH')
+ hash_section = HashSection(elf.stream, hash_offset, elf)
+ self.assertEqual(hash_section.get_number_of_symbols(), 4)
+
+
+class TestGNUHash(unittest.TestCase):
+ def test_get_number_of_syms(self):
+ """ Verify we can get get the number of symbols from a GNU hash
+ section.
+ """
+
+ with open(os.path.join('test', 'testfiles_for_unittests',
+ 'lib_versioned64.so.1.elf'), 'rb') as f:
+ elf = ELFFile(f)
+ for segment in elf.iter_segments():
+ if segment.header.p_type != 'PT_DYNAMIC':
+ continue
+
+ _, hash_offset = segment.get_table_offset('DT_GNU_HASH')
+ hash_section = GNUHashSection(elf.stream, hash_offset, elf)
+ self.assertEqual(hash_section.get_number_of_symbols(), 24)