#include <string.h>
#include "ac_binary.h"
+#include "ac_gpu_info.h"
+#include "util/u_dynarray.h"
#include "util/u_math.h"
// Old distributions may not have this enum constant
#define MY_EM_AMDGPU 224
+#ifndef STT_AMDGPU_LDS
+#define STT_AMDGPU_LDS 13 // this is deprecated -- remove
+#endif
+
+#ifndef SHN_AMDGPU_LDS
+#define SHN_AMDGPU_LDS 0xff00
+#endif
+
#ifndef R_AMDGPU_NONE
#define R_AMDGPU_NONE 0
#define R_AMDGPU_ABS32_LO 1
static void report_erroraf(const char *fmt, va_list va)
{
char *msg;
- int ret = asprintf(&msg, fmt, va);
+ int ret = vasprintf(&msg, fmt, va);
if (ret < 0)
- msg = "(asprintf failed)";
+ msg = "(vasprintf failed)";
fprintf(stderr, "ac_rtld error: %s\n", msg);
fprintf(stderr, "ELF error: %s\n", elf_errmsg(elf_errno()));
}
+/**
+ * Find a symbol in a dynarray of struct ac_rtld_symbol by \p name and shader
+ * \p part_idx.
+ */
+static const struct ac_rtld_symbol *find_symbol(const struct util_dynarray *symbols,
+ const char *name, unsigned part_idx)
+{
+ util_dynarray_foreach(symbols, struct ac_rtld_symbol, symbol) {
+ if ((symbol->part_idx == ~0u || symbol->part_idx == part_idx) &&
+ !strcmp(name, symbol->name))
+ return symbol;
+ }
+ return 0;
+}
+
+static int compare_symbol_by_align(const void *lhsp, const void *rhsp)
+{
+ const struct ac_rtld_symbol *lhs = lhsp;
+ const struct ac_rtld_symbol *rhs = rhsp;
+ if (rhs->align > lhs->align)
+ return 1;
+ if (rhs->align < lhs->align)
+ return -1;
+ return 0;
+}
+
+/**
+ * Sort the given symbol list by decreasing alignment and assign offsets.
+ */
+static bool layout_symbols(struct ac_rtld_symbol *symbols, unsigned num_symbols,
+ uint64_t *ptotal_size)
+{
+ qsort(symbols, num_symbols, sizeof(*symbols), compare_symbol_by_align);
+
+ uint64_t total_size = *ptotal_size;
+
+ for (unsigned i = 0; i < num_symbols; ++i) {
+ struct ac_rtld_symbol *s = &symbols[i];
+ assert(util_is_power_of_two_nonzero(s->align));
+
+ total_size = align64(total_size, s->align);
+ s->offset = total_size;
+
+ if (total_size + s->size < total_size) {
+ report_errorf("%s: size overflow", __FUNCTION__);
+ return false;
+ }
+
+ total_size += s->size;
+ }
+
+ *ptotal_size = total_size;
+ return true;
+}
+
+/**
+ * Read LDS symbols from the given \p section of the ELF of \p part and append
+ * them to the LDS symbols list.
+ *
+ * Shared LDS symbols are filtered out.
+ */
+static bool read_private_lds_symbols(struct ac_rtld_binary *binary,
+ unsigned part_idx,
+ Elf_Scn *section,
+ uint32_t *lds_end_align)
+{
+#define report_if(cond) \
+ do { \
+ if ((cond)) { \
+ report_errorf(#cond); \
+ return false; \
+ } \
+ } while (false)
+#define report_elf_if(cond) \
+ do { \
+ if ((cond)) { \
+ report_elf_errorf(#cond); \
+ return false; \
+ } \
+ } while (false)
+
+ struct ac_rtld_part *part = &binary->parts[part_idx];
+ Elf64_Shdr *shdr = elf64_getshdr(section);
+ uint32_t strtabidx = shdr->sh_link;
+ Elf_Data *symbols_data = elf_getdata(section, NULL);
+ report_elf_if(!symbols_data);
+
+ const Elf64_Sym *symbol = symbols_data->d_buf;
+ size_t num_symbols = symbols_data->d_size / sizeof(Elf64_Sym);
+
+ for (size_t j = 0; j < num_symbols; ++j, ++symbol) {
+ struct ac_rtld_symbol s = {};
+
+ if (ELF64_ST_TYPE(symbol->st_info) == STT_AMDGPU_LDS) {
+ /* old-style LDS symbols from initial prototype -- remove eventually */
+ s.align = MIN2(1u << (symbol->st_other >> 3), 1u << 16);
+ } else if (symbol->st_shndx == SHN_AMDGPU_LDS) {
+ s.align = MIN2(symbol->st_value, 1u << 16);
+ report_if(!util_is_power_of_two_nonzero(s.align));
+ } else
+ continue;
+
+ report_if(symbol->st_size > 1u << 29);
+
+ s.name = elf_strptr(part->elf, strtabidx, symbol->st_name);
+ s.size = symbol->st_size;
+ s.part_idx = part_idx;
+
+ if (!strcmp(s.name, "__lds_end")) {
+ report_elf_if(s.size != 0);
+ *lds_end_align = MAX2(*lds_end_align, s.align);
+ continue;
+ }
+
+ const struct ac_rtld_symbol *shared =
+ find_symbol(&binary->lds_symbols, s.name, part_idx);
+ if (shared) {
+ report_elf_if(s.align > shared->align);
+ report_elf_if(s.size > shared->size);
+ continue;
+ }
+
+ util_dynarray_append(&binary->lds_symbols, struct ac_rtld_symbol, s);
+ }
+
+ return true;
+
+#undef report_if
+#undef report_elf_if
+}
+
/**
* Open a binary consisting of one or more shader parts.
*
* \param binary the uninitialized struct
- * \param num_parts number of shader parts
- * \param elf_ptrs pointers to the in-memory ELF objects for each shader part
- * \param elf_sizes sizes (in bytes) of the in-memory ELF objects
+ * \param i binary opening parameters
*/
-bool ac_rtld_open(struct ac_rtld_binary *binary, unsigned num_parts,
- const char * const *elf_ptrs,
- const size_t *elf_sizes)
+bool ac_rtld_open(struct ac_rtld_binary *binary,
+ struct ac_rtld_open_info i)
{
/* One of the libelf implementations
* (http://www.mr511.de/software/english.htm) requires calling
elf_version(EV_CURRENT);
memset(binary, 0, sizeof(*binary));
- binary->num_parts = num_parts;
- binary->parts = calloc(sizeof(*binary->parts), num_parts);
+ memcpy(&binary->options, &i.options, sizeof(binary->options));
+ binary->wave_size = i.wave_size;
+ binary->num_parts = i.num_parts;
+ binary->parts = calloc(sizeof(*binary->parts), i.num_parts);
if (!binary->parts)
return false;
uint64_t pasted_text_size = 0;
uint64_t rx_align = 1;
uint64_t rx_size = 0;
+ uint64_t exec_size = 0;
#define report_if(cond) \
do { \
} \
} while (false)
- /* First pass over all parts: open ELFs and determine the placement of
- * sections in the memory image. */
- for (unsigned i = 0; i < num_parts; ++i) {
- struct ac_rtld_part *part = &binary->parts[i];
- part->elf = elf_memory((char *)elf_ptrs[i], elf_sizes[i]);
+ /* Copy and layout shared LDS symbols. */
+ if (i.num_shared_lds_symbols) {
+ if (!util_dynarray_resize(&binary->lds_symbols, struct ac_rtld_symbol,
+ i.num_shared_lds_symbols))
+ goto fail;
+
+ memcpy(binary->lds_symbols.data, i.shared_lds_symbols, binary->lds_symbols.size);
+ }
+
+ util_dynarray_foreach(&binary->lds_symbols, struct ac_rtld_symbol, symbol)
+ symbol->part_idx = ~0u;
+
+ unsigned max_lds_size = 64 * 1024;
+
+ if (i.info->chip_class == GFX6 ||
+ (i.shader_type != MESA_SHADER_COMPUTE &&
+ i.shader_type != MESA_SHADER_FRAGMENT))
+ max_lds_size = 32 * 1024;
+
+ uint64_t shared_lds_size = 0;
+ if (!layout_symbols(binary->lds_symbols.data, i.num_shared_lds_symbols, &shared_lds_size))
+ goto fail;
+
+ if (shared_lds_size > max_lds_size) {
+ fprintf(stderr, "ac_rtld error(1): too much LDS (used = %u, max = %u)\n",
+ (unsigned)shared_lds_size, max_lds_size);
+ goto fail;
+ }
+ binary->lds_size = shared_lds_size;
+
+ /* First pass over all parts: open ELFs, pre-determine the placement of
+ * sections in the memory image, and collect and layout private LDS symbols. */
+ uint32_t lds_end_align = 0;
+
+ if (binary->options.halt_at_entry)
+ pasted_text_size += 4;
+
+ for (unsigned part_idx = 0; part_idx < i.num_parts; ++part_idx) {
+ struct ac_rtld_part *part = &binary->parts[part_idx];
+ unsigned part_lds_symbols_begin =
+ util_dynarray_num_elements(&binary->lds_symbols, struct ac_rtld_symbol);
+
+ part->elf = elf_memory((char *)i.elf_ptrs[part_idx], i.elf_sizes[part_idx]);
report_elf_if(!part->elf);
const Elf64_Ehdr *ehdr = elf64_getehdr(part->elf);
if (!strcmp(s->name, ".text"))
s->is_pasted_text = true;
+
+ exec_size += shdr->sh_size;
}
if (s->is_pasted_text) {
s->offset = rx_size;
rx_size += shdr->sh_size;
}
+ } else if (shdr->sh_type == SHT_SYMTAB) {
+ if (!read_private_lds_symbols(binary, part_idx, section, &lds_end_align))
+ goto fail;
}
}
+
+ uint64_t part_lds_size = shared_lds_size;
+ if (!layout_symbols(
+ util_dynarray_element(&binary->lds_symbols, struct ac_rtld_symbol, part_lds_symbols_begin),
+ util_dynarray_num_elements(&binary->lds_symbols, struct ac_rtld_symbol) - part_lds_symbols_begin,
+ &part_lds_size))
+ goto fail;
+ binary->lds_size = MAX2(binary->lds_size, part_lds_size);
}
binary->rx_end_markers = pasted_text_size;
pasted_text_size += 4 * DEBUGGER_NUM_MARKERS;
+ /* __lds_end is a special symbol that points at the end of the memory
+ * occupied by other LDS symbols. Its alignment is taken as the
+ * maximum of its alignment over all shader parts where it occurs.
+ */
+ if (lds_end_align) {
+ binary->lds_size = align(binary->lds_size, lds_end_align);
+
+ struct ac_rtld_symbol *lds_end =
+ util_dynarray_grow(&binary->lds_symbols, struct ac_rtld_symbol, 1);
+ lds_end->name = "__lds_end";
+ lds_end->size = 0;
+ lds_end->align = lds_end_align;
+ lds_end->offset = binary->lds_size;
+ lds_end->part_idx = ~0u;
+ }
+
+ if (binary->lds_size > max_lds_size) {
+ fprintf(stderr, "ac_rtld error(2): too much LDS (used = %u, max = %u)\n",
+ (unsigned)binary->lds_size, max_lds_size);
+ goto fail;
+ }
+
/* Second pass: Adjust offsets of non-pasted text sections. */
binary->rx_size = pasted_text_size;
binary->rx_size = align(binary->rx_size, rx_align);
- for (unsigned i = 0; i < num_parts; ++i) {
- struct ac_rtld_part *part = &binary->parts[i];
+ for (unsigned part_idx = 0; part_idx < i.num_parts; ++part_idx) {
+ struct ac_rtld_part *part = &binary->parts[part_idx];
size_t num_shdrs;
elf_getshdrnum(part->elf, &num_shdrs);
}
binary->rx_size += rx_size;
+ binary->exec_size = exec_size;
+
+ if (i.info->chip_class >= GFX10) {
+ /* In gfx10, the SQ fetches up to 3 cache lines of 16 dwords
+ * ahead of the PC, configurable by SH_MEM_CONFIG and
+ * S_INST_PREFETCH. This can cause two issues:
+ *
+ * (1) Crossing a page boundary to an unmapped page. The logic
+ * does not distinguish between a required fetch and a "mere"
+ * prefetch and will fault.
+ *
+ * (2) Prefetching instructions that will be changed for a
+ * different shader.
+ *
+ * (2) is not currently an issue because we flush the I$ at IB
+ * boundaries, but (1) needs to be addressed. Due to buffer
+ * suballocation, we just play it safe.
+ */
+ binary->rx_size = align(binary->rx_size + 3 * 64, 64);
+ }
return true;
elf_end(part->elf);
}
+ util_dynarray_fini(&binary->lds_symbols);
free(binary->parts);
binary->parts = NULL;
binary->num_parts = 0;
return get_section_by_name(&binary->parts[0], name, data, nbytes);
}
-bool ac_rtld_read_config(struct ac_rtld_binary *binary,
+bool ac_rtld_read_config(const struct radeon_info *info,
+ struct ac_rtld_binary *binary,
struct ac_shader_config *config)
{
for (unsigned i = 0; i < binary->num_parts; ++i) {
/* TODO: be precise about scratch use? */
struct ac_shader_config c = {};
- ac_parse_shader_binary_config(config_data, config_nbytes, true, &c);
+ ac_parse_shader_binary_config(config_data, config_nbytes,
+ binary->wave_size, true, info, &c);
config->num_sgprs = MAX2(config->num_sgprs, c.num_sgprs);
config->num_vgprs = MAX2(config->num_vgprs, c.num_vgprs);
unsigned part_idx, const Elf64_Sym *sym,
const char *name, uint64_t *value)
{
- if (sym->st_shndx == SHN_UNDEF) {
+ /* TODO: properly disentangle the undef and the LDS cases once
+ * STT_AMDGPU_LDS is retired. */
+ if (sym->st_shndx == SHN_UNDEF || sym->st_shndx == SHN_AMDGPU_LDS) {
+ const struct ac_rtld_symbol *lds_sym =
+ find_symbol(&u->binary->lds_symbols, name, part_idx);
+
+ if (lds_sym) {
+ *value = lds_sym->offset;
+ return true;
+ }
+
/* TODO: resolve from other parts */
if (u->get_external_symbol(u->cb_data, name, value))
} \
} while (false)
- /* First pass: upload raw section data. */
+ if (u->binary->options.halt_at_entry) {
+ /* s_sethalt 1 */
+ *(uint32_t *)u->rx_ptr = util_cpu_to_le32(0xbf8d0001);
+ }
+
+ /* First pass: upload raw section data and lay out private LDS symbols. */
for (unsigned i = 0; i < u->binary->num_parts; ++i) {
struct ac_rtld_part *part = &u->binary->parts[i];
+
Elf_Scn *section = NULL;
while ((section = elf_nextscn(part->elf, section))) {
Elf64_Shdr *shdr = elf64_getshdr(section);