nir,amd: remove trinary_minmax opcodes
[mesa.git] / src / amd / common / ac_rtld.c
index 996ff045b16d881c5bca2ab237fc8a36d9fc92ed..e512b8f73275686b228297602b94c6b876667443 100644 (file)
 #include <string.h>
 
 #include "ac_binary.h"
+#include "ac_gpu_info.h"
+#include "util/u_dynarray.h"
 #include "util/u_math.h"
 
 // Old distributions may not have this enum constant
 #define MY_EM_AMDGPU 224
 
+#ifndef STT_AMDGPU_LDS
+#define STT_AMDGPU_LDS 13 // this is deprecated -- remove
+#endif
+
+#ifndef SHN_AMDGPU_LDS
+#define SHN_AMDGPU_LDS 0xff00
+#endif
+
 #ifndef R_AMDGPU_NONE
 #define R_AMDGPU_NONE 0
 #define R_AMDGPU_ABS32_LO 1
@@ -72,9 +82,9 @@ struct ac_rtld_part {
 static void report_erroraf(const char *fmt, va_list va)
 {
        char *msg;
-       int ret = asprintf(&msg, fmt, va);
+       int ret = vasprintf(&msg, fmt, va);
        if (ret < 0)
-               msg = "(asprintf failed)";
+               msg = "(vasprintf failed)";
 
        fprintf(stderr, "ac_rtld error: %s\n", msg);
 
@@ -104,17 +114,145 @@ static void report_elf_errorf(const char *fmt, ...)
        fprintf(stderr, "ELF error: %s\n", elf_errmsg(elf_errno()));
 }
 
+/**
+ * Find a symbol in a dynarray of struct ac_rtld_symbol by \p name and shader
+ * \p part_idx.
+ */
+static const struct ac_rtld_symbol *find_symbol(const struct util_dynarray *symbols,
+                                               const char *name, unsigned part_idx)
+{
+       util_dynarray_foreach(symbols, struct ac_rtld_symbol, symbol) {
+               if ((symbol->part_idx == ~0u || symbol->part_idx == part_idx) &&
+                   !strcmp(name, symbol->name))
+                       return symbol;
+       }
+       return 0;
+}
+
+static int compare_symbol_by_align(const void *lhsp, const void *rhsp)
+{
+       const struct ac_rtld_symbol *lhs = lhsp;
+       const struct ac_rtld_symbol *rhs = rhsp;
+       if (rhs->align > lhs->align)
+               return 1;
+       if (rhs->align < lhs->align)
+               return -1;
+       return 0;
+}
+
+/**
+ * Sort the given symbol list by decreasing alignment and assign offsets.
+ */
+static bool layout_symbols(struct ac_rtld_symbol *symbols, unsigned num_symbols,
+                          uint64_t *ptotal_size)
+{
+       qsort(symbols, num_symbols, sizeof(*symbols), compare_symbol_by_align);
+
+       uint64_t total_size = *ptotal_size;
+
+       for (unsigned i = 0; i < num_symbols; ++i) {
+               struct ac_rtld_symbol *s = &symbols[i];
+               assert(util_is_power_of_two_nonzero(s->align));
+
+               total_size = align64(total_size, s->align);
+               s->offset = total_size;
+
+               if (total_size + s->size < total_size) {
+                       report_errorf("%s: size overflow", __FUNCTION__);
+                       return false;
+               }
+
+               total_size += s->size;
+       }
+
+       *ptotal_size = total_size;
+       return true;
+}
+
+/**
+ * Read LDS symbols from the given \p section of the ELF of \p part and append
+ * them to the LDS symbols list.
+ *
+ * Shared LDS symbols are filtered out.
+ */
+static bool read_private_lds_symbols(struct ac_rtld_binary *binary,
+                                    unsigned part_idx,
+                                    Elf_Scn *section,
+                                    uint32_t *lds_end_align)
+{
+#define report_if(cond) \
+       do { \
+               if ((cond)) { \
+                       report_errorf(#cond); \
+                       return false; \
+               } \
+       } while (false)
+#define report_elf_if(cond) \
+       do { \
+               if ((cond)) { \
+                       report_elf_errorf(#cond); \
+                       return false; \
+               } \
+       } while (false)
+
+       struct ac_rtld_part *part = &binary->parts[part_idx];
+       Elf64_Shdr *shdr = elf64_getshdr(section);
+       uint32_t strtabidx = shdr->sh_link;
+       Elf_Data *symbols_data = elf_getdata(section, NULL);
+       report_elf_if(!symbols_data);
+
+       const Elf64_Sym *symbol = symbols_data->d_buf;
+       size_t num_symbols = symbols_data->d_size / sizeof(Elf64_Sym);
+
+       for (size_t j = 0; j < num_symbols; ++j, ++symbol) {
+               struct ac_rtld_symbol s = {};
+
+               if (ELF64_ST_TYPE(symbol->st_info) == STT_AMDGPU_LDS) {
+                       /* old-style LDS symbols from initial prototype -- remove eventually */
+                       s.align = MIN2(1u << (symbol->st_other >> 3), 1u << 16);
+               } else if (symbol->st_shndx == SHN_AMDGPU_LDS) {
+                       s.align = MIN2(symbol->st_value, 1u << 16);
+                       report_if(!util_is_power_of_two_nonzero(s.align));
+               } else
+                       continue;
+
+               report_if(symbol->st_size > 1u << 29);
+
+               s.name = elf_strptr(part->elf, strtabidx, symbol->st_name);
+               s.size = symbol->st_size;
+               s.part_idx = part_idx;
+
+               if (!strcmp(s.name, "__lds_end")) {
+                       report_elf_if(s.size != 0);
+                       *lds_end_align = MAX2(*lds_end_align, s.align);
+                       continue;
+               }
+
+               const struct ac_rtld_symbol *shared =
+                       find_symbol(&binary->lds_symbols, s.name, part_idx);
+               if (shared) {
+                       report_elf_if(s.align > shared->align);
+                       report_elf_if(s.size > shared->size);
+                       continue;
+               }
+
+               util_dynarray_append(&binary->lds_symbols, struct ac_rtld_symbol, s);
+       }
+
+       return true;
+
+#undef report_if
+#undef report_elf_if
+}
+
 /**
  * Open a binary consisting of one or more shader parts.
  *
  * \param binary the uninitialized struct
- * \param num_parts number of shader parts
- * \param elf_ptrs pointers to the in-memory ELF objects for each shader part
- * \param elf_sizes sizes (in bytes) of the in-memory ELF objects
+ * \param i binary opening parameters
  */
-bool ac_rtld_open(struct ac_rtld_binary *binary, unsigned num_parts,
-                 const char * const *elf_ptrs,
-                 const size_t *elf_sizes)
+bool ac_rtld_open(struct ac_rtld_binary *binary,
+                 struct ac_rtld_open_info i)
 {
        /* One of the libelf implementations
         * (http://www.mr511.de/software/english.htm) requires calling
@@ -123,14 +261,17 @@ bool ac_rtld_open(struct ac_rtld_binary *binary, unsigned num_parts,
        elf_version(EV_CURRENT);
 
        memset(binary, 0, sizeof(*binary));
-       binary->num_parts = num_parts;
-       binary->parts = calloc(sizeof(*binary->parts), num_parts);
+       memcpy(&binary->options, &i.options, sizeof(binary->options));
+       binary->wave_size = i.wave_size;
+       binary->num_parts = i.num_parts;
+       binary->parts = calloc(sizeof(*binary->parts), i.num_parts);
        if (!binary->parts)
                return false;
 
        uint64_t pasted_text_size = 0;
        uint64_t rx_align = 1;
        uint64_t rx_size = 0;
+       uint64_t exec_size = 0;
 
 #define report_if(cond) \
        do { \
@@ -147,11 +288,49 @@ bool ac_rtld_open(struct ac_rtld_binary *binary, unsigned num_parts,
                } \
        } while (false)
 
-       /* First pass over all parts: open ELFs and determine the placement of
-        * sections in the memory image. */
-       for (unsigned i = 0; i < num_parts; ++i) {
-               struct ac_rtld_part *part = &binary->parts[i];
-               part->elf = elf_memory((char *)elf_ptrs[i], elf_sizes[i]);
+       /* Copy and layout shared LDS symbols. */
+       if (i.num_shared_lds_symbols) {
+               if (!util_dynarray_resize(&binary->lds_symbols, struct ac_rtld_symbol,
+                                         i.num_shared_lds_symbols))
+                       goto fail;
+
+               memcpy(binary->lds_symbols.data, i.shared_lds_symbols, binary->lds_symbols.size);
+       }
+
+       util_dynarray_foreach(&binary->lds_symbols, struct ac_rtld_symbol, symbol)
+               symbol->part_idx = ~0u;
+
+       unsigned max_lds_size = 64 * 1024;
+
+       if (i.info->chip_class == GFX6 ||
+           (i.shader_type != MESA_SHADER_COMPUTE &&
+            i.shader_type != MESA_SHADER_FRAGMENT))
+               max_lds_size = 32 * 1024;
+
+       uint64_t shared_lds_size = 0;
+       if (!layout_symbols(binary->lds_symbols.data, i.num_shared_lds_symbols, &shared_lds_size))
+               goto fail;
+
+       if (shared_lds_size > max_lds_size) {
+               fprintf(stderr, "ac_rtld error(1): too much LDS (used = %u, max = %u)\n",
+                       (unsigned)shared_lds_size, max_lds_size);
+               goto fail;
+       }
+       binary->lds_size = shared_lds_size;
+
+       /* First pass over all parts: open ELFs, pre-determine the placement of
+        * sections in the memory image, and collect and layout private LDS symbols. */
+       uint32_t lds_end_align = 0;
+
+       if (binary->options.halt_at_entry)
+               pasted_text_size += 4;
+
+       for (unsigned part_idx = 0; part_idx < i.num_parts; ++part_idx) {
+               struct ac_rtld_part *part = &binary->parts[part_idx];
+               unsigned part_lds_symbols_begin =
+                       util_dynarray_num_elements(&binary->lds_symbols, struct ac_rtld_symbol);
+
+               part->elf = elf_memory((char *)i.elf_ptrs[part_idx], i.elf_sizes[part_idx]);
                report_elf_if(!part->elf);
 
                const Elf64_Ehdr *ehdr = elf64_getehdr(part->elf);
@@ -192,6 +371,8 @@ bool ac_rtld_open(struct ac_rtld_binary *binary, unsigned num_parts,
 
                                        if (!strcmp(s->name, ".text"))
                                                s->is_pasted_text = true;
+
+                                       exec_size += shdr->sh_size;
                                }
 
                                if (s->is_pasted_text) {
@@ -203,19 +384,52 @@ bool ac_rtld_open(struct ac_rtld_binary *binary, unsigned num_parts,
                                        s->offset = rx_size;
                                        rx_size += shdr->sh_size;
                                }
+                       } else if (shdr->sh_type == SHT_SYMTAB) {
+                               if (!read_private_lds_symbols(binary, part_idx, section, &lds_end_align))
+                                       goto fail;
                        }
                }
+
+               uint64_t part_lds_size = shared_lds_size;
+               if (!layout_symbols(
+                       util_dynarray_element(&binary->lds_symbols, struct ac_rtld_symbol, part_lds_symbols_begin),
+                       util_dynarray_num_elements(&binary->lds_symbols, struct ac_rtld_symbol) - part_lds_symbols_begin,
+                       &part_lds_size))
+                       goto fail;
+               binary->lds_size = MAX2(binary->lds_size, part_lds_size);
        }
 
        binary->rx_end_markers = pasted_text_size;
        pasted_text_size += 4 * DEBUGGER_NUM_MARKERS;
 
+       /* __lds_end is a special symbol that points at the end of the memory
+        * occupied by other LDS symbols. Its alignment is taken as the
+        * maximum of its alignment over all shader parts where it occurs.
+        */
+       if (lds_end_align) {
+               binary->lds_size = align(binary->lds_size, lds_end_align);
+
+               struct ac_rtld_symbol *lds_end =
+                       util_dynarray_grow(&binary->lds_symbols, struct ac_rtld_symbol, 1);
+               lds_end->name = "__lds_end";
+               lds_end->size = 0;
+               lds_end->align = lds_end_align;
+               lds_end->offset = binary->lds_size;
+               lds_end->part_idx = ~0u;
+       }
+
+       if (binary->lds_size > max_lds_size) {
+               fprintf(stderr, "ac_rtld error(2): too much LDS (used = %u, max = %u)\n",
+                       (unsigned)binary->lds_size, max_lds_size);
+               goto fail;
+       }
+
        /* Second pass: Adjust offsets of non-pasted text sections. */
        binary->rx_size = pasted_text_size;
        binary->rx_size = align(binary->rx_size, rx_align);
 
-       for (unsigned i = 0; i < num_parts; ++i) {
-               struct ac_rtld_part *part = &binary->parts[i];
+       for (unsigned part_idx = 0; part_idx < i.num_parts; ++part_idx) {
+               struct ac_rtld_part *part = &binary->parts[part_idx];
                size_t num_shdrs;
                elf_getshdrnum(part->elf, &num_shdrs);
 
@@ -227,6 +441,26 @@ bool ac_rtld_open(struct ac_rtld_binary *binary, unsigned num_parts,
        }
 
        binary->rx_size += rx_size;
+       binary->exec_size = exec_size;
+
+       if (i.info->chip_class >= GFX10) {
+               /* In gfx10, the SQ fetches up to 3 cache lines of 16 dwords
+                * ahead of the PC, configurable by SH_MEM_CONFIG and
+                * S_INST_PREFETCH. This can cause two issues:
+                *
+                * (1) Crossing a page boundary to an unmapped page. The logic
+                *     does not distinguish between a required fetch and a "mere"
+                *     prefetch and will fault.
+                *
+                * (2) Prefetching instructions that will be changed for a
+                *     different shader.
+                *
+                * (2) is not currently an issue because we flush the I$ at IB
+                * boundaries, but (1) needs to be addressed. Due to buffer
+                * suballocation, we just play it safe.
+                */
+               binary->rx_size = align(binary->rx_size + 3 * 64, 64);
+       }
 
        return true;
 
@@ -246,6 +480,7 @@ void ac_rtld_close(struct ac_rtld_binary *binary)
                elf_end(part->elf);
        }
 
+       util_dynarray_fini(&binary->lds_symbols);
        free(binary->parts);
        binary->parts = NULL;
        binary->num_parts = 0;
@@ -279,7 +514,8 @@ bool ac_rtld_get_section_by_name(struct ac_rtld_binary *binary, const char *name
        return get_section_by_name(&binary->parts[0], name, data, nbytes);
 }
 
-bool ac_rtld_read_config(struct ac_rtld_binary *binary,
+bool ac_rtld_read_config(const struct radeon_info *info,
+                        struct ac_rtld_binary *binary,
                         struct ac_shader_config *config)
 {
        for (unsigned i = 0; i < binary->num_parts; ++i) {
@@ -293,7 +529,8 @@ bool ac_rtld_read_config(struct ac_rtld_binary *binary,
 
                /* TODO: be precise about scratch use? */
                struct ac_shader_config c = {};
-               ac_parse_shader_binary_config(config_data, config_nbytes, true, &c);
+               ac_parse_shader_binary_config(config_data, config_nbytes,
+                                             binary->wave_size, true, info, &c);
 
                config->num_sgprs = MAX2(config->num_sgprs, c.num_sgprs);
                config->num_vgprs = MAX2(config->num_vgprs, c.num_vgprs);
@@ -329,7 +566,17 @@ static bool resolve_symbol(const struct ac_rtld_upload_info *u,
                           unsigned part_idx, const Elf64_Sym *sym,
                           const char *name, uint64_t *value)
 {
-       if (sym->st_shndx == SHN_UNDEF) {
+       /* TODO: properly disentangle the undef and the LDS cases once
+        * STT_AMDGPU_LDS is retired. */
+       if (sym->st_shndx == SHN_UNDEF || sym->st_shndx == SHN_AMDGPU_LDS) {
+               const struct ac_rtld_symbol *lds_sym =
+                       find_symbol(&u->binary->lds_symbols, name, part_idx);
+
+               if (lds_sym) {
+                       *value = lds_sym->offset;
+                       return true;
+               }
+
                /* TODO: resolve from other parts */
 
                if (u->get_external_symbol(u->cb_data, name, value))
@@ -510,9 +757,15 @@ bool ac_rtld_upload(struct ac_rtld_upload_info *u)
                } \
        } while (false)
 
-       /* First pass: upload raw section data. */
+       if (u->binary->options.halt_at_entry) {
+               /* s_sethalt 1 */
+               *(uint32_t *)u->rx_ptr = util_cpu_to_le32(0xbf8d0001);
+       }
+
+       /* First pass: upload raw section data and lay out private LDS symbols. */
        for (unsigned i = 0; i < u->binary->num_parts; ++i) {
                struct ac_rtld_part *part = &u->binary->parts[i];
+
                Elf_Scn *section = NULL;
                while ((section = elf_nextscn(part->elf, section))) {
                        Elf64_Shdr *shdr = elf64_getshdr(section);