ac: add more fields to ac_gpu_info
[mesa.git] / src / amd / common / ac_rtld.c
index 57d6b0151b42ab3235ac366cb394a60d20d38c42..7c35e72543d2bec6fc407c098efeef9fb7cabf01 100644 (file)
 #define MY_EM_AMDGPU 224
 
 #ifndef STT_AMDGPU_LDS
-#define STT_AMDGPU_LDS 13
+#define STT_AMDGPU_LDS 13 // this is deprecated -- remove
+#endif
+
+#ifndef SHN_AMDGPU_LDS
+#define SHN_AMDGPU_LDS 0xff00
 #endif
 
 #ifndef R_AMDGPU_NONE
@@ -176,13 +180,20 @@ static bool read_private_lds_symbols(struct ac_rtld_binary *binary,
                                     Elf_Scn *section,
                                     uint32_t *lds_end_align)
 {
-#define report_elf_if(cond) \
+#define report_if(cond) \
        do { \
                if ((cond)) { \
                        report_errorf(#cond); \
                        return false; \
                } \
        } while (false)
+#define report_elf_if(cond) \
+       do { \
+               if ((cond)) { \
+                       report_elf_errorf(#cond); \
+                       return false; \
+               } \
+       } while (false)
 
        struct ac_rtld_part *part = &binary->parts[part_idx];
        Elf64_Shdr *shdr = elf64_getshdr(section);
@@ -194,15 +205,21 @@ static bool read_private_lds_symbols(struct ac_rtld_binary *binary,
        size_t num_symbols = symbols_data->d_size / sizeof(Elf64_Sym);
 
        for (size_t j = 0; j < num_symbols; ++j, ++symbol) {
-               if (ELF64_ST_TYPE(symbol->st_info) != STT_AMDGPU_LDS)
+               struct ac_rtld_symbol s = {};
+
+               if (ELF64_ST_TYPE(symbol->st_info) == STT_AMDGPU_LDS) {
+                       /* old-style LDS symbols from initial prototype -- remove eventually */
+                       s.align = MIN2(1u << (symbol->st_other >> 3), 1u << 16);
+               } else if (symbol->st_shndx == SHN_AMDGPU_LDS) {
+                       s.align = MIN2(symbol->st_value, 1u << 16);
+                       report_if(!util_is_power_of_two_nonzero(s.align));
+               } else
                        continue;
 
-               report_elf_if(symbol->st_size > 1u << 29);
+               report_if(symbol->st_size > 1u << 29);
 
-               struct ac_rtld_symbol s = {};
                s.name = elf_strptr(part->elf, strtabidx, symbol->st_name);
                s.size = symbol->st_size;
-               s.align = MIN2(1u << (symbol->st_other >> 3), 1u << 16);
                s.part_idx = part_idx;
 
                if (!strcmp(s.name, "__lds_end")) {
@@ -224,6 +241,7 @@ static bool read_private_lds_symbols(struct ac_rtld_binary *binary,
 
        return true;
 
+#undef report_if
 #undef report_elf_if
 }
 
@@ -244,6 +262,7 @@ bool ac_rtld_open(struct ac_rtld_binary *binary,
 
        memset(binary, 0, sizeof(*binary));
        memcpy(&binary->options, &i.options, sizeof(binary->options));
+       binary->wave_size = i.wave_size;
        binary->num_parts = i.num_parts;
        binary->parts = calloc(sizeof(*binary->parts), i.num_parts);
        if (!binary->parts)
@@ -252,6 +271,7 @@ bool ac_rtld_open(struct ac_rtld_binary *binary,
        uint64_t pasted_text_size = 0;
        uint64_t rx_align = 1;
        uint64_t rx_size = 0;
+       uint64_t exec_size = 0;
 
 #define report_if(cond) \
        do { \
@@ -351,6 +371,8 @@ bool ac_rtld_open(struct ac_rtld_binary *binary,
 
                                        if (!strcmp(s->name, ".text"))
                                                s->is_pasted_text = true;
+
+                                       exec_size += shdr->sh_size;
                                }
 
                                if (s->is_pasted_text) {
@@ -419,6 +441,26 @@ bool ac_rtld_open(struct ac_rtld_binary *binary,
        }
 
        binary->rx_size += rx_size;
+       binary->exec_size = exec_size;
+
+       if (i.info->chip_class >= GFX10) {
+               /* In gfx10, the SQ fetches up to 3 cache lines of 16 dwords
+                * ahead of the PC, configurable by SH_MEM_CONFIG and
+                * S_INST_PREFETCH. This can cause two issues:
+                *
+                * (1) Crossing a page boundary to an unmapped page. The logic
+                *     does not distinguish between a required fetch and a "mere"
+                *     prefetch and will fault.
+                *
+                * (2) Prefetching instructions that will be changed for a
+                *     different shader.
+                *
+                * (2) is not currently an issue because we flush the I$ at IB
+                * boundaries, but (1) needs to be addressed. Due to buffer
+                * suballocation, we just play it safe.
+                */
+               binary->rx_size = align(binary->rx_size + 3 * 64, 64);
+       }
 
        return true;
 
@@ -486,7 +528,8 @@ bool ac_rtld_read_config(struct ac_rtld_binary *binary,
 
                /* TODO: be precise about scratch use? */
                struct ac_shader_config c = {};
-               ac_parse_shader_binary_config(config_data, config_nbytes, true, &c);
+               ac_parse_shader_binary_config(config_data, config_nbytes,
+                                             binary->wave_size, true, &c);
 
                config->num_sgprs = MAX2(config->num_sgprs, c.num_sgprs);
                config->num_vgprs = MAX2(config->num_vgprs, c.num_vgprs);
@@ -522,7 +565,9 @@ static bool resolve_symbol(const struct ac_rtld_upload_info *u,
                           unsigned part_idx, const Elf64_Sym *sym,
                           const char *name, uint64_t *value)
 {
-       if (sym->st_shndx == SHN_UNDEF) {
+       /* TODO: properly disentangle the undef and the LDS cases once
+        * STT_AMDGPU_LDS is retired. */
+       if (sym->st_shndx == SHN_UNDEF || sym->st_shndx == SHN_AMDGPU_LDS) {
                const struct ac_rtld_symbol *lds_sym =
                        find_symbol(&u->binary->lds_symbols, name, part_idx);