nir: Handle all array stride cases in nir_deref_instr_array_stride
[mesa.git] / src / amd / common / ac_binary.c
index 1bf52c783281aff6d62732f834826542d1f70018..4651c064abd5731f807f8af08ecf243226790da6 100644 (file)
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * Authors: Tom Stellard <thomas.stellard@amd.com>
- *
- * Based on radeon_elf_util.c.
  */
 
+#include "ac_gpu_info.h"
 #include "ac_binary.h"
 
 #include "util/u_math.h"
 #define SPILLED_SGPRS                                     0x4
 #define SPILLED_VGPRS                                     0x8
 
-static void parse_symbol_table(Elf_Data *symbol_table_data,
-                               const GElf_Shdr *symbol_table_header,
-                               struct ac_shader_binary *binary)
+/* Parse configuration data in .AMDGPU.config section format. */
+void ac_parse_shader_binary_config(const char *data, size_t nbytes,
+                                  unsigned wave_size,
+                                  bool really_needs_scratch,
+                                  const struct radeon_info *info,
+                                  struct ac_shader_config *conf)
 {
-       GElf_Sym symbol;
-       unsigned i = 0;
-       unsigned symbol_count =
-               symbol_table_header->sh_size / symbol_table_header->sh_entsize;
-
-       /* We are over allocating this list, because symbol_count gives the
-        * total number of symbols, and we will only be filling the list
-        * with offsets of global symbols.  The memory savings from
-        * allocating the correct size of this list will be small, and
-        * I don't think it is worth the cost of pre-computing the number
-        * of global symbols.
-        */
-       binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
-
-       while (gelf_getsym(symbol_table_data, i++, &symbol)) {
-               unsigned i;
-               if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
-                   symbol.st_shndx == 0 /* Undefined symbol */) {
-                       continue;
-               }
-
-               binary->global_symbol_offsets[binary->global_symbol_count] =
-                                       symbol.st_value;
-
-               /* Sort the list using bubble sort.  This list will usually
-                * be small. */
-               for (i = binary->global_symbol_count; i > 0; --i) {
-                       uint64_t lhs = binary->global_symbol_offsets[i - 1];
-                       uint64_t rhs = binary->global_symbol_offsets[i];
-                       if (lhs < rhs) {
-                               break;
-                       }
-                       binary->global_symbol_offsets[i] = lhs;
-                       binary->global_symbol_offsets[i - 1] = rhs;
-               }
-               ++binary->global_symbol_count;
-       }
-}
-
-static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
-                       unsigned symbol_sh_link,
-                       struct ac_shader_binary *binary)
-{
-       unsigned i;
-
-       if (!relocs || !symbols || !binary->reloc_count) {
-               return;
-       }
-       binary->relocs = CALLOC(binary->reloc_count,
-                       sizeof(struct ac_shader_reloc));
-       for (i = 0; i < binary->reloc_count; i++) {
-               GElf_Sym symbol;
-               GElf_Rel rel;
-               char *symbol_name;
-               struct ac_shader_reloc *reloc = &binary->relocs[i];
-
-               gelf_getrel(relocs, i, &rel);
-               gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
-               symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
-
-               reloc->offset = rel.r_offset;
-               strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
-               reloc->name[sizeof(reloc->name)-1] = 0;
-       }
-}
-
-bool ac_elf_read(const char *elf_data, unsigned elf_size,
-                struct ac_shader_binary *binary)
-{
-       char *elf_buffer;
-       Elf *elf;
-       Elf_Scn *section = NULL;
-       Elf_Data *symbols = NULL, *relocs = NULL;
-       size_t section_str_index;
-       unsigned symbol_sh_link = 0;
-       bool success = true;
-
-       /* One of the libelf implementations
-        * (http://www.mr511.de/software/english.htm) requires calling
-        * elf_version() before elf_memory().
-        */
-       elf_version(EV_CURRENT);
-       elf_buffer = MALLOC(elf_size);
-       memcpy(elf_buffer, elf_data, elf_size);
-
-       elf = elf_memory(elf_buffer, elf_size);
-
-       elf_getshdrstrndx(elf, &section_str_index);
-
-       while ((section = elf_nextscn(elf, section))) {
-               const char *name;
-               Elf_Data *section_data = NULL;
-               GElf_Shdr section_header;
-               if (gelf_getshdr(section, &section_header) != &section_header) {
-                       fprintf(stderr, "Failed to read ELF section header\n");
-                       success = false;
-                       break;
-               }
-               name = elf_strptr(elf, section_str_index, section_header.sh_name);
-               if (!strcmp(name, ".text")) {
-                       section_data = elf_getdata(section, section_data);
-                       binary->code_size = section_data->d_size;
-                       binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
-                       memcpy(binary->code, section_data->d_buf, binary->code_size);
-               } else if (!strcmp(name, ".AMDGPU.config")) {
-                       section_data = elf_getdata(section, section_data);
-                       binary->config_size = section_data->d_size;
-                       if (!binary->config_size) {
-                               fprintf(stderr, ".AMDGPU.config is empty!\n");
-                               success = false;
-                               break;
-                       }
-                       binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
-                       memcpy(binary->config, section_data->d_buf, binary->config_size);
-               } else if (!strcmp(name, ".AMDGPU.disasm")) {
-                       /* Always read disassembly if it's available. */
-                       section_data = elf_getdata(section, section_data);
-                       binary->disasm_string = strndup(section_data->d_buf,
-                                                       section_data->d_size);
-               } else if (!strncmp(name, ".rodata", 7)) {
-                       section_data = elf_getdata(section, section_data);
-                       binary->rodata_size = section_data->d_size;
-                       binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
-                       memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
-               } else if (!strncmp(name, ".symtab", 7)) {
-                       symbols = elf_getdata(section, section_data);
-                       symbol_sh_link = section_header.sh_link;
-                       parse_symbol_table(symbols, &section_header, binary);
-               } else if (!strcmp(name, ".rel.text")) {
-                       relocs = elf_getdata(section, section_data);
-                       binary->reloc_count = section_header.sh_size /
-                                       section_header.sh_entsize;
-               }
-       }
-
-       parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
+       uint32_t scratch_size = 0;
 
-       if (elf){
-               elf_end(elf);
-       }
-       FREE(elf_buffer);
-
-       /* Cache the config size per symbol */
-       if (binary->global_symbol_count) {
-               binary->config_size_per_symbol =
-                       binary->config_size / binary->global_symbol_count;
-       } else {
-               binary->global_symbol_count = 1;
-               binary->config_size_per_symbol = binary->config_size;
-       }
-       return success;
-}
-
-const unsigned char *ac_shader_binary_config_start(
-       const struct ac_shader_binary *binary,
-       uint64_t symbol_offset)
-{
-       unsigned i;
-       for (i = 0; i < binary->global_symbol_count; ++i) {
-               if (binary->global_symbol_offsets[i] == symbol_offset) {
-                       unsigned offset = i * binary->config_size_per_symbol;
-                       return binary->config + offset;
-               }
-       }
-       return binary->config;
-}
-
-
-static const char *scratch_rsrc_dword0_symbol =
-       "SCRATCH_RSRC_DWORD0";
-
-static const char *scratch_rsrc_dword1_symbol =
-       "SCRATCH_RSRC_DWORD1";
-
-void ac_shader_binary_read_config(struct ac_shader_binary *binary,
-                                 struct ac_shader_config *conf,
-                                 unsigned symbol_offset,
-                                 bool supports_spill)
-{
-       unsigned i;
-       const unsigned char *config =
-               ac_shader_binary_config_start(binary, symbol_offset);
-       bool really_needs_scratch = false;
-       uint32_t wavesize = 0;
-       /* LLVM adds SGPR spills to the scratch size.
-        * Find out if we really need the scratch buffer.
-        */
-       if (supports_spill) {
-               really_needs_scratch = true;
-       } else {
-               for (i = 0; i < binary->reloc_count; i++) {
-                       const struct ac_shader_reloc *reloc = &binary->relocs[i];
-
-                       if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
-                           !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
-                               really_needs_scratch = true;
-                               break;
-                       }
-               }
-       }
-
-       for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
-               unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
-               unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
+       for (size_t i = 0; i < nbytes; i += 8) {
+               unsigned reg = util_le32_to_cpu(*(uint32_t*)(data + i));
+               unsigned value = util_le32_to_cpu(*(uint32_t*)(data + i + 4));
                switch (reg) {
                case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
                case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
                case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
                case R_00B848_COMPUTE_PGM_RSRC1:
+               case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
+                       if (wave_size == 32)
+                               conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 8);
+                       else
+                               conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
+
                        conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
-                       conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
+                       /* TODO: LLVM doesn't set FLOAT_MODE for non-compute shaders */
                        conf->float_mode =  G_00B028_FLOAT_MODE(value);
+                       conf->rsrc1 = value;
                        break;
                case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
                        conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
+                       /* TODO: LLVM doesn't set SHARED_VGPR_CNT for all shader types */
+                       conf->num_shared_vgprs = G_00B02C_SHARED_VGPR_CNT(value);
+                       conf->rsrc2 = value;
+                       break;
+               case R_00B12C_SPI_SHADER_PGM_RSRC2_VS:
+                       conf->num_shared_vgprs = G_00B12C_SHARED_VGPR_CNT(value);
+                       conf->rsrc2 = value;
+                       break;
+               case R_00B22C_SPI_SHADER_PGM_RSRC2_GS:
+                       conf->num_shared_vgprs = G_00B22C_SHARED_VGPR_CNT(value);
+                       conf->rsrc2 = value;
+                       break;
+               case R_00B42C_SPI_SHADER_PGM_RSRC2_HS:
+                       conf->num_shared_vgprs = G_00B42C_SHARED_VGPR_CNT(value);
+                       conf->rsrc2 = value;
                        break;
                case R_00B84C_COMPUTE_PGM_RSRC2:
                        conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
+                       conf->rsrc2 = value;
+                       break;
+               case R_00B8A0_COMPUTE_PGM_RSRC3:
+                       conf->num_shared_vgprs = G_00B8A0_SHARED_VGPR_CNT(value);
+                       conf->rsrc3 = value;
                        break;
                case R_0286CC_SPI_PS_INPUT_ENA:
                        conf->spi_ps_input_ena = value;
@@ -271,7 +99,7 @@ void ac_shader_binary_read_config(struct ac_shader_binary *binary,
                case R_0286E8_SPI_TMPRING_SIZE:
                case R_00B860_COMPUTE_TMPRING_SIZE:
                        /* WAVESIZE is in units of 256 dwords. */
-                       wavesize = value;
+                       scratch_size = value;
                        break;
                case SPILLED_SGPRS:
                        conf->spilled_sgprs = value;
@@ -291,13 +119,34 @@ void ac_shader_binary_read_config(struct ac_shader_binary *binary,
                        }
                        break;
                }
-
-               if (!conf->spi_ps_input_addr)
-                       conf->spi_ps_input_addr = conf->spi_ps_input_ena;
        }
 
+       if (!conf->spi_ps_input_addr)
+               conf->spi_ps_input_addr = conf->spi_ps_input_ena;
+
        if (really_needs_scratch) {
                /* sgprs spills aren't spilling */
-               conf->scratch_bytes_per_wave = G_00B860_WAVESIZE(wavesize) * 256 * 4;
+               conf->scratch_bytes_per_wave = G_00B860_WAVESIZE(scratch_size) * 256 * 4;
+       }
+
+       /* GFX 10.3 internally:
+        * - aligns VGPRS to 16 for Wave32 and 8 for Wave64
+        * - aligns LDS to 1024
+        *
+        * For shader-db stats, set num_vgprs that the hw actually uses.
+        */
+       if (info->chip_class >= GFX10_3) {
+               conf->num_vgprs = align(conf->num_vgprs, wave_size == 32 ? 16 : 8);
        }
+
+       /* Enable 64-bit and 16-bit denormals, because there is no performance
+        * cost.
+        *
+        * Don't enable denormals for 32-bit floats, because:
+        * - denormals disable output modifiers
+        * - denormals break v_mad_f32
+        * - GFX6 & GFX7 would be very slow
+        */
+       conf->float_mode &= ~V_00B028_FP_ALL_DENORMS;
+       conf->float_mode |= V_00B028_FP_64_DENORMS;
 }