amd/common: switch to 3-spaces style
authorPierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Mon, 7 Sep 2020 07:58:36 +0000 (09:58 +0200)
committerVivek Pandya <vivekvpandya@gmail.com>
Mon, 7 Sep 2020 15:55:16 +0000 (21:25 +0530)
Follow-up of !4319 using the same clang-format config.

Acked-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Acked-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5310>

22 files changed:
src/amd/common/.editorconfig [deleted file]
src/amd/common/ac_binary.c
src/amd/common/ac_binary.h
src/amd/common/ac_debug.c
src/amd/common/ac_debug.h
src/amd/common/ac_exp_param.h
src/amd/common/ac_gpu_info.c
src/amd/common/ac_gpu_info.h
src/amd/common/ac_rtld.c
src/amd/common/ac_rtld.h
src/amd/common/ac_shader_args.c
src/amd/common/ac_shader_args.h
src/amd/common/ac_shader_util.c
src/amd/common/ac_shader_util.h
src/amd/common/ac_shadowed_regs.c
src/amd/common/ac_shadowed_regs.h
src/amd/common/ac_surface.c
src/amd/common/ac_surface.h
src/amd/common/amd_family.h
src/amd/common/amd_kernel_code_t.h
src/amd/common/gfx10_format_table.h
src/amd/common/sid.h

diff --git a/src/amd/common/.editorconfig b/src/amd/common/.editorconfig
deleted file mode 100644 (file)
index 21a3c7d..0000000
+++ /dev/null
@@ -1,3 +0,0 @@
-[*.{c,h}]
-indent_style = tab
-indent_size = tab
index 4651c064abd5731f807f8af08ecf243226790da6..93068696c3272f5156c0525d813bdb96cd9d3695 100644 (file)
  * SOFTWARE.
  */
 
-#include "ac_gpu_info.h"
 #include "ac_binary.h"
 
+#include "ac_gpu_info.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
 #include <gelf.h>
 #include <libelf.h>
-#include <stdio.h>
-
 #include <sid.h>
+#include <stdio.h>
 
-#define SPILLED_SGPRS                                     0x4
-#define SPILLED_VGPRS                                     0x8
+#define SPILLED_SGPRS 0x4
+#define SPILLED_VGPRS 0x8
 
 /* Parse configuration data in .AMDGPU.config section format. */
-void ac_parse_shader_binary_config(const char *data, size_t nbytes,
-                                  unsigned wave_size,
-                                  bool really_needs_scratch,
-                                  const struct radeon_info *info,
-                                  struct ac_shader_config *conf)
+void ac_parse_shader_binary_config(const char *data, size_t nbytes, unsigned wave_size,
+                                   bool really_needs_scratch, const struct radeon_info *info,
+                                   struct ac_shader_config *conf)
 {
-       uint32_t scratch_size = 0;
+   uint32_t scratch_size = 0;
 
-       for (size_t i = 0; i < nbytes; i += 8) {
-               unsigned reg = util_le32_to_cpu(*(uint32_t*)(data + i));
-               unsigned value = util_le32_to_cpu(*(uint32_t*)(data + i + 4));
-               switch (reg) {
-               case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
-               case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
-               case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
-               case R_00B848_COMPUTE_PGM_RSRC1:
-               case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
-                       if (wave_size == 32)
-                               conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 8);
-                       else
-                               conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
+   for (size_t i = 0; i < nbytes; i += 8) {
+      unsigned reg = util_le32_to_cpu(*(uint32_t *)(data + i));
+      unsigned value = util_le32_to_cpu(*(uint32_t *)(data + i + 4));
+      switch (reg) {
+      case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
+      case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
+      case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
+      case R_00B848_COMPUTE_PGM_RSRC1:
+      case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
+         if (wave_size == 32)
+            conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 8);
+         else
+            conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
 
-                       conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
-                       /* TODO: LLVM doesn't set FLOAT_MODE for non-compute shaders */
-                       conf->float_mode =  G_00B028_FLOAT_MODE(value);
-                       conf->rsrc1 = value;
-                       break;
-               case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
-                       conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
-                       /* TODO: LLVM doesn't set SHARED_VGPR_CNT for all shader types */
-                       conf->num_shared_vgprs = G_00B02C_SHARED_VGPR_CNT(value);
-                       conf->rsrc2 = value;
-                       break;
-               case R_00B12C_SPI_SHADER_PGM_RSRC2_VS:
-                       conf->num_shared_vgprs = G_00B12C_SHARED_VGPR_CNT(value);
-                       conf->rsrc2 = value;
-                       break;
-               case R_00B22C_SPI_SHADER_PGM_RSRC2_GS:
-                       conf->num_shared_vgprs = G_00B22C_SHARED_VGPR_CNT(value);
-                       conf->rsrc2 = value;
-                       break;
-               case R_00B42C_SPI_SHADER_PGM_RSRC2_HS:
-                       conf->num_shared_vgprs = G_00B42C_SHARED_VGPR_CNT(value);
-                       conf->rsrc2 = value;
-                       break;
-               case R_00B84C_COMPUTE_PGM_RSRC2:
-                       conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
-                       conf->rsrc2 = value;
-                       break;
-               case R_00B8A0_COMPUTE_PGM_RSRC3:
-                       conf->num_shared_vgprs = G_00B8A0_SHARED_VGPR_CNT(value);
-                       conf->rsrc3 = value;
-                       break;
-               case R_0286CC_SPI_PS_INPUT_ENA:
-                       conf->spi_ps_input_ena = value;
-                       break;
-               case R_0286D0_SPI_PS_INPUT_ADDR:
-                       conf->spi_ps_input_addr = value;
-                       break;
-               case R_0286E8_SPI_TMPRING_SIZE:
-               case R_00B860_COMPUTE_TMPRING_SIZE:
-                       /* WAVESIZE is in units of 256 dwords. */
-                       scratch_size = value;
-                       break;
-               case SPILLED_SGPRS:
-                       conf->spilled_sgprs = value;
-                       break;
-               case SPILLED_VGPRS:
-                       conf->spilled_vgprs = value;
-                       break;
-               default:
-                       {
-                               static bool printed;
+         conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
+         /* TODO: LLVM doesn't set FLOAT_MODE for non-compute shaders */
+         conf->float_mode = G_00B028_FLOAT_MODE(value);
+         conf->rsrc1 = value;
+         break;
+      case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
+         conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
+         /* TODO: LLVM doesn't set SHARED_VGPR_CNT for all shader types */
+         conf->num_shared_vgprs = G_00B02C_SHARED_VGPR_CNT(value);
+         conf->rsrc2 = value;
+         break;
+      case R_00B12C_SPI_SHADER_PGM_RSRC2_VS:
+         conf->num_shared_vgprs = G_00B12C_SHARED_VGPR_CNT(value);
+         conf->rsrc2 = value;
+         break;
+      case R_00B22C_SPI_SHADER_PGM_RSRC2_GS:
+         conf->num_shared_vgprs = G_00B22C_SHARED_VGPR_CNT(value);
+         conf->rsrc2 = value;
+         break;
+      case R_00B42C_SPI_SHADER_PGM_RSRC2_HS:
+         conf->num_shared_vgprs = G_00B42C_SHARED_VGPR_CNT(value);
+         conf->rsrc2 = value;
+         break;
+      case R_00B84C_COMPUTE_PGM_RSRC2:
+         conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
+         conf->rsrc2 = value;
+         break;
+      case R_00B8A0_COMPUTE_PGM_RSRC3:
+         conf->num_shared_vgprs = G_00B8A0_SHARED_VGPR_CNT(value);
+         conf->rsrc3 = value;
+         break;
+      case R_0286CC_SPI_PS_INPUT_ENA:
+         conf->spi_ps_input_ena = value;
+         break;
+      case R_0286D0_SPI_PS_INPUT_ADDR:
+         conf->spi_ps_input_addr = value;
+         break;
+      case R_0286E8_SPI_TMPRING_SIZE:
+      case R_00B860_COMPUTE_TMPRING_SIZE:
+         /* WAVESIZE is in units of 256 dwords. */
+         scratch_size = value;
+         break;
+      case SPILLED_SGPRS:
+         conf->spilled_sgprs = value;
+         break;
+      case SPILLED_VGPRS:
+         conf->spilled_vgprs = value;
+         break;
+      default: {
+         static bool printed;
 
-                               if (!printed) {
-                                       fprintf(stderr, "Warning: LLVM emitted unknown "
-                                               "config register: 0x%x\n", reg);
-                                       printed = true;
-                               }
-                       }
-                       break;
-               }
-       }
+         if (!printed) {
+            fprintf(stderr,
+                    "Warning: LLVM emitted unknown "
+                    "config register: 0x%x\n",
+                    reg);
+            printed = true;
+         }
+      } break;
+      }
+   }
 
-       if (!conf->spi_ps_input_addr)
-               conf->spi_ps_input_addr = conf->spi_ps_input_ena;
+   if (!conf->spi_ps_input_addr)
+      conf->spi_ps_input_addr = conf->spi_ps_input_ena;
 
-       if (really_needs_scratch) {
-               /* sgprs spills aren't spilling */
-               conf->scratch_bytes_per_wave = G_00B860_WAVESIZE(scratch_size) * 256 * 4;
-       }
+   if (really_needs_scratch) {
+      /* sgprs spills aren't spilling */
+      conf->scratch_bytes_per_wave = G_00B860_WAVESIZE(scratch_size) * 256 * 4;
+   }
 
-       /* GFX 10.3 internally:
-        * - aligns VGPRS to 16 for Wave32 and 8 for Wave64
-        * - aligns LDS to 1024
-        *
-        * For shader-db stats, set num_vgprs that the hw actually uses.
-        */
-       if (info->chip_class >= GFX10_3) {
-               conf->num_vgprs = align(conf->num_vgprs, wave_size == 32 ? 16 : 8);
-       }
+   /* GFX 10.3 internally:
+    * - aligns VGPRS to 16 for Wave32 and 8 for Wave64
+    * - aligns LDS to 1024
+    *
+    * For shader-db stats, set num_vgprs that the hw actually uses.
+    */
+   if (info->chip_class >= GFX10_3) {
+      conf->num_vgprs = align(conf->num_vgprs, wave_size == 32 ? 16 : 8);
+   }
 
-       /* Enable 64-bit and 16-bit denormals, because there is no performance
-        * cost.
-        *
-        * Don't enable denormals for 32-bit floats, because:
-        * - denormals disable output modifiers
-        * - denormals break v_mad_f32
-        * - GFX6 & GFX7 would be very slow
-        */
-       conf->float_mode &= ~V_00B028_FP_ALL_DENORMS;
-       conf->float_mode |= V_00B028_FP_64_DENORMS;
+   /* Enable 64-bit and 16-bit denormals, because there is no performance
+    * cost.
+    *
+    * Don't enable denormals for 32-bit floats, because:
+    * - denormals disable output modifiers
+    * - denormals break v_mad_f32
+    * - GFX6 & GFX7 would be very slow
+    */
+   conf->float_mode &= ~V_00B028_FP_ALL_DENORMS;
+   conf->float_mode |= V_00B028_FP_64_DENORMS;
 }
index 0d981423696a061e37935e816d0e4f378e4cf94f..5eae2d50baa2300218a5e2fe7ed447909800d293 100644 (file)
@@ -24,9 +24,9 @@
 #ifndef AC_BINARY_H
 #define AC_BINARY_H
 
+#include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
-#include <stdbool.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -35,26 +35,24 @@ extern "C" {
 struct radeon_info;
 
 struct ac_shader_config {
-       unsigned num_sgprs;
-       unsigned num_vgprs;
-       unsigned num_shared_vgprs; /* GFX10: number of VGPRs shared between half-waves */
-       unsigned spilled_sgprs;
-       unsigned spilled_vgprs;
-       unsigned lds_size; /* in HW allocation units; i.e 256 bytes on SI, 512 bytes on CI+ */
-       unsigned spi_ps_input_ena;
-       unsigned spi_ps_input_addr;
-       unsigned float_mode;
-       unsigned scratch_bytes_per_wave;
-       unsigned rsrc1;
-       unsigned rsrc2;
-       unsigned rsrc3;
+   unsigned num_sgprs;
+   unsigned num_vgprs;
+   unsigned num_shared_vgprs; /* GFX10: number of VGPRs shared between half-waves */
+   unsigned spilled_sgprs;
+   unsigned spilled_vgprs;
+   unsigned lds_size; /* in HW allocation units; i.e 256 bytes on SI, 512 bytes on CI+ */
+   unsigned spi_ps_input_ena;
+   unsigned spi_ps_input_addr;
+   unsigned float_mode;
+   unsigned scratch_bytes_per_wave;
+   unsigned rsrc1;
+   unsigned rsrc2;
+   unsigned rsrc3;
 };
 
-void ac_parse_shader_binary_config(const char *data, size_t nbytes,
-                                  unsigned wave_size,
-                                  bool really_needs_scratch,
-                                  const struct radeon_info *info,
-                                  struct ac_shader_config *conf);
+void ac_parse_shader_binary_config(const char *data, size_t nbytes, unsigned wave_size,
+                                   bool really_needs_scratch, const struct radeon_info *info,
+                                   struct ac_shader_config *conf);
 
 #ifdef __cplusplus
 }
index f095b0b45dabeb40fd43aab6695d56bc1926ce8b..bbaed82c4928610e1dee63749f78170f2ea3f9aa 100644 (file)
 #include "ac_debug.h"
 
 #ifdef HAVE_VALGRIND
-#include <valgrind.h>
 #include <memcheck.h>
+#include <valgrind.h>
 #define VG(x) x
 #else
 #define VG(x) ((void)0)
 #endif
 
-#include <inttypes.h>
-
 #include "sid.h"
 #include "sid_tables.h"
 #include "util/u_math.h"
 #include "util/u_string.h"
 
 #include <assert.h>
+#include <inttypes.h>
 
 /* Parsed IBs are difficult to read without colors. Use "less -R file" to
  * read them, or use "aha -b -f file" to convert them to html.
  */
-#define COLOR_RESET    "\033[0m"
-#define COLOR_RED      "\033[31m"
-#define COLOR_GREEN    "\033[1;32m"
-#define COLOR_YELLOW   "\033[1;33m"
-#define COLOR_CYAN     "\033[1;36m"
+#define COLOR_RESET  "\033[0m"
+#define COLOR_RED    "\033[31m"
+#define COLOR_GREEN  "\033[1;32m"
+#define COLOR_YELLOW "\033[1;33m"
+#define COLOR_CYAN   "\033[1;36m"
 
 #define INDENT_PKT 8
 
 struct ac_ib_parser {
-       FILE *f;
-       uint32_t *ib;
-       unsigned num_dw;
-       const int *trace_ids;
-       unsigned trace_id_count;
-       enum chip_class chip_class;
-       ac_debug_addr_callback addr_callback;
-       void *addr_callback_data;
-
-       unsigned cur_dw;
+   FILE *f;
+   uint32_t *ib;
+   unsigned num_dw;
+   const int *trace_ids;
+   unsigned trace_id_count;
+   enum chip_class chip_class;
+   ac_debug_addr_callback addr_callback;
+   void *addr_callback_data;
+
+   unsigned cur_dw;
 };
 
 static void ac_do_parse_ib(FILE *f, struct ac_ib_parser *ib);
 
 static void print_spaces(FILE *f, unsigned num)
 {
-       fprintf(f, "%*s", num, "");
+   fprintf(f, "%*s", num, "");
 }
 
 static void print_value(FILE *file, uint32_t value, int bits)
 {
-       /* Guess if it's int or float */
-       if (value <= (1 << 15)) {
-               if (value <= 9)
-                       fprintf(file, "%u\n", value);
-               else
-                       fprintf(file, "%u (0x%0*x)\n", value, bits / 4, value);
-       } else {
-               float f = uif(value);
-
-               if (fabs(f) < 100000 && f*10 == floor(f*10))
-                       fprintf(file, "%.1ff (0x%0*x)\n", f, bits / 4, value);
-               else
-                       /* Don't print more leading zeros than there are bits. */
-                       fprintf(file, "0x%0*x\n", bits / 4, value);
-       }
+   /* Guess if it's int or float */
+   if (value <= (1 << 15)) {
+      if (value <= 9)
+         fprintf(file, "%u\n", value);
+      else
+         fprintf(file, "%u (0x%0*x)\n", value, bits / 4, value);
+   } else {
+      float f = uif(value);
+
+      if (fabs(f) < 100000 && f * 10 == floor(f * 10))
+         fprintf(file, "%.1ff (0x%0*x)\n", f, bits / 4, value);
+      else
+         /* Don't print more leading zeros than there are bits. */
+         fprintf(file, "0x%0*x\n", bits / 4, value);
+   }
 }
 
-static void print_named_value(FILE *file, const char *name, uint32_t value,
-                             int bits)
+static void print_named_value(FILE *file, const char *name, uint32_t value, int bits)
 {
-       print_spaces(file, INDENT_PKT);
-       fprintf(file, COLOR_YELLOW "%s" COLOR_RESET " <- ", name);
-       print_value(file, value, bits);
+   print_spaces(file, INDENT_PKT);
+   fprintf(file, COLOR_YELLOW "%s" COLOR_RESET " <- ", name);
+   print_value(file, value, bits);
 }
 
 static const struct si_reg *find_register(enum chip_class chip_class, unsigned offset)
 {
-       const struct si_reg *table;
-       unsigned table_size;
-
-       switch (chip_class) {
-       case GFX10_3:
-       case GFX10:
-               table = gfx10_reg_table;
-               table_size = ARRAY_SIZE(gfx10_reg_table);
-               break;
-       case GFX9:
-               table = gfx9_reg_table;
-               table_size = ARRAY_SIZE(gfx9_reg_table);
-               break;
-       case GFX8:
-               table = gfx8_reg_table;
-               table_size = ARRAY_SIZE(gfx8_reg_table);
-               break;
-       case GFX7:
-               table = gfx7_reg_table;
-               table_size = ARRAY_SIZE(gfx7_reg_table);
-               break;
-       case GFX6:
-               table = gfx6_reg_table;
-               table_size = ARRAY_SIZE(gfx6_reg_table);
-               break;
-       default:
-               return NULL;
-       }
-
-       for (unsigned i = 0; i < table_size; i++) {
-               const struct si_reg *reg = &table[i];
-
-               if (reg->offset == offset)
-                       return reg;
-       }
-
-       return NULL;
+   const struct si_reg *table;
+   unsigned table_size;
+
+   switch (chip_class) {
+   case GFX10_3:
+   case GFX10:
+      table = gfx10_reg_table;
+      table_size = ARRAY_SIZE(gfx10_reg_table);
+      break;
+   case GFX9:
+      table = gfx9_reg_table;
+      table_size = ARRAY_SIZE(gfx9_reg_table);
+      break;
+   case GFX8:
+      table = gfx8_reg_table;
+      table_size = ARRAY_SIZE(gfx8_reg_table);
+      break;
+   case GFX7:
+      table = gfx7_reg_table;
+      table_size = ARRAY_SIZE(gfx7_reg_table);
+      break;
+   case GFX6:
+      table = gfx6_reg_table;
+      table_size = ARRAY_SIZE(gfx6_reg_table);
+      break;
+   default:
+      return NULL;
+   }
+
+   for (unsigned i = 0; i < table_size; i++) {
+      const struct si_reg *reg = &table[i];
+
+      if (reg->offset == offset)
+         return reg;
+   }
+
+   return NULL;
 }
 
 const char *ac_get_register_name(enum chip_class chip_class, unsigned offset)
 {
-       const struct si_reg *reg = find_register(chip_class, offset);
+   const struct si_reg *reg = find_register(chip_class, offset);
 
-       return reg ? sid_strings + reg->name_offset : "(no name)";
+   return reg ? sid_strings + reg->name_offset : "(no name)";
 }
 
-void ac_dump_reg(FILE *file, enum chip_class chip_class, unsigned offset,
-                uint32_t value, uint32_t field_mask)
+void ac_dump_reg(FILE *file, enum chip_class chip_class, unsigned offset, uint32_t value,
+                 uint32_t field_mask)
 {
-       const struct si_reg *reg = find_register(chip_class, offset);
-
-       if (reg) {
-               const char *reg_name = sid_strings + reg->name_offset;
-               bool first_field = true;
-
-               print_spaces(file, INDENT_PKT);
-               fprintf(file, COLOR_YELLOW "%s" COLOR_RESET " <- ",
-                       reg_name);
-
-               if (!reg->num_fields) {
-                       print_value(file, value, 32);
-                       return;
-               }
-
-               for (unsigned f = 0; f < reg->num_fields; f++) {
-                       const struct si_field *field = sid_fields_table + reg->fields_offset + f;
-                       const int *values_offsets = sid_strings_offsets + field->values_offset;
-                       uint32_t val = (value & field->mask) >>
-                                      (ffs(field->mask) - 1);
-
-                       if (!(field->mask & field_mask))
-                               continue;
-
-                       /* Indent the field. */
-                       if (!first_field)
-                               print_spaces(file,
-                                            INDENT_PKT + strlen(reg_name) + 4);
-
-                       /* Print the field. */
-                       fprintf(file, "%s = ", sid_strings + field->name_offset);
-
-                       if (val < field->num_values && values_offsets[val] >= 0)
-                               fprintf(file, "%s\n", sid_strings + values_offsets[val]);
-                       else
-                               print_value(file, val,
-                                           util_bitcount(field->mask));
-
-                       first_field = false;
-               }
-               return;
-       }
-
-       print_spaces(file, INDENT_PKT);
-       fprintf(file, COLOR_YELLOW "0x%05x" COLOR_RESET " <- 0x%08x\n", offset, value);
+   const struct si_reg *reg = find_register(chip_class, offset);
+
+   if (reg) {
+      const char *reg_name = sid_strings + reg->name_offset;
+      bool first_field = true;
+
+      print_spaces(file, INDENT_PKT);
+      fprintf(file, COLOR_YELLOW "%s" COLOR_RESET " <- ", reg_name);
+
+      if (!reg->num_fields) {
+         print_value(file, value, 32);
+         return;
+      }
+
+      for (unsigned f = 0; f < reg->num_fields; f++) {
+         const struct si_field *field = sid_fields_table + reg->fields_offset + f;
+         const int *values_offsets = sid_strings_offsets + field->values_offset;
+         uint32_t val = (value & field->mask) >> (ffs(field->mask) - 1);
+
+         if (!(field->mask & field_mask))
+            continue;
+
+         /* Indent the field. */
+         if (!first_field)
+            print_spaces(file, INDENT_PKT + strlen(reg_name) + 4);
+
+         /* Print the field. */
+         fprintf(file, "%s = ", sid_strings + field->name_offset);
+
+         if (val < field->num_values && values_offsets[val] >= 0)
+            fprintf(file, "%s\n", sid_strings + values_offsets[val]);
+         else
+            print_value(file, val, util_bitcount(field->mask));
+
+         first_field = false;
+      }
+      return;
+   }
+
+   print_spaces(file, INDENT_PKT);
+   fprintf(file, COLOR_YELLOW "0x%05x" COLOR_RESET " <- 0x%08x\n", offset, value);
 }
 
 static uint32_t ac_ib_get(struct ac_ib_parser *ib)
 {
-       uint32_t v = 0;
+   uint32_t v = 0;
 
-       if (ib->cur_dw < ib->num_dw) {
-               v = ib->ib[ib->cur_dw];
+   if (ib->cur_dw < ib->num_dw) {
+      v = ib->ib[ib->cur_dw];
 #ifdef HAVE_VALGRIND
-               /* Help figure out where garbage data is written to IBs.
-                *
-                * Arguably we should do this already when the IBs are written,
-                * see RADEON_VALGRIND. The problem is that client-requests to
-                * Valgrind have an overhead even when Valgrind isn't running,
-                * and radeon_emit is performance sensitive...
-                */
-               if (VALGRIND_CHECK_VALUE_IS_DEFINED(v))
-                       fprintf(ib->f, COLOR_RED "Valgrind: The next DWORD is garbage"
-                               COLOR_RESET "\n");
+      /* Help figure out where garbage data is written to IBs.
+       *
+       * Arguably we should do this already when the IBs are written,
+       * see RADEON_VALGRIND. The problem is that client-requests to
+       * Valgrind have an overhead even when Valgrind isn't running,
+       * and radeon_emit is performance sensitive...
+       */
+      if (VALGRIND_CHECK_VALUE_IS_DEFINED(v))
+         fprintf(ib->f, COLOR_RED "Valgrind: The next DWORD is garbage" COLOR_RESET "\n");
 #endif
-               fprintf(ib->f, "\n\035#%08x ", v);
-       } else {
-               fprintf(ib->f, "\n\035#???????? ");
-       }
+      fprintf(ib->f, "\n\035#%08x ", v);
+   } else {
+      fprintf(ib->f, "\n\035#???????? ");
+   }
 
-       ib->cur_dw++;
-       return v;
+   ib->cur_dw++;
+   return v;
 }
 
 static void ac_parse_set_reg_packet(FILE *f, unsigned count, unsigned reg_offset,
-                                   struct ac_ib_parser *ib)
+                                    struct ac_ib_parser *ib)
 {
-       unsigned reg_dw = ac_ib_get(ib);
-       unsigned reg = ((reg_dw & 0xFFFF) << 2) + reg_offset;
-       unsigned index = reg_dw >> 28;
-       int i;
-
-       if (index != 0) {
-               print_spaces(f, INDENT_PKT);
-               fprintf(f, "INDEX = %u\n", index);
-       }
-
-       for (i = 0; i < count; i++)
-               ac_dump_reg(f, ib->chip_class, reg + i*4, ac_ib_get(ib), ~0);
+   unsigned reg_dw = ac_ib_get(ib);
+   unsigned reg = ((reg_dw & 0xFFFF) << 2) + reg_offset;
+   unsigned index = reg_dw >> 28;
+   int i;
+
+   if (index != 0) {
+      print_spaces(f, INDENT_PKT);
+      fprintf(f, "INDEX = %u\n", index);
+   }
+
+   for (i = 0; i < count; i++)
+      ac_dump_reg(f, ib->chip_class, reg + i * 4, ac_ib_get(ib), ~0);
 }
 
 static void ac_parse_packet3(FILE *f, uint32_t header, struct ac_ib_parser *ib,
                              int *current_trace_id)
 {
-       unsigned first_dw = ib->cur_dw;
-       int count = PKT_COUNT_G(header);
-       unsigned op = PKT3_IT_OPCODE_G(header);
-       const char *predicate = PKT3_PREDICATE(header) ? "(predicate)" : "";
-       int i;
-
-       /* Print the name first. */
-       for (i = 0; i < ARRAY_SIZE(packet3_table); i++)
-               if (packet3_table[i].op == op)
-                       break;
-
-       if (i < ARRAY_SIZE(packet3_table)) {
-               const char *name = sid_strings + packet3_table[i].name_offset;
-
-               if (op == PKT3_SET_CONTEXT_REG ||
-                   op == PKT3_SET_CONFIG_REG ||
-                   op == PKT3_SET_UCONFIG_REG ||
-                   op == PKT3_SET_UCONFIG_REG_INDEX ||
-                   op == PKT3_SET_SH_REG)
-                       fprintf(f, COLOR_CYAN "%s%s" COLOR_CYAN ":\n",
-                               name, predicate);
-               else
-                       fprintf(f, COLOR_GREEN "%s%s" COLOR_RESET ":\n",
-                               name, predicate);
-       } else
-               fprintf(f, COLOR_RED "PKT3_UNKNOWN 0x%x%s" COLOR_RESET ":\n",
-                       op, predicate);
-
-       /* Print the contents. */
-       switch (op) {
-       case PKT3_SET_CONTEXT_REG:
-               ac_parse_set_reg_packet(f, count, SI_CONTEXT_REG_OFFSET, ib);
-               break;
-       case PKT3_SET_CONFIG_REG:
-               ac_parse_set_reg_packet(f, count, SI_CONFIG_REG_OFFSET, ib);
-               break;
-       case PKT3_SET_UCONFIG_REG:
-       case PKT3_SET_UCONFIG_REG_INDEX:
-               ac_parse_set_reg_packet(f, count, CIK_UCONFIG_REG_OFFSET, ib);
-               break;
-       case PKT3_SET_SH_REG:
-               ac_parse_set_reg_packet(f, count, SI_SH_REG_OFFSET, ib);
-               break;
-       case PKT3_ACQUIRE_MEM:
-               ac_dump_reg(f, ib->chip_class, R_0301F0_CP_COHER_CNTL, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_0301F4_CP_COHER_SIZE, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_030230_CP_COHER_SIZE_HI, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_0301F8_CP_COHER_BASE, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_0301E4_CP_COHER_BASE_HI, ac_ib_get(ib), ~0);
-               print_named_value(f, "POLL_INTERVAL", ac_ib_get(ib), 16);
-               if (ib->chip_class >= GFX10)
-                       ac_dump_reg(f, ib->chip_class, R_586_GCR_CNTL, ac_ib_get(ib), ~0);
-               break;
-       case PKT3_SURFACE_SYNC:
-               if (ib->chip_class >= GFX7) {
-                       ac_dump_reg(f, ib->chip_class, R_0301F0_CP_COHER_CNTL, ac_ib_get(ib), ~0);
-                       ac_dump_reg(f, ib->chip_class, R_0301F4_CP_COHER_SIZE, ac_ib_get(ib), ~0);
-                       ac_dump_reg(f, ib->chip_class, R_0301F8_CP_COHER_BASE, ac_ib_get(ib), ~0);
-               } else {
-                       ac_dump_reg(f, ib->chip_class, R_0085F0_CP_COHER_CNTL, ac_ib_get(ib), ~0);
-                       ac_dump_reg(f, ib->chip_class, R_0085F4_CP_COHER_SIZE, ac_ib_get(ib), ~0);
-                       ac_dump_reg(f, ib->chip_class, R_0085F8_CP_COHER_BASE, ac_ib_get(ib), ~0);
-               }
-               print_named_value(f, "POLL_INTERVAL", ac_ib_get(ib), 16);
-               break;
-       case PKT3_EVENT_WRITE: {
-               uint32_t event_dw = ac_ib_get(ib);
-               ac_dump_reg(f, ib->chip_class, R_028A90_VGT_EVENT_INITIATOR, event_dw,
-                           S_028A90_EVENT_TYPE(~0));
-               print_named_value(f, "EVENT_INDEX", (event_dw >> 8) & 0xf, 4);
-               print_named_value(f, "INV_L2", (event_dw >> 20) & 0x1, 1);
-               if (count > 0) {
-                       print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32);
-                       print_named_value(f, "ADDRESS_HI", ac_ib_get(ib), 16);
-               }
-               break;
-       }
-       case PKT3_EVENT_WRITE_EOP: {
-               uint32_t event_dw = ac_ib_get(ib);
-               ac_dump_reg(f, ib->chip_class, R_028A90_VGT_EVENT_INITIATOR, event_dw,
-                           S_028A90_EVENT_TYPE(~0));
-               print_named_value(f, "EVENT_INDEX", (event_dw >> 8) & 0xf, 4);
-               print_named_value(f, "TCL1_VOL_ACTION_ENA", (event_dw >> 12) & 0x1, 1);
-               print_named_value(f, "TC_VOL_ACTION_ENA", (event_dw >> 13) & 0x1, 1);
-               print_named_value(f, "TC_WB_ACTION_ENA", (event_dw >> 15) & 0x1, 1);
-               print_named_value(f, "TCL1_ACTION_ENA", (event_dw >> 16) & 0x1, 1);
-               print_named_value(f, "TC_ACTION_ENA", (event_dw >> 17) & 0x1, 1);
-               print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32);
-               uint32_t addr_hi_dw = ac_ib_get(ib);
-               print_named_value(f, "ADDRESS_HI", addr_hi_dw, 16);
-               print_named_value(f, "DST_SEL", (addr_hi_dw >> 16) & 0x3, 2);
-               print_named_value(f, "INT_SEL", (addr_hi_dw >> 24) & 0x7, 3);
-               print_named_value(f, "DATA_SEL", addr_hi_dw >> 29, 3);
-               print_named_value(f, "DATA_LO", ac_ib_get(ib), 32);
-               print_named_value(f, "DATA_HI", ac_ib_get(ib), 32);
-               break;
-       }
-       case PKT3_RELEASE_MEM: {
-               uint32_t event_dw = ac_ib_get(ib);
-               if (ib->chip_class >= GFX10) {
-                       ac_dump_reg(f, ib->chip_class, R_490_RELEASE_MEM_OP, event_dw, ~0u);
-               } else {
-                       ac_dump_reg(f, ib->chip_class, R_028A90_VGT_EVENT_INITIATOR, event_dw,
-                                   S_028A90_EVENT_TYPE(~0));
-                       print_named_value(f, "EVENT_INDEX", (event_dw >> 8) & 0xf, 4);
-                       print_named_value(f, "TCL1_VOL_ACTION_ENA", (event_dw >> 12) & 0x1, 1);
-                       print_named_value(f, "TC_VOL_ACTION_ENA", (event_dw >> 13) & 0x1, 1);
-                       print_named_value(f, "TC_WB_ACTION_ENA", (event_dw >> 15) & 0x1, 1);
-                       print_named_value(f, "TCL1_ACTION_ENA", (event_dw >> 16) & 0x1, 1);
-                       print_named_value(f, "TC_ACTION_ENA", (event_dw >> 17) & 0x1, 1);
-                       print_named_value(f, "TC_NC_ACTION_ENA", (event_dw >> 19) & 0x1, 1);
-                       print_named_value(f, "TC_WC_ACTION_ENA", (event_dw >> 20) & 0x1, 1);
-                       print_named_value(f, "TC_MD_ACTION_ENA", (event_dw >> 21) & 0x1, 1);
-               }
-               uint32_t sel_dw = ac_ib_get(ib);
-               print_named_value(f, "DST_SEL", (sel_dw >> 16) & 0x3, 2);
-               print_named_value(f, "INT_SEL", (sel_dw >> 24) & 0x7, 3);
-               print_named_value(f, "DATA_SEL", sel_dw >> 29, 3);
-               print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32);
-               print_named_value(f, "ADDRESS_HI", ac_ib_get(ib), 32);
-               print_named_value(f, "DATA_LO", ac_ib_get(ib), 32);
-               print_named_value(f, "DATA_HI", ac_ib_get(ib), 32);
-               print_named_value(f, "CTXID", ac_ib_get(ib), 32);
-               break;
-       }
-       case PKT3_WAIT_REG_MEM:
-               print_named_value(f, "OP", ac_ib_get(ib), 32);
-               print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32);
-               print_named_value(f, "ADDRESS_HI", ac_ib_get(ib), 32);
-               print_named_value(f, "REF", ac_ib_get(ib), 32);
-               print_named_value(f, "MASK", ac_ib_get(ib), 32);
-               print_named_value(f, "POLL_INTERVAL", ac_ib_get(ib), 16);
-               break;
-       case PKT3_DRAW_INDEX_AUTO:
-               ac_dump_reg(f, ib->chip_class, R_030930_VGT_NUM_INDICES, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_0287F0_VGT_DRAW_INITIATOR, ac_ib_get(ib), ~0);
-               break;
-       case PKT3_DRAW_INDEX_2:
-               ac_dump_reg(f, ib->chip_class, R_028A78_VGT_DMA_MAX_SIZE, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_0287E8_VGT_DMA_BASE, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_0287E4_VGT_DMA_BASE_HI, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_030930_VGT_NUM_INDICES, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_0287F0_VGT_DRAW_INITIATOR, ac_ib_get(ib), ~0);
-               break;
-       case PKT3_INDEX_TYPE:
-               ac_dump_reg(f, ib->chip_class, R_028A7C_VGT_DMA_INDEX_TYPE, ac_ib_get(ib), ~0);
-               break;
-       case PKT3_NUM_INSTANCES:
-               ac_dump_reg(f, ib->chip_class, R_030934_VGT_NUM_INSTANCES, ac_ib_get(ib), ~0);
-               break;
-       case PKT3_WRITE_DATA:
-               ac_dump_reg(f, ib->chip_class, R_370_CONTROL, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_371_DST_ADDR_LO, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_372_DST_ADDR_HI, ac_ib_get(ib), ~0);
-               /* The payload is written automatically */
-               break;
-       case PKT3_CP_DMA:
-               ac_dump_reg(f, ib->chip_class, R_410_CP_DMA_WORD0, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_411_CP_DMA_WORD1, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_412_CP_DMA_WORD2, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_413_CP_DMA_WORD3, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_414_COMMAND, ac_ib_get(ib), ~0);
-               break;
-       case PKT3_DMA_DATA:
-               ac_dump_reg(f, ib->chip_class, R_500_DMA_DATA_WORD0, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_501_SRC_ADDR_LO, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_502_SRC_ADDR_HI, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_503_DST_ADDR_LO, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_504_DST_ADDR_HI, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_414_COMMAND, ac_ib_get(ib), ~0);
-               break;
-       case PKT3_INDIRECT_BUFFER_SI:
-       case PKT3_INDIRECT_BUFFER_CONST:
-       case PKT3_INDIRECT_BUFFER_CIK: {
-               uint32_t base_lo_dw = ac_ib_get(ib);
-               ac_dump_reg(f, ib->chip_class, R_3F0_IB_BASE_LO, base_lo_dw, ~0);
-               uint32_t base_hi_dw = ac_ib_get(ib);
-               ac_dump_reg(f, ib->chip_class, R_3F1_IB_BASE_HI, base_hi_dw, ~0);
-               uint32_t control_dw = ac_ib_get(ib);
-               ac_dump_reg(f, ib->chip_class, R_3F2_IB_CONTROL, control_dw, ~0);
-
-               if (!ib->addr_callback)
-                       break;
-
-               uint64_t addr = ((uint64_t)base_hi_dw << 32) | base_lo_dw;
-               void *data = ib->addr_callback(ib->addr_callback_data, addr);
-               if (!data)
-                       break;
-
-               if (G_3F2_CHAIN(control_dw)) {
-                       ib->ib = data;
-                       ib->num_dw = G_3F2_IB_SIZE(control_dw);
-                       ib->cur_dw = 0;
-                       return;
-               }
-
-               struct ac_ib_parser ib_recurse;
-               memcpy(&ib_recurse, ib, sizeof(ib_recurse));
-               ib_recurse.ib = data;
-               ib_recurse.num_dw = G_3F2_IB_SIZE(control_dw);
-               ib_recurse.cur_dw = 0;
-               if(ib_recurse.trace_id_count) {
-                       if (*current_trace_id == *ib->trace_ids) {
-                               ++ib_recurse.trace_ids;
-                               --ib_recurse.trace_id_count;
-                       } else {
-                               ib_recurse.trace_id_count = 0;
-                       }
-               }
-
-               fprintf(f, "\n\035>------------------ nested begin ------------------\n");
-               ac_do_parse_ib(f, &ib_recurse);
-               fprintf(f, "\n\035<------------------- nested end -------------------\n");
-               break;
-       }
-       case PKT3_CLEAR_STATE:
-       case PKT3_INCREMENT_DE_COUNTER:
-       case PKT3_PFP_SYNC_ME:
-               break;
-       case PKT3_NOP:
-               if (header == PKT3_NOP_PAD) {
-                       count = -1; /* One dword NOP. */
-               } else if (count == 0 && ib->cur_dw < ib->num_dw &&
-                          AC_IS_TRACE_POINT(ib->ib[ib->cur_dw])) {
-                       unsigned packet_id = AC_GET_TRACE_POINT_ID(ib->ib[ib->cur_dw]);
-
-                       print_spaces(f, INDENT_PKT);
-                       fprintf(f, COLOR_RED "Trace point ID: %u\n", packet_id);
-
-                       if (!ib->trace_id_count)
-                               break; /* tracing was disabled */
-
-                       *current_trace_id = packet_id;
-
-                       print_spaces(f, INDENT_PKT);
-                       if (packet_id < *ib->trace_ids)
-                               fprintf(f, COLOR_RED
-                                       "This trace point was reached by the CP."
-                                       COLOR_RESET "\n");
-                       else if (packet_id == *ib->trace_ids)
-                               fprintf(f, COLOR_RED
-                                       "!!!!! This is the last trace point that "
-                                       "was reached by the CP !!!!!"
-                                       COLOR_RESET "\n");
-                       else if (packet_id+1 == *ib->trace_ids)
-                               fprintf(f, COLOR_RED
-                                       "!!!!! This is the first trace point that "
-                                       "was NOT been reached by the CP !!!!!"
-                                       COLOR_RESET "\n");
-                       else
-                               fprintf(f, COLOR_RED
-                                       "!!!!! This trace point was NOT reached "
-                                       "by the CP !!!!!"
-                                       COLOR_RESET "\n");
-                       break;
-               }
-               break;
-       }
-
-       /* print additional dwords */
-       while (ib->cur_dw <= first_dw + count)
-               ac_ib_get(ib);
-
-       if (ib->cur_dw > first_dw + count + 1)
-               fprintf(f, COLOR_RED "\n!!!!! count in header too low !!!!!"
-                       COLOR_RESET "\n");
+   unsigned first_dw = ib->cur_dw;
+   int count = PKT_COUNT_G(header);
+   unsigned op = PKT3_IT_OPCODE_G(header);
+   const char *predicate = PKT3_PREDICATE(header) ? "(predicate)" : "";
+   int i;
+
+   /* Print the name first. */
+   for (i = 0; i < ARRAY_SIZE(packet3_table); i++)
+      if (packet3_table[i].op == op)
+         break;
+
+   if (i < ARRAY_SIZE(packet3_table)) {
+      const char *name = sid_strings + packet3_table[i].name_offset;
+
+      if (op == PKT3_SET_CONTEXT_REG || op == PKT3_SET_CONFIG_REG || op == PKT3_SET_UCONFIG_REG ||
+          op == PKT3_SET_UCONFIG_REG_INDEX || op == PKT3_SET_SH_REG)
+         fprintf(f, COLOR_CYAN "%s%s" COLOR_CYAN ":\n", name, predicate);
+      else
+         fprintf(f, COLOR_GREEN "%s%s" COLOR_RESET ":\n", name, predicate);
+   } else
+      fprintf(f, COLOR_RED "PKT3_UNKNOWN 0x%x%s" COLOR_RESET ":\n", op, predicate);
+
+   /* Print the contents. */
+   switch (op) {
+   case PKT3_SET_CONTEXT_REG:
+      ac_parse_set_reg_packet(f, count, SI_CONTEXT_REG_OFFSET, ib);
+      break;
+   case PKT3_SET_CONFIG_REG:
+      ac_parse_set_reg_packet(f, count, SI_CONFIG_REG_OFFSET, ib);
+      break;
+   case PKT3_SET_UCONFIG_REG:
+   case PKT3_SET_UCONFIG_REG_INDEX:
+      ac_parse_set_reg_packet(f, count, CIK_UCONFIG_REG_OFFSET, ib);
+      break;
+   case PKT3_SET_SH_REG:
+      ac_parse_set_reg_packet(f, count, SI_SH_REG_OFFSET, ib);
+      break;
+   case PKT3_ACQUIRE_MEM:
+      ac_dump_reg(f, ib->chip_class, R_0301F0_CP_COHER_CNTL, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_0301F4_CP_COHER_SIZE, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_030230_CP_COHER_SIZE_HI, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_0301F8_CP_COHER_BASE, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_0301E4_CP_COHER_BASE_HI, ac_ib_get(ib), ~0);
+      print_named_value(f, "POLL_INTERVAL", ac_ib_get(ib), 16);
+      if (ib->chip_class >= GFX10)
+         ac_dump_reg(f, ib->chip_class, R_586_GCR_CNTL, ac_ib_get(ib), ~0);
+      break;
+   case PKT3_SURFACE_SYNC:
+      if (ib->chip_class >= GFX7) {
+         ac_dump_reg(f, ib->chip_class, R_0301F0_CP_COHER_CNTL, ac_ib_get(ib), ~0);
+         ac_dump_reg(f, ib->chip_class, R_0301F4_CP_COHER_SIZE, ac_ib_get(ib), ~0);
+         ac_dump_reg(f, ib->chip_class, R_0301F8_CP_COHER_BASE, ac_ib_get(ib), ~0);
+      } else {
+         ac_dump_reg(f, ib->chip_class, R_0085F0_CP_COHER_CNTL, ac_ib_get(ib), ~0);
+         ac_dump_reg(f, ib->chip_class, R_0085F4_CP_COHER_SIZE, ac_ib_get(ib), ~0);
+         ac_dump_reg(f, ib->chip_class, R_0085F8_CP_COHER_BASE, ac_ib_get(ib), ~0);
+      }
+      print_named_value(f, "POLL_INTERVAL", ac_ib_get(ib), 16);
+      break;
+   case PKT3_EVENT_WRITE: {
+      uint32_t event_dw = ac_ib_get(ib);
+      ac_dump_reg(f, ib->chip_class, R_028A90_VGT_EVENT_INITIATOR, event_dw,
+                  S_028A90_EVENT_TYPE(~0));
+      print_named_value(f, "EVENT_INDEX", (event_dw >> 8) & 0xf, 4);
+      print_named_value(f, "INV_L2", (event_dw >> 20) & 0x1, 1);
+      if (count > 0) {
+         print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32);
+         print_named_value(f, "ADDRESS_HI", ac_ib_get(ib), 16);
+      }
+      break;
+   }
+   case PKT3_EVENT_WRITE_EOP: {
+      uint32_t event_dw = ac_ib_get(ib);
+      ac_dump_reg(f, ib->chip_class, R_028A90_VGT_EVENT_INITIATOR, event_dw,
+                  S_028A90_EVENT_TYPE(~0));
+      print_named_value(f, "EVENT_INDEX", (event_dw >> 8) & 0xf, 4);
+      print_named_value(f, "TCL1_VOL_ACTION_ENA", (event_dw >> 12) & 0x1, 1);
+      print_named_value(f, "TC_VOL_ACTION_ENA", (event_dw >> 13) & 0x1, 1);
+      print_named_value(f, "TC_WB_ACTION_ENA", (event_dw >> 15) & 0x1, 1);
+      print_named_value(f, "TCL1_ACTION_ENA", (event_dw >> 16) & 0x1, 1);
+      print_named_value(f, "TC_ACTION_ENA", (event_dw >> 17) & 0x1, 1);
+      print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32);
+      uint32_t addr_hi_dw = ac_ib_get(ib);
+      print_named_value(f, "ADDRESS_HI", addr_hi_dw, 16);
+      print_named_value(f, "DST_SEL", (addr_hi_dw >> 16) & 0x3, 2);
+      print_named_value(f, "INT_SEL", (addr_hi_dw >> 24) & 0x7, 3);
+      print_named_value(f, "DATA_SEL", addr_hi_dw >> 29, 3);
+      print_named_value(f, "DATA_LO", ac_ib_get(ib), 32);
+      print_named_value(f, "DATA_HI", ac_ib_get(ib), 32);
+      break;
+   }
+   case PKT3_RELEASE_MEM: {
+      uint32_t event_dw = ac_ib_get(ib);
+      if (ib->chip_class >= GFX10) {
+         ac_dump_reg(f, ib->chip_class, R_490_RELEASE_MEM_OP, event_dw, ~0u);
+      } else {
+         ac_dump_reg(f, ib->chip_class, R_028A90_VGT_EVENT_INITIATOR, event_dw,
+                     S_028A90_EVENT_TYPE(~0));
+         print_named_value(f, "EVENT_INDEX", (event_dw >> 8) & 0xf, 4);
+         print_named_value(f, "TCL1_VOL_ACTION_ENA", (event_dw >> 12) & 0x1, 1);
+         print_named_value(f, "TC_VOL_ACTION_ENA", (event_dw >> 13) & 0x1, 1);
+         print_named_value(f, "TC_WB_ACTION_ENA", (event_dw >> 15) & 0x1, 1);
+         print_named_value(f, "TCL1_ACTION_ENA", (event_dw >> 16) & 0x1, 1);
+         print_named_value(f, "TC_ACTION_ENA", (event_dw >> 17) & 0x1, 1);
+         print_named_value(f, "TC_NC_ACTION_ENA", (event_dw >> 19) & 0x1, 1);
+         print_named_value(f, "TC_WC_ACTION_ENA", (event_dw >> 20) & 0x1, 1);
+         print_named_value(f, "TC_MD_ACTION_ENA", (event_dw >> 21) & 0x1, 1);
+      }
+      uint32_t sel_dw = ac_ib_get(ib);
+      print_named_value(f, "DST_SEL", (sel_dw >> 16) & 0x3, 2);
+      print_named_value(f, "INT_SEL", (sel_dw >> 24) & 0x7, 3);
+      print_named_value(f, "DATA_SEL", sel_dw >> 29, 3);
+      print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32);
+      print_named_value(f, "ADDRESS_HI", ac_ib_get(ib), 32);
+      print_named_value(f, "DATA_LO", ac_ib_get(ib), 32);
+      print_named_value(f, "DATA_HI", ac_ib_get(ib), 32);
+      print_named_value(f, "CTXID", ac_ib_get(ib), 32);
+      break;
+   }
+   case PKT3_WAIT_REG_MEM:
+      print_named_value(f, "OP", ac_ib_get(ib), 32);
+      print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32);
+      print_named_value(f, "ADDRESS_HI", ac_ib_get(ib), 32);
+      print_named_value(f, "REF", ac_ib_get(ib), 32);
+      print_named_value(f, "MASK", ac_ib_get(ib), 32);
+      print_named_value(f, "POLL_INTERVAL", ac_ib_get(ib), 16);
+      break;
+   case PKT3_DRAW_INDEX_AUTO:
+      ac_dump_reg(f, ib->chip_class, R_030930_VGT_NUM_INDICES, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_0287F0_VGT_DRAW_INITIATOR, ac_ib_get(ib), ~0);
+      break;
+   case PKT3_DRAW_INDEX_2:
+      ac_dump_reg(f, ib->chip_class, R_028A78_VGT_DMA_MAX_SIZE, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_0287E8_VGT_DMA_BASE, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_0287E4_VGT_DMA_BASE_HI, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_030930_VGT_NUM_INDICES, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_0287F0_VGT_DRAW_INITIATOR, ac_ib_get(ib), ~0);
+      break;
+   case PKT3_INDEX_TYPE:
+      ac_dump_reg(f, ib->chip_class, R_028A7C_VGT_DMA_INDEX_TYPE, ac_ib_get(ib), ~0);
+      break;
+   case PKT3_NUM_INSTANCES:
+      ac_dump_reg(f, ib->chip_class, R_030934_VGT_NUM_INSTANCES, ac_ib_get(ib), ~0);
+      break;
+   case PKT3_WRITE_DATA:
+      ac_dump_reg(f, ib->chip_class, R_370_CONTROL, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_371_DST_ADDR_LO, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_372_DST_ADDR_HI, ac_ib_get(ib), ~0);
+      /* The payload is written automatically */
+      break;
+   case PKT3_CP_DMA:
+      ac_dump_reg(f, ib->chip_class, R_410_CP_DMA_WORD0, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_411_CP_DMA_WORD1, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_412_CP_DMA_WORD2, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_413_CP_DMA_WORD3, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_414_COMMAND, ac_ib_get(ib), ~0);
+      break;
+   case PKT3_DMA_DATA:
+      ac_dump_reg(f, ib->chip_class, R_500_DMA_DATA_WORD0, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_501_SRC_ADDR_LO, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_502_SRC_ADDR_HI, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_503_DST_ADDR_LO, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_504_DST_ADDR_HI, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_414_COMMAND, ac_ib_get(ib), ~0);
+      break;
+   case PKT3_INDIRECT_BUFFER_SI:
+   case PKT3_INDIRECT_BUFFER_CONST:
+   case PKT3_INDIRECT_BUFFER_CIK: {
+      uint32_t base_lo_dw = ac_ib_get(ib);
+      ac_dump_reg(f, ib->chip_class, R_3F0_IB_BASE_LO, base_lo_dw, ~0);
+      uint32_t base_hi_dw = ac_ib_get(ib);
+      ac_dump_reg(f, ib->chip_class, R_3F1_IB_BASE_HI, base_hi_dw, ~0);
+      uint32_t control_dw = ac_ib_get(ib);
+      ac_dump_reg(f, ib->chip_class, R_3F2_IB_CONTROL, control_dw, ~0);
+
+      if (!ib->addr_callback)
+         break;
+
+      uint64_t addr = ((uint64_t)base_hi_dw << 32) | base_lo_dw;
+      void *data = ib->addr_callback(ib->addr_callback_data, addr);
+      if (!data)
+         break;
+
+      if (G_3F2_CHAIN(control_dw)) {
+         ib->ib = data;
+         ib->num_dw = G_3F2_IB_SIZE(control_dw);
+         ib->cur_dw = 0;
+         return;
+      }
+
+      struct ac_ib_parser ib_recurse;
+      memcpy(&ib_recurse, ib, sizeof(ib_recurse));
+      ib_recurse.ib = data;
+      ib_recurse.num_dw = G_3F2_IB_SIZE(control_dw);
+      ib_recurse.cur_dw = 0;
+      if (ib_recurse.trace_id_count) {
+         if (*current_trace_id == *ib->trace_ids) {
+            ++ib_recurse.trace_ids;
+            --ib_recurse.trace_id_count;
+         } else {
+            ib_recurse.trace_id_count = 0;
+         }
+      }
+
+      fprintf(f, "\n\035>------------------ nested begin ------------------\n");
+      ac_do_parse_ib(f, &ib_recurse);
+      fprintf(f, "\n\035<------------------- nested end -------------------\n");
+      break;
+   }
+   case PKT3_CLEAR_STATE:
+   case PKT3_INCREMENT_DE_COUNTER:
+   case PKT3_PFP_SYNC_ME:
+      break;
+   case PKT3_NOP:
+      if (header == PKT3_NOP_PAD) {
+         count = -1; /* One dword NOP. */
+      } else if (count == 0 && ib->cur_dw < ib->num_dw && AC_IS_TRACE_POINT(ib->ib[ib->cur_dw])) {
+         unsigned packet_id = AC_GET_TRACE_POINT_ID(ib->ib[ib->cur_dw]);
+
+         print_spaces(f, INDENT_PKT);
+         fprintf(f, COLOR_RED "Trace point ID: %u\n", packet_id);
+
+         if (!ib->trace_id_count)
+            break; /* tracing was disabled */
+
+         *current_trace_id = packet_id;
+
+         print_spaces(f, INDENT_PKT);
+         if (packet_id < *ib->trace_ids)
+            fprintf(f, COLOR_RED "This trace point was reached by the CP." COLOR_RESET "\n");
+         else if (packet_id == *ib->trace_ids)
+            fprintf(f, COLOR_RED "!!!!! This is the last trace point that "
+                                 "was reached by the CP !!!!!" COLOR_RESET "\n");
+         else if (packet_id + 1 == *ib->trace_ids)
+            fprintf(f, COLOR_RED "!!!!! This is the first trace point that "
+                                 "was NOT been reached by the CP !!!!!" COLOR_RESET "\n");
+         else
+            fprintf(f, COLOR_RED "!!!!! This trace point was NOT reached "
+                                 "by the CP !!!!!" COLOR_RESET "\n");
+         break;
+      }
+      break;
+   }
+
+   /* print additional dwords */
+   while (ib->cur_dw <= first_dw + count)
+      ac_ib_get(ib);
+
+   if (ib->cur_dw > first_dw + count + 1)
+      fprintf(f, COLOR_RED "\n!!!!! count in header too low !!!!!" COLOR_RESET "\n");
 }
 
 /**
@@ -517,65 +494,65 @@ static void ac_parse_packet3(FILE *f, uint32_t header, struct ac_ib_parser *ib,
  */
 static void ac_do_parse_ib(FILE *f, struct ac_ib_parser *ib)
 {
-       int current_trace_id = -1;
-
-       while (ib->cur_dw < ib->num_dw) {
-               uint32_t header = ac_ib_get(ib);
-               unsigned type = PKT_TYPE_G(header);
-
-               switch (type) {
-               case 3:
-                       ac_parse_packet3(f, header, ib, &current_trace_id);
-                       break;
-               case 2:
-                       /* type-2 nop */
-                       if (header == 0x80000000) {
-                               fprintf(f, COLOR_GREEN "NOP (type 2)" COLOR_RESET "\n");
-                               break;
-                       }
-                       /* fall through */
-               default:
-                       fprintf(f, "Unknown packet type %i\n", type);
-                       break;
-               }
-       }
+   int current_trace_id = -1;
+
+   while (ib->cur_dw < ib->num_dw) {
+      uint32_t header = ac_ib_get(ib);
+      unsigned type = PKT_TYPE_G(header);
+
+      switch (type) {
+      case 3:
+         ac_parse_packet3(f, header, ib, &current_trace_id);
+         break;
+      case 2:
+         /* type-2 nop */
+         if (header == 0x80000000) {
+            fprintf(f, COLOR_GREEN "NOP (type 2)" COLOR_RESET "\n");
+            break;
+         }
+         /* fall through */
+      default:
+         fprintf(f, "Unknown packet type %i\n", type);
+         break;
+      }
+   }
 }
 
 static void format_ib_output(FILE *f, char *out)
 {
-       unsigned depth = 0;
+   unsigned depth = 0;
 
-       for (;;) {
-               char op = 0;
+   for (;;) {
+      char op = 0;
 
-               if (out[0] == '\n' && out[1] == '\035')
-                       out++;
-               if (out[0] == '\035') {
-                       op = out[1];
-                       out += 2;
-               }
+      if (out[0] == '\n' && out[1] == '\035')
+         out++;
+      if (out[0] == '\035') {
+         op = out[1];
+         out += 2;
+      }
 
-               if (op == '<')
-                       depth--;
+      if (op == '<')
+         depth--;
 
-               unsigned indent = 4 * depth;
-               if (op != '#')
-                       indent += 9;
+      unsigned indent = 4 * depth;
+      if (op != '#')
+         indent += 9;
 
-               if (indent)
-                       print_spaces(f, indent);
+      if (indent)
+         print_spaces(f, indent);
 
-               char *end = strchrnul(out, '\n');
-               fwrite(out, end - out, 1, f);
-               fputc('\n', f); /* always end with a new line */
-               if (!*end)
-                       break;
+      char *end = strchrnul(out, '\n');
+      fwrite(out, end - out, 1, f);
+      fputc('\n', f); /* always end with a new line */
+      if (!*end)
+         break;
 
-               out = end + 1;
+      out = end + 1;
 
-               if (op == '>')
-                       depth++;
-       }
+      if (op == '>')
+         depth++;
+   }
 }
 
 /**
@@ -593,34 +570,34 @@ static void format_ib_output(FILE *f, char *out)
  * \param addr_callback_data user data for addr_callback
  */
 void ac_parse_ib_chunk(FILE *f, uint32_t *ib_ptr, int num_dw, const int *trace_ids,
-                      unsigned trace_id_count, enum chip_class chip_class,
+                       unsigned trace_id_count, enum chip_class chip_class,
                        ac_debug_addr_callback addr_callback, void *addr_callback_data)
 {
-       struct ac_ib_parser ib = {};
-       ib.ib = ib_ptr;
-       ib.num_dw = num_dw;
-       ib.trace_ids = trace_ids;
-       ib.trace_id_count = trace_id_count;
-       ib.chip_class = chip_class;
-       ib.addr_callback = addr_callback;
-       ib.addr_callback_data = addr_callback_data;
-
-       char *out;
-       size_t outsize;
-       FILE *memf = open_memstream(&out, &outsize);
-       ib.f = memf;
-       ac_do_parse_ib(memf, &ib);
-       fclose(memf);
-
-       if (out) {
-               format_ib_output(f, out);
-               free(out);
-       }
-
-       if (ib.cur_dw > ib.num_dw) {
-               printf("\nPacket ends after the end of IB.\n");
-               exit(1);
-       }
+   struct ac_ib_parser ib = {};
+   ib.ib = ib_ptr;
+   ib.num_dw = num_dw;
+   ib.trace_ids = trace_ids;
+   ib.trace_id_count = trace_id_count;
+   ib.chip_class = chip_class;
+   ib.addr_callback = addr_callback;
+   ib.addr_callback_data = addr_callback_data;
+
+   char *out;
+   size_t outsize;
+   FILE *memf = open_memstream(&out, &outsize);
+   ib.f = memf;
+   ac_do_parse_ib(memf, &ib);
+   fclose(memf);
+
+   if (out) {
+      format_ib_output(f, out);
+      free(out);
+   }
+
+   if (ib.cur_dw > ib.num_dw) {
+      printf("\nPacket ends after the end of IB.\n");
+      exit(1);
+   }
 }
 
 /**
@@ -637,17 +614,16 @@ void ac_parse_ib_chunk(FILE *f, uint32_t *ib_ptr, int num_dw, const int *trace_i
  *                      be NULL.
  * \param addr_callback_data user data for addr_callback
  */
-void ac_parse_ib(FILE *f, uint32_t *ib, int num_dw, const int *trace_ids,
-                unsigned trace_id_count, const char *name,
-                enum chip_class chip_class, ac_debug_addr_callback addr_callback,
-                void *addr_callback_data)
+void ac_parse_ib(FILE *f, uint32_t *ib, int num_dw, const int *trace_ids, unsigned trace_id_count,
+                 const char *name, enum chip_class chip_class, ac_debug_addr_callback addr_callback,
+                 void *addr_callback_data)
 {
-       fprintf(f, "------------------ %s begin ------------------\n", name);
+   fprintf(f, "------------------ %s begin ------------------\n", name);
 
-       ac_parse_ib_chunk(f, ib, num_dw, trace_ids, trace_id_count,
-                         chip_class, addr_callback,  addr_callback_data);
+   ac_parse_ib_chunk(f, ib, num_dw, trace_ids, trace_id_count, chip_class, addr_callback,
+                     addr_callback_data);
 
-       fprintf(f, "------------------- %s end -------------------\n\n", name);
+   fprintf(f, "------------------- %s end -------------------\n\n", name);
 }
 
 /**
@@ -657,179 +633,176 @@ void ac_parse_ib(FILE *f, uint32_t *ib, int num_dw, const int *trace_ids,
  * \param old_dmesg_timestamp  previous dmesg timestamp parsed at init time
  * \param out_addr             detected VM fault addr
  */
-bool ac_vm_fault_occured(enum chip_class chip_class,
-                        uint64_t *old_dmesg_timestamp, uint64_t *out_addr)
+bool ac_vm_fault_occured(enum chip_class chip_class, uint64_t *old_dmesg_timestamp,
+                         uint64_t *out_addr)
 {
-       char line[2000];
-       unsigned sec, usec;
-       int progress = 0;
-       uint64_t dmesg_timestamp = 0;
-       bool fault = false;
-
-       FILE *p = popen("dmesg", "r");
-       if (!p)
-               return false;
-
-       while (fgets(line, sizeof(line), p)) {
-               char *msg, len;
-
-               if (!line[0] || line[0] == '\n')
-                       continue;
-
-               /* Get the timestamp. */
-               if (sscanf(line, "[%u.%u]", &sec, &usec) != 2) {
-                       static bool hit = false;
-                       if (!hit) {
-                               fprintf(stderr, "%s: failed to parse line '%s'\n",
-                                       __func__, line);
-                               hit = true;
-                       }
-                       continue;
-               }
-               dmesg_timestamp = sec * 1000000ull + usec;
-
-               /* If just updating the timestamp. */
-               if (!out_addr)
-                       continue;
-
-               /* Process messages only if the timestamp is newer. */
-               if (dmesg_timestamp <= *old_dmesg_timestamp)
-                       continue;
-
-               /* Only process the first VM fault. */
-               if (fault)
-                       continue;
-
-               /* Remove trailing \n */
-               len = strlen(line);
-               if (len && line[len-1] == '\n')
-                       line[len-1] = 0;
-
-               /* Get the message part. */
-               msg = strchr(line, ']');
-               if (!msg)
-                       continue;
-               msg++;
-
-               const char *header_line, *addr_line_prefix, *addr_line_format;
-
-               if (chip_class >= GFX9) {
-                       /* Match this:
-                        * ..: [gfxhub] VMC page fault (src_id:0 ring:158 vm_id:2 pas_id:0)
-                        * ..:   at page 0x0000000219f8f000 from 27
-                        * ..: VM_L2_PROTECTION_FAULT_STATUS:0x0020113C
-                        */
-                       header_line = "VMC page fault";
-                       addr_line_prefix = "   at page";
-                       addr_line_format = "%"PRIx64;
-               } else {
-                       header_line = "GPU fault detected:";
-                       addr_line_prefix = "VM_CONTEXT1_PROTECTION_FAULT_ADDR";
-                       addr_line_format = "%"PRIX64;
-               }
-
-               switch (progress) {
-               case 0:
-                       if (strstr(msg, header_line))
-                               progress = 1;
-                       break;
-               case 1:
-                       msg = strstr(msg, addr_line_prefix);
-                       if (msg) {
-                               msg = strstr(msg, "0x");
-                               if (msg) {
-                                       msg += 2;
-                                       if (sscanf(msg, addr_line_format, out_addr) == 1)
-                                               fault = true;
-                               }
-                       }
-                       progress = 0;
-                       break;
-               default:
-                       progress = 0;
-               }
-       }
-       pclose(p);
-
-       if (dmesg_timestamp > *old_dmesg_timestamp)
-               *old_dmesg_timestamp = dmesg_timestamp;
-
-       return fault;
+   char line[2000];
+   unsigned sec, usec;
+   int progress = 0;
+   uint64_t dmesg_timestamp = 0;
+   bool fault = false;
+
+   FILE *p = popen("dmesg", "r");
+   if (!p)
+      return false;
+
+   while (fgets(line, sizeof(line), p)) {
+      char *msg, len;
+
+      if (!line[0] || line[0] == '\n')
+         continue;
+
+      /* Get the timestamp. */
+      if (sscanf(line, "[%u.%u]", &sec, &usec) != 2) {
+         static bool hit = false;
+         if (!hit) {
+            fprintf(stderr, "%s: failed to parse line '%s'\n", __func__, line);
+            hit = true;
+         }
+         continue;
+      }
+      dmesg_timestamp = sec * 1000000ull + usec;
+
+      /* If just updating the timestamp. */
+      if (!out_addr)
+         continue;
+
+      /* Process messages only if the timestamp is newer. */
+      if (dmesg_timestamp <= *old_dmesg_timestamp)
+         continue;
+
+      /* Only process the first VM fault. */
+      if (fault)
+         continue;
+
+      /* Remove trailing \n */
+      len = strlen(line);
+      if (len && line[len - 1] == '\n')
+         line[len - 1] = 0;
+
+      /* Get the message part. */
+      msg = strchr(line, ']');
+      if (!msg)
+         continue;
+      msg++;
+
+      const char *header_line, *addr_line_prefix, *addr_line_format;
+
+      if (chip_class >= GFX9) {
+         /* Match this:
+          * ..: [gfxhub] VMC page fault (src_id:0 ring:158 vm_id:2 pas_id:0)
+          * ..:   at page 0x0000000219f8f000 from 27
+          * ..: VM_L2_PROTECTION_FAULT_STATUS:0x0020113C
+          */
+         header_line = "VMC page fault";
+         addr_line_prefix = "   at page";
+         addr_line_format = "%" PRIx64;
+      } else {
+         header_line = "GPU fault detected:";
+         addr_line_prefix = "VM_CONTEXT1_PROTECTION_FAULT_ADDR";
+         addr_line_format = "%" PRIX64;
+      }
+
+      switch (progress) {
+      case 0:
+         if (strstr(msg, header_line))
+            progress = 1;
+         break;
+      case 1:
+         msg = strstr(msg, addr_line_prefix);
+         if (msg) {
+            msg = strstr(msg, "0x");
+            if (msg) {
+               msg += 2;
+               if (sscanf(msg, addr_line_format, out_addr) == 1)
+                  fault = true;
+            }
+         }
+         progress = 0;
+         break;
+      default:
+         progress = 0;
+      }
+   }
+   pclose(p);
+
+   if (dmesg_timestamp > *old_dmesg_timestamp)
+      *old_dmesg_timestamp = dmesg_timestamp;
+
+   return fault;
 }
 
 static int compare_wave(const void *p1, const void *p2)
 {
-       struct ac_wave_info *w1 = (struct ac_wave_info *)p1;
-       struct ac_wave_info *w2 = (struct ac_wave_info *)p2;
-
-       /* Sort waves according to PC and then SE, SH, CU, etc. */
-       if (w1->pc < w2->pc)
-               return -1;
-       if (w1->pc > w2->pc)
-               return 1;
-       if (w1->se < w2->se)
-               return -1;
-       if (w1->se > w2->se)
-               return 1;
-       if (w1->sh < w2->sh)
-               return -1;
-       if (w1->sh > w2->sh)
-               return 1;
-       if (w1->cu < w2->cu)
-               return -1;
-       if (w1->cu > w2->cu)
-               return 1;
-       if (w1->simd < w2->simd)
-               return -1;
-       if (w1->simd > w2->simd)
-               return 1;
-       if (w1->wave < w2->wave)
-               return -1;
-       if (w1->wave > w2->wave)
-               return 1;
-
-       return 0;
+   struct ac_wave_info *w1 = (struct ac_wave_info *)p1;
+   struct ac_wave_info *w2 = (struct ac_wave_info *)p2;
+
+   /* Sort waves according to PC and then SE, SH, CU, etc. */
+   if (w1->pc < w2->pc)
+      return -1;
+   if (w1->pc > w2->pc)
+      return 1;
+   if (w1->se < w2->se)
+      return -1;
+   if (w1->se > w2->se)
+      return 1;
+   if (w1->sh < w2->sh)
+      return -1;
+   if (w1->sh > w2->sh)
+      return 1;
+   if (w1->cu < w2->cu)
+      return -1;
+   if (w1->cu > w2->cu)
+      return 1;
+   if (w1->simd < w2->simd)
+      return -1;
+   if (w1->simd > w2->simd)
+      return 1;
+   if (w1->wave < w2->wave)
+      return -1;
+   if (w1->wave > w2->wave)
+      return 1;
+
+   return 0;
 }
 
 /* Return wave information. "waves" should be a large enough array. */
 unsigned ac_get_wave_info(enum chip_class chip_class,
-                         struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP])
+                          struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP])
 {
-       char line[2000], cmd[128];
-       unsigned num_waves = 0;
-
-       sprintf(cmd, "umr -O halt_waves -wa %s", chip_class >= GFX10 ? "gfx_0.0.0" : "gfx");
-
-       FILE *p = popen(cmd, "r");
-       if (!p)
-               return 0;
-
-       if (!fgets(line, sizeof(line), p) ||
-           strncmp(line, "SE", 2) != 0) {
-               pclose(p);
-               return 0;
-       }
-
-       while (fgets(line, sizeof(line), p)) {
-               struct ac_wave_info *w;
-               uint32_t pc_hi, pc_lo, exec_hi, exec_lo;
-
-               assert(num_waves < AC_MAX_WAVES_PER_CHIP);
-               w = &waves[num_waves];
-
-               if (sscanf(line, "%u %u %u %u %u %x %x %x %x %x %x %x",
-                          &w->se, &w->sh, &w->cu, &w->simd, &w->wave,
-                          &w->status, &pc_hi, &pc_lo, &w->inst_dw0,
-                          &w->inst_dw1, &exec_hi, &exec_lo) == 12) {
-                       w->pc = ((uint64_t)pc_hi << 32) | pc_lo;
-                       w->exec = ((uint64_t)exec_hi << 32) | exec_lo;
-                       w->matched = false;
-                       num_waves++;
-               }
-       }
-
-       qsort(waves, num_waves, sizeof(struct ac_wave_info), compare_wave);
-
-       pclose(p);
-       return num_waves;
+   char line[2000], cmd[128];
+   unsigned num_waves = 0;
+
+   sprintf(cmd, "umr -O halt_waves -wa %s", chip_class >= GFX10 ? "gfx_0.0.0" : "gfx");
+
+   FILE *p = popen(cmd, "r");
+   if (!p)
+      return 0;
+
+   if (!fgets(line, sizeof(line), p) || strncmp(line, "SE", 2) != 0) {
+      pclose(p);
+      return 0;
+   }
+
+   while (fgets(line, sizeof(line), p)) {
+      struct ac_wave_info *w;
+      uint32_t pc_hi, pc_lo, exec_hi, exec_lo;
+
+      assert(num_waves < AC_MAX_WAVES_PER_CHIP);
+      w = &waves[num_waves];
+
+      if (sscanf(line, "%u %u %u %u %u %x %x %x %x %x %x %x", &w->se, &w->sh, &w->cu, &w->simd,
+                 &w->wave, &w->status, &pc_hi, &pc_lo, &w->inst_dw0, &w->inst_dw1, &exec_hi,
+                 &exec_lo) == 12) {
+         w->pc = ((uint64_t)pc_hi << 32) | pc_lo;
+         w->exec = ((uint64_t)exec_hi << 32) | exec_lo;
+         w->matched = false;
+         num_waves++;
+      }
+   }
+
+   qsort(waves, num_waves, sizeof(struct ac_wave_info), compare_wave);
+
+   pclose(p);
+   return num_waves;
 }
index e66abb940c300f12a216eb67a73f7e87f3c9e3ce..72441f7d6ccb15b80d4da768705f766197cdd6ea 100644 (file)
 #ifndef AC_DEBUG_H
 #define AC_DEBUG_H
 
+#include "amd_family.h"
+
+#include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
-#include <stdbool.h>
-
-#include "amd_family.h"
 
-#define AC_ENCODE_TRACE_POINT(id)       (0xcafe0000 | ((id) & 0xffff))
-#define AC_IS_TRACE_POINT(x)            (((x) & 0xcafe0000) == 0xcafe0000)
-#define AC_GET_TRACE_POINT_ID(x)        ((x) & 0xffff)
+#define AC_ENCODE_TRACE_POINT(id) (0xcafe0000 | ((id)&0xffff))
+#define AC_IS_TRACE_POINT(x)      (((x)&0xcafe0000) == 0xcafe0000)
+#define AC_GET_TRACE_POINT_ID(x)  ((x)&0xffff)
 
 #define AC_MAX_WAVES_PER_CHIP (64 * 40)
 
@@ -41,36 +41,36 @@ extern "C" {
 #endif
 
 struct ac_wave_info {
-       unsigned se; /* shader engine */
-       unsigned sh; /* shader array */
-       unsigned cu; /* compute unit */
-       unsigned simd;
-       unsigned wave;
-       uint32_t status;
-       uint64_t pc; /* program counter */
-       uint32_t inst_dw0;
-       uint32_t inst_dw1;
-       uint64_t exec;
-       bool matched; /* whether the wave is used by a currently-bound shader */
+   unsigned se; /* shader engine */
+   unsigned sh; /* shader array */
+   unsigned cu; /* compute unit */
+   unsigned simd;
+   unsigned wave;
+   uint32_t status;
+   uint64_t pc; /* program counter */
+   uint32_t inst_dw0;
+   uint32_t inst_dw1;
+   uint64_t exec;
+   bool matched; /* whether the wave is used by a currently-bound shader */
 };
 
 typedef void *(*ac_debug_addr_callback)(void *data, uint64_t addr);
 
 const char *ac_get_register_name(enum chip_class chip_class, unsigned offset);
-void ac_dump_reg(FILE *file, enum chip_class chip_class, unsigned offset,
-                uint32_t value, uint32_t field_mask);
+void ac_dump_reg(FILE *file, enum chip_class chip_class, unsigned offset, uint32_t value,
+                 uint32_t field_mask);
 void ac_parse_ib_chunk(FILE *f, uint32_t *ib, int num_dw, const int *trace_ids,
-                      unsigned trace_id_count, enum chip_class chip_class,
-                      ac_debug_addr_callback addr_callback, void *addr_callback_data);
-void ac_parse_ib(FILE *f, uint32_t *ib, int num_dw, const int *trace_ids,
-                unsigned trace_id_count, const char *name, enum chip_class chip_class,
-                ac_debug_addr_callback addr_callback, void *addr_callback_data);
+                       unsigned trace_id_count, enum chip_class chip_class,
+                       ac_debug_addr_callback addr_callback, void *addr_callback_data);
+void ac_parse_ib(FILE *f, uint32_t *ib, int num_dw, const int *trace_ids, unsigned trace_id_count,
+                 const char *name, enum chip_class chip_class, ac_debug_addr_callback addr_callback,
+                 void *addr_callback_data);
 
-bool ac_vm_fault_occured(enum chip_class chip_class,
-                        uint64_t *old_dmesg_timestamp, uint64_t *out_addr);
+bool ac_vm_fault_occured(enum chip_class chip_class, uint64_t *old_dmesg_timestamp,
+                         uint64_t *out_addr);
 
 unsigned ac_get_wave_info(enum chip_class chip_class,
-                         struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP]);
+                          struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP]);
 
 #ifdef __cplusplus
 }
index b97ce8154e0203c6ddd9e44a6af63fb68e3919a2..ac8018c0b39ea69c42257f706535e4ad17933a04 100644 (file)
 #ifndef AC_EXP_PARAM_H
 #define AC_EXP_PARAM_H
 
-enum {
-       /* SPI_PS_INPUT_CNTL_i.OFFSET[0:4] */
-       AC_EXP_PARAM_OFFSET_0 = 0,
-       AC_EXP_PARAM_OFFSET_31 = 31,
-       /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL[0:1] */
-       AC_EXP_PARAM_DEFAULT_VAL_0000 = 64,
-       AC_EXP_PARAM_DEFAULT_VAL_0001,
-       AC_EXP_PARAM_DEFAULT_VAL_1110,
-       AC_EXP_PARAM_DEFAULT_VAL_1111,
-       AC_EXP_PARAM_UNDEFINED = 255,
+enum
+{
+   /* SPI_PS_INPUT_CNTL_i.OFFSET[0:4] */
+   AC_EXP_PARAM_OFFSET_0 = 0,
+   AC_EXP_PARAM_OFFSET_31 = 31,
+   /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL[0:1] */
+   AC_EXP_PARAM_DEFAULT_VAL_0000 = 64,
+   AC_EXP_PARAM_DEFAULT_VAL_0001,
+   AC_EXP_PARAM_DEFAULT_VAL_1110,
+   AC_EXP_PARAM_DEFAULT_VAL_1111,
+   AC_EXP_PARAM_UNDEFINED = 255,
 };
 
 #endif
index e6ed816f74c4f58fb5a5427ce36f577bbb7f48fd..770737a7ed42ff74025f81b1331a7f81efdd730c 100644 (file)
  */
 
 #include "ac_gpu_info.h"
+
 #include "addrlib/src/amdgpu_asic_addr.h"
+#include "drm-uapi/amdgpu_drm.h"
 #include "sid.h"
-
 #include "util/macros.h"
 #include "util/u_math.h"
 
+#include <amdgpu.h>
 #include <stdio.h>
-
 #include <xf86drm.h>
-#include "drm-uapi/amdgpu_drm.h"
 
-#include <amdgpu.h>
-
-#define CIK_TILE_MODE_COLOR_2D                 14
-
-#define CIK__GB_TILE_MODE__PIPE_CONFIG(x)        (((x) >> 6) & 0x1f)
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P2               0
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16          4
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16         5
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32         6
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32         7
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16    8
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16    9
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16    10
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16   11
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16   12
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32   13
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32   14
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16   16
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16  17
+#define CIK_TILE_MODE_COLOR_2D 14
+
+#define CIK__GB_TILE_MODE__PIPE_CONFIG(x)           (((x) >> 6) & 0x1f)
+#define CIK__PIPE_CONFIG__ADDR_SURF_P2              0
+#define CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16         4
+#define CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16        5
+#define CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32        6
+#define CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32        7
+#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16   8
+#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16   9
+#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16   10
+#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16  11
+#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16  12
+#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32  13
+#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32  14
+#define CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16  16
+#define CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16 17
 
 static unsigned cik_get_num_tile_pipes(struct amdgpu_gpu_info *info)
 {
@@ -61,12 +59,12 @@ static unsigned cik_get_num_tile_pipes(struct amdgpu_gpu_info *info)
 
    switch (CIK__GB_TILE_MODE__PIPE_CONFIG(mode2d)) {
    case CIK__PIPE_CONFIG__ADDR_SURF_P2:
-       return 2;
+      return 2;
    case CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16:
    case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16:
    case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32:
    case CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32:
-       return 4;
+      return 4;
    case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16:
    case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16:
    case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16:
@@ -74,1239 +72,1162 @@ static unsigned cik_get_num_tile_pipes(struct amdgpu_gpu_info *info)
    case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16:
    case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32:
    case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32:
-       return 8;
+      return 8;
    case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16:
    case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16:
-       return 16;
+      return 16;
    default:
-       fprintf(stderr, "Invalid GFX7 pipe configuration, assuming P2\n");
-       assert(!"this should never occur");
-       return 2;
+      fprintf(stderr, "Invalid GFX7 pipe configuration, assuming P2\n");
+      assert(!"this should never occur");
+      return 2;
    }
 }
 
 static bool has_syncobj(int fd)
 {
-       uint64_t value;
-       if (drmGetCap(fd, DRM_CAP_SYNCOBJ, &value))
-               return false;
-       return value ? true : false;
+   uint64_t value;
+   if (drmGetCap(fd, DRM_CAP_SYNCOBJ, &value))
+      return false;
+   return value ? true : false;
 }
 
 static bool has_timeline_syncobj(int fd)
 {
-       uint64_t value;
-       if (drmGetCap(fd, DRM_CAP_SYNCOBJ_TIMELINE, &value))
-               return false;
-       return value ? true : false;
+   uint64_t value;
+   if (drmGetCap(fd, DRM_CAP_SYNCOBJ_TIMELINE, &value))
+      return false;
+   return value ? true : false;
 }
 
 static uint64_t fix_vram_size(uint64_t size)
 {
-       /* The VRAM size is underreported, so we need to fix it, because
-        * it's used to compute the number of memory modules for harvesting.
-        */
-       return align64(size, 256*1024*1024);
+   /* The VRAM size is underreported, so we need to fix it, because
+    * it's used to compute the number of memory modules for harvesting.
+    */
+   return align64(size, 256 * 1024 * 1024);
 }
 
-static uint32_t
-get_l2_cache_size(enum radeon_family family)
+static uint32_t get_l2_cache_size(enum radeon_family family)
 {
-       switch (family) {
-       case CHIP_KABINI:
-       case CHIP_STONEY:
-               return 128 * 1024;
-       case CHIP_OLAND:
-       case CHIP_HAINAN:
-       case CHIP_ICELAND:
-               return 256 * 1024;
-       case CHIP_PITCAIRN:
-       case CHIP_VERDE:
-       case CHIP_BONAIRE:
-       case CHIP_KAVERI:
-       case CHIP_POLARIS12:
-       case CHIP_CARRIZO:
-               return 512 * 1024;
-       case CHIP_TAHITI:
-       case CHIP_TONGA:
-               return 768 * 1024;
-               break;
-       case CHIP_HAWAII:
-       case CHIP_POLARIS11:
-               return 1024 * 1024;
-       case CHIP_FIJI:
-       case CHIP_POLARIS10:
-               return 2048 * 1024;
-               break;
-       default:
-               return 4096 * 1024;
-       }
+   switch (family) {
+   case CHIP_KABINI:
+   case CHIP_STONEY:
+      return 128 * 1024;
+   case CHIP_OLAND:
+   case CHIP_HAINAN:
+   case CHIP_ICELAND:
+      return 256 * 1024;
+   case CHIP_PITCAIRN:
+   case CHIP_VERDE:
+   case CHIP_BONAIRE:
+   case CHIP_KAVERI:
+   case CHIP_POLARIS12:
+   case CHIP_CARRIZO:
+      return 512 * 1024;
+   case CHIP_TAHITI:
+   case CHIP_TONGA:
+      return 768 * 1024;
+      break;
+   case CHIP_HAWAII:
+   case CHIP_POLARIS11:
+      return 1024 * 1024;
+   case CHIP_FIJI:
+   case CHIP_POLARIS10:
+      return 2048 * 1024;
+      break;
+   default:
+      return 4096 * 1024;
+   }
 }
 
-bool ac_query_gpu_info(int fd, void *dev_p,
-                      struct radeon_info *info,
-                      struct amdgpu_gpu_info *amdinfo)
+bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
+                       struct amdgpu_gpu_info *amdinfo)
 {
-       struct drm_amdgpu_info_device device_info = {};
-       struct amdgpu_buffer_size_alignments alignment_info = {};
-       struct drm_amdgpu_info_hw_ip dma = {}, compute = {}, uvd = {};
-       struct drm_amdgpu_info_hw_ip uvd_enc = {}, vce = {}, vcn_dec = {}, vcn_jpeg = {};
-       struct drm_amdgpu_info_hw_ip vcn_enc = {}, gfx = {};
-       struct amdgpu_gds_resource_info gds = {};
-       uint32_t vce_version = 0, vce_feature = 0, uvd_version = 0, uvd_feature = 0;
-       int r, i, j;
-       amdgpu_device_handle dev = dev_p;
-       drmDevicePtr devinfo;
-
-       /* Get PCI info. */
-       r = drmGetDevice2(fd, 0, &devinfo);
-       if (r) {
-               fprintf(stderr, "amdgpu: drmGetDevice2 failed.\n");
-               return false;
-       }
-       info->pci_domain = devinfo->businfo.pci->domain;
-       info->pci_bus = devinfo->businfo.pci->bus;
-       info->pci_dev = devinfo->businfo.pci->dev;
-       info->pci_func = devinfo->businfo.pci->func;
-       drmFreeDevice(&devinfo);
-
-       assert(info->drm_major == 3);
-       info->is_amdgpu = true;
-
-       /* Query hardware and driver information. */
-       r = amdgpu_query_gpu_info(dev, amdinfo);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_gpu_info failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_info(dev, AMDGPU_INFO_DEV_INFO, sizeof(device_info),
-                             &device_info);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_info(dev_info) failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_buffer_size_alignment(dev, &alignment_info);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_buffer_size_alignment failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_DMA, 0, &dma);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(dma) failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_GFX, 0, &gfx);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(gfx) failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_COMPUTE, 0, &compute);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(compute) failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_UVD, 0, &uvd);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(uvd) failed.\n");
-               return false;
-       }
-
-       if (info->drm_minor >= 17) {
-               r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_UVD_ENC, 0, &uvd_enc);
-               if (r) {
-                       fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(uvd_enc) failed.\n");
-                       return false;
-               }
-       }
-
-       if (info->drm_minor >= 17) {
-               r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCN_DEC, 0, &vcn_dec);
-               if (r) {
-                       fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vcn_dec) failed.\n");
-                       return false;
-               }
-       }
-
-       if (info->drm_minor >= 17) {
-               r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCN_ENC, 0, &vcn_enc);
-               if (r) {
-                       fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vcn_enc) failed.\n");
-                       return false;
-               }
-       }
-
-       if (info->drm_minor >= 27) {
-               r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCN_JPEG, 0, &vcn_jpeg);
-               if (r) {
-                       fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vcn_jpeg) failed.\n");
-                       return false;
-               }
-       }
-
-       r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_ME, 0, 0,
-                                       &info->me_fw_version,
-                                       &info->me_fw_feature);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(me) failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_PFP, 0, 0,
-                                       &info->pfp_fw_version,
-                                       &info->pfp_fw_feature);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(pfp) failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_CE, 0, 0,
-                                       &info->ce_fw_version,
-                                       &info->ce_fw_feature);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(ce) failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_UVD, 0, 0,
-                                       &uvd_version, &uvd_feature);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(uvd) failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCE, 0, &vce);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vce) failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_VCE, 0, 0,
-                                       &vce_version, &vce_feature);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(vce) failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_sw_info(dev, amdgpu_sw_info_address32_hi, &info->address32_hi);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_sw_info(address32_hi) failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_gds_info(dev, &gds);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_gds_info failed.\n");
-               return false;
-       }
-
-       if (info->drm_minor >= 9) {
-               struct drm_amdgpu_memory_info meminfo = {};
-
-               r = amdgpu_query_info(dev, AMDGPU_INFO_MEMORY, sizeof(meminfo), &meminfo);
-               if (r) {
-                       fprintf(stderr, "amdgpu: amdgpu_query_info(memory) failed.\n");
-                       return false;
-               }
-
-               /* Note: usable_heap_size values can be random and can't be relied on. */
-               info->gart_size = meminfo.gtt.total_heap_size;
-               info->vram_size = fix_vram_size(meminfo.vram.total_heap_size);
-               info->vram_vis_size = meminfo.cpu_accessible_vram.total_heap_size;
-       } else {
-               /* This is a deprecated interface, which reports usable sizes
-                * (total minus pinned), but the pinned size computation is
-                * buggy, so the values returned from these functions can be
-                * random.
-                */
-               struct amdgpu_heap_info vram, vram_vis, gtt;
-
-               r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_VRAM, 0, &vram);
-               if (r) {
-                       fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram) failed.\n");
-                       return false;
-               }
-
-               r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_VRAM,
-                                       AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED,
-                                       &vram_vis);
-               if (r) {
-                       fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram_vis) failed.\n");
-                       return false;
-               }
-
-               r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_GTT, 0, &gtt);
-               if (r) {
-                       fprintf(stderr, "amdgpu: amdgpu_query_heap_info(gtt) failed.\n");
-                       return false;
-               }
-
-               info->gart_size = gtt.heap_size;
-               info->vram_size = fix_vram_size(vram.heap_size);
-               info->vram_vis_size = vram_vis.heap_size;
-       }
-
-       /* Set chip identification. */
-       info->pci_id = amdinfo->asic_id; /* TODO: is this correct? */
-       info->pci_rev_id = amdinfo->pci_rev_id;
-       info->vce_harvest_config = amdinfo->vce_harvest_config;
-
-#define identify_chip2(asic, chipname) \
-       if (ASICREV_IS(amdinfo->chip_external_rev, asic)) { \
-               info->family = CHIP_##chipname; \
-               info->name = #chipname; \
-       }
+   struct drm_amdgpu_info_device device_info = {};
+   struct amdgpu_buffer_size_alignments alignment_info = {};
+   struct drm_amdgpu_info_hw_ip dma = {}, compute = {}, uvd = {};
+   struct drm_amdgpu_info_hw_ip uvd_enc = {}, vce = {}, vcn_dec = {}, vcn_jpeg = {};
+   struct drm_amdgpu_info_hw_ip vcn_enc = {}, gfx = {};
+   struct amdgpu_gds_resource_info gds = {};
+   uint32_t vce_version = 0, vce_feature = 0, uvd_version = 0, uvd_feature = 0;
+   int r, i, j;
+   amdgpu_device_handle dev = dev_p;
+   drmDevicePtr devinfo;
+
+   /* Get PCI info. */
+   r = drmGetDevice2(fd, 0, &devinfo);
+   if (r) {
+      fprintf(stderr, "amdgpu: drmGetDevice2 failed.\n");
+      return false;
+   }
+   info->pci_domain = devinfo->businfo.pci->domain;
+   info->pci_bus = devinfo->businfo.pci->bus;
+   info->pci_dev = devinfo->businfo.pci->dev;
+   info->pci_func = devinfo->businfo.pci->func;
+   drmFreeDevice(&devinfo);
+
+   assert(info->drm_major == 3);
+   info->is_amdgpu = true;
+
+   /* Query hardware and driver information. */
+   r = amdgpu_query_gpu_info(dev, amdinfo);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_gpu_info failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_info(dev, AMDGPU_INFO_DEV_INFO, sizeof(device_info), &device_info);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_info(dev_info) failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_buffer_size_alignment(dev, &alignment_info);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_buffer_size_alignment failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_DMA, 0, &dma);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(dma) failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_GFX, 0, &gfx);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(gfx) failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_COMPUTE, 0, &compute);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(compute) failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_UVD, 0, &uvd);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(uvd) failed.\n");
+      return false;
+   }
+
+   if (info->drm_minor >= 17) {
+      r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_UVD_ENC, 0, &uvd_enc);
+      if (r) {
+         fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(uvd_enc) failed.\n");
+         return false;
+      }
+   }
+
+   if (info->drm_minor >= 17) {
+      r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCN_DEC, 0, &vcn_dec);
+      if (r) {
+         fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vcn_dec) failed.\n");
+         return false;
+      }
+   }
+
+   if (info->drm_minor >= 17) {
+      r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCN_ENC, 0, &vcn_enc);
+      if (r) {
+         fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vcn_enc) failed.\n");
+         return false;
+      }
+   }
+
+   if (info->drm_minor >= 27) {
+      r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCN_JPEG, 0, &vcn_jpeg);
+      if (r) {
+         fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vcn_jpeg) failed.\n");
+         return false;
+      }
+   }
+
+   r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_ME, 0, 0, &info->me_fw_version,
+                                     &info->me_fw_feature);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(me) failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_PFP, 0, 0, &info->pfp_fw_version,
+                                     &info->pfp_fw_feature);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(pfp) failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_CE, 0, 0, &info->ce_fw_version,
+                                     &info->ce_fw_feature);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(ce) failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_UVD, 0, 0, &uvd_version, &uvd_feature);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(uvd) failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCE, 0, &vce);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vce) failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_VCE, 0, 0, &vce_version, &vce_feature);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(vce) failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_sw_info(dev, amdgpu_sw_info_address32_hi, &info->address32_hi);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_sw_info(address32_hi) failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_gds_info(dev, &gds);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_gds_info failed.\n");
+      return false;
+   }
+
+   if (info->drm_minor >= 9) {
+      struct drm_amdgpu_memory_info meminfo = {};
+
+      r = amdgpu_query_info(dev, AMDGPU_INFO_MEMORY, sizeof(meminfo), &meminfo);
+      if (r) {
+         fprintf(stderr, "amdgpu: amdgpu_query_info(memory) failed.\n");
+         return false;
+      }
+
+      /* Note: usable_heap_size values can be random and can't be relied on. */
+      info->gart_size = meminfo.gtt.total_heap_size;
+      info->vram_size = fix_vram_size(meminfo.vram.total_heap_size);
+      info->vram_vis_size = meminfo.cpu_accessible_vram.total_heap_size;
+   } else {
+      /* This is a deprecated interface, which reports usable sizes
+       * (total minus pinned), but the pinned size computation is
+       * buggy, so the values returned from these functions can be
+       * random.
+       */
+      struct amdgpu_heap_info vram, vram_vis, gtt;
+
+      r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_VRAM, 0, &vram);
+      if (r) {
+         fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram) failed.\n");
+         return false;
+      }
+
+      r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_VRAM, AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED,
+                                 &vram_vis);
+      if (r) {
+         fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram_vis) failed.\n");
+         return false;
+      }
+
+      r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_GTT, 0, &gtt);
+      if (r) {
+         fprintf(stderr, "amdgpu: amdgpu_query_heap_info(gtt) failed.\n");
+         return false;
+      }
+
+      info->gart_size = gtt.heap_size;
+      info->vram_size = fix_vram_size(vram.heap_size);
+      info->vram_vis_size = vram_vis.heap_size;
+   }
+
+   /* Set chip identification. */
+   info->pci_id = amdinfo->asic_id; /* TODO: is this correct? */
+   info->pci_rev_id = amdinfo->pci_rev_id;
+   info->vce_harvest_config = amdinfo->vce_harvest_config;
+
+#define identify_chip2(asic, chipname)                                                             \
+   if (ASICREV_IS(amdinfo->chip_external_rev, asic)) {                                             \
+      info->family = CHIP_##chipname;                                                              \
+      info->name = #chipname;                                                                      \
+   }
 #define identify_chip(chipname) identify_chip2(chipname, chipname)
 
-       switch (amdinfo->family_id) {
-       case FAMILY_SI:
-               identify_chip(TAHITI);
-               identify_chip(PITCAIRN);
-               identify_chip2(CAPEVERDE, VERDE);
-               identify_chip(OLAND);
-               identify_chip(HAINAN);
-               break;
-       case FAMILY_CI:
-               identify_chip(BONAIRE);
-               identify_chip(HAWAII);
-               break;
-       case FAMILY_KV:
-               identify_chip2(SPECTRE, KAVERI);
-               identify_chip2(SPOOKY, KAVERI);
-               identify_chip2(KALINDI, KABINI);
-               identify_chip2(GODAVARI, KABINI);
-               break;
-       case FAMILY_VI:
-               identify_chip(ICELAND);
-               identify_chip(TONGA);
-               identify_chip(FIJI);
-               identify_chip(POLARIS10);
-               identify_chip(POLARIS11);
-               identify_chip(POLARIS12);
-               identify_chip(VEGAM);
-               break;
-       case FAMILY_CZ:
-               identify_chip(CARRIZO);
-               identify_chip(STONEY);
-               break;
-       case FAMILY_AI:
-               identify_chip(VEGA10);
-               identify_chip(VEGA12);
-               identify_chip(VEGA20);
-               identify_chip(ARCTURUS);
-               break;
-       case FAMILY_RV:
-               identify_chip(RAVEN);
-               identify_chip(RAVEN2);
-               identify_chip(RENOIR);
-               break;
-       case FAMILY_NV:
-               identify_chip(NAVI10);
-               identify_chip(NAVI12);
-               identify_chip(NAVI14);
-               identify_chip(SIENNA_CICHLID);
-               identify_chip(NAVY_FLOUNDER);
-               break;
-       }
-
-       if (!info->name) {
-               fprintf(stderr, "amdgpu: unknown (family_id, chip_external_rev): (%u, %u)\n",
-                       amdinfo->family_id, amdinfo->chip_external_rev);
-               return false;
-       }
-
-       if (info->family >= CHIP_SIENNA_CICHLID)
-               info->chip_class = GFX10_3;
-       else if (info->family >= CHIP_NAVI10)
-               info->chip_class = GFX10;
-       else if (info->family >= CHIP_VEGA10)
-               info->chip_class = GFX9;
-       else if (info->family >= CHIP_TONGA)
-               info->chip_class = GFX8;
-       else if (info->family >= CHIP_BONAIRE)
-               info->chip_class = GFX7;
-       else if (info->family >= CHIP_TAHITI)
-               info->chip_class = GFX6;
-       else {
-               fprintf(stderr, "amdgpu: Unknown family.\n");
-               return false;
-       }
-
-       info->family_id = amdinfo->family_id;
-       info->chip_external_rev = amdinfo->chip_external_rev;
-       info->marketing_name = amdgpu_get_marketing_name(dev);
-       info->is_pro_graphics = info->marketing_name &&
-                               (!strcmp(info->marketing_name, "Pro") ||
-                                !strcmp(info->marketing_name, "PRO") ||
-                                !strcmp(info->marketing_name, "Frontier"));
-
-       /* Set which chips have dedicated VRAM. */
-       info->has_dedicated_vram =
-               !(amdinfo->ids_flags & AMDGPU_IDS_FLAGS_FUSION);
-
-       /* The kernel can split large buffers in VRAM but not in GTT, so large
-        * allocations can fail or cause buffer movement failures in the kernel.
-        */
-       if (info->has_dedicated_vram)
-               info->max_alloc_size = info->vram_size * 0.8;
-       else
-               info->max_alloc_size = info->gart_size * 0.7;
-
-       info->vram_type = amdinfo->vram_type;
-       info->vram_bit_width = amdinfo->vram_bit_width;
-       info->ce_ram_size = amdinfo->ce_ram_size;
-
-       info->l2_cache_size = get_l2_cache_size(info->family);
-       info->l1_cache_size = 16384;
-
-       /* Set which chips have uncached device memory. */
-       info->has_l2_uncached = info->chip_class >= GFX9;
-
-       /* Set hardware information. */
-       info->gds_size = gds.gds_total_size;
-       info->gds_gfx_partition_size = gds.gds_gfx_partition_size;
-       /* convert the shader/memory clocks from KHz to MHz */
-       info->max_shader_clock = amdinfo->max_engine_clk / 1000;
-       info->max_memory_clock = amdinfo->max_memory_clk / 1000;
-       info->num_tcc_blocks = device_info.num_tcc_blocks;
-       info->max_se = amdinfo->num_shader_engines;
-       info->max_sh_per_se = amdinfo->num_shader_arrays_per_engine;
-       info->has_hw_decode =
-               (uvd.available_rings != 0) || (vcn_dec.available_rings != 0) ||
-               (vcn_jpeg.available_rings != 0);
-       info->uvd_fw_version =
-               uvd.available_rings ? uvd_version : 0;
-       info->vce_fw_version =
-               vce.available_rings ? vce_version : 0;
-       info->uvd_enc_supported =
-               uvd_enc.available_rings ? true : false;
-       info->has_userptr = true;
-       info->has_syncobj = has_syncobj(fd);
-       info->has_timeline_syncobj = has_timeline_syncobj(fd);
-       info->has_syncobj_wait_for_submit = info->has_syncobj && info->drm_minor >= 20;
-       info->has_fence_to_handle = info->has_syncobj && info->drm_minor >= 21;
-       info->has_ctx_priority = info->drm_minor >= 22;
-       info->has_local_buffers = info->drm_minor >= 20;
-       info->kernel_flushes_hdp_before_ib = true;
-       info->htile_cmask_support_1d_tiling = true;
-       info->si_TA_CS_BC_BASE_ADDR_allowed = true;
-       info->has_bo_metadata = true;
-       info->has_gpu_reset_status_query = true;
-       info->has_eqaa_surface_allocator = true;
-       info->has_format_bc1_through_bc7 = true;
-       /* DRM 3.1.0 doesn't flush TC for GFX8 correctly. */
-       info->kernel_flushes_tc_l2_after_ib = info->chip_class != GFX8 ||
-                                             info->drm_minor >= 2;
-       info->has_indirect_compute_dispatch = true;
-       /* GFX6 doesn't support unaligned loads. */
-       info->has_unaligned_shader_loads = info->chip_class != GFX6;
-       /* Disable sparse mappings on GFX6 due to VM faults in CP DMA. Enable them once
-        * these faults are mitigated in software.
-        */
-       info->has_sparse_vm_mappings = info->chip_class >= GFX7 && info->drm_minor >= 13;
-       info->has_2d_tiling = true;
-       info->has_read_registers_query = true;
-       info->has_scheduled_fence_dependency = info->drm_minor >= 28;
-       info->mid_command_buffer_preemption_enabled =
-               amdinfo->ids_flags & AMDGPU_IDS_FLAGS_PREEMPTION;
-
-       info->pa_sc_tile_steering_override = device_info.pa_sc_tile_steering_override;
-       info->num_render_backends = amdinfo->rb_pipes;
-       /* The value returned by the kernel driver was wrong. */
-       if (info->family == CHIP_KAVERI)
-               info->num_render_backends = 2;
-
-       info->clock_crystal_freq = amdinfo->gpu_counter_freq;
-       if (!info->clock_crystal_freq) {
-               fprintf(stderr, "amdgpu: clock crystal frequency is 0, timestamps will be wrong\n");
-               info->clock_crystal_freq = 1;
-       }
-       if (info->chip_class >= GFX10) {
-               info->tcc_cache_line_size = 128;
-
-               if (info->drm_minor >= 35) {
-                       info->tcc_harvested = device_info.tcc_disabled_mask != 0;
-               } else {
-                       /* This is a hack, but it's all we can do without a kernel upgrade. */
-                       info->tcc_harvested =
-                               (info->vram_size / info->num_tcc_blocks) != 512*1024*1024;
-               }
-       } else {
-               info->tcc_cache_line_size = 64;
-       }
-       info->gb_addr_config = amdinfo->gb_addr_cfg;
-       if (info->chip_class >= GFX9) {
-               info->num_tile_pipes = 1 << G_0098F8_NUM_PIPES(amdinfo->gb_addr_cfg);
-               info->pipe_interleave_bytes =
-                       256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(amdinfo->gb_addr_cfg);
-       } else {
-               info->num_tile_pipes = cik_get_num_tile_pipes(amdinfo);
-               info->pipe_interleave_bytes =
-                       256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(amdinfo->gb_addr_cfg);
-       }
-       info->r600_has_virtual_memory = true;
-
-       /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
-        * 16KB makes some SIMDs unoccupied).
-        *
-        * LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used.
-        */
-       info->lds_size_per_workgroup = info->chip_class >= GFX10 ? 128 * 1024 : 64 * 1024;
-       info->lds_granularity = info->chip_class >= GFX7 ? 128 * 4 : 64 * 4;
-
-       assert(util_is_power_of_two_or_zero(dma.available_rings + 1));
-       assert(util_is_power_of_two_or_zero(compute.available_rings + 1));
-
-       info->has_graphics = gfx.available_rings > 0;
-       info->num_rings[RING_GFX] = util_bitcount(gfx.available_rings);
-       info->num_rings[RING_COMPUTE] = util_bitcount(compute.available_rings);
-       info->num_rings[RING_DMA] = util_bitcount(dma.available_rings);
-       info->num_rings[RING_UVD] = util_bitcount(uvd.available_rings);
-       info->num_rings[RING_VCE] = util_bitcount(vce.available_rings);
-       info->num_rings[RING_UVD_ENC] = util_bitcount(uvd_enc.available_rings);
-       info->num_rings[RING_VCN_DEC] = util_bitcount(vcn_dec.available_rings);
-       info->num_rings[RING_VCN_ENC] = util_bitcount(vcn_enc.available_rings);
-       info->num_rings[RING_VCN_JPEG] = util_bitcount(vcn_jpeg.available_rings);
-
-       /* This is "align_mask" copied from the kernel, maximums of all IP versions. */
-       info->ib_pad_dw_mask[RING_GFX] = 0xff;
-       info->ib_pad_dw_mask[RING_COMPUTE] = 0xff;
-       info->ib_pad_dw_mask[RING_DMA] = 0xf;
-       info->ib_pad_dw_mask[RING_UVD] = 0xf;
-       info->ib_pad_dw_mask[RING_VCE] = 0x3f;
-       info->ib_pad_dw_mask[RING_UVD_ENC] = 0x3f;
-       info->ib_pad_dw_mask[RING_VCN_DEC] = 0xf;
-       info->ib_pad_dw_mask[RING_VCN_ENC] = 0x3f;
-       info->ib_pad_dw_mask[RING_VCN_JPEG] = 0xf;
-
-       /* The mere presence of CLEAR_STATE in the IB causes random GPU hangs
-        * on GFX6. Some CLEAR_STATE cause asic hang on radeon kernel, etc.
-        * SPI_VS_OUT_CONFIG. So only enable GFX7 CLEAR_STATE on amdgpu kernel.
-        */
-       info->has_clear_state = info->chip_class >= GFX7;
-
-       info->has_distributed_tess = info->chip_class >= GFX10 ||
-                                    (info->chip_class >= GFX8 && info->max_se >= 2);
-
-       info->has_dcc_constant_encode = info->family == CHIP_RAVEN2 ||
-                                       info->family == CHIP_RENOIR ||
-                                       info->chip_class >= GFX10;
-
-       info->has_rbplus = info->family == CHIP_STONEY ||
-                          info->chip_class >= GFX9;
-
-       /* Some chips have RB+ registers, but don't support RB+. Those must
-        * always disable it.
-        */
-       info->rbplus_allowed = info->has_rbplus &&
-                              (info->family == CHIP_STONEY ||
-                               info->family == CHIP_VEGA12 ||
-                               info->family == CHIP_RAVEN ||
-                               info->family == CHIP_RAVEN2 ||
-                               info->family == CHIP_RENOIR ||
-                               info->chip_class >= GFX10_3);
-
-       info->has_out_of_order_rast = info->chip_class >= GFX8 &&
-                                     info->chip_class <= GFX9 &&
-                                     info->max_se >= 2;
-
-       /* Whether chips support double rate packed math instructions. */
-       info->has_packed_math_16bit = info->chip_class >= GFX9;
-
-       /* TODO: Figure out how to use LOAD_CONTEXT_REG on GFX6-GFX7. */
-       info->has_load_ctx_reg_pkt = info->chip_class >= GFX9 ||
-                                    (info->chip_class >= GFX8 &&
-                                     info->me_fw_feature >= 41);
-
-       info->cpdma_prefetch_writes_memory = info->chip_class <= GFX8;
-
-       info->has_gfx9_scissor_bug = info->family == CHIP_VEGA10 ||
-                                    info->family == CHIP_RAVEN;
-
-       info->has_tc_compat_zrange_bug = info->chip_class >= GFX8 &&
-                                        info->chip_class <= GFX9;
-
-       info->has_msaa_sample_loc_bug = (info->family >= CHIP_POLARIS10 &&
-                                        info->family <= CHIP_POLARIS12) ||
-                                       info->family == CHIP_VEGA10 ||
-                                       info->family == CHIP_RAVEN;
-
-       info->has_ls_vgpr_init_bug = info->family == CHIP_VEGA10 ||
-                                    info->family == CHIP_RAVEN;
-
-       /* Get the number of good compute units. */
-       info->num_good_compute_units = 0;
-       for (i = 0; i < info->max_se; i++) {
-               for (j = 0; j < info->max_sh_per_se; j++) {
-                       /*
-                        * The cu bitmap in amd gpu info structure is
-                        * 4x4 size array, and it's usually suitable for Vega
-                        * ASICs which has 4*2 SE/SH layout.
-                        * But for Arcturus, SE/SH layout is changed to 8*1.
-                        * To mostly reduce the impact, we make it compatible
-                        * with current bitmap array as below:
-                        *    SE4,SH0 --> cu_bitmap[0][1]
-                        *    SE5,SH0 --> cu_bitmap[1][1]
-                        *    SE6,SH0 --> cu_bitmap[2][1]
-                        *    SE7,SH0 --> cu_bitmap[3][1]
-                        */
-                       info->cu_mask[i%4][j+i/4] = amdinfo->cu_bitmap[i%4][j+i/4];
-                       info->num_good_compute_units +=
-                               util_bitcount(info->cu_mask[i][j]);
-               }
-       }
-
-       /* On GFX10, only whole WGPs (in units of 2 CUs) can be disabled,
-        * and max - min <= 2.
-        */
-       unsigned cu_group = info->chip_class >= GFX10 ? 2 : 1;
-       info->max_good_cu_per_sa = DIV_ROUND_UP(info->num_good_compute_units,
-                                               (info->max_se * info->max_sh_per_se * cu_group)) * cu_group;
-       info->min_good_cu_per_sa = (info->num_good_compute_units /
-                                   (info->max_se * info->max_sh_per_se * cu_group)) * cu_group;
-
-       memcpy(info->si_tile_mode_array, amdinfo->gb_tile_mode,
-               sizeof(amdinfo->gb_tile_mode));
-       info->enabled_rb_mask = amdinfo->enabled_rb_pipes_mask;
-
-       memcpy(info->cik_macrotile_mode_array, amdinfo->gb_macro_tile_mode,
-               sizeof(amdinfo->gb_macro_tile_mode));
-
-       info->pte_fragment_size = alignment_info.size_local;
-       info->gart_page_size = alignment_info.size_remote;
-
-       if (info->chip_class == GFX6)
-               info->gfx_ib_pad_with_type2 = true;
-
-       unsigned ib_align = 0;
-       ib_align = MAX2(ib_align, gfx.ib_start_alignment);
-       ib_align = MAX2(ib_align, gfx.ib_size_alignment);
-       ib_align = MAX2(ib_align, compute.ib_start_alignment);
-       ib_align = MAX2(ib_align, compute.ib_size_alignment);
-       ib_align = MAX2(ib_align, dma.ib_start_alignment);
-       ib_align = MAX2(ib_align, dma.ib_size_alignment);
-       ib_align = MAX2(ib_align, uvd.ib_start_alignment);
-       ib_align = MAX2(ib_align, uvd.ib_size_alignment);
-       ib_align = MAX2(ib_align, uvd_enc.ib_start_alignment);
-       ib_align = MAX2(ib_align, uvd_enc.ib_size_alignment);
-       ib_align = MAX2(ib_align, vce.ib_start_alignment);
-       ib_align = MAX2(ib_align, vce.ib_size_alignment);
-       ib_align = MAX2(ib_align, vcn_dec.ib_start_alignment);
-       ib_align = MAX2(ib_align, vcn_dec.ib_size_alignment);
-       ib_align = MAX2(ib_align, vcn_enc.ib_start_alignment);
-       ib_align = MAX2(ib_align, vcn_enc.ib_size_alignment);
-       ib_align = MAX2(ib_align, vcn_jpeg.ib_start_alignment);
-       ib_align = MAX2(ib_align, vcn_jpeg.ib_size_alignment);
-       /* GFX10 and maybe GFX9 need this alignment for cache coherency. */
-       if (info->chip_class >= GFX9)
-               ib_align = MAX2(ib_align, info->tcc_cache_line_size);
-       /* The kernel pads gfx and compute IBs to 256 dwords since:
-        *   66f3b2d527154bd258a57c8815004b5964aa1cf5
-        * Do the same.
-        */
-       ib_align = MAX2(ib_align, 1024);
-       info->ib_alignment = ib_align;
-
-        if ((info->drm_minor >= 31 &&
-             (info->family == CHIP_RAVEN ||
-              info->family == CHIP_RAVEN2 ||
-              info->family == CHIP_RENOIR)) ||
-            (info->drm_minor >= 34 &&
-             (info->family == CHIP_NAVI12 ||
-              info->family == CHIP_NAVI14)) ||
-            info->chip_class >= GFX10_3) {
-               if (info->num_render_backends == 1)
-                       info->use_display_dcc_unaligned = true;
-               else
-                       info->use_display_dcc_with_retile_blit = true;
-       }
-
-       info->has_gds_ordered_append = info->chip_class >= GFX7 &&
-                                      info->drm_minor >= 29;
-
-       if (info->chip_class >= GFX9) {
-               unsigned pc_lines = 0;
-
-               switch (info->family) {
-               case CHIP_VEGA10:
-               case CHIP_VEGA12:
-               case CHIP_VEGA20:
-                       pc_lines = 2048;
-                       break;
-               case CHIP_RAVEN:
-               case CHIP_RAVEN2:
-               case CHIP_RENOIR:
-               case CHIP_NAVI10:
-               case CHIP_NAVI12:
-               case CHIP_SIENNA_CICHLID:
-               case CHIP_NAVY_FLOUNDER:
-                       pc_lines = 1024;
-                       break;
-               case CHIP_NAVI14:
-                       pc_lines = 512;
-                       break;
-               case CHIP_ARCTURUS:
-                       break;
-               default:
-                       assert(0);
-               }
-
-               info->pc_lines = pc_lines;
-
-               if (info->chip_class >= GFX10) {
-                       info->pbb_max_alloc_count = pc_lines / 3;
-               } else {
-                       info->pbb_max_alloc_count =
-                               MIN2(128, pc_lines / (4 * info->max_se));
-               }
-       }
-
-       /* The number of SDPs is the same as the number of TCCs for now. */
-       if (info->chip_class >= GFX10)
-               info->num_sdp_interfaces = device_info.num_tcc_blocks;
-
-       if (info->chip_class >= GFX10_3)
-               info->max_wave64_per_simd = 16;
-       else if (info->chip_class == GFX10)
-               info->max_wave64_per_simd = 20;
-       else if (info->family >= CHIP_POLARIS10 && info->family <= CHIP_VEGAM)
-               info->max_wave64_per_simd = 8;
-       else
-               info->max_wave64_per_simd = 10;
-
-       if (info->chip_class >= GFX10) {
-               info->num_physical_sgprs_per_simd = 128 * info->max_wave64_per_simd;
-               info->min_sgpr_alloc = 128;
-               info->sgpr_alloc_granularity = 128;
-               /* Don't use late alloc on small chips. */
-               info->use_late_alloc = info->num_render_backends > 4;
-       } else if (info->chip_class >= GFX8) {
-               info->num_physical_sgprs_per_simd = 800;
-               info->min_sgpr_alloc = 16;
-               info->sgpr_alloc_granularity = 16;
-               info->use_late_alloc = true;
-       } else {
-               info->num_physical_sgprs_per_simd = 512;
-               info->min_sgpr_alloc = 8;
-               info->sgpr_alloc_granularity = 8;
-               /* Potential hang on Kabini: */
-               info->use_late_alloc = info->family != CHIP_KABINI;
-       }
-
-       info->max_sgpr_alloc = info->family == CHIP_TONGA ||
-                              info->family == CHIP_ICELAND ? 96 : 104;
-
-       info->min_wave64_vgpr_alloc = 4;
-       info->max_vgpr_alloc = 256;
-       info->wave64_vgpr_alloc_granularity = 4;
-
-       info->num_physical_wave64_vgprs_per_simd = info->chip_class >= GFX10 ? 512 : 256;
-       info->num_simd_per_compute_unit = info->chip_class >= GFX10 ? 2 : 4;
-
-       return true;
+   switch (amdinfo->family_id) {
+   case FAMILY_SI:
+      identify_chip(TAHITI);
+      identify_chip(PITCAIRN);
+      identify_chip2(CAPEVERDE, VERDE);
+      identify_chip(OLAND);
+      identify_chip(HAINAN);
+      break;
+   case FAMILY_CI:
+      identify_chip(BONAIRE);
+      identify_chip(HAWAII);
+      break;
+   case FAMILY_KV:
+      identify_chip2(SPECTRE, KAVERI);
+      identify_chip2(SPOOKY, KAVERI);
+      identify_chip2(KALINDI, KABINI);
+      identify_chip2(GODAVARI, KABINI);
+      break;
+   case FAMILY_VI:
+      identify_chip(ICELAND);
+      identify_chip(TONGA);
+      identify_chip(FIJI);
+      identify_chip(POLARIS10);
+      identify_chip(POLARIS11);
+      identify_chip(POLARIS12);
+      identify_chip(VEGAM);
+      break;
+   case FAMILY_CZ:
+      identify_chip(CARRIZO);
+      identify_chip(STONEY);
+      break;
+   case FAMILY_AI:
+      identify_chip(VEGA10);
+      identify_chip(VEGA12);
+      identify_chip(VEGA20);
+      identify_chip(ARCTURUS);
+      break;
+   case FAMILY_RV:
+      identify_chip(RAVEN);
+      identify_chip(RAVEN2);
+      identify_chip(RENOIR);
+      break;
+   case FAMILY_NV:
+      identify_chip(NAVI10);
+      identify_chip(NAVI12);
+      identify_chip(NAVI14);
+      identify_chip(SIENNA_CICHLID);
+      identify_chip(NAVY_FLOUNDER);
+      break;
+   }
+
+   if (!info->name) {
+      fprintf(stderr, "amdgpu: unknown (family_id, chip_external_rev): (%u, %u)\n",
+              amdinfo->family_id, amdinfo->chip_external_rev);
+      return false;
+   }
+
+   if (info->family >= CHIP_SIENNA_CICHLID)
+      info->chip_class = GFX10_3;
+   else if (info->family >= CHIP_NAVI10)
+      info->chip_class = GFX10;
+   else if (info->family >= CHIP_VEGA10)
+      info->chip_class = GFX9;
+   else if (info->family >= CHIP_TONGA)
+      info->chip_class = GFX8;
+   else if (info->family >= CHIP_BONAIRE)
+      info->chip_class = GFX7;
+   else if (info->family >= CHIP_TAHITI)
+      info->chip_class = GFX6;
+   else {
+      fprintf(stderr, "amdgpu: Unknown family.\n");
+      return false;
+   }
+
+   info->family_id = amdinfo->family_id;
+   info->chip_external_rev = amdinfo->chip_external_rev;
+   info->marketing_name = amdgpu_get_marketing_name(dev);
+   info->is_pro_graphics = info->marketing_name && (!strcmp(info->marketing_name, "Pro") ||
+                                                    !strcmp(info->marketing_name, "PRO") ||
+                                                    !strcmp(info->marketing_name, "Frontier"));
+
+   /* Set which chips have dedicated VRAM. */
+   info->has_dedicated_vram = !(amdinfo->ids_flags & AMDGPU_IDS_FLAGS_FUSION);
+
+   /* The kernel can split large buffers in VRAM but not in GTT, so large
+    * allocations can fail or cause buffer movement failures in the kernel.
+    */
+   if (info->has_dedicated_vram)
+      info->max_alloc_size = info->vram_size * 0.8;
+   else
+      info->max_alloc_size = info->gart_size * 0.7;
+
+   info->vram_type = amdinfo->vram_type;
+   info->vram_bit_width = amdinfo->vram_bit_width;
+   info->ce_ram_size = amdinfo->ce_ram_size;
+
+   info->l2_cache_size = get_l2_cache_size(info->family);
+   info->l1_cache_size = 16384;
+
+   /* Set which chips have uncached device memory. */
+   info->has_l2_uncached = info->chip_class >= GFX9;
+
+   /* Set hardware information. */
+   info->gds_size = gds.gds_total_size;
+   info->gds_gfx_partition_size = gds.gds_gfx_partition_size;
+   /* convert the shader/memory clocks from KHz to MHz */
+   info->max_shader_clock = amdinfo->max_engine_clk / 1000;
+   info->max_memory_clock = amdinfo->max_memory_clk / 1000;
+   info->num_tcc_blocks = device_info.num_tcc_blocks;
+   info->max_se = amdinfo->num_shader_engines;
+   info->max_sh_per_se = amdinfo->num_shader_arrays_per_engine;
+   info->has_hw_decode = (uvd.available_rings != 0) || (vcn_dec.available_rings != 0) ||
+                         (vcn_jpeg.available_rings != 0);
+   info->uvd_fw_version = uvd.available_rings ? uvd_version : 0;
+   info->vce_fw_version = vce.available_rings ? vce_version : 0;
+   info->uvd_enc_supported = uvd_enc.available_rings ? true : false;
+   info->has_userptr = true;
+   info->has_syncobj = has_syncobj(fd);
+   info->has_timeline_syncobj = has_timeline_syncobj(fd);
+   info->has_syncobj_wait_for_submit = info->has_syncobj && info->drm_minor >= 20;
+   info->has_fence_to_handle = info->has_syncobj && info->drm_minor >= 21;
+   info->has_ctx_priority = info->drm_minor >= 22;
+   info->has_local_buffers = info->drm_minor >= 20;
+   info->kernel_flushes_hdp_before_ib = true;
+   info->htile_cmask_support_1d_tiling = true;
+   info->si_TA_CS_BC_BASE_ADDR_allowed = true;
+   info->has_bo_metadata = true;
+   info->has_gpu_reset_status_query = true;
+   info->has_eqaa_surface_allocator = true;
+   info->has_format_bc1_through_bc7 = true;
+   /* DRM 3.1.0 doesn't flush TC for GFX8 correctly. */
+   info->kernel_flushes_tc_l2_after_ib = info->chip_class != GFX8 || info->drm_minor >= 2;
+   info->has_indirect_compute_dispatch = true;
+   /* GFX6 doesn't support unaligned loads. */
+   info->has_unaligned_shader_loads = info->chip_class != GFX6;
+   /* Disable sparse mappings on GFX6 due to VM faults in CP DMA. Enable them once
+    * these faults are mitigated in software.
+    */
+   info->has_sparse_vm_mappings = info->chip_class >= GFX7 && info->drm_minor >= 13;
+   info->has_2d_tiling = true;
+   info->has_read_registers_query = true;
+   info->has_scheduled_fence_dependency = info->drm_minor >= 28;
+   info->mid_command_buffer_preemption_enabled = amdinfo->ids_flags & AMDGPU_IDS_FLAGS_PREEMPTION;
+
+   info->pa_sc_tile_steering_override = device_info.pa_sc_tile_steering_override;
+   info->num_render_backends = amdinfo->rb_pipes;
+   /* The value returned by the kernel driver was wrong. */
+   if (info->family == CHIP_KAVERI)
+      info->num_render_backends = 2;
+
+   info->clock_crystal_freq = amdinfo->gpu_counter_freq;
+   if (!info->clock_crystal_freq) {
+      fprintf(stderr, "amdgpu: clock crystal frequency is 0, timestamps will be wrong\n");
+      info->clock_crystal_freq = 1;
+   }
+   if (info->chip_class >= GFX10) {
+      info->tcc_cache_line_size = 128;
+
+      if (info->drm_minor >= 35) {
+         info->tcc_harvested = device_info.tcc_disabled_mask != 0;
+      } else {
+         /* This is a hack, but it's all we can do without a kernel upgrade. */
+         info->tcc_harvested = (info->vram_size / info->num_tcc_blocks) != 512 * 1024 * 1024;
+      }
+   } else {
+      info->tcc_cache_line_size = 64;
+   }
+   info->gb_addr_config = amdinfo->gb_addr_cfg;
+   if (info->chip_class >= GFX9) {
+      info->num_tile_pipes = 1 << G_0098F8_NUM_PIPES(amdinfo->gb_addr_cfg);
+      info->pipe_interleave_bytes = 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(amdinfo->gb_addr_cfg);
+   } else {
+      info->num_tile_pipes = cik_get_num_tile_pipes(amdinfo);
+      info->pipe_interleave_bytes = 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(amdinfo->gb_addr_cfg);
+   }
+   info->r600_has_virtual_memory = true;
+
+   /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
+    * 16KB makes some SIMDs unoccupied).
+    *
+    * LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used.
+    */
+   info->lds_size_per_workgroup = info->chip_class >= GFX10 ? 128 * 1024 : 64 * 1024;
+   info->lds_granularity = info->chip_class >= GFX7 ? 128 * 4 : 64 * 4;
+
+   assert(util_is_power_of_two_or_zero(dma.available_rings + 1));
+   assert(util_is_power_of_two_or_zero(compute.available_rings + 1));
+
+   info->has_graphics = gfx.available_rings > 0;
+   info->num_rings[RING_GFX] = util_bitcount(gfx.available_rings);
+   info->num_rings[RING_COMPUTE] = util_bitcount(compute.available_rings);
+   info->num_rings[RING_DMA] = util_bitcount(dma.available_rings);
+   info->num_rings[RING_UVD] = util_bitcount(uvd.available_rings);
+   info->num_rings[RING_VCE] = util_bitcount(vce.available_rings);
+   info->num_rings[RING_UVD_ENC] = util_bitcount(uvd_enc.available_rings);
+   info->num_rings[RING_VCN_DEC] = util_bitcount(vcn_dec.available_rings);
+   info->num_rings[RING_VCN_ENC] = util_bitcount(vcn_enc.available_rings);
+   info->num_rings[RING_VCN_JPEG] = util_bitcount(vcn_jpeg.available_rings);
+
+   /* This is "align_mask" copied from the kernel, maximums of all IP versions. */
+   info->ib_pad_dw_mask[RING_GFX] = 0xff;
+   info->ib_pad_dw_mask[RING_COMPUTE] = 0xff;
+   info->ib_pad_dw_mask[RING_DMA] = 0xf;
+   info->ib_pad_dw_mask[RING_UVD] = 0xf;
+   info->ib_pad_dw_mask[RING_VCE] = 0x3f;
+   info->ib_pad_dw_mask[RING_UVD_ENC] = 0x3f;
+   info->ib_pad_dw_mask[RING_VCN_DEC] = 0xf;
+   info->ib_pad_dw_mask[RING_VCN_ENC] = 0x3f;
+   info->ib_pad_dw_mask[RING_VCN_JPEG] = 0xf;
+
+   /* The mere presence of CLEAR_STATE in the IB causes random GPU hangs
+    * on GFX6. Some CLEAR_STATE cause asic hang on radeon kernel, etc.
+    * SPI_VS_OUT_CONFIG. So only enable GFX7 CLEAR_STATE on amdgpu kernel.
+    */
+   info->has_clear_state = info->chip_class >= GFX7;
+
+   info->has_distributed_tess =
+      info->chip_class >= GFX10 || (info->chip_class >= GFX8 && info->max_se >= 2);
+
+   info->has_dcc_constant_encode =
+      info->family == CHIP_RAVEN2 || info->family == CHIP_RENOIR || info->chip_class >= GFX10;
+
+   info->has_rbplus = info->family == CHIP_STONEY || info->chip_class >= GFX9;
+
+   /* Some chips have RB+ registers, but don't support RB+. Those must
+    * always disable it.
+    */
+   info->rbplus_allowed =
+      info->has_rbplus &&
+      (info->family == CHIP_STONEY || info->family == CHIP_VEGA12 || info->family == CHIP_RAVEN ||
+       info->family == CHIP_RAVEN2 || info->family == CHIP_RENOIR || info->chip_class >= GFX10_3);
+
+   info->has_out_of_order_rast =
+      info->chip_class >= GFX8 && info->chip_class <= GFX9 && info->max_se >= 2;
+
+   /* Whether chips support double rate packed math instructions. */
+   info->has_packed_math_16bit = info->chip_class >= GFX9;
+
+   /* TODO: Figure out how to use LOAD_CONTEXT_REG on GFX6-GFX7. */
+   info->has_load_ctx_reg_pkt =
+      info->chip_class >= GFX9 || (info->chip_class >= GFX8 && info->me_fw_feature >= 41);
+
+   info->cpdma_prefetch_writes_memory = info->chip_class <= GFX8;
+
+   info->has_gfx9_scissor_bug = info->family == CHIP_VEGA10 || info->family == CHIP_RAVEN;
+
+   info->has_tc_compat_zrange_bug = info->chip_class >= GFX8 && info->chip_class <= GFX9;
+
+   info->has_msaa_sample_loc_bug =
+      (info->family >= CHIP_POLARIS10 && info->family <= CHIP_POLARIS12) ||
+      info->family == CHIP_VEGA10 || info->family == CHIP_RAVEN;
+
+   info->has_ls_vgpr_init_bug = info->family == CHIP_VEGA10 || info->family == CHIP_RAVEN;
+
+   /* Get the number of good compute units. */
+   info->num_good_compute_units = 0;
+   for (i = 0; i < info->max_se; i++) {
+      for (j = 0; j < info->max_sh_per_se; j++) {
+         /*
+          * The cu bitmap in amd gpu info structure is
+          * 4x4 size array, and it's usually suitable for Vega
+          * ASICs which has 4*2 SE/SH layout.
+          * But for Arcturus, SE/SH layout is changed to 8*1.
+          * To mostly reduce the impact, we make it compatible
+          * with current bitmap array as below:
+          *    SE4,SH0 --> cu_bitmap[0][1]
+          *    SE5,SH0 --> cu_bitmap[1][1]
+          *    SE6,SH0 --> cu_bitmap[2][1]
+          *    SE7,SH0 --> cu_bitmap[3][1]
+          */
+         info->cu_mask[i % 4][j + i / 4] = amdinfo->cu_bitmap[i % 4][j + i / 4];
+         info->num_good_compute_units += util_bitcount(info->cu_mask[i][j]);
+      }
+   }
+
+   /* On GFX10, only whole WGPs (in units of 2 CUs) can be disabled,
+    * and max - min <= 2.
+    */
+   unsigned cu_group = info->chip_class >= GFX10 ? 2 : 1;
+   info->max_good_cu_per_sa =
+      DIV_ROUND_UP(info->num_good_compute_units, (info->max_se * info->max_sh_per_se * cu_group)) *
+      cu_group;
+   info->min_good_cu_per_sa =
+      (info->num_good_compute_units / (info->max_se * info->max_sh_per_se * cu_group)) * cu_group;
+
+   memcpy(info->si_tile_mode_array, amdinfo->gb_tile_mode, sizeof(amdinfo->gb_tile_mode));
+   info->enabled_rb_mask = amdinfo->enabled_rb_pipes_mask;
+
+   memcpy(info->cik_macrotile_mode_array, amdinfo->gb_macro_tile_mode,
+          sizeof(amdinfo->gb_macro_tile_mode));
+
+   info->pte_fragment_size = alignment_info.size_local;
+   info->gart_page_size = alignment_info.size_remote;
+
+   if (info->chip_class == GFX6)
+      info->gfx_ib_pad_with_type2 = true;
+
+   unsigned ib_align = 0;
+   ib_align = MAX2(ib_align, gfx.ib_start_alignment);
+   ib_align = MAX2(ib_align, gfx.ib_size_alignment);
+   ib_align = MAX2(ib_align, compute.ib_start_alignment);
+   ib_align = MAX2(ib_align, compute.ib_size_alignment);
+   ib_align = MAX2(ib_align, dma.ib_start_alignment);
+   ib_align = MAX2(ib_align, dma.ib_size_alignment);
+   ib_align = MAX2(ib_align, uvd.ib_start_alignment);
+   ib_align = MAX2(ib_align, uvd.ib_size_alignment);
+   ib_align = MAX2(ib_align, uvd_enc.ib_start_alignment);
+   ib_align = MAX2(ib_align, uvd_enc.ib_size_alignment);
+   ib_align = MAX2(ib_align, vce.ib_start_alignment);
+   ib_align = MAX2(ib_align, vce.ib_size_alignment);
+   ib_align = MAX2(ib_align, vcn_dec.ib_start_alignment);
+   ib_align = MAX2(ib_align, vcn_dec.ib_size_alignment);
+   ib_align = MAX2(ib_align, vcn_enc.ib_start_alignment);
+   ib_align = MAX2(ib_align, vcn_enc.ib_size_alignment);
+   ib_align = MAX2(ib_align, vcn_jpeg.ib_start_alignment);
+   ib_align = MAX2(ib_align, vcn_jpeg.ib_size_alignment);
+   /* GFX10 and maybe GFX9 need this alignment for cache coherency. */
+   if (info->chip_class >= GFX9)
+      ib_align = MAX2(ib_align, info->tcc_cache_line_size);
+   /* The kernel pads gfx and compute IBs to 256 dwords since:
+    *   66f3b2d527154bd258a57c8815004b5964aa1cf5
+    * Do the same.
+    */
+   ib_align = MAX2(ib_align, 1024);
+   info->ib_alignment = ib_align;
+
+   if ((info->drm_minor >= 31 && (info->family == CHIP_RAVEN || info->family == CHIP_RAVEN2 ||
+                                  info->family == CHIP_RENOIR)) ||
+       (info->drm_minor >= 34 && (info->family == CHIP_NAVI12 || info->family == CHIP_NAVI14)) ||
+       info->chip_class >= GFX10_3) {
+      if (info->num_render_backends == 1)
+         info->use_display_dcc_unaligned = true;
+      else
+         info->use_display_dcc_with_retile_blit = true;
+   }
+
+   info->has_gds_ordered_append = info->chip_class >= GFX7 && info->drm_minor >= 29;
+
+   if (info->chip_class >= GFX9) {
+      unsigned pc_lines = 0;
+
+      switch (info->family) {
+      case CHIP_VEGA10:
+      case CHIP_VEGA12:
+      case CHIP_VEGA20:
+         pc_lines = 2048;
+         break;
+      case CHIP_RAVEN:
+      case CHIP_RAVEN2:
+      case CHIP_RENOIR:
+      case CHIP_NAVI10:
+      case CHIP_NAVI12:
+      case CHIP_SIENNA_CICHLID:
+      case CHIP_NAVY_FLOUNDER:
+         pc_lines = 1024;
+         break;
+      case CHIP_NAVI14:
+         pc_lines = 512;
+         break;
+      case CHIP_ARCTURUS:
+         break;
+      default:
+         assert(0);
+      }
+
+      info->pc_lines = pc_lines;
+
+      if (info->chip_class >= GFX10) {
+         info->pbb_max_alloc_count = pc_lines / 3;
+      } else {
+         info->pbb_max_alloc_count = MIN2(128, pc_lines / (4 * info->max_se));
+      }
+   }
+
+   /* The number of SDPs is the same as the number of TCCs for now. */
+   if (info->chip_class >= GFX10)
+      info->num_sdp_interfaces = device_info.num_tcc_blocks;
+
+   if (info->chip_class >= GFX10_3)
+      info->max_wave64_per_simd = 16;
+   else if (info->chip_class == GFX10)
+      info->max_wave64_per_simd = 20;
+   else if (info->family >= CHIP_POLARIS10 && info->family <= CHIP_VEGAM)
+      info->max_wave64_per_simd = 8;
+   else
+      info->max_wave64_per_simd = 10;
+
+   if (info->chip_class >= GFX10) {
+      info->num_physical_sgprs_per_simd = 128 * info->max_wave64_per_simd;
+      info->min_sgpr_alloc = 128;
+      info->sgpr_alloc_granularity = 128;
+      /* Don't use late alloc on small chips. */
+      info->use_late_alloc = info->num_render_backends > 4;
+   } else if (info->chip_class >= GFX8) {
+      info->num_physical_sgprs_per_simd = 800;
+      info->min_sgpr_alloc = 16;
+      info->sgpr_alloc_granularity = 16;
+      info->use_late_alloc = true;
+   } else {
+      info->num_physical_sgprs_per_simd = 512;
+      info->min_sgpr_alloc = 8;
+      info->sgpr_alloc_granularity = 8;
+      /* Potential hang on Kabini: */
+      info->use_late_alloc = info->family != CHIP_KABINI;
+   }
+
+   info->max_sgpr_alloc = info->family == CHIP_TONGA || info->family == CHIP_ICELAND ? 96 : 104;
+
+   info->min_wave64_vgpr_alloc = 4;
+   info->max_vgpr_alloc = 256;
+   info->wave64_vgpr_alloc_granularity = 4;
+
+   info->num_physical_wave64_vgprs_per_simd = info->chip_class >= GFX10 ? 512 : 256;
+   info->num_simd_per_compute_unit = info->chip_class >= GFX10 ? 2 : 4;
+
+   return true;
 }
 
 void ac_compute_driver_uuid(char *uuid, size_t size)
 {
-       char amd_uuid[] = "AMD-MESA-DRV";
+   char amd_uuid[] = "AMD-MESA-DRV";
 
-       assert(size >= sizeof(amd_uuid));
+   assert(size >= sizeof(amd_uuid));
 
-       memset(uuid, 0, size);
-       strncpy(uuid, amd_uuid, size);
+   memset(uuid, 0, size);
+   strncpy(uuid, amd_uuid, size);
 }
 
 void ac_compute_device_uuid(struct radeon_info *info, char *uuid, size_t size)
 {
-       uint32_t *uint_uuid = (uint32_t*)uuid;
-
-       assert(size >= sizeof(uint32_t)*4);
-
-       /**
-        * Use the device info directly instead of using a sha1. GL/VK UUIDs
-        * are 16 byte vs 20 byte for sha1, and the truncation that would be
-        * required would get rid of part of the little entropy we have.
-        * */
-       memset(uuid, 0, size);
-       uint_uuid[0] = info->pci_domain;
-       uint_uuid[1] = info->pci_bus;
-       uint_uuid[2] = info->pci_dev;
-       uint_uuid[3] = info->pci_func;
+   uint32_t *uint_uuid = (uint32_t *)uuid;
+
+   assert(size >= sizeof(uint32_t) * 4);
+
+   /**
+    * Use the device info directly instead of using a sha1. GL/VK UUIDs
+    * are 16 byte vs 20 byte for sha1, and the truncation that would be
+    * required would get rid of part of the little entropy we have.
+    * */
+   memset(uuid, 0, size);
+   uint_uuid[0] = info->pci_domain;
+   uint_uuid[1] = info->pci_bus;
+   uint_uuid[2] = info->pci_dev;
+   uint_uuid[3] = info->pci_func;
 }
 
 void ac_print_gpu_info(struct radeon_info *info)
 {
-       printf("Device info:\n");
-       printf("    pci (domain:bus:dev.func): %04x:%02x:%02x.%x\n",
-              info->pci_domain, info->pci_bus,
-              info->pci_dev, info->pci_func);
-
-       printf("    name = %s\n", info->name);
-       printf("    marketing_name = %s\n", info->marketing_name);
-       printf("    is_pro_graphics = %u\n", info->is_pro_graphics);
-       printf("    pci_id = 0x%x\n", info->pci_id);
-       printf("    pci_rev_id = 0x%x\n", info->pci_rev_id);
-       printf("    family = %i\n", info->family);
-       printf("    chip_class = %i\n", info->chip_class);
-       printf("    family_id = %i\n", info->family_id);
-       printf("    chip_external_rev = %i\n", info->chip_external_rev);
-       printf("    clock_crystal_freq = %i\n", info->clock_crystal_freq);
-
-       printf("Features:\n");
-       printf("    has_graphics = %i\n", info->has_graphics);
-       printf("    num_rings[RING_GFX] = %i\n", info->num_rings[RING_GFX]);
-       printf("    num_rings[RING_DMA] = %i\n", info->num_rings[RING_DMA]);
-       printf("    num_rings[RING_COMPUTE] = %u\n", info->num_rings[RING_COMPUTE]);
-       printf("    num_rings[RING_UVD] = %i\n", info->num_rings[RING_UVD]);
-       printf("    num_rings[RING_VCE] = %i\n", info->num_rings[RING_VCE]);
-       printf("    num_rings[RING_UVD_ENC] = %i\n", info->num_rings[RING_UVD_ENC]);
-       printf("    num_rings[RING_VCN_DEC] = %i\n", info->num_rings[RING_VCN_DEC]);
-       printf("    num_rings[RING_VCN_ENC] = %i\n", info->num_rings[RING_VCN_ENC]);
-       printf("    num_rings[RING_VCN_JPEG] = %i\n", info->num_rings[RING_VCN_JPEG]);
-       printf("    has_clear_state = %u\n", info->has_clear_state);
-       printf("    has_distributed_tess = %u\n", info->has_distributed_tess);
-       printf("    has_dcc_constant_encode = %u\n", info->has_dcc_constant_encode);
-       printf("    has_rbplus = %u\n", info->has_rbplus);
-       printf("    rbplus_allowed = %u\n", info->rbplus_allowed);
-       printf("    has_load_ctx_reg_pkt = %u\n", info->has_load_ctx_reg_pkt);
-       printf("    has_out_of_order_rast = %u\n", info->has_out_of_order_rast);
-       printf("    cpdma_prefetch_writes_memory = %u\n", info->cpdma_prefetch_writes_memory);
-       printf("    has_gfx9_scissor_bug = %i\n", info->has_gfx9_scissor_bug);
-       printf("    has_tc_compat_zrange_bug = %i\n", info->has_tc_compat_zrange_bug);
-       printf("    has_msaa_sample_loc_bug = %i\n", info->has_msaa_sample_loc_bug);
-       printf("    has_ls_vgpr_init_bug = %i\n", info->has_ls_vgpr_init_bug);
-
-       printf("Display features:\n");
-       printf("    use_display_dcc_unaligned = %u\n", info->use_display_dcc_unaligned);
-       printf("    use_display_dcc_with_retile_blit = %u\n", info->use_display_dcc_with_retile_blit);
-
-       printf("Memory info:\n");
-       printf("    pte_fragment_size = %u\n", info->pte_fragment_size);
-       printf("    gart_page_size = %u\n", info->gart_page_size);
-       printf("    gart_size = %i MB\n", (int)DIV_ROUND_UP(info->gart_size, 1024*1024));
-       printf("    vram_size = %i MB\n", (int)DIV_ROUND_UP(info->vram_size, 1024*1024));
-       printf("    vram_vis_size = %i MB\n", (int)DIV_ROUND_UP(info->vram_vis_size, 1024*1024));
-       printf("    vram_type = %i\n", info->vram_type);
-       printf("    vram_bit_width = %i\n", info->vram_bit_width);
-       printf("    gds_size = %u kB\n", info->gds_size / 1024);
-       printf("    gds_gfx_partition_size = %u kB\n", info->gds_gfx_partition_size / 1024);
-       printf("    max_alloc_size = %i MB\n",
-              (int)DIV_ROUND_UP(info->max_alloc_size, 1024*1024));
-       printf("    min_alloc_size = %u\n", info->min_alloc_size);
-       printf("    address32_hi = %u\n", info->address32_hi);
-       printf("    has_dedicated_vram = %u\n", info->has_dedicated_vram);
-       printf("    num_sdp_interfaces = %u\n", info->num_sdp_interfaces);
-       printf("    num_tcc_blocks = %i\n", info->num_tcc_blocks);
-       printf("    tcc_cache_line_size = %u\n", info->tcc_cache_line_size);
-       printf("    tcc_harvested = %u\n", info->tcc_harvested);
-       printf("    pc_lines = %u\n", info->pc_lines);
-       printf("    lds_size_per_workgroup = %u\n", info->lds_size_per_workgroup);
-       printf("    lds_granularity = %i\n", info->lds_granularity);
-       printf("    max_memory_clock = %i\n", info->max_memory_clock);
-       printf("    ce_ram_size = %i\n", info->ce_ram_size);
-       printf("    l1_cache_size = %i\n", info->l1_cache_size);
-       printf("    l2_cache_size = %i\n", info->l2_cache_size);
-
-       printf("CP info:\n");
-       printf("    gfx_ib_pad_with_type2 = %i\n", info->gfx_ib_pad_with_type2);
-       printf("    ib_alignment = %u\n", info->ib_alignment);
-       printf("    me_fw_version = %i\n", info->me_fw_version);
-       printf("    me_fw_feature = %i\n", info->me_fw_feature);
-       printf("    pfp_fw_version = %i\n", info->pfp_fw_version);
-       printf("    pfp_fw_feature = %i\n", info->pfp_fw_feature);
-       printf("    ce_fw_version = %i\n", info->ce_fw_version);
-       printf("    ce_fw_feature = %i\n", info->ce_fw_feature);
-
-       printf("Multimedia info:\n");
-       printf("    has_hw_decode = %u\n", info->has_hw_decode);
-       printf("    uvd_enc_supported = %u\n", info->uvd_enc_supported);
-       printf("    uvd_fw_version = %u\n", info->uvd_fw_version);
-       printf("    vce_fw_version = %u\n", info->vce_fw_version);
-       printf("    vce_harvest_config = %i\n", info->vce_harvest_config);
-
-       printf("Kernel & winsys capabilities:\n");
-       printf("    drm = %i.%i.%i\n", info->drm_major,
-              info->drm_minor, info->drm_patchlevel);
-       printf("    has_userptr = %i\n", info->has_userptr);
-       printf("    has_syncobj = %u\n", info->has_syncobj);
-       printf("    has_syncobj_wait_for_submit = %u\n", info->has_syncobj_wait_for_submit);
-       printf("    has_timeline_syncobj = %u\n", info->has_timeline_syncobj);
-       printf("    has_fence_to_handle = %u\n", info->has_fence_to_handle);
-       printf("    has_ctx_priority = %u\n", info->has_ctx_priority);
-       printf("    has_local_buffers = %u\n", info->has_local_buffers);
-       printf("    kernel_flushes_hdp_before_ib = %u\n", info->kernel_flushes_hdp_before_ib);
-       printf("    htile_cmask_support_1d_tiling = %u\n", info->htile_cmask_support_1d_tiling);
-       printf("    si_TA_CS_BC_BASE_ADDR_allowed = %u\n", info->si_TA_CS_BC_BASE_ADDR_allowed);
-       printf("    has_bo_metadata = %u\n", info->has_bo_metadata);
-       printf("    has_gpu_reset_status_query = %u\n", info->has_gpu_reset_status_query);
-       printf("    has_eqaa_surface_allocator = %u\n", info->has_eqaa_surface_allocator);
-       printf("    has_format_bc1_through_bc7 = %u\n", info->has_format_bc1_through_bc7);
-       printf("    kernel_flushes_tc_l2_after_ib = %u\n", info->kernel_flushes_tc_l2_after_ib);
-       printf("    has_indirect_compute_dispatch = %u\n", info->has_indirect_compute_dispatch);
-       printf("    has_unaligned_shader_loads = %u\n", info->has_unaligned_shader_loads);
-       printf("    has_sparse_vm_mappings = %u\n", info->has_sparse_vm_mappings);
-       printf("    has_2d_tiling = %u\n", info->has_2d_tiling);
-       printf("    has_read_registers_query = %u\n", info->has_read_registers_query);
-       printf("    has_gds_ordered_append = %u\n", info->has_gds_ordered_append);
-       printf("    has_scheduled_fence_dependency = %u\n", info->has_scheduled_fence_dependency);
-       printf("    mid_command_buffer_preemption_enabled = %u\n", info->mid_command_buffer_preemption_enabled);
-
-       printf("Shader core info:\n");
-       printf("    max_shader_clock = %i\n", info->max_shader_clock);
-       printf("    num_good_compute_units = %i\n", info->num_good_compute_units);
-       printf("    max_good_cu_per_sa = %i\n", info->max_good_cu_per_sa);
-       printf("    min_good_cu_per_sa = %i\n", info->min_good_cu_per_sa);
-       printf("    max_se = %i\n", info->max_se);
-       printf("    max_sh_per_se = %i\n", info->max_sh_per_se);
-       printf("    max_wave64_per_simd = %i\n", info->max_wave64_per_simd);
-       printf("    num_physical_sgprs_per_simd = %i\n", info->num_physical_sgprs_per_simd);
-       printf("    num_physical_wave64_vgprs_per_simd = %i\n", info->num_physical_wave64_vgprs_per_simd);
-       printf("    num_simd_per_compute_unit = %i\n", info->num_simd_per_compute_unit);
-       printf("    min_sgpr_alloc = %i\n", info->min_sgpr_alloc);
-       printf("    max_sgpr_alloc = %i\n", info->max_sgpr_alloc);
-       printf("    sgpr_alloc_granularity = %i\n", info->sgpr_alloc_granularity);
-       printf("    min_wave64_vgpr_alloc = %i\n", info->min_wave64_vgpr_alloc);
-       printf("    max_vgpr_alloc = %i\n", info->max_vgpr_alloc);
-       printf("    wave64_vgpr_alloc_granularity = %i\n", info->wave64_vgpr_alloc_granularity);
-
-       printf("Render backend info:\n");
-       printf("    pa_sc_tile_steering_override = 0x%x\n", info->pa_sc_tile_steering_override);
-       printf("    num_render_backends = %i\n", info->num_render_backends);
-       printf("    num_tile_pipes = %i\n", info->num_tile_pipes);
-       printf("    pipe_interleave_bytes = %i\n", info->pipe_interleave_bytes);
-       printf("    enabled_rb_mask = 0x%x\n", info->enabled_rb_mask);
-       printf("    max_alignment = %u\n", (unsigned)info->max_alignment);
-       printf("    pbb_max_alloc_count = %u\n", info->pbb_max_alloc_count);
-
-       printf("GB_ADDR_CONFIG: 0x%08x\n", info->gb_addr_config);
-       if (info->chip_class >= GFX10) {
-               printf("    num_pipes = %u\n",
-                      1 << G_0098F8_NUM_PIPES(info->gb_addr_config));
-               printf("    pipe_interleave_size = %u\n",
-                      256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config));
-               printf("    max_compressed_frags = %u\n",
-                      1 << G_0098F8_MAX_COMPRESSED_FRAGS(info->gb_addr_config));
-       } else if (info->chip_class == GFX9) {
-               printf("    num_pipes = %u\n",
-                      1 << G_0098F8_NUM_PIPES(info->gb_addr_config));
-               printf("    pipe_interleave_size = %u\n",
-                      256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config));
-               printf("    max_compressed_frags = %u\n",
-                      1 << G_0098F8_MAX_COMPRESSED_FRAGS(info->gb_addr_config));
-               printf("    bank_interleave_size = %u\n",
-                      1 << G_0098F8_BANK_INTERLEAVE_SIZE(info->gb_addr_config));
-               printf("    num_banks = %u\n",
-                      1 << G_0098F8_NUM_BANKS(info->gb_addr_config));
-               printf("    shader_engine_tile_size = %u\n",
-                      16 << G_0098F8_SHADER_ENGINE_TILE_SIZE(info->gb_addr_config));
-               printf("    num_shader_engines = %u\n",
-                      1 << G_0098F8_NUM_SHADER_ENGINES_GFX9(info->gb_addr_config));
-               printf("    num_gpus = %u (raw)\n",
-                      G_0098F8_NUM_GPUS_GFX9(info->gb_addr_config));
-               printf("    multi_gpu_tile_size = %u (raw)\n",
-                      G_0098F8_MULTI_GPU_TILE_SIZE(info->gb_addr_config));
-               printf("    num_rb_per_se = %u\n",
-                      1 << G_0098F8_NUM_RB_PER_SE(info->gb_addr_config));
-               printf("    row_size = %u\n",
-                      1024 << G_0098F8_ROW_SIZE(info->gb_addr_config));
-               printf("    num_lower_pipes = %u (raw)\n",
-                      G_0098F8_NUM_LOWER_PIPES(info->gb_addr_config));
-               printf("    se_enable = %u (raw)\n",
-                      G_0098F8_SE_ENABLE(info->gb_addr_config));
-       } else {
-               printf("    num_pipes = %u\n",
-                      1 << G_0098F8_NUM_PIPES(info->gb_addr_config));
-               printf("    pipe_interleave_size = %u\n",
-                      256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(info->gb_addr_config));
-               printf("    bank_interleave_size = %u\n",
-                      1 << G_0098F8_BANK_INTERLEAVE_SIZE(info->gb_addr_config));
-               printf("    num_shader_engines = %u\n",
-                      1 << G_0098F8_NUM_SHADER_ENGINES_GFX6(info->gb_addr_config));
-               printf("    shader_engine_tile_size = %u\n",
-                      16 << G_0098F8_SHADER_ENGINE_TILE_SIZE(info->gb_addr_config));
-               printf("    num_gpus = %u (raw)\n",
-                      G_0098F8_NUM_GPUS_GFX6(info->gb_addr_config));
-               printf("    multi_gpu_tile_size = %u (raw)\n",
-                      G_0098F8_MULTI_GPU_TILE_SIZE(info->gb_addr_config));
-               printf("    row_size = %u\n",
-                      1024 << G_0098F8_ROW_SIZE(info->gb_addr_config));
-               printf("    num_lower_pipes = %u (raw)\n",
-                      G_0098F8_NUM_LOWER_PIPES(info->gb_addr_config));
-       }
+   printf("Device info:\n");
+   printf("    pci (domain:bus:dev.func): %04x:%02x:%02x.%x\n", info->pci_domain, info->pci_bus,
+          info->pci_dev, info->pci_func);
+
+   printf("    name = %s\n", info->name);
+   printf("    marketing_name = %s\n", info->marketing_name);
+   printf("    is_pro_graphics = %u\n", info->is_pro_graphics);
+   printf("    pci_id = 0x%x\n", info->pci_id);
+   printf("    pci_rev_id = 0x%x\n", info->pci_rev_id);
+   printf("    family = %i\n", info->family);
+   printf("    chip_class = %i\n", info->chip_class);
+   printf("    family_id = %i\n", info->family_id);
+   printf("    chip_external_rev = %i\n", info->chip_external_rev);
+   printf("    clock_crystal_freq = %i\n", info->clock_crystal_freq);
+
+   printf("Features:\n");
+   printf("    has_graphics = %i\n", info->has_graphics);
+   printf("    num_rings[RING_GFX] = %i\n", info->num_rings[RING_GFX]);
+   printf("    num_rings[RING_DMA] = %i\n", info->num_rings[RING_DMA]);
+   printf("    num_rings[RING_COMPUTE] = %u\n", info->num_rings[RING_COMPUTE]);
+   printf("    num_rings[RING_UVD] = %i\n", info->num_rings[RING_UVD]);
+   printf("    num_rings[RING_VCE] = %i\n", info->num_rings[RING_VCE]);
+   printf("    num_rings[RING_UVD_ENC] = %i\n", info->num_rings[RING_UVD_ENC]);
+   printf("    num_rings[RING_VCN_DEC] = %i\n", info->num_rings[RING_VCN_DEC]);
+   printf("    num_rings[RING_VCN_ENC] = %i\n", info->num_rings[RING_VCN_ENC]);
+   printf("    num_rings[RING_VCN_JPEG] = %i\n", info->num_rings[RING_VCN_JPEG]);
+   printf("    has_clear_state = %u\n", info->has_clear_state);
+   printf("    has_distributed_tess = %u\n", info->has_distributed_tess);
+   printf("    has_dcc_constant_encode = %u\n", info->has_dcc_constant_encode);
+   printf("    has_rbplus = %u\n", info->has_rbplus);
+   printf("    rbplus_allowed = %u\n", info->rbplus_allowed);
+   printf("    has_load_ctx_reg_pkt = %u\n", info->has_load_ctx_reg_pkt);
+   printf("    has_out_of_order_rast = %u\n", info->has_out_of_order_rast);
+   printf("    cpdma_prefetch_writes_memory = %u\n", info->cpdma_prefetch_writes_memory);
+   printf("    has_gfx9_scissor_bug = %i\n", info->has_gfx9_scissor_bug);
+   printf("    has_tc_compat_zrange_bug = %i\n", info->has_tc_compat_zrange_bug);
+   printf("    has_msaa_sample_loc_bug = %i\n", info->has_msaa_sample_loc_bug);
+   printf("    has_ls_vgpr_init_bug = %i\n", info->has_ls_vgpr_init_bug);
+
+   printf("Display features:\n");
+   printf("    use_display_dcc_unaligned = %u\n", info->use_display_dcc_unaligned);
+   printf("    use_display_dcc_with_retile_blit = %u\n", info->use_display_dcc_with_retile_blit);
+
+   printf("Memory info:\n");
+   printf("    pte_fragment_size = %u\n", info->pte_fragment_size);
+   printf("    gart_page_size = %u\n", info->gart_page_size);
+   printf("    gart_size = %i MB\n", (int)DIV_ROUND_UP(info->gart_size, 1024 * 1024));
+   printf("    vram_size = %i MB\n", (int)DIV_ROUND_UP(info->vram_size, 1024 * 1024));
+   printf("    vram_vis_size = %i MB\n", (int)DIV_ROUND_UP(info->vram_vis_size, 1024 * 1024));
+   printf("    vram_type = %i\n", info->vram_type);
+   printf("    vram_bit_width = %i\n", info->vram_bit_width);
+   printf("    gds_size = %u kB\n", info->gds_size / 1024);
+   printf("    gds_gfx_partition_size = %u kB\n", info->gds_gfx_partition_size / 1024);
+   printf("    max_alloc_size = %i MB\n", (int)DIV_ROUND_UP(info->max_alloc_size, 1024 * 1024));
+   printf("    min_alloc_size = %u\n", info->min_alloc_size);
+   printf("    address32_hi = %u\n", info->address32_hi);
+   printf("    has_dedicated_vram = %u\n", info->has_dedicated_vram);
+   printf("    num_sdp_interfaces = %u\n", info->num_sdp_interfaces);
+   printf("    num_tcc_blocks = %i\n", info->num_tcc_blocks);
+   printf("    tcc_cache_line_size = %u\n", info->tcc_cache_line_size);
+   printf("    tcc_harvested = %u\n", info->tcc_harvested);
+   printf("    pc_lines = %u\n", info->pc_lines);
+   printf("    lds_size_per_workgroup = %u\n", info->lds_size_per_workgroup);
+   printf("    lds_granularity = %i\n", info->lds_granularity);
+   printf("    max_memory_clock = %i\n", info->max_memory_clock);
+   printf("    ce_ram_size = %i\n", info->ce_ram_size);
+   printf("    l1_cache_size = %i\n", info->l1_cache_size);
+   printf("    l2_cache_size = %i\n", info->l2_cache_size);
+
+   printf("CP info:\n");
+   printf("    gfx_ib_pad_with_type2 = %i\n", info->gfx_ib_pad_with_type2);
+   printf("    ib_alignment = %u\n", info->ib_alignment);
+   printf("    me_fw_version = %i\n", info->me_fw_version);
+   printf("    me_fw_feature = %i\n", info->me_fw_feature);
+   printf("    pfp_fw_version = %i\n", info->pfp_fw_version);
+   printf("    pfp_fw_feature = %i\n", info->pfp_fw_feature);
+   printf("    ce_fw_version = %i\n", info->ce_fw_version);
+   printf("    ce_fw_feature = %i\n", info->ce_fw_feature);
+
+   printf("Multimedia info:\n");
+   printf("    has_hw_decode = %u\n", info->has_hw_decode);
+   printf("    uvd_enc_supported = %u\n", info->uvd_enc_supported);
+   printf("    uvd_fw_version = %u\n", info->uvd_fw_version);
+   printf("    vce_fw_version = %u\n", info->vce_fw_version);
+   printf("    vce_harvest_config = %i\n", info->vce_harvest_config);
+
+   printf("Kernel & winsys capabilities:\n");
+   printf("    drm = %i.%i.%i\n", info->drm_major, info->drm_minor, info->drm_patchlevel);
+   printf("    has_userptr = %i\n", info->has_userptr);
+   printf("    has_syncobj = %u\n", info->has_syncobj);
+   printf("    has_syncobj_wait_for_submit = %u\n", info->has_syncobj_wait_for_submit);
+   printf("    has_timeline_syncobj = %u\n", info->has_timeline_syncobj);
+   printf("    has_fence_to_handle = %u\n", info->has_fence_to_handle);
+   printf("    has_ctx_priority = %u\n", info->has_ctx_priority);
+   printf("    has_local_buffers = %u\n", info->has_local_buffers);
+   printf("    kernel_flushes_hdp_before_ib = %u\n", info->kernel_flushes_hdp_before_ib);
+   printf("    htile_cmask_support_1d_tiling = %u\n", info->htile_cmask_support_1d_tiling);
+   printf("    si_TA_CS_BC_BASE_ADDR_allowed = %u\n", info->si_TA_CS_BC_BASE_ADDR_allowed);
+   printf("    has_bo_metadata = %u\n", info->has_bo_metadata);
+   printf("    has_gpu_reset_status_query = %u\n", info->has_gpu_reset_status_query);
+   printf("    has_eqaa_surface_allocator = %u\n", info->has_eqaa_surface_allocator);
+   printf("    has_format_bc1_through_bc7 = %u\n", info->has_format_bc1_through_bc7);
+   printf("    kernel_flushes_tc_l2_after_ib = %u\n", info->kernel_flushes_tc_l2_after_ib);
+   printf("    has_indirect_compute_dispatch = %u\n", info->has_indirect_compute_dispatch);
+   printf("    has_unaligned_shader_loads = %u\n", info->has_unaligned_shader_loads);
+   printf("    has_sparse_vm_mappings = %u\n", info->has_sparse_vm_mappings);
+   printf("    has_2d_tiling = %u\n", info->has_2d_tiling);
+   printf("    has_read_registers_query = %u\n", info->has_read_registers_query);
+   printf("    has_gds_ordered_append = %u\n", info->has_gds_ordered_append);
+   printf("    has_scheduled_fence_dependency = %u\n", info->has_scheduled_fence_dependency);
+   printf("    mid_command_buffer_preemption_enabled = %u\n",
+          info->mid_command_buffer_preemption_enabled);
+
+   printf("Shader core info:\n");
+   printf("    max_shader_clock = %i\n", info->max_shader_clock);
+   printf("    num_good_compute_units = %i\n", info->num_good_compute_units);
+   printf("    max_good_cu_per_sa = %i\n", info->max_good_cu_per_sa);
+   printf("    min_good_cu_per_sa = %i\n", info->min_good_cu_per_sa);
+   printf("    max_se = %i\n", info->max_se);
+   printf("    max_sh_per_se = %i\n", info->max_sh_per_se);
+   printf("    max_wave64_per_simd = %i\n", info->max_wave64_per_simd);
+   printf("    num_physical_sgprs_per_simd = %i\n", info->num_physical_sgprs_per_simd);
+   printf("    num_physical_wave64_vgprs_per_simd = %i\n",
+          info->num_physical_wave64_vgprs_per_simd);
+   printf("    num_simd_per_compute_unit = %i\n", info->num_simd_per_compute_unit);
+   printf("    min_sgpr_alloc = %i\n", info->min_sgpr_alloc);
+   printf("    max_sgpr_alloc = %i\n", info->max_sgpr_alloc);
+   printf("    sgpr_alloc_granularity = %i\n", info->sgpr_alloc_granularity);
+   printf("    min_wave64_vgpr_alloc = %i\n", info->min_wave64_vgpr_alloc);
+   printf("    max_vgpr_alloc = %i\n", info->max_vgpr_alloc);
+   printf("    wave64_vgpr_alloc_granularity = %i\n", info->wave64_vgpr_alloc_granularity);
+
+   printf("Render backend info:\n");
+   printf("    pa_sc_tile_steering_override = 0x%x\n", info->pa_sc_tile_steering_override);
+   printf("    num_render_backends = %i\n", info->num_render_backends);
+   printf("    num_tile_pipes = %i\n", info->num_tile_pipes);
+   printf("    pipe_interleave_bytes = %i\n", info->pipe_interleave_bytes);
+   printf("    enabled_rb_mask = 0x%x\n", info->enabled_rb_mask);
+   printf("    max_alignment = %u\n", (unsigned)info->max_alignment);
+   printf("    pbb_max_alloc_count = %u\n", info->pbb_max_alloc_count);
+
+   printf("GB_ADDR_CONFIG: 0x%08x\n", info->gb_addr_config);
+   if (info->chip_class >= GFX10) {
+      printf("    num_pipes = %u\n", 1 << G_0098F8_NUM_PIPES(info->gb_addr_config));
+      printf("    pipe_interleave_size = %u\n",
+             256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config));
+      printf("    max_compressed_frags = %u\n",
+             1 << G_0098F8_MAX_COMPRESSED_FRAGS(info->gb_addr_config));
+   } else if (info->chip_class == GFX9) {
+      printf("    num_pipes = %u\n", 1 << G_0098F8_NUM_PIPES(info->gb_addr_config));
+      printf("    pipe_interleave_size = %u\n",
+             256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config));
+      printf("    max_compressed_frags = %u\n",
+             1 << G_0098F8_MAX_COMPRESSED_FRAGS(info->gb_addr_config));
+      printf("    bank_interleave_size = %u\n",
+             1 << G_0098F8_BANK_INTERLEAVE_SIZE(info->gb_addr_config));
+      printf("    num_banks = %u\n", 1 << G_0098F8_NUM_BANKS(info->gb_addr_config));
+      printf("    shader_engine_tile_size = %u\n",
+             16 << G_0098F8_SHADER_ENGINE_TILE_SIZE(info->gb_addr_config));
+      printf("    num_shader_engines = %u\n",
+             1 << G_0098F8_NUM_SHADER_ENGINES_GFX9(info->gb_addr_config));
+      printf("    num_gpus = %u (raw)\n", G_0098F8_NUM_GPUS_GFX9(info->gb_addr_config));
+      printf("    multi_gpu_tile_size = %u (raw)\n",
+             G_0098F8_MULTI_GPU_TILE_SIZE(info->gb_addr_config));
+      printf("    num_rb_per_se = %u\n", 1 << G_0098F8_NUM_RB_PER_SE(info->gb_addr_config));
+      printf("    row_size = %u\n", 1024 << G_0098F8_ROW_SIZE(info->gb_addr_config));
+      printf("    num_lower_pipes = %u (raw)\n", G_0098F8_NUM_LOWER_PIPES(info->gb_addr_config));
+      printf("    se_enable = %u (raw)\n", G_0098F8_SE_ENABLE(info->gb_addr_config));
+   } else {
+      printf("    num_pipes = %u\n", 1 << G_0098F8_NUM_PIPES(info->gb_addr_config));
+      printf("    pipe_interleave_size = %u\n",
+             256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(info->gb_addr_config));
+      printf("    bank_interleave_size = %u\n",
+             1 << G_0098F8_BANK_INTERLEAVE_SIZE(info->gb_addr_config));
+      printf("    num_shader_engines = %u\n",
+             1 << G_0098F8_NUM_SHADER_ENGINES_GFX6(info->gb_addr_config));
+      printf("    shader_engine_tile_size = %u\n",
+             16 << G_0098F8_SHADER_ENGINE_TILE_SIZE(info->gb_addr_config));
+      printf("    num_gpus = %u (raw)\n", G_0098F8_NUM_GPUS_GFX6(info->gb_addr_config));
+      printf("    multi_gpu_tile_size = %u (raw)\n",
+             G_0098F8_MULTI_GPU_TILE_SIZE(info->gb_addr_config));
+      printf("    row_size = %u\n", 1024 << G_0098F8_ROW_SIZE(info->gb_addr_config));
+      printf("    num_lower_pipes = %u (raw)\n", G_0098F8_NUM_LOWER_PIPES(info->gb_addr_config));
+   }
 }
 
-int
-ac_get_gs_table_depth(enum chip_class chip_class, enum radeon_family family)
+int ac_get_gs_table_depth(enum chip_class chip_class, enum radeon_family family)
 {
-       if (chip_class >= GFX9)
-               return -1;
-
-       switch (family) {
-       case CHIP_OLAND:
-       case CHIP_HAINAN:
-       case CHIP_KAVERI:
-       case CHIP_KABINI:
-       case CHIP_ICELAND:
-       case CHIP_CARRIZO:
-       case CHIP_STONEY:
-               return 16;
-       case CHIP_TAHITI:
-       case CHIP_PITCAIRN:
-       case CHIP_VERDE:
-       case CHIP_BONAIRE:
-       case CHIP_HAWAII:
-       case CHIP_TONGA:
-       case CHIP_FIJI:
-       case CHIP_POLARIS10:
-       case CHIP_POLARIS11:
-       case CHIP_POLARIS12:
-       case CHIP_VEGAM:
-               return 32;
-       default:
-               unreachable("Unknown GPU");
-       }
+   if (chip_class >= GFX9)
+      return -1;
+
+   switch (family) {
+   case CHIP_OLAND:
+   case CHIP_HAINAN:
+   case CHIP_KAVERI:
+   case CHIP_KABINI:
+   case CHIP_ICELAND:
+   case CHIP_CARRIZO:
+   case CHIP_STONEY:
+      return 16;
+   case CHIP_TAHITI:
+   case CHIP_PITCAIRN:
+   case CHIP_VERDE:
+   case CHIP_BONAIRE:
+   case CHIP_HAWAII:
+   case CHIP_TONGA:
+   case CHIP_FIJI:
+   case CHIP_POLARIS10:
+   case CHIP_POLARIS11:
+   case CHIP_POLARIS12:
+   case CHIP_VEGAM:
+      return 32;
+   default:
+      unreachable("Unknown GPU");
+   }
 }
 
-void
-ac_get_raster_config(struct radeon_info *info,
-                    uint32_t *raster_config_p,
-                    uint32_t *raster_config_1_p,
-                    uint32_t *se_tile_repeat_p)
+void ac_get_raster_config(struct radeon_info *info, uint32_t *raster_config_p,
+                          uint32_t *raster_config_1_p, uint32_t *se_tile_repeat_p)
 {
-       unsigned raster_config, raster_config_1, se_tile_repeat;
-
-       switch (info->family) {
-       /* 1 SE / 1 RB */
-       case CHIP_HAINAN:
-       case CHIP_KABINI:
-       case CHIP_STONEY:
-               raster_config = 0x00000000;
-               raster_config_1 = 0x00000000;
-               break;
-       /* 1 SE / 4 RBs */
-       case CHIP_VERDE:
-               raster_config = 0x0000124a;
-               raster_config_1 = 0x00000000;
-               break;
-       /* 1 SE / 2 RBs (Oland is special) */
-       case CHIP_OLAND:
-               raster_config = 0x00000082;
-               raster_config_1 = 0x00000000;
-               break;
-       /* 1 SE / 2 RBs */
-       case CHIP_KAVERI:
-       case CHIP_ICELAND:
-       case CHIP_CARRIZO:
-               raster_config = 0x00000002;
-               raster_config_1 = 0x00000000;
-               break;
-       /* 2 SEs / 4 RBs */
-       case CHIP_BONAIRE:
-       case CHIP_POLARIS11:
-       case CHIP_POLARIS12:
-               raster_config = 0x16000012;
-               raster_config_1 = 0x00000000;
-               break;
-       /* 2 SEs / 8 RBs */
-       case CHIP_TAHITI:
-       case CHIP_PITCAIRN:
-               raster_config = 0x2a00126a;
-               raster_config_1 = 0x00000000;
-               break;
-       /* 4 SEs / 8 RBs */
-       case CHIP_TONGA:
-       case CHIP_POLARIS10:
-               raster_config = 0x16000012;
-               raster_config_1 = 0x0000002a;
-               break;
-       /* 4 SEs / 16 RBs */
-       case CHIP_HAWAII:
-       case CHIP_FIJI:
-       case CHIP_VEGAM:
-               raster_config = 0x3a00161a;
-               raster_config_1 = 0x0000002e;
-               break;
-       default:
-               fprintf(stderr,
-                       "ac: Unknown GPU, using 0 for raster_config\n");
-               raster_config = 0x00000000;
-               raster_config_1 = 0x00000000;
-               break;
-       }
-
-       /* drm/radeon on Kaveri is buggy, so disable 1 RB to work around it.
-        * This decreases performance by up to 50% when the RB is the bottleneck.
-        */
-       if (info->family == CHIP_KAVERI && !info->is_amdgpu)
-               raster_config = 0x00000000;
-
-       /* Fiji: Old kernels have incorrect tiling config. This decreases
-        * RB performance by 25%. (it disables 1 RB in the second packer)
-        */
-       if (info->family == CHIP_FIJI &&
-           info->cik_macrotile_mode_array[0] == 0x000000e8) {
-               raster_config = 0x16000012;
-               raster_config_1 = 0x0000002a;
-       }
-
-       unsigned se_width = 8 << G_028350_SE_XSEL_GFX6(raster_config);
-       unsigned se_height = 8 << G_028350_SE_YSEL_GFX6(raster_config);
-
-       /* I don't know how to calculate this, though this is probably a good guess. */
-       se_tile_repeat = MAX2(se_width, se_height) * info->max_se;
-
-       *raster_config_p = raster_config;
-       *raster_config_1_p = raster_config_1;
-       if (se_tile_repeat_p)
-               *se_tile_repeat_p = se_tile_repeat;
+   unsigned raster_config, raster_config_1, se_tile_repeat;
+
+   switch (info->family) {
+   /* 1 SE / 1 RB */
+   case CHIP_HAINAN:
+   case CHIP_KABINI:
+   case CHIP_STONEY:
+      raster_config = 0x00000000;
+      raster_config_1 = 0x00000000;
+      break;
+   /* 1 SE / 4 RBs */
+   case CHIP_VERDE:
+      raster_config = 0x0000124a;
+      raster_config_1 = 0x00000000;
+      break;
+   /* 1 SE / 2 RBs (Oland is special) */
+   case CHIP_OLAND:
+      raster_config = 0x00000082;
+      raster_config_1 = 0x00000000;
+      break;
+   /* 1 SE / 2 RBs */
+   case CHIP_KAVERI:
+   case CHIP_ICELAND:
+   case CHIP_CARRIZO:
+      raster_config = 0x00000002;
+      raster_config_1 = 0x00000000;
+      break;
+   /* 2 SEs / 4 RBs */
+   case CHIP_BONAIRE:
+   case CHIP_POLARIS11:
+   case CHIP_POLARIS12:
+      raster_config = 0x16000012;
+      raster_config_1 = 0x00000000;
+      break;
+   /* 2 SEs / 8 RBs */
+   case CHIP_TAHITI:
+   case CHIP_PITCAIRN:
+      raster_config = 0x2a00126a;
+      raster_config_1 = 0x00000000;
+      break;
+   /* 4 SEs / 8 RBs */
+   case CHIP_TONGA:
+   case CHIP_POLARIS10:
+      raster_config = 0x16000012;
+      raster_config_1 = 0x0000002a;
+      break;
+   /* 4 SEs / 16 RBs */
+   case CHIP_HAWAII:
+   case CHIP_FIJI:
+   case CHIP_VEGAM:
+      raster_config = 0x3a00161a;
+      raster_config_1 = 0x0000002e;
+      break;
+   default:
+      fprintf(stderr, "ac: Unknown GPU, using 0 for raster_config\n");
+      raster_config = 0x00000000;
+      raster_config_1 = 0x00000000;
+      break;
+   }
+
+   /* drm/radeon on Kaveri is buggy, so disable 1 RB to work around it.
+    * This decreases performance by up to 50% when the RB is the bottleneck.
+    */
+   if (info->family == CHIP_KAVERI && !info->is_amdgpu)
+      raster_config = 0x00000000;
+
+   /* Fiji: Old kernels have incorrect tiling config. This decreases
+    * RB performance by 25%. (it disables 1 RB in the second packer)
+    */
+   if (info->family == CHIP_FIJI && info->cik_macrotile_mode_array[0] == 0x000000e8) {
+      raster_config = 0x16000012;
+      raster_config_1 = 0x0000002a;
+   }
+
+   unsigned se_width = 8 << G_028350_SE_XSEL_GFX6(raster_config);
+   unsigned se_height = 8 << G_028350_SE_YSEL_GFX6(raster_config);
+
+   /* I don't know how to calculate this, though this is probably a good guess. */
+   se_tile_repeat = MAX2(se_width, se_height) * info->max_se;
+
+   *raster_config_p = raster_config;
+   *raster_config_1_p = raster_config_1;
+   if (se_tile_repeat_p)
+      *se_tile_repeat_p = se_tile_repeat;
 }
 
-void
-ac_get_harvested_configs(struct radeon_info *info,
-                        unsigned raster_config,
-                        unsigned *cik_raster_config_1_p,
-                        unsigned *raster_config_se)
+void ac_get_harvested_configs(struct radeon_info *info, unsigned raster_config,
+                              unsigned *cik_raster_config_1_p, unsigned *raster_config_se)
 {
-       unsigned sh_per_se = MAX2(info->max_sh_per_se, 1);
-       unsigned num_se = MAX2(info->max_se, 1);
-       unsigned rb_mask = info->enabled_rb_mask;
-       unsigned num_rb = MIN2(info->num_render_backends, 16);
-       unsigned rb_per_pkr = MIN2(num_rb / num_se / sh_per_se, 2);
-       unsigned rb_per_se = num_rb / num_se;
-       unsigned se_mask[4];
-       unsigned se;
-
-       se_mask[0] = ((1 << rb_per_se) - 1) & rb_mask;
-       se_mask[1] = (se_mask[0] << rb_per_se) & rb_mask;
-       se_mask[2] = (se_mask[1] << rb_per_se) & rb_mask;
-       se_mask[3] = (se_mask[2] << rb_per_se) & rb_mask;
-
-       assert(num_se == 1 || num_se == 2 || num_se == 4);
-       assert(sh_per_se == 1 || sh_per_se == 2);
-       assert(rb_per_pkr == 1 || rb_per_pkr == 2);
-
-
-       if (info->chip_class >= GFX7) {
-               unsigned raster_config_1 = *cik_raster_config_1_p;
-               if ((num_se > 2) && ((!se_mask[0] && !se_mask[1]) ||
-                                    (!se_mask[2] && !se_mask[3]))) {
-                       raster_config_1 &= C_028354_SE_PAIR_MAP;
-
-                       if (!se_mask[0] && !se_mask[1]) {
-                               raster_config_1 |=
-                                       S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_3);
-                       } else {
-                               raster_config_1 |=
-                                       S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_0);
-                       }
-                       *cik_raster_config_1_p = raster_config_1;
-               }
-       }
-
-       for (se = 0; se < num_se; se++) {
-               unsigned pkr0_mask = ((1 << rb_per_pkr) - 1) << (se * rb_per_se);
-               unsigned pkr1_mask = pkr0_mask << rb_per_pkr;
-               int idx = (se / 2) * 2;
-
-               raster_config_se[se] = raster_config;
-               if ((num_se > 1) && (!se_mask[idx] || !se_mask[idx + 1])) {
-                       raster_config_se[se] &= C_028350_SE_MAP;
-
-                       if (!se_mask[idx]) {
-                               raster_config_se[se] |=
-                                       S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3);
-                       } else {
-                               raster_config_se[se] |=
-                                       S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0);
-                       }
-               }
-
-               pkr0_mask &= rb_mask;
-               pkr1_mask &= rb_mask;
-               if (rb_per_se > 2 && (!pkr0_mask || !pkr1_mask)) {
-                       raster_config_se[se] &= C_028350_PKR_MAP;
-
-                       if (!pkr0_mask) {
-                               raster_config_se[se] |=
-                                       S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_3);
-                       } else {
-                               raster_config_se[se] |=
-                                       S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_0);
-                       }
-               }
-
-               if (rb_per_se >= 2) {
-                       unsigned rb0_mask = 1 << (se * rb_per_se);
-                       unsigned rb1_mask = rb0_mask << 1;
-
-                       rb0_mask &= rb_mask;
-                       rb1_mask &= rb_mask;
-                       if (!rb0_mask || !rb1_mask) {
-                               raster_config_se[se] &= C_028350_RB_MAP_PKR0;
-
-                               if (!rb0_mask) {
-                                       raster_config_se[se] |=
-                                               S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_3);
-                               } else {
-                                       raster_config_se[se] |=
-                                               S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_0);
-                               }
-                       }
-
-                       if (rb_per_se > 2) {
-                               rb0_mask = 1 << (se * rb_per_se + rb_per_pkr);
-                               rb1_mask = rb0_mask << 1;
-                               rb0_mask &= rb_mask;
-                               rb1_mask &= rb_mask;
-                               if (!rb0_mask || !rb1_mask) {
-                                       raster_config_se[se] &= C_028350_RB_MAP_PKR1;
-
-                                       if (!rb0_mask) {
-                                               raster_config_se[se] |=
-                                                       S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_3);
-                                       } else {
-                                               raster_config_se[se] |=
-                                                       S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_0);
-                                       }
-                               }
-                       }
-               }
-       }
+   unsigned sh_per_se = MAX2(info->max_sh_per_se, 1);
+   unsigned num_se = MAX2(info->max_se, 1);
+   unsigned rb_mask = info->enabled_rb_mask;
+   unsigned num_rb = MIN2(info->num_render_backends, 16);
+   unsigned rb_per_pkr = MIN2(num_rb / num_se / sh_per_se, 2);
+   unsigned rb_per_se = num_rb / num_se;
+   unsigned se_mask[4];
+   unsigned se;
+
+   se_mask[0] = ((1 << rb_per_se) - 1) & rb_mask;
+   se_mask[1] = (se_mask[0] << rb_per_se) & rb_mask;
+   se_mask[2] = (se_mask[1] << rb_per_se) & rb_mask;
+   se_mask[3] = (se_mask[2] << rb_per_se) & rb_mask;
+
+   assert(num_se == 1 || num_se == 2 || num_se == 4);
+   assert(sh_per_se == 1 || sh_per_se == 2);
+   assert(rb_per_pkr == 1 || rb_per_pkr == 2);
+
+   if (info->chip_class >= GFX7) {
+      unsigned raster_config_1 = *cik_raster_config_1_p;
+      if ((num_se > 2) && ((!se_mask[0] && !se_mask[1]) || (!se_mask[2] && !se_mask[3]))) {
+         raster_config_1 &= C_028354_SE_PAIR_MAP;
+
+         if (!se_mask[0] && !se_mask[1]) {
+            raster_config_1 |= S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_3);
+         } else {
+            raster_config_1 |= S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_0);
+         }
+         *cik_raster_config_1_p = raster_config_1;
+      }
+   }
+
+   for (se = 0; se < num_se; se++) {
+      unsigned pkr0_mask = ((1 << rb_per_pkr) - 1) << (se * rb_per_se);
+      unsigned pkr1_mask = pkr0_mask << rb_per_pkr;
+      int idx = (se / 2) * 2;
+
+      raster_config_se[se] = raster_config;
+      if ((num_se > 1) && (!se_mask[idx] || !se_mask[idx + 1])) {
+         raster_config_se[se] &= C_028350_SE_MAP;
+
+         if (!se_mask[idx]) {
+            raster_config_se[se] |= S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3);
+         } else {
+            raster_config_se[se] |= S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0);
+         }
+      }
+
+      pkr0_mask &= rb_mask;
+      pkr1_mask &= rb_mask;
+      if (rb_per_se > 2 && (!pkr0_mask || !pkr1_mask)) {
+         raster_config_se[se] &= C_028350_PKR_MAP;
+
+         if (!pkr0_mask) {
+            raster_config_se[se] |= S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_3);
+         } else {
+            raster_config_se[se] |= S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_0);
+         }
+      }
+
+      if (rb_per_se >= 2) {
+         unsigned rb0_mask = 1 << (se * rb_per_se);
+         unsigned rb1_mask = rb0_mask << 1;
+
+         rb0_mask &= rb_mask;
+         rb1_mask &= rb_mask;
+         if (!rb0_mask || !rb1_mask) {
+            raster_config_se[se] &= C_028350_RB_MAP_PKR0;
+
+            if (!rb0_mask) {
+               raster_config_se[se] |= S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_3);
+            } else {
+               raster_config_se[se] |= S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_0);
+            }
+         }
+
+         if (rb_per_se > 2) {
+            rb0_mask = 1 << (se * rb_per_se + rb_per_pkr);
+            rb1_mask = rb0_mask << 1;
+            rb0_mask &= rb_mask;
+            rb1_mask &= rb_mask;
+            if (!rb0_mask || !rb1_mask) {
+               raster_config_se[se] &= C_028350_RB_MAP_PKR1;
+
+               if (!rb0_mask) {
+                  raster_config_se[se] |= S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_3);
+               } else {
+                  raster_config_se[se] |= S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_0);
+               }
+            }
+         }
+      }
+   }
 }
 
-unsigned ac_get_compute_resource_limits(struct radeon_info *info,
-                                       unsigned waves_per_threadgroup,
-                                       unsigned max_waves_per_sh,
-                                       unsigned threadgroups_per_cu)
+unsigned ac_get_compute_resource_limits(struct radeon_info *info, unsigned waves_per_threadgroup,
+                                        unsigned max_waves_per_sh, unsigned threadgroups_per_cu)
 {
-       unsigned compute_resource_limits =
-               S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0);
-
-       if (info->chip_class >= GFX7) {
-               unsigned num_cu_per_se = info->num_good_compute_units /
-                                        info->max_se;
-
-               /* Force even distribution on all SIMDs in CU if the workgroup
-                * size is 64. This has shown some good improvements if # of CUs
-                * per SE is not a multiple of 4.
-                */
-               if (num_cu_per_se % 4 && waves_per_threadgroup == 1)
-                       compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1);
-
-               assert(threadgroups_per_cu >= 1 && threadgroups_per_cu <= 8);
-               compute_resource_limits |= S_00B854_WAVES_PER_SH(max_waves_per_sh) |
-                                          S_00B854_CU_GROUP_COUNT(threadgroups_per_cu - 1);
-       } else {
-               /* GFX6 */
-               if (max_waves_per_sh) {
-                       unsigned limit_div16 = DIV_ROUND_UP(max_waves_per_sh, 16);
-                       compute_resource_limits |= S_00B854_WAVES_PER_SH_GFX6(limit_div16);
-               }
-       }
-       return compute_resource_limits;
+   unsigned compute_resource_limits = S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0);
+
+   if (info->chip_class >= GFX7) {
+      unsigned num_cu_per_se = info->num_good_compute_units / info->max_se;
+
+      /* Force even distribution on all SIMDs in CU if the workgroup
+       * size is 64. This has shown some good improvements if # of CUs
+       * per SE is not a multiple of 4.
+       */
+      if (num_cu_per_se % 4 && waves_per_threadgroup == 1)
+         compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1);
+
+      assert(threadgroups_per_cu >= 1 && threadgroups_per_cu <= 8);
+      compute_resource_limits |=
+         S_00B854_WAVES_PER_SH(max_waves_per_sh) | S_00B854_CU_GROUP_COUNT(threadgroups_per_cu - 1);
+   } else {
+      /* GFX6 */
+      if (max_waves_per_sh) {
+         unsigned limit_div16 = DIV_ROUND_UP(max_waves_per_sh, 16);
+         compute_resource_limits |= S_00B854_WAVES_PER_SH_GFX6(limit_div16);
+      }
+   }
+   return compute_resource_limits;
 }
index 70e53f16cb403cda05b1fc76fe69802a592f9c12..f6d4e621b58f5d7d6a4471da6fb4e9ec5002c9ca 100644 (file)
 #ifndef AC_GPU_INFO_H
 #define AC_GPU_INFO_H
 
+#include "amd_family.h"
+
+#include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
-#include <stdbool.h>
-#include "amd_family.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -38,186 +39,179 @@ extern "C" {
 struct amdgpu_gpu_info;
 
 struct radeon_info {
-       /* PCI info: domain:bus:dev:func */
-       uint32_t                    pci_domain;
-       uint32_t                    pci_bus;
-       uint32_t                    pci_dev;
-       uint32_t                    pci_func;
-
-       /* Device info. */
-       const char                  *name;
-       const char                  *marketing_name;
-       bool                        is_pro_graphics;
-       uint32_t                    pci_id;
-       uint32_t                    pci_rev_id;
-       enum radeon_family          family;
-       enum chip_class             chip_class;
-       uint32_t                    family_id;
-       uint32_t                    chip_external_rev;
-       uint32_t                    clock_crystal_freq;
-
-       /* Features. */
-       bool                        has_graphics; /* false if the chip is compute-only */
-       uint32_t                    num_rings[NUM_RING_TYPES];
-       uint32_t                    ib_pad_dw_mask[NUM_RING_TYPES];
-       bool                        has_clear_state;
-       bool                        has_distributed_tess;
-       bool                        has_dcc_constant_encode;
-       bool                        has_rbplus; /* if RB+ registers exist */
-       bool                        rbplus_allowed; /* if RB+ is allowed */
-       bool                        has_load_ctx_reg_pkt;
-       bool                        has_out_of_order_rast;
-       bool                        has_packed_math_16bit;
-       bool                        cpdma_prefetch_writes_memory;
-       bool                        has_gfx9_scissor_bug;
-       bool                        has_tc_compat_zrange_bug;
-       bool                        has_msaa_sample_loc_bug;
-       bool                        has_ls_vgpr_init_bug;
-
-       /* Display features. */
-       /* There are 2 display DCC codepaths, because display expects unaligned DCC. */
-       /* Disable RB and pipe alignment to skip the retile blit. (1 RB chips only) */
-       bool                        use_display_dcc_unaligned;
-       /* Allocate both aligned and unaligned DCC and use the retile blit. */
-       bool                        use_display_dcc_with_retile_blit;
-
-       /* Memory info. */
-       uint32_t                    pte_fragment_size;
-       uint32_t                    gart_page_size;
-       uint64_t                    gart_size;
-       uint64_t                    vram_size;
-       uint64_t                    vram_vis_size;
-       uint32_t                    vram_bit_width;
-       uint32_t                    vram_type;
-       unsigned                    gds_size;
-       unsigned                    gds_gfx_partition_size;
-       uint64_t                    max_alloc_size;
-       uint32_t                    min_alloc_size;
-       uint32_t                    address32_hi;
-       bool                        has_dedicated_vram;
-       bool                        has_l2_uncached;
-       bool                        r600_has_virtual_memory;
-       uint32_t                    num_sdp_interfaces;
-       uint32_t                    num_tcc_blocks;
-       uint32_t                    tcc_cache_line_size;
-       bool                        tcc_harvested;
-       unsigned                    pc_lines;
-       uint32_t                    lds_size_per_workgroup;
-       uint32_t                    lds_granularity;
-       uint32_t                    max_memory_clock;
-       uint32_t                    ce_ram_size;
-       uint32_t                    l1_cache_size;
-       uint32_t                    l2_cache_size;
-
-       /* CP info. */
-       bool                        gfx_ib_pad_with_type2;
-       unsigned                    ib_alignment; /* both start and size alignment */
-       uint32_t                    me_fw_version;
-       uint32_t                    me_fw_feature;
-       uint32_t                    pfp_fw_version;
-       uint32_t                    pfp_fw_feature;
-       uint32_t                    ce_fw_version;
-       uint32_t                    ce_fw_feature;
-
-       /* Multimedia info. */
-       bool                        has_hw_decode;
-       bool                        uvd_enc_supported;
-       uint32_t                    uvd_fw_version;
-       uint32_t                    vce_fw_version;
-       uint32_t                    vce_harvest_config;
-
-       /* Kernel & winsys capabilities. */
-       uint32_t                    drm_major; /* version */
-       uint32_t                    drm_minor;
-       uint32_t                    drm_patchlevel;
-       bool                        is_amdgpu;
-       bool                        has_userptr;
-       bool                        has_syncobj;
-       bool                        has_syncobj_wait_for_submit;
-       bool                        has_timeline_syncobj;
-       bool                        has_fence_to_handle;
-       bool                        has_ctx_priority;
-       bool                        has_local_buffers;
-       bool                        kernel_flushes_hdp_before_ib;
-       bool                        htile_cmask_support_1d_tiling;
-       bool                        si_TA_CS_BC_BASE_ADDR_allowed;
-       bool                        has_bo_metadata;
-       bool                        has_gpu_reset_status_query;
-       bool                        has_eqaa_surface_allocator;
-       bool                        has_format_bc1_through_bc7;
-       bool                        kernel_flushes_tc_l2_after_ib;
-       bool                        has_indirect_compute_dispatch;
-       bool                        has_unaligned_shader_loads;
-       bool                        has_sparse_vm_mappings;
-       bool                        has_2d_tiling;
-       bool                        has_read_registers_query;
-       bool                        has_gds_ordered_append;
-       bool                        has_scheduled_fence_dependency;
-       /* Whether SR-IOV is enabled or amdgpu.mcbp=1 was set on the kernel command line. */
-       bool                        mid_command_buffer_preemption_enabled;
-
-       /* Shader cores. */
-       uint32_t                    cu_mask[4][2];
-       uint32_t                    r600_max_quad_pipes; /* wave size / 16 */
-       uint32_t                    max_shader_clock;
-       uint32_t                    num_good_compute_units;
-       uint32_t                    max_good_cu_per_sa;
-       uint32_t                    min_good_cu_per_sa; /* min != max if SAs have different # of CUs */
-       uint32_t                    max_se; /* shader engines */
-       uint32_t                    max_sh_per_se; /* shader arrays per shader engine */
-       uint32_t                    max_wave64_per_simd;
-       uint32_t                    num_physical_sgprs_per_simd;
-       uint32_t                    num_physical_wave64_vgprs_per_simd;
-       uint32_t                    num_simd_per_compute_unit;
-       uint32_t                    min_sgpr_alloc;
-       uint32_t                    max_sgpr_alloc;
-       uint32_t                    sgpr_alloc_granularity;
-       uint32_t                    min_wave64_vgpr_alloc;
-       uint32_t                    max_vgpr_alloc;
-       uint32_t                    wave64_vgpr_alloc_granularity;
-       bool                        use_late_alloc; /* VS and GS: late pos/param allocation */
-
-       /* Render backends (color + depth blocks). */
-       uint32_t                    r300_num_gb_pipes;
-       uint32_t                    r300_num_z_pipes;
-       uint32_t                    r600_gb_backend_map; /* R600 harvest config */
-       bool                        r600_gb_backend_map_valid;
-       uint32_t                    r600_num_banks;
-       uint32_t                    gb_addr_config;
-       uint32_t                    pa_sc_tile_steering_override; /* CLEAR_STATE also sets this */
-       uint32_t                    num_render_backends;
-       uint32_t                    num_tile_pipes; /* pipe count from PIPE_CONFIG */
-       uint32_t                    pipe_interleave_bytes;
-       uint32_t                    enabled_rb_mask; /* GCN harvest config */
-       uint64_t                    max_alignment; /* from addrlib */
-       uint32_t                    pbb_max_alloc_count;
-
-       /* Tile modes. */
-       uint32_t                    si_tile_mode_array[32];
-       uint32_t                    cik_macrotile_mode_array[16];
+   /* PCI info: domain:bus:dev:func */
+   uint32_t pci_domain;
+   uint32_t pci_bus;
+   uint32_t pci_dev;
+   uint32_t pci_func;
+
+   /* Device info. */
+   const char *name;
+   const char *marketing_name;
+   bool is_pro_graphics;
+   uint32_t pci_id;
+   uint32_t pci_rev_id;
+   enum radeon_family family;
+   enum chip_class chip_class;
+   uint32_t family_id;
+   uint32_t chip_external_rev;
+   uint32_t clock_crystal_freq;
+
+   /* Features. */
+   bool has_graphics; /* false if the chip is compute-only */
+   uint32_t num_rings[NUM_RING_TYPES];
+   uint32_t ib_pad_dw_mask[NUM_RING_TYPES];
+   bool has_clear_state;
+   bool has_distributed_tess;
+   bool has_dcc_constant_encode;
+   bool has_rbplus;     /* if RB+ registers exist */
+   bool rbplus_allowed; /* if RB+ is allowed */
+   bool has_load_ctx_reg_pkt;
+   bool has_out_of_order_rast;
+   bool has_packed_math_16bit;
+   bool cpdma_prefetch_writes_memory;
+   bool has_gfx9_scissor_bug;
+   bool has_tc_compat_zrange_bug;
+   bool has_msaa_sample_loc_bug;
+   bool has_ls_vgpr_init_bug;
+
+   /* Display features. */
+   /* There are 2 display DCC codepaths, because display expects unaligned DCC. */
+   /* Disable RB and pipe alignment to skip the retile blit. (1 RB chips only) */
+   bool use_display_dcc_unaligned;
+   /* Allocate both aligned and unaligned DCC and use the retile blit. */
+   bool use_display_dcc_with_retile_blit;
+
+   /* Memory info. */
+   uint32_t pte_fragment_size;
+   uint32_t gart_page_size;
+   uint64_t gart_size;
+   uint64_t vram_size;
+   uint64_t vram_vis_size;
+   uint32_t vram_bit_width;
+   uint32_t vram_type;
+   unsigned gds_size;
+   unsigned gds_gfx_partition_size;
+   uint64_t max_alloc_size;
+   uint32_t min_alloc_size;
+   uint32_t address32_hi;
+   bool has_dedicated_vram;
+   bool has_l2_uncached;
+   bool r600_has_virtual_memory;
+   uint32_t num_sdp_interfaces;
+   uint32_t num_tcc_blocks;
+   uint32_t tcc_cache_line_size;
+   bool tcc_harvested;
+   unsigned pc_lines;
+   uint32_t lds_size_per_workgroup;
+   uint32_t lds_granularity;
+   uint32_t max_memory_clock;
+   uint32_t ce_ram_size;
+   uint32_t l1_cache_size;
+   uint32_t l2_cache_size;
+
+   /* CP info. */
+   bool gfx_ib_pad_with_type2;
+   unsigned ib_alignment; /* both start and size alignment */
+   uint32_t me_fw_version;
+   uint32_t me_fw_feature;
+   uint32_t pfp_fw_version;
+   uint32_t pfp_fw_feature;
+   uint32_t ce_fw_version;
+   uint32_t ce_fw_feature;
+
+   /* Multimedia info. */
+   bool has_hw_decode;
+   bool uvd_enc_supported;
+   uint32_t uvd_fw_version;
+   uint32_t vce_fw_version;
+   uint32_t vce_harvest_config;
+
+   /* Kernel & winsys capabilities. */
+   uint32_t drm_major; /* version */
+   uint32_t drm_minor;
+   uint32_t drm_patchlevel;
+   bool is_amdgpu;
+   bool has_userptr;
+   bool has_syncobj;
+   bool has_syncobj_wait_for_submit;
+   bool has_timeline_syncobj;
+   bool has_fence_to_handle;
+   bool has_ctx_priority;
+   bool has_local_buffers;
+   bool kernel_flushes_hdp_before_ib;
+   bool htile_cmask_support_1d_tiling;
+   bool si_TA_CS_BC_BASE_ADDR_allowed;
+   bool has_bo_metadata;
+   bool has_gpu_reset_status_query;
+   bool has_eqaa_surface_allocator;
+   bool has_format_bc1_through_bc7;
+   bool kernel_flushes_tc_l2_after_ib;
+   bool has_indirect_compute_dispatch;
+   bool has_unaligned_shader_loads;
+   bool has_sparse_vm_mappings;
+   bool has_2d_tiling;
+   bool has_read_registers_query;
+   bool has_gds_ordered_append;
+   bool has_scheduled_fence_dependency;
+   /* Whether SR-IOV is enabled or amdgpu.mcbp=1 was set on the kernel command line. */
+   bool mid_command_buffer_preemption_enabled;
+
+   /* Shader cores. */
+   uint32_t cu_mask[4][2];
+   uint32_t r600_max_quad_pipes; /* wave size / 16 */
+   uint32_t max_shader_clock;
+   uint32_t num_good_compute_units;
+   uint32_t max_good_cu_per_sa;
+   uint32_t min_good_cu_per_sa; /* min != max if SAs have different # of CUs */
+   uint32_t max_se;             /* shader engines */
+   uint32_t max_sh_per_se;      /* shader arrays per shader engine */
+   uint32_t max_wave64_per_simd;
+   uint32_t num_physical_sgprs_per_simd;
+   uint32_t num_physical_wave64_vgprs_per_simd;
+   uint32_t num_simd_per_compute_unit;
+   uint32_t min_sgpr_alloc;
+   uint32_t max_sgpr_alloc;
+   uint32_t sgpr_alloc_granularity;
+   uint32_t min_wave64_vgpr_alloc;
+   uint32_t max_vgpr_alloc;
+   uint32_t wave64_vgpr_alloc_granularity;
+   bool use_late_alloc; /* VS and GS: late pos/param allocation */
+
+   /* Render backends (color + depth blocks). */
+   uint32_t r300_num_gb_pipes;
+   uint32_t r300_num_z_pipes;
+   uint32_t r600_gb_backend_map; /* R600 harvest config */
+   bool r600_gb_backend_map_valid;
+   uint32_t r600_num_banks;
+   uint32_t gb_addr_config;
+   uint32_t pa_sc_tile_steering_override; /* CLEAR_STATE also sets this */
+   uint32_t num_render_backends;
+   uint32_t num_tile_pipes; /* pipe count from PIPE_CONFIG */
+   uint32_t pipe_interleave_bytes;
+   uint32_t enabled_rb_mask; /* GCN harvest config */
+   uint64_t max_alignment;   /* from addrlib */
+   uint32_t pbb_max_alloc_count;
+
+   /* Tile modes. */
+   uint32_t si_tile_mode_array[32];
+   uint32_t cik_macrotile_mode_array[16];
 };
 
-bool ac_query_gpu_info(int fd, void *dev_p,
-                      struct radeon_info *info,
-                      struct amdgpu_gpu_info *amdinfo);
+bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
+                       struct amdgpu_gpu_info *amdinfo);
 
 void ac_compute_driver_uuid(char *uuid, size_t size);
 
 void ac_compute_device_uuid(struct radeon_info *info, char *uuid, size_t size);
 void ac_print_gpu_info(struct radeon_info *info);
 int ac_get_gs_table_depth(enum chip_class chip_class, enum radeon_family family);
-void ac_get_raster_config(struct radeon_info *info,
-                         uint32_t *raster_config_p,
-                         uint32_t *raster_config_1_p,
-                         uint32_t *se_tile_repeat_p);
-void ac_get_harvested_configs(struct radeon_info *info,
-                             unsigned raster_config,
-                             unsigned *cik_raster_config_1_p,
-                             unsigned *raster_config_se);
-unsigned ac_get_compute_resource_limits(struct radeon_info *info,
-                                       unsigned waves_per_threadgroup,
-                                       unsigned max_waves_per_sh,
-                                       unsigned threadgroups_per_cu);
+void ac_get_raster_config(struct radeon_info *info, uint32_t *raster_config_p,
+                          uint32_t *raster_config_1_p, uint32_t *se_tile_repeat_p);
+void ac_get_harvested_configs(struct radeon_info *info, unsigned raster_config,
+                              unsigned *cik_raster_config_1_p, unsigned *raster_config_se);
+unsigned ac_get_compute_resource_limits(struct radeon_info *info, unsigned waves_per_threadgroup,
+                                        unsigned max_waves_per_sh, unsigned threadgroups_per_cu);
 
 #ifdef __cplusplus
 }
index e512b8f73275686b228297602b94c6b876667443..8a9cd7c7a6e43454eccfcb1aa879c7fdb0dfc1e4 100644 (file)
 
 #include "ac_rtld.h"
 
+#include "ac_binary.h"
+#include "ac_gpu_info.h"
+#include "util/u_dynarray.h"
+#include "util/u_math.h"
+
 #include <gelf.h>
 #include <libelf.h>
 #include <stdarg.h>
 #include <stdlib.h>
 #include <string.h>
 
-#include "ac_binary.h"
-#include "ac_gpu_info.h"
-#include "util/u_dynarray.h"
-#include "util/u_math.h"
-
 // Old distributions may not have this enum constant
 #define MY_EM_AMDGPU 224
 
 #endif
 
 #ifndef R_AMDGPU_NONE
-#define R_AMDGPU_NONE 0
-#define R_AMDGPU_ABS32_LO 1
-#define R_AMDGPU_ABS32_HI 2
-#define R_AMDGPU_ABS64 3
-#define R_AMDGPU_REL32 4
-#define R_AMDGPU_REL64 5
-#define R_AMDGPU_ABS32 6
-#define R_AMDGPU_GOTPCREL 7
+#define R_AMDGPU_NONE          0
+#define R_AMDGPU_ABS32_LO      1
+#define R_AMDGPU_ABS32_HI      2
+#define R_AMDGPU_ABS64         3
+#define R_AMDGPU_REL32         4
+#define R_AMDGPU_REL64         5
+#define R_AMDGPU_ABS32         6
+#define R_AMDGPU_GOTPCREL      7
 #define R_AMDGPU_GOTPCREL32_LO 8
 #define R_AMDGPU_GOTPCREL32_HI 9
-#define R_AMDGPU_REL32_LO 10
-#define R_AMDGPU_REL32_HI 11
-#define R_AMDGPU_RELATIVE64 13
+#define R_AMDGPU_REL32_LO      10
+#define R_AMDGPU_REL32_HI      11
+#define R_AMDGPU_RELATIVE64    13
 #endif
 
 /* For the UMR disassembler. */
-#define DEBUGGER_END_OF_CODE_MARKER    0xbf9f0000 /* invalid instruction */
-#define DEBUGGER_NUM_MARKERS           5
+#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
+#define DEBUGGER_NUM_MARKERS        5
 
 struct ac_rtld_section {
-       bool is_rx : 1;
-       bool is_pasted_text : 1;
-       uint64_t offset;
-       const char *name;
+   bool is_rx : 1;
+   bool is_pasted_text : 1;
+   uint64_t offset;
+   const char *name;
 };
 
 struct ac_rtld_part {
-       Elf *elf;
-       struct ac_rtld_section *sections;
-       unsigned num_sections;
+   Elf *elf;
+   struct ac_rtld_section *sections;
+   unsigned num_sections;
 };
 
 static void report_erroraf(const char *fmt, va_list va)
 {
-       char *msg;
-       int ret = vasprintf(&msg, fmt, va);
-       if (ret < 0)
-               msg = "(vasprintf failed)";
+   char *msg;
+   int ret = vasprintf(&msg, fmt, va);
+   if (ret < 0)
+      msg = "(vasprintf failed)";
 
-       fprintf(stderr, "ac_rtld error: %s\n", msg);
+   fprintf(stderr, "ac_rtld error: %s\n", msg);
 
-       if (ret >= 0)
-               free(msg);
+   if (ret >= 0)
+      free(msg);
 }
 
 static void report_errorf(const char *fmt, ...) PRINTFLIKE(1, 2);
 
 static void report_errorf(const char *fmt, ...)
 {
-       va_list va;
-       va_start(va, fmt);
-       report_erroraf(fmt, va);
-       va_end(va);
+   va_list va;
+   va_start(va, fmt);
+   report_erroraf(fmt, va);
+   va_end(va);
 }
 
 static void report_elf_errorf(const char *fmt, ...) PRINTFLIKE(1, 2);
 
 static void report_elf_errorf(const char *fmt, ...)
 {
-       va_list va;
-       va_start(va, fmt);
-       report_erroraf(fmt, va);
-       va_end(va);
+   va_list va;
+   va_start(va, fmt);
+   report_erroraf(fmt, va);
+   va_end(va);
 
-       fprintf(stderr, "ELF error: %s\n", elf_errmsg(elf_errno()));
+   fprintf(stderr, "ELF error: %s\n", elf_errmsg(elf_errno()));
 }
 
 /**
@@ -119,54 +119,53 @@ static void report_elf_errorf(const char *fmt, ...)
  * \p part_idx.
  */
 static const struct ac_rtld_symbol *find_symbol(const struct util_dynarray *symbols,
-                                               const char *name, unsigned part_idx)
+                                                const char *name, unsigned part_idx)
 {
-       util_dynarray_foreach(symbols, struct ac_rtld_symbol, symbol) {
-               if ((symbol->part_idx == ~0u || symbol->part_idx == part_idx) &&
-                   !strcmp(name, symbol->name))
-                       return symbol;
-       }
-       return 0;
+   util_dynarray_foreach (symbols, struct ac_rtld_symbol, symbol) {
+      if ((symbol->part_idx == ~0u || symbol->part_idx == part_idx) && !strcmp(name, symbol->name))
+         return symbol;
+   }
+   return 0;
 }
 
 static int compare_symbol_by_align(const void *lhsp, const void *rhsp)
 {
-       const struct ac_rtld_symbol *lhs = lhsp;
-       const struct ac_rtld_symbol *rhs = rhsp;
-       if (rhs->align > lhs->align)
-               return 1;
-       if (rhs->align < lhs->align)
-               return -1;
-       return 0;
+   const struct ac_rtld_symbol *lhs = lhsp;
+   const struct ac_rtld_symbol *rhs = rhsp;
+   if (rhs->align > lhs->align)
+      return 1;
+   if (rhs->align < lhs->align)
+      return -1;
+   return 0;
 }
 
 /**
  * Sort the given symbol list by decreasing alignment and assign offsets.
  */
 static bool layout_symbols(struct ac_rtld_symbol *symbols, unsigned num_symbols,
-                          uint64_t *ptotal_size)
+                           uint64_t *ptotal_size)
 {
-       qsort(symbols, num_symbols, sizeof(*symbols), compare_symbol_by_align);
+   qsort(symbols, num_symbols, sizeof(*symbols), compare_symbol_by_align);
 
-       uint64_t total_size = *ptotal_size;
+   uint64_t total_size = *ptotal_size;
 
-       for (unsigned i = 0; i < num_symbols; ++i) {
-               struct ac_rtld_symbol *s = &symbols[i];
-               assert(util_is_power_of_two_nonzero(s->align));
+   for (unsigned i = 0; i < num_symbols; ++i) {
+      struct ac_rtld_symbol *s = &symbols[i];
+      assert(util_is_power_of_two_nonzero(s->align));
 
-               total_size = align64(total_size, s->align);
-               s->offset = total_size;
+      total_size = align64(total_size, s->align);
+      s->offset = total_size;
 
-               if (total_size + s->size < total_size) {
-                       report_errorf("%s: size overflow", __FUNCTION__);
-                       return false;
-               }
+      if (total_size + s->size < total_size) {
+         report_errorf("%s: size overflow", __FUNCTION__);
+         return false;
+      }
 
-               total_size += s->size;
-       }
+      total_size += s->size;
+   }
 
-       *ptotal_size = total_size;
-       return true;
+   *ptotal_size = total_size;
+   return true;
 }
 
 /**
@@ -175,71 +174,68 @@ static bool layout_symbols(struct ac_rtld_symbol *symbols, unsigned num_symbols,
  *
  * Shared LDS symbols are filtered out.
  */
-static bool read_private_lds_symbols(struct ac_rtld_binary *binary,
-                                    unsigned part_idx,
-                                    Elf_Scn *section,
-                                    uint32_t *lds_end_align)
+static bool read_private_lds_symbols(struct ac_rtld_binary *binary, unsigned part_idx,
+                                     Elf_Scn *section, uint32_t *lds_end_align)
 {
-#define report_if(cond) \
-       do { \
-               if ((cond)) { \
-                       report_errorf(#cond); \
-                       return false; \
-               } \
-       } while (false)
-#define report_elf_if(cond) \
-       do { \
-               if ((cond)) { \
-                       report_elf_errorf(#cond); \
-                       return false; \
-               } \
-       } while (false)
-
-       struct ac_rtld_part *part = &binary->parts[part_idx];
-       Elf64_Shdr *shdr = elf64_getshdr(section);
-       uint32_t strtabidx = shdr->sh_link;
-       Elf_Data *symbols_data = elf_getdata(section, NULL);
-       report_elf_if(!symbols_data);
-
-       const Elf64_Sym *symbol = symbols_data->d_buf;
-       size_t num_symbols = symbols_data->d_size / sizeof(Elf64_Sym);
-
-       for (size_t j = 0; j < num_symbols; ++j, ++symbol) {
-               struct ac_rtld_symbol s = {};
-
-               if (ELF64_ST_TYPE(symbol->st_info) == STT_AMDGPU_LDS) {
-                       /* old-style LDS symbols from initial prototype -- remove eventually */
-                       s.align = MIN2(1u << (symbol->st_other >> 3), 1u << 16);
-               } else if (symbol->st_shndx == SHN_AMDGPU_LDS) {
-                       s.align = MIN2(symbol->st_value, 1u << 16);
-                       report_if(!util_is_power_of_two_nonzero(s.align));
-               } else
-                       continue;
-
-               report_if(symbol->st_size > 1u << 29);
-
-               s.name = elf_strptr(part->elf, strtabidx, symbol->st_name);
-               s.size = symbol->st_size;
-               s.part_idx = part_idx;
-
-               if (!strcmp(s.name, "__lds_end")) {
-                       report_elf_if(s.size != 0);
-                       *lds_end_align = MAX2(*lds_end_align, s.align);
-                       continue;
-               }
-
-               const struct ac_rtld_symbol *shared =
-                       find_symbol(&binary->lds_symbols, s.name, part_idx);
-               if (shared) {
-                       report_elf_if(s.align > shared->align);
-                       report_elf_if(s.size > shared->size);
-                       continue;
-               }
-
-               util_dynarray_append(&binary->lds_symbols, struct ac_rtld_symbol, s);
-       }
-
-       return true;
+#define report_if(cond)                                                                            \
+   do {                                                                                            \
+      if ((cond)) {                                                                                \
+         report_errorf(#cond);                                                                     \
+         return false;                                                                             \
+      }                                                                                            \
+   } while (false)
+#define report_elf_if(cond)                                                                        \
+   do {                                                                                            \
+      if ((cond)) {                                                                                \
+         report_elf_errorf(#cond);                                                                 \
+         return false;                                                                             \
+      }                                                                                            \
+   } while (false)
+
+   struct ac_rtld_part *part = &binary->parts[part_idx];
+   Elf64_Shdr *shdr = elf64_getshdr(section);
+   uint32_t strtabidx = shdr->sh_link;
+   Elf_Data *symbols_data = elf_getdata(section, NULL);
+   report_elf_if(!symbols_data);
+
+   const Elf64_Sym *symbol = symbols_data->d_buf;
+   size_t num_symbols = symbols_data->d_size / sizeof(Elf64_Sym);
+
+   for (size_t j = 0; j < num_symbols; ++j, ++symbol) {
+      struct ac_rtld_symbol s = {};
+
+      if (ELF64_ST_TYPE(symbol->st_info) == STT_AMDGPU_LDS) {
+         /* old-style LDS symbols from initial prototype -- remove eventually */
+         s.align = MIN2(1u << (symbol->st_other >> 3), 1u << 16);
+      } else if (symbol->st_shndx == SHN_AMDGPU_LDS) {
+         s.align = MIN2(symbol->st_value, 1u << 16);
+         report_if(!util_is_power_of_two_nonzero(s.align));
+      } else
+         continue;
+
+      report_if(symbol->st_size > 1u << 29);
+
+      s.name = elf_strptr(part->elf, strtabidx, symbol->st_name);
+      s.size = symbol->st_size;
+      s.part_idx = part_idx;
+
+      if (!strcmp(s.name, "__lds_end")) {
+         report_elf_if(s.size != 0);
+         *lds_end_align = MAX2(*lds_end_align, s.align);
+         continue;
+      }
+
+      const struct ac_rtld_symbol *shared = find_symbol(&binary->lds_symbols, s.name, part_idx);
+      if (shared) {
+         report_elf_if(s.align > shared->align);
+         report_elf_if(s.size > shared->size);
+         continue;
+      }
+
+      util_dynarray_append(&binary->lds_symbols, struct ac_rtld_symbol, s);
+   }
+
+   return true;
 
 #undef report_if
 #undef report_elf_if
@@ -251,486 +247,476 @@ static bool read_private_lds_symbols(struct ac_rtld_binary *binary,
  * \param binary the uninitialized struct
  * \param i binary opening parameters
  */
-bool ac_rtld_open(struct ac_rtld_binary *binary,
-                 struct ac_rtld_open_info i)
+bool ac_rtld_open(struct ac_rtld_binary *binary, struct ac_rtld_open_info i)
 {
-       /* One of the libelf implementations
-        * (http://www.mr511.de/software/english.htm) requires calling
-        * elf_version() before elf_memory().
-        */
-       elf_version(EV_CURRENT);
-
-       memset(binary, 0, sizeof(*binary));
-       memcpy(&binary->options, &i.options, sizeof(binary->options));
-       binary->wave_size = i.wave_size;
-       binary->num_parts = i.num_parts;
-       binary->parts = calloc(sizeof(*binary->parts), i.num_parts);
-       if (!binary->parts)
-               return false;
-
-       uint64_t pasted_text_size = 0;
-       uint64_t rx_align = 1;
-       uint64_t rx_size = 0;
-       uint64_t exec_size = 0;
-
-#define report_if(cond) \
-       do { \
-               if ((cond)) { \
-                       report_errorf(#cond); \
-                       goto fail; \
-               } \
-       } while (false)
-#define report_elf_if(cond) \
-       do { \
-               if ((cond)) { \
-                       report_elf_errorf(#cond); \
-                       goto fail; \
-               } \
-       } while (false)
-
-       /* Copy and layout shared LDS symbols. */
-       if (i.num_shared_lds_symbols) {
-               if (!util_dynarray_resize(&binary->lds_symbols, struct ac_rtld_symbol,
-                                         i.num_shared_lds_symbols))
-                       goto fail;
-
-               memcpy(binary->lds_symbols.data, i.shared_lds_symbols, binary->lds_symbols.size);
-       }
-
-       util_dynarray_foreach(&binary->lds_symbols, struct ac_rtld_symbol, symbol)
-               symbol->part_idx = ~0u;
-
-       unsigned max_lds_size = 64 * 1024;
-
-       if (i.info->chip_class == GFX6 ||
-           (i.shader_type != MESA_SHADER_COMPUTE &&
-            i.shader_type != MESA_SHADER_FRAGMENT))
-               max_lds_size = 32 * 1024;
-
-       uint64_t shared_lds_size = 0;
-       if (!layout_symbols(binary->lds_symbols.data, i.num_shared_lds_symbols, &shared_lds_size))
-               goto fail;
-
-       if (shared_lds_size > max_lds_size) {
-               fprintf(stderr, "ac_rtld error(1): too much LDS (used = %u, max = %u)\n",
-                       (unsigned)shared_lds_size, max_lds_size);
-               goto fail;
-       }
-       binary->lds_size = shared_lds_size;
-
-       /* First pass over all parts: open ELFs, pre-determine the placement of
-        * sections in the memory image, and collect and layout private LDS symbols. */
-       uint32_t lds_end_align = 0;
-
-       if (binary->options.halt_at_entry)
-               pasted_text_size += 4;
-
-       for (unsigned part_idx = 0; part_idx < i.num_parts; ++part_idx) {
-               struct ac_rtld_part *part = &binary->parts[part_idx];
-               unsigned part_lds_symbols_begin =
-                       util_dynarray_num_elements(&binary->lds_symbols, struct ac_rtld_symbol);
-
-               part->elf = elf_memory((char *)i.elf_ptrs[part_idx], i.elf_sizes[part_idx]);
-               report_elf_if(!part->elf);
-
-               const Elf64_Ehdr *ehdr = elf64_getehdr(part->elf);
-               report_elf_if(!ehdr);
-               report_if(ehdr->e_machine != MY_EM_AMDGPU);
-
-               size_t section_str_index;
-               size_t num_shdrs;
-               report_elf_if(elf_getshdrstrndx(part->elf, &section_str_index) < 0);
-               report_elf_if(elf_getshdrnum(part->elf, &num_shdrs) < 0);
-
-               part->num_sections = num_shdrs;
-               part->sections = calloc(sizeof(*part->sections), num_shdrs);
-               report_if(!part->sections);
-
-               Elf_Scn *section = NULL;
-               while ((section = elf_nextscn(part->elf, section))) {
-                       Elf64_Shdr *shdr = elf64_getshdr(section);
-                       struct ac_rtld_section *s = &part->sections[elf_ndxscn(section)];
-                       s->name = elf_strptr(part->elf, section_str_index, shdr->sh_name);
-                       report_elf_if(!s->name);
-
-                       /* Cannot actually handle linked objects yet */
-                       report_elf_if(shdr->sh_addr != 0);
-
-                       /* Alignment must be 0 or a power of two */
-                       report_elf_if(shdr->sh_addralign & (shdr->sh_addralign - 1));
-                       uint64_t sh_align = MAX2(shdr->sh_addralign, 1);
-
-                       if (shdr->sh_flags & SHF_ALLOC &&
-                           shdr->sh_type != SHT_NOTE) {
-                               report_if(shdr->sh_flags & SHF_WRITE);
-
-                               s->is_rx = true;
-
-                               if (shdr->sh_flags & SHF_EXECINSTR) {
-                                       report_elf_if(shdr->sh_size & 3);
-
-                                       if (!strcmp(s->name, ".text"))
-                                               s->is_pasted_text = true;
-
-                                       exec_size += shdr->sh_size;
-                               }
-
-                               if (s->is_pasted_text) {
-                                       s->offset = pasted_text_size;
-                                       pasted_text_size += shdr->sh_size;
-                               } else {
-                                       rx_align = align(rx_align, sh_align);
-                                       rx_size = align(rx_size, sh_align);
-                                       s->offset = rx_size;
-                                       rx_size += shdr->sh_size;
-                               }
-                       } else if (shdr->sh_type == SHT_SYMTAB) {
-                               if (!read_private_lds_symbols(binary, part_idx, section, &lds_end_align))
-                                       goto fail;
-                       }
-               }
-
-               uint64_t part_lds_size = shared_lds_size;
-               if (!layout_symbols(
-                       util_dynarray_element(&binary->lds_symbols, struct ac_rtld_symbol, part_lds_symbols_begin),
-                       util_dynarray_num_elements(&binary->lds_symbols, struct ac_rtld_symbol) - part_lds_symbols_begin,
-                       &part_lds_size))
-                       goto fail;
-               binary->lds_size = MAX2(binary->lds_size, part_lds_size);
-       }
-
-       binary->rx_end_markers = pasted_text_size;
-       pasted_text_size += 4 * DEBUGGER_NUM_MARKERS;
-
-       /* __lds_end is a special symbol that points at the end of the memory
-        * occupied by other LDS symbols. Its alignment is taken as the
-        * maximum of its alignment over all shader parts where it occurs.
-        */
-       if (lds_end_align) {
-               binary->lds_size = align(binary->lds_size, lds_end_align);
-
-               struct ac_rtld_symbol *lds_end =
-                       util_dynarray_grow(&binary->lds_symbols, struct ac_rtld_symbol, 1);
-               lds_end->name = "__lds_end";
-               lds_end->size = 0;
-               lds_end->align = lds_end_align;
-               lds_end->offset = binary->lds_size;
-               lds_end->part_idx = ~0u;
-       }
-
-       if (binary->lds_size > max_lds_size) {
-               fprintf(stderr, "ac_rtld error(2): too much LDS (used = %u, max = %u)\n",
-                       (unsigned)binary->lds_size, max_lds_size);
-               goto fail;
-       }
-
-       /* Second pass: Adjust offsets of non-pasted text sections. */
-       binary->rx_size = pasted_text_size;
-       binary->rx_size = align(binary->rx_size, rx_align);
-
-       for (unsigned part_idx = 0; part_idx < i.num_parts; ++part_idx) {
-               struct ac_rtld_part *part = &binary->parts[part_idx];
-               size_t num_shdrs;
-               elf_getshdrnum(part->elf, &num_shdrs);
-
-               for (unsigned j = 0; j < num_shdrs; ++j) {
-                       struct ac_rtld_section *s = &part->sections[j];
-                       if (s->is_rx && !s->is_pasted_text)
-                               s->offset += binary->rx_size;
-               }
-       }
-
-       binary->rx_size += rx_size;
-       binary->exec_size = exec_size;
-
-       if (i.info->chip_class >= GFX10) {
-               /* In gfx10, the SQ fetches up to 3 cache lines of 16 dwords
-                * ahead of the PC, configurable by SH_MEM_CONFIG and
-                * S_INST_PREFETCH. This can cause two issues:
-                *
-                * (1) Crossing a page boundary to an unmapped page. The logic
-                *     does not distinguish between a required fetch and a "mere"
-                *     prefetch and will fault.
-                *
-                * (2) Prefetching instructions that will be changed for a
-                *     different shader.
-                *
-                * (2) is not currently an issue because we flush the I$ at IB
-                * boundaries, but (1) needs to be addressed. Due to buffer
-                * suballocation, we just play it safe.
-                */
-               binary->rx_size = align(binary->rx_size + 3 * 64, 64);
-       }
-
-       return true;
+   /* One of the libelf implementations
+    * (http://www.mr511.de/software/english.htm) requires calling
+    * elf_version() before elf_memory().
+    */
+   elf_version(EV_CURRENT);
+
+   memset(binary, 0, sizeof(*binary));
+   memcpy(&binary->options, &i.options, sizeof(binary->options));
+   binary->wave_size = i.wave_size;
+   binary->num_parts = i.num_parts;
+   binary->parts = calloc(sizeof(*binary->parts), i.num_parts);
+   if (!binary->parts)
+      return false;
+
+   uint64_t pasted_text_size = 0;
+   uint64_t rx_align = 1;
+   uint64_t rx_size = 0;
+   uint64_t exec_size = 0;
+
+#define report_if(cond)                                                                            \
+   do {                                                                                            \
+      if ((cond)) {                                                                                \
+         report_errorf(#cond);                                                                     \
+         goto fail;                                                                                \
+      }                                                                                            \
+   } while (false)
+#define report_elf_if(cond)                                                                        \
+   do {                                                                                            \
+      if ((cond)) {                                                                                \
+         report_elf_errorf(#cond);                                                                 \
+         goto fail;                                                                                \
+      }                                                                                            \
+   } while (false)
+
+   /* Copy and layout shared LDS symbols. */
+   if (i.num_shared_lds_symbols) {
+      if (!util_dynarray_resize(&binary->lds_symbols, struct ac_rtld_symbol,
+                                i.num_shared_lds_symbols))
+         goto fail;
+
+      memcpy(binary->lds_symbols.data, i.shared_lds_symbols, binary->lds_symbols.size);
+   }
+
+   util_dynarray_foreach (&binary->lds_symbols, struct ac_rtld_symbol, symbol)
+      symbol->part_idx = ~0u;
+
+   unsigned max_lds_size = 64 * 1024;
+
+   if (i.info->chip_class == GFX6 ||
+       (i.shader_type != MESA_SHADER_COMPUTE && i.shader_type != MESA_SHADER_FRAGMENT))
+      max_lds_size = 32 * 1024;
+
+   uint64_t shared_lds_size = 0;
+   if (!layout_symbols(binary->lds_symbols.data, i.num_shared_lds_symbols, &shared_lds_size))
+      goto fail;
+
+   if (shared_lds_size > max_lds_size) {
+      fprintf(stderr, "ac_rtld error(1): too much LDS (used = %u, max = %u)\n",
+              (unsigned)shared_lds_size, max_lds_size);
+      goto fail;
+   }
+   binary->lds_size = shared_lds_size;
+
+   /* First pass over all parts: open ELFs, pre-determine the placement of
+    * sections in the memory image, and collect and layout private LDS symbols. */
+   uint32_t lds_end_align = 0;
+
+   if (binary->options.halt_at_entry)
+      pasted_text_size += 4;
+
+   for (unsigned part_idx = 0; part_idx < i.num_parts; ++part_idx) {
+      struct ac_rtld_part *part = &binary->parts[part_idx];
+      unsigned part_lds_symbols_begin =
+         util_dynarray_num_elements(&binary->lds_symbols, struct ac_rtld_symbol);
+
+      part->elf = elf_memory((char *)i.elf_ptrs[part_idx], i.elf_sizes[part_idx]);
+      report_elf_if(!part->elf);
+
+      const Elf64_Ehdr *ehdr = elf64_getehdr(part->elf);
+      report_elf_if(!ehdr);
+      report_if(ehdr->e_machine != MY_EM_AMDGPU);
+
+      size_t section_str_index;
+      size_t num_shdrs;
+      report_elf_if(elf_getshdrstrndx(part->elf, &section_str_index) < 0);
+      report_elf_if(elf_getshdrnum(part->elf, &num_shdrs) < 0);
+
+      part->num_sections = num_shdrs;
+      part->sections = calloc(sizeof(*part->sections), num_shdrs);
+      report_if(!part->sections);
+
+      Elf_Scn *section = NULL;
+      while ((section = elf_nextscn(part->elf, section))) {
+         Elf64_Shdr *shdr = elf64_getshdr(section);
+         struct ac_rtld_section *s = &part->sections[elf_ndxscn(section)];
+         s->name = elf_strptr(part->elf, section_str_index, shdr->sh_name);
+         report_elf_if(!s->name);
+
+         /* Cannot actually handle linked objects yet */
+         report_elf_if(shdr->sh_addr != 0);
+
+         /* Alignment must be 0 or a power of two */
+         report_elf_if(shdr->sh_addralign & (shdr->sh_addralign - 1));
+         uint64_t sh_align = MAX2(shdr->sh_addralign, 1);
+
+         if (shdr->sh_flags & SHF_ALLOC && shdr->sh_type != SHT_NOTE) {
+            report_if(shdr->sh_flags & SHF_WRITE);
+
+            s->is_rx = true;
+
+            if (shdr->sh_flags & SHF_EXECINSTR) {
+               report_elf_if(shdr->sh_size & 3);
+
+               if (!strcmp(s->name, ".text"))
+                  s->is_pasted_text = true;
+
+               exec_size += shdr->sh_size;
+            }
+
+            if (s->is_pasted_text) {
+               s->offset = pasted_text_size;
+               pasted_text_size += shdr->sh_size;
+            } else {
+               rx_align = align(rx_align, sh_align);
+               rx_size = align(rx_size, sh_align);
+               s->offset = rx_size;
+               rx_size += shdr->sh_size;
+            }
+         } else if (shdr->sh_type == SHT_SYMTAB) {
+            if (!read_private_lds_symbols(binary, part_idx, section, &lds_end_align))
+               goto fail;
+         }
+      }
+
+      uint64_t part_lds_size = shared_lds_size;
+      if (!layout_symbols(util_dynarray_element(&binary->lds_symbols, struct ac_rtld_symbol,
+                                                part_lds_symbols_begin),
+                          util_dynarray_num_elements(&binary->lds_symbols, struct ac_rtld_symbol) -
+                             part_lds_symbols_begin,
+                          &part_lds_size))
+         goto fail;
+      binary->lds_size = MAX2(binary->lds_size, part_lds_size);
+   }
+
+   binary->rx_end_markers = pasted_text_size;
+   pasted_text_size += 4 * DEBUGGER_NUM_MARKERS;
+
+   /* __lds_end is a special symbol that points at the end of the memory
+    * occupied by other LDS symbols. Its alignment is taken as the
+    * maximum of its alignment over all shader parts where it occurs.
+    */
+   if (lds_end_align) {
+      binary->lds_size = align(binary->lds_size, lds_end_align);
+
+      struct ac_rtld_symbol *lds_end =
+         util_dynarray_grow(&binary->lds_symbols, struct ac_rtld_symbol, 1);
+      lds_end->name = "__lds_end";
+      lds_end->size = 0;
+      lds_end->align = lds_end_align;
+      lds_end->offset = binary->lds_size;
+      lds_end->part_idx = ~0u;
+   }
+
+   if (binary->lds_size > max_lds_size) {
+      fprintf(stderr, "ac_rtld error(2): too much LDS (used = %u, max = %u)\n",
+              (unsigned)binary->lds_size, max_lds_size);
+      goto fail;
+   }
+
+   /* Second pass: Adjust offsets of non-pasted text sections. */
+   binary->rx_size = pasted_text_size;
+   binary->rx_size = align(binary->rx_size, rx_align);
+
+   for (unsigned part_idx = 0; part_idx < i.num_parts; ++part_idx) {
+      struct ac_rtld_part *part = &binary->parts[part_idx];
+      size_t num_shdrs;
+      elf_getshdrnum(part->elf, &num_shdrs);
+
+      for (unsigned j = 0; j < num_shdrs; ++j) {
+         struct ac_rtld_section *s = &part->sections[j];
+         if (s->is_rx && !s->is_pasted_text)
+            s->offset += binary->rx_size;
+      }
+   }
+
+   binary->rx_size += rx_size;
+   binary->exec_size = exec_size;
+
+   if (i.info->chip_class >= GFX10) {
+      /* In gfx10, the SQ fetches up to 3 cache lines of 16 dwords
+       * ahead of the PC, configurable by SH_MEM_CONFIG and
+       * S_INST_PREFETCH. This can cause two issues:
+       *
+       * (1) Crossing a page boundary to an unmapped page. The logic
+       *     does not distinguish between a required fetch and a "mere"
+       *     prefetch and will fault.
+       *
+       * (2) Prefetching instructions that will be changed for a
+       *     different shader.
+       *
+       * (2) is not currently an issue because we flush the I$ at IB
+       * boundaries, but (1) needs to be addressed. Due to buffer
+       * suballocation, we just play it safe.
+       */
+      binary->rx_size = align(binary->rx_size + 3 * 64, 64);
+   }
+
+   return true;
 
 #undef report_if
 #undef report_elf_if
 
 fail:
-       ac_rtld_close(binary);
-       return false;
+   ac_rtld_close(binary);
+   return false;
 }
 
 void ac_rtld_close(struct ac_rtld_binary *binary)
 {
-       for (unsigned i = 0; i < binary->num_parts; ++i) {
-               struct ac_rtld_part *part = &binary->parts[i];
-               free(part->sections);
-               elf_end(part->elf);
-       }
-
-       util_dynarray_fini(&binary->lds_symbols);
-       free(binary->parts);
-       binary->parts = NULL;
-       binary->num_parts = 0;
+   for (unsigned i = 0; i < binary->num_parts; ++i) {
+      struct ac_rtld_part *part = &binary->parts[i];
+      free(part->sections);
+      elf_end(part->elf);
+   }
+
+   util_dynarray_fini(&binary->lds_symbols);
+   free(binary->parts);
+   binary->parts = NULL;
+   binary->num_parts = 0;
 }
 
-static bool get_section_by_name(struct ac_rtld_part *part, const char *name,
-                               const char **data, size_t *nbytes)
+static bool get_section_by_name(struct ac_rtld_part *part, const char *name, const char **data,
+                                size_t *nbytes)
 {
-       for (unsigned i = 0; i < part->num_sections; ++i) {
-               struct ac_rtld_section *s = &part->sections[i];
-               if (s->name && !strcmp(name, s->name)) {
-                       Elf_Scn *target_scn = elf_getscn(part->elf, i);
-                       Elf_Data *target_data = elf_getdata(target_scn, NULL);
-                       if (!target_data) {
-                               report_elf_errorf("ac_rtld: get_section_by_name: elf_getdata");
-                               return false;
-                       }
-
-                       *data = target_data->d_buf;
-                       *nbytes = target_data->d_size;
-                       return true;
-               }
-       }
-       return false;
+   for (unsigned i = 0; i < part->num_sections; ++i) {
+      struct ac_rtld_section *s = &part->sections[i];
+      if (s->name && !strcmp(name, s->name)) {
+         Elf_Scn *target_scn = elf_getscn(part->elf, i);
+         Elf_Data *target_data = elf_getdata(target_scn, NULL);
+         if (!target_data) {
+            report_elf_errorf("ac_rtld: get_section_by_name: elf_getdata");
+            return false;
+         }
+
+         *data = target_data->d_buf;
+         *nbytes = target_data->d_size;
+         return true;
+      }
+   }
+   return false;
 }
 
-bool ac_rtld_get_section_by_name(struct ac_rtld_binary *binary, const char *name,
-                                const char **data, size_t *nbytes)
+bool ac_rtld_get_section_by_name(struct ac_rtld_binary *binary, const char *name, const char **data,
+                                 size_t *nbytes)
 {
-       assert(binary->num_parts == 1);
-       return get_section_by_name(&binary->parts[0], name, data, nbytes);
+   assert(binary->num_parts == 1);
+   return get_section_by_name(&binary->parts[0], name, data, nbytes);
 }
 
-bool ac_rtld_read_config(const struct radeon_info *info,
-                        struct ac_rtld_binary *binary,
-                        struct ac_shader_config *config)
+bool ac_rtld_read_config(const struct radeon_info *info, struct ac_rtld_binary *binary,
+                         struct ac_shader_config *config)
 {
-       for (unsigned i = 0; i < binary->num_parts; ++i) {
-               struct ac_rtld_part *part = &binary->parts[i];
-               const char *config_data;
-               size_t config_nbytes;
-
-               if (!get_section_by_name(part, ".AMDGPU.config",
-                                        &config_data, &config_nbytes))
-                       return false;
-
-               /* TODO: be precise about scratch use? */
-               struct ac_shader_config c = {};
-               ac_parse_shader_binary_config(config_data, config_nbytes,
-                                             binary->wave_size, true, info, &c);
-
-               config->num_sgprs = MAX2(config->num_sgprs, c.num_sgprs);
-               config->num_vgprs = MAX2(config->num_vgprs, c.num_vgprs);
-               config->spilled_sgprs = MAX2(config->spilled_sgprs, c.spilled_sgprs);
-               config->spilled_vgprs = MAX2(config->spilled_vgprs, c.spilled_vgprs);
-               config->scratch_bytes_per_wave = MAX2(config->scratch_bytes_per_wave,
-                                                     c.scratch_bytes_per_wave);
-
-               assert(i == 0 || config->float_mode == c.float_mode);
-               config->float_mode = c.float_mode;
-
-               /* SPI_PS_INPUT_ENA/ADDR can't be combined. Only the value from
-                * the main shader part is used. */
-               assert(config->spi_ps_input_ena == 0 &&
-                      config->spi_ps_input_addr == 0);
-               config->spi_ps_input_ena = c.spi_ps_input_ena;
-               config->spi_ps_input_addr = c.spi_ps_input_addr;
-
-               /* TODO: consistently use LDS symbols for this */
-               config->lds_size = MAX2(config->lds_size, c.lds_size);
-
-               /* TODO: Should we combine these somehow? It's currently only
-                * used for radeonsi's compute, where multiple parts aren't used. */
-               assert(config->rsrc1 == 0 && config->rsrc2 == 0);
-               config->rsrc1 = c.rsrc1;
-               config->rsrc2 = c.rsrc2;
-       }
-
-       return true;
+   for (unsigned i = 0; i < binary->num_parts; ++i) {
+      struct ac_rtld_part *part = &binary->parts[i];
+      const char *config_data;
+      size_t config_nbytes;
+
+      if (!get_section_by_name(part, ".AMDGPU.config", &config_data, &config_nbytes))
+         return false;
+
+      /* TODO: be precise about scratch use? */
+      struct ac_shader_config c = {};
+      ac_parse_shader_binary_config(config_data, config_nbytes, binary->wave_size, true, info, &c);
+
+      config->num_sgprs = MAX2(config->num_sgprs, c.num_sgprs);
+      config->num_vgprs = MAX2(config->num_vgprs, c.num_vgprs);
+      config->spilled_sgprs = MAX2(config->spilled_sgprs, c.spilled_sgprs);
+      config->spilled_vgprs = MAX2(config->spilled_vgprs, c.spilled_vgprs);
+      config->scratch_bytes_per_wave =
+         MAX2(config->scratch_bytes_per_wave, c.scratch_bytes_per_wave);
+
+      assert(i == 0 || config->float_mode == c.float_mode);
+      config->float_mode = c.float_mode;
+
+      /* SPI_PS_INPUT_ENA/ADDR can't be combined. Only the value from
+       * the main shader part is used. */
+      assert(config->spi_ps_input_ena == 0 && config->spi_ps_input_addr == 0);
+      config->spi_ps_input_ena = c.spi_ps_input_ena;
+      config->spi_ps_input_addr = c.spi_ps_input_addr;
+
+      /* TODO: consistently use LDS symbols for this */
+      config->lds_size = MAX2(config->lds_size, c.lds_size);
+
+      /* TODO: Should we combine these somehow? It's currently only
+       * used for radeonsi's compute, where multiple parts aren't used. */
+      assert(config->rsrc1 == 0 && config->rsrc2 == 0);
+      config->rsrc1 = c.rsrc1;
+      config->rsrc2 = c.rsrc2;
+   }
+
+   return true;
 }
 
-static bool resolve_symbol(const struct ac_rtld_upload_info *u,
-                          unsigned part_idx, const Elf64_Sym *sym,
-                          const char *name, uint64_t *value)
+static bool resolve_symbol(const struct ac_rtld_upload_info *u, unsigned part_idx,
+                           const Elf64_Sym *sym, const char *name, uint64_t *value)
 {
-       /* TODO: properly disentangle the undef and the LDS cases once
-        * STT_AMDGPU_LDS is retired. */
-       if (sym->st_shndx == SHN_UNDEF || sym->st_shndx == SHN_AMDGPU_LDS) {
-               const struct ac_rtld_symbol *lds_sym =
-                       find_symbol(&u->binary->lds_symbols, name, part_idx);
-
-               if (lds_sym) {
-                       *value = lds_sym->offset;
-                       return true;
-               }
-
-               /* TODO: resolve from other parts */
-
-               if (u->get_external_symbol(u->cb_data, name, value))
-                       return true;
-
-               report_errorf("symbol %s: unknown", name);
-               return false;
-       }
-
-       struct ac_rtld_part *part = &u->binary->parts[part_idx];
-       if (sym->st_shndx >= part->num_sections) {
-               report_errorf("symbol %s: section out of bounds", name);
-               return false;
-       }
-
-       struct ac_rtld_section *s = &part->sections[sym->st_shndx];
-       if (!s->is_rx) {
-               report_errorf("symbol %s: bad section", name);
-               return false;
-       }
-
-       uint64_t section_base = u->rx_va + s->offset;
-
-       *value = section_base + sym->st_value;
-       return true;
+   /* TODO: properly disentangle the undef and the LDS cases once
+    * STT_AMDGPU_LDS is retired. */
+   if (sym->st_shndx == SHN_UNDEF || sym->st_shndx == SHN_AMDGPU_LDS) {
+      const struct ac_rtld_symbol *lds_sym = find_symbol(&u->binary->lds_symbols, name, part_idx);
+
+      if (lds_sym) {
+         *value = lds_sym->offset;
+         return true;
+      }
+
+      /* TODO: resolve from other parts */
+
+      if (u->get_external_symbol(u->cb_data, name, value))
+         return true;
+
+      report_errorf("symbol %s: unknown", name);
+      return false;
+   }
+
+   struct ac_rtld_part *part = &u->binary->parts[part_idx];
+   if (sym->st_shndx >= part->num_sections) {
+      report_errorf("symbol %s: section out of bounds", name);
+      return false;
+   }
+
+   struct ac_rtld_section *s = &part->sections[sym->st_shndx];
+   if (!s->is_rx) {
+      report_errorf("symbol %s: bad section", name);
+      return false;
+   }
+
+   uint64_t section_base = u->rx_va + s->offset;
+
+   *value = section_base + sym->st_value;
+   return true;
 }
 
-static bool apply_relocs(const struct ac_rtld_upload_info *u,
-                        unsigned part_idx, const Elf64_Shdr *reloc_shdr,
-                        const Elf_Data *reloc_data)
+static bool apply_relocs(const struct ac_rtld_upload_info *u, unsigned part_idx,
+                         const Elf64_Shdr *reloc_shdr, const Elf_Data *reloc_data)
 {
-#define report_if(cond) \
-       do { \
-               if ((cond)) { \
-                       report_errorf(#cond); \
-                       return false; \
-               } \
-       } while (false)
-#define report_elf_if(cond) \
-       do { \
-               if ((cond)) { \
-                       report_elf_errorf(#cond); \
-                       return false; \
-               } \
-       } while (false)
-
-       struct ac_rtld_part *part = &u->binary->parts[part_idx];
-       Elf_Scn *target_scn = elf_getscn(part->elf, reloc_shdr->sh_info);
-       report_elf_if(!target_scn);
-
-       Elf_Data *target_data = elf_getdata(target_scn, NULL);
-       report_elf_if(!target_data);
-
-       Elf_Scn *symbols_scn = elf_getscn(part->elf, reloc_shdr->sh_link);
-       report_elf_if(!symbols_scn);
-
-       Elf64_Shdr *symbols_shdr = elf64_getshdr(symbols_scn);
-       report_elf_if(!symbols_shdr);
-       uint32_t strtabidx = symbols_shdr->sh_link;
-
-       Elf_Data *symbols_data = elf_getdata(symbols_scn, NULL);
-       report_elf_if(!symbols_data);
-
-       const Elf64_Sym *symbols = symbols_data->d_buf;
-       size_t num_symbols = symbols_data->d_size / sizeof(Elf64_Sym);
-
-       struct ac_rtld_section *s = &part->sections[reloc_shdr->sh_info];
-       report_if(!s->is_rx);
-
-       const char *orig_base = target_data->d_buf;
-       char *dst_base = u->rx_ptr + s->offset;
-       uint64_t va_base = u->rx_va + s->offset;
-
-       Elf64_Rel *rel = reloc_data->d_buf;
-       size_t num_relocs = reloc_data->d_size / sizeof(*rel);
-       for (size_t i = 0; i < num_relocs; ++i, ++rel) {
-               size_t r_sym = ELF64_R_SYM(rel->r_info);
-               unsigned r_type = ELF64_R_TYPE(rel->r_info);
-
-               const char *orig_ptr = orig_base + rel->r_offset;
-               char *dst_ptr = dst_base + rel->r_offset;
-               uint64_t va = va_base + rel->r_offset;
-
-               uint64_t symbol;
-               uint64_t addend;
-
-               if (r_sym == STN_UNDEF) {
-                       symbol = 0;
-               } else {
-                       report_elf_if(r_sym >= num_symbols);
-
-                       const Elf64_Sym *sym = &symbols[r_sym];
-                       const char *symbol_name =
-                               elf_strptr(part->elf, strtabidx, sym->st_name);
-                       report_elf_if(!symbol_name);
-
-                       if (!resolve_symbol(u, part_idx, sym, symbol_name, &symbol))
-                               return false;
-               }
-
-               /* TODO: Should we also support .rela sections, where the
-                * addend is part of the relocation record? */
-
-               /* Load the addend from the ELF instead of the destination,
-                * because the destination may be in VRAM. */
-               switch (r_type) {
-               case R_AMDGPU_ABS32:
-               case R_AMDGPU_ABS32_LO:
-               case R_AMDGPU_ABS32_HI:
-               case R_AMDGPU_REL32:
-               case R_AMDGPU_REL32_LO:
-               case R_AMDGPU_REL32_HI:
-                       addend = *(const uint32_t *)orig_ptr;
-                       break;
-               case R_AMDGPU_ABS64:
-               case R_AMDGPU_REL64:
-                       addend = *(const uint64_t *)orig_ptr;
-                       break;
-               default:
-                       report_errorf("unsupported r_type == %u", r_type);
-                       return false;
-               }
-
-               uint64_t abs = symbol + addend;
-
-               switch (r_type) {
-               case R_AMDGPU_ABS32:
-                       assert((uint32_t)abs == abs);
-               case R_AMDGPU_ABS32_LO:
-                       *(uint32_t *)dst_ptr = util_cpu_to_le32(abs);
-                       break;
-               case R_AMDGPU_ABS32_HI:
-                       *(uint32_t *)dst_ptr = util_cpu_to_le32(abs >> 32);
-                       break;
-               case R_AMDGPU_ABS64:
-                       *(uint64_t *)dst_ptr = util_cpu_to_le64(abs);
-                       break;
-               case R_AMDGPU_REL32:
-                       assert((int64_t)(int32_t)(abs - va) == (int64_t)(abs - va));
-               case R_AMDGPU_REL32_LO:
-                       *(uint32_t *)dst_ptr = util_cpu_to_le32(abs - va);
-                       break;
-               case R_AMDGPU_REL32_HI:
-                       *(uint32_t *)dst_ptr = util_cpu_to_le32((abs - va) >> 32);
-                       break;
-               case R_AMDGPU_REL64:
-                       *(uint64_t *)dst_ptr = util_cpu_to_le64(abs - va);
-                       break;
-               default:
-                       unreachable("bad r_type");
-               }
-       }
-
-       return true;
+#define report_if(cond)                                                                            \
+   do {                                                                                            \
+      if ((cond)) {                                                                                \
+         report_errorf(#cond);                                                                     \
+         return false;                                                                             \
+      }                                                                                            \
+   } while (false)
+#define report_elf_if(cond)                                                                        \
+   do {                                                                                            \
+      if ((cond)) {                                                                                \
+         report_elf_errorf(#cond);                                                                 \
+         return false;                                                                             \
+      }                                                                                            \
+   } while (false)
+
+   struct ac_rtld_part *part = &u->binary->parts[part_idx];
+   Elf_Scn *target_scn = elf_getscn(part->elf, reloc_shdr->sh_info);
+   report_elf_if(!target_scn);
+
+   Elf_Data *target_data = elf_getdata(target_scn, NULL);
+   report_elf_if(!target_data);
+
+   Elf_Scn *symbols_scn = elf_getscn(part->elf, reloc_shdr->sh_link);
+   report_elf_if(!symbols_scn);
+
+   Elf64_Shdr *symbols_shdr = elf64_getshdr(symbols_scn);
+   report_elf_if(!symbols_shdr);
+   uint32_t strtabidx = symbols_shdr->sh_link;
+
+   Elf_Data *symbols_data = elf_getdata(symbols_scn, NULL);
+   report_elf_if(!symbols_data);
+
+   const Elf64_Sym *symbols = symbols_data->d_buf;
+   size_t num_symbols = symbols_data->d_size / sizeof(Elf64_Sym);
+
+   struct ac_rtld_section *s = &part->sections[reloc_shdr->sh_info];
+   report_if(!s->is_rx);
+
+   const char *orig_base = target_data->d_buf;
+   char *dst_base = u->rx_ptr + s->offset;
+   uint64_t va_base = u->rx_va + s->offset;
+
+   Elf64_Rel *rel = reloc_data->d_buf;
+   size_t num_relocs = reloc_data->d_size / sizeof(*rel);
+   for (size_t i = 0; i < num_relocs; ++i, ++rel) {
+      size_t r_sym = ELF64_R_SYM(rel->r_info);
+      unsigned r_type = ELF64_R_TYPE(rel->r_info);
+
+      const char *orig_ptr = orig_base + rel->r_offset;
+      char *dst_ptr = dst_base + rel->r_offset;
+      uint64_t va = va_base + rel->r_offset;
+
+      uint64_t symbol;
+      uint64_t addend;
+
+      if (r_sym == STN_UNDEF) {
+         symbol = 0;
+      } else {
+         report_elf_if(r_sym >= num_symbols);
+
+         const Elf64_Sym *sym = &symbols[r_sym];
+         const char *symbol_name = elf_strptr(part->elf, strtabidx, sym->st_name);
+         report_elf_if(!symbol_name);
+
+         if (!resolve_symbol(u, part_idx, sym, symbol_name, &symbol))
+            return false;
+      }
+
+      /* TODO: Should we also support .rela sections, where the
+       * addend is part of the relocation record? */
+
+      /* Load the addend from the ELF instead of the destination,
+       * because the destination may be in VRAM. */
+      switch (r_type) {
+      case R_AMDGPU_ABS32:
+      case R_AMDGPU_ABS32_LO:
+      case R_AMDGPU_ABS32_HI:
+      case R_AMDGPU_REL32:
+      case R_AMDGPU_REL32_LO:
+      case R_AMDGPU_REL32_HI:
+         addend = *(const uint32_t *)orig_ptr;
+         break;
+      case R_AMDGPU_ABS64:
+      case R_AMDGPU_REL64:
+         addend = *(const uint64_t *)orig_ptr;
+         break;
+      default:
+         report_errorf("unsupported r_type == %u", r_type);
+         return false;
+      }
+
+      uint64_t abs = symbol + addend;
+
+      switch (r_type) {
+      case R_AMDGPU_ABS32:
+         assert((uint32_t)abs == abs);
+      case R_AMDGPU_ABS32_LO:
+         *(uint32_t *)dst_ptr = util_cpu_to_le32(abs);
+         break;
+      case R_AMDGPU_ABS32_HI:
+         *(uint32_t *)dst_ptr = util_cpu_to_le32(abs >> 32);
+         break;
+      case R_AMDGPU_ABS64:
+         *(uint64_t *)dst_ptr = util_cpu_to_le64(abs);
+         break;
+      case R_AMDGPU_REL32:
+         assert((int64_t)(int32_t)(abs - va) == (int64_t)(abs - va));
+      case R_AMDGPU_REL32_LO:
+         *(uint32_t *)dst_ptr = util_cpu_to_le32(abs - va);
+         break;
+      case R_AMDGPU_REL32_HI:
+         *(uint32_t *)dst_ptr = util_cpu_to_le32((abs - va) >> 32);
+         break;
+      case R_AMDGPU_REL64:
+         *(uint64_t *)dst_ptr = util_cpu_to_le64(abs - va);
+         break;
+      default:
+         unreachable("bad r_type");
+      }
+   }
+
+   return true;
 
 #undef report_if
 #undef report_elf_if
@@ -742,72 +728,72 @@ static bool apply_relocs(const struct ac_rtld_upload_info *u,
  */
 bool ac_rtld_upload(struct ac_rtld_upload_info *u)
 {
-#define report_if(cond) \
-       do { \
-               if ((cond)) { \
-                       report_errorf(#cond); \
-                       return false; \
-               } \
-       } while (false)
-#define report_elf_if(cond) \
-       do { \
-               if ((cond)) { \
-                       report_errorf(#cond); \
-                       return false; \
-               } \
-       } while (false)
-
-       if (u->binary->options.halt_at_entry) {
-               /* s_sethalt 1 */
-               *(uint32_t *)u->rx_ptr = util_cpu_to_le32(0xbf8d0001);
-       }
-
-       /* First pass: upload raw section data and lay out private LDS symbols. */
-       for (unsigned i = 0; i < u->binary->num_parts; ++i) {
-               struct ac_rtld_part *part = &u->binary->parts[i];
-
-               Elf_Scn *section = NULL;
-               while ((section = elf_nextscn(part->elf, section))) {
-                       Elf64_Shdr *shdr = elf64_getshdr(section);
-                       struct ac_rtld_section *s = &part->sections[elf_ndxscn(section)];
-
-                       if (!s->is_rx)
-                               continue;
-
-                       report_if(shdr->sh_type != SHT_PROGBITS);
-
-                       Elf_Data *data = elf_getdata(section, NULL);
-                       report_elf_if(!data || data->d_size != shdr->sh_size);
-                       memcpy(u->rx_ptr + s->offset, data->d_buf, shdr->sh_size);
-               }
-       }
-
-       if (u->binary->rx_end_markers) {
-               uint32_t *dst = (uint32_t *)(u->rx_ptr + u->binary->rx_end_markers);
-               for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; ++i)
-                       *dst++ = util_cpu_to_le32(DEBUGGER_END_OF_CODE_MARKER);
-       }
-
-       /* Second pass: handle relocations, overwriting uploaded data where
-        * appropriate. */
-       for (unsigned i = 0; i < u->binary->num_parts; ++i) {
-               struct ac_rtld_part *part = &u->binary->parts[i];
-               Elf_Scn *section = NULL;
-               while ((section = elf_nextscn(part->elf, section))) {
-                       Elf64_Shdr *shdr = elf64_getshdr(section);
-                       if (shdr->sh_type == SHT_REL) {
-                               Elf_Data *relocs = elf_getdata(section, NULL);
-                               report_elf_if(!relocs || relocs->d_size != shdr->sh_size);
-                               if (!apply_relocs(u, i, shdr, relocs))
-                                       return false;
-                       } else if (shdr->sh_type == SHT_RELA) {
-                               report_errorf("SHT_RELA not supported");
-                               return false;
-                       }
-               }
-       }
-
-       return true;
+#define report_if(cond)                                                                            \
+   do {                                                                                            \
+      if ((cond)) {                                                                                \
+         report_errorf(#cond);                                                                     \
+         return false;                                                                             \
+      }                                                                                            \
+   } while (false)
+#define report_elf_if(cond)                                                                        \
+   do {                                                                                            \
+      if ((cond)) {                                                                                \
+         report_errorf(#cond);                                                                     \
+         return false;                                                                             \
+      }                                                                                            \
+   } while (false)
+
+   if (u->binary->options.halt_at_entry) {
+      /* s_sethalt 1 */
+      *(uint32_t *)u->rx_ptr = util_cpu_to_le32(0xbf8d0001);
+   }
+
+   /* First pass: upload raw section data and lay out private LDS symbols. */
+   for (unsigned i = 0; i < u->binary->num_parts; ++i) {
+      struct ac_rtld_part *part = &u->binary->parts[i];
+
+      Elf_Scn *section = NULL;
+      while ((section = elf_nextscn(part->elf, section))) {
+         Elf64_Shdr *shdr = elf64_getshdr(section);
+         struct ac_rtld_section *s = &part->sections[elf_ndxscn(section)];
+
+         if (!s->is_rx)
+            continue;
+
+         report_if(shdr->sh_type != SHT_PROGBITS);
+
+         Elf_Data *data = elf_getdata(section, NULL);
+         report_elf_if(!data || data->d_size != shdr->sh_size);
+         memcpy(u->rx_ptr + s->offset, data->d_buf, shdr->sh_size);
+      }
+   }
+
+   if (u->binary->rx_end_markers) {
+      uint32_t *dst = (uint32_t *)(u->rx_ptr + u->binary->rx_end_markers);
+      for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; ++i)
+         *dst++ = util_cpu_to_le32(DEBUGGER_END_OF_CODE_MARKER);
+   }
+
+   /* Second pass: handle relocations, overwriting uploaded data where
+    * appropriate. */
+   for (unsigned i = 0; i < u->binary->num_parts; ++i) {
+      struct ac_rtld_part *part = &u->binary->parts[i];
+      Elf_Scn *section = NULL;
+      while ((section = elf_nextscn(part->elf, section))) {
+         Elf64_Shdr *shdr = elf64_getshdr(section);
+         if (shdr->sh_type == SHT_REL) {
+            Elf_Data *relocs = elf_getdata(section, NULL);
+            report_elf_if(!relocs || relocs->d_size != shdr->sh_size);
+            if (!apply_relocs(u, i, shdr, relocs))
+               return false;
+         } else if (shdr->sh_type == SHT_RELA) {
+            report_errorf("SHT_RELA not supported");
+            return false;
+         }
+      }
+   }
+
+   return true;
 
 #undef report_if
 #undef report_elf_if
index 2470a5243f1ea4b5bd077d2ffede7d28fc0b1e89..af03a857f5e6ccb150ce6c9ea932293618fa86b9 100644 (file)
 #ifndef AC_RTLD_H
 #define AC_RTLD_H
 
+#include "compiler/shader_enums.h"
+#include "util/u_dynarray.h"
+
 #include <stdbool.h>
-#include <stdint.h>
 #include <stddef.h>
-
-#include "util/u_dynarray.h"
-#include "compiler/shader_enums.h"
+#include <stdint.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -40,37 +40,37 @@ struct ac_shader_config;
 struct radeon_info;
 
 struct ac_rtld_symbol {
-       const char *name;
-       uint32_t size;
-       uint32_t align;
-       uint64_t offset; /* filled in by ac_rtld_open */
-       unsigned part_idx; /* shader part in which this symbol appears */
+   const char *name;
+   uint32_t size;
+   uint32_t align;
+   uint64_t offset;   /* filled in by ac_rtld_open */
+   unsigned part_idx; /* shader part in which this symbol appears */
 };
 
 struct ac_rtld_options {
-       /* Loader will insert an s_sethalt 1 instruction as the
-        * first instruction. */
-       bool halt_at_entry:1;
+   /* Loader will insert an s_sethalt 1 instruction as the
+    * first instruction. */
+   bool halt_at_entry : 1;
 };
 
 /* Lightweight wrapper around underlying ELF objects. */
 struct ac_rtld_binary {
-       struct ac_rtld_options options;
-       unsigned wave_size;
+   struct ac_rtld_options options;
+   unsigned wave_size;
 
-       /* Required buffer sizes, currently read/executable only. */
-       uint64_t rx_size;
+   /* Required buffer sizes, currently read/executable only. */
+   uint64_t rx_size;
 
-       /* Size of executable code, for reporting purposes. */
-       uint64_t exec_size;
+   /* Size of executable code, for reporting purposes. */
+   uint64_t exec_size;
 
-       uint64_t rx_end_markers;
+   uint64_t rx_end_markers;
 
-       unsigned num_parts;
-       struct ac_rtld_part *parts;
+   unsigned num_parts;
+   struct ac_rtld_part *parts;
 
-       struct util_dynarray lds_symbols;
-       uint32_t lds_size;
+   struct util_dynarray lds_symbols;
+   uint32_t lds_size;
 };
 
 /**
@@ -82,8 +82,7 @@ struct ac_rtld_binary {
  * \param value to be filled in by the callback
  * \return whether the symbol was found successfully
  */
-typedef bool (*ac_rtld_get_external_symbol_cb)(
-       void *cb_data, const char *symbol, uint64_t *value);
+typedef bool (*ac_rtld_get_external_symbol_cb)(void *cb_data, const char *symbol, uint64_t *value);
 
 /**
  * Lifetimes of \ref info, in-memory ELF objects, and the names of
@@ -91,50 +90,48 @@ typedef bool (*ac_rtld_get_external_symbol_cb)(
  * the opened binary.
  */
 struct ac_rtld_open_info {
-       const struct radeon_info *info;
-       struct ac_rtld_options options;
-       gl_shader_stage shader_type;
-       unsigned wave_size;
-
-       unsigned num_parts;
-       const char * const *elf_ptrs; /* in-memory ELF objects of each part */
-       const size_t *elf_sizes; /* sizes of corresponding in-memory ELF objects in bytes */
-
-       /* Shared LDS symbols are layouted such that they are accessible from
-        * all shader parts. Non-shared (private) LDS symbols of one part may
-        * overlap private LDS symbols of another shader part.
-        */
-       unsigned num_shared_lds_symbols;
-       const struct ac_rtld_symbol *shared_lds_symbols;
+   const struct radeon_info *info;
+   struct ac_rtld_options options;
+   gl_shader_stage shader_type;
+   unsigned wave_size;
+
+   unsigned num_parts;
+   const char *const *elf_ptrs; /* in-memory ELF objects of each part */
+   const size_t *elf_sizes;     /* sizes of corresponding in-memory ELF objects in bytes */
+
+   /* Shared LDS symbols are layouted such that they are accessible from
+    * all shader parts. Non-shared (private) LDS symbols of one part may
+    * overlap private LDS symbols of another shader part.
+    */
+   unsigned num_shared_lds_symbols;
+   const struct ac_rtld_symbol *shared_lds_symbols;
 };
 
-bool ac_rtld_open(struct ac_rtld_binary *binary,
-                 struct ac_rtld_open_info i);
+bool ac_rtld_open(struct ac_rtld_binary *binary, struct ac_rtld_open_info i);
 
 void ac_rtld_close(struct ac_rtld_binary *binary);
 
-bool ac_rtld_get_section_by_name(struct ac_rtld_binary *binary, const char *name,
-                                const char **data, size_t *nbytes);
+bool ac_rtld_get_section_by_name(struct ac_rtld_binary *binary, const char *name, const char **data,
+                                 size_t *nbytes);
 
-bool ac_rtld_read_config(const struct radeon_info *info,
-                        struct ac_rtld_binary *binary,
-                        struct ac_shader_config *config);
+bool ac_rtld_read_config(const struct radeon_info *info, struct ac_rtld_binary *binary,
+                         struct ac_shader_config *config);
 
 struct ac_rtld_upload_info {
-       struct ac_rtld_binary *binary;
+   struct ac_rtld_binary *binary;
 
-       /** GPU mapping of the read/executable buffer. */
-       uint64_t rx_va;
+   /** GPU mapping of the read/executable buffer. */
+   uint64_t rx_va;
 
-       /** CPU mapping of the read/executable buffer */
-       char *rx_ptr;
+   /** CPU mapping of the read/executable buffer */
+   char *rx_ptr;
 
-       /** Optional callback function that will be queried for symbols not
-        * defined in any of the binary's parts. */
-       ac_rtld_get_external_symbol_cb get_external_symbol;
+   /** Optional callback function that will be queried for symbols not
+    * defined in any of the binary's parts. */
+   ac_rtld_get_external_symbol_cb get_external_symbol;
 
-       /** Caller-defined data that will be passed to callback functions. */
-       void *cb_data;
+   /** Caller-defined data that will be passed to callback functions. */
+   void *cb_data;
 };
 
 bool ac_rtld_upload(struct ac_rtld_upload_info *u);
index d5600eaca4098954ce577f84069e7146799dd534..d3816e1c0fc0fb2af6f8421760917db2fdcfa6a8 100644 (file)
  */
 
 #include "ac_shader_args.h"
+
 #include "nir/nir_builder.h"
 
-void
-ac_add_arg(struct ac_shader_args *info, enum ac_arg_regfile regfile,
-          unsigned size, enum ac_arg_type type, struct ac_arg *arg)
+void ac_add_arg(struct ac_shader_args *info, enum ac_arg_regfile regfile, unsigned size,
+                enum ac_arg_type type, struct ac_arg *arg)
 {
-       assert(info->arg_count < AC_MAX_ARGS);
+   assert(info->arg_count < AC_MAX_ARGS);
 
-       unsigned offset;
-       if (regfile == AC_ARG_SGPR) {
-               offset = info->num_sgprs_used;
-               info->num_sgprs_used += size;
-       } else {
-               assert(regfile == AC_ARG_VGPR);
-               offset = info->num_vgprs_used;
-               info->num_vgprs_used += size;
-       }
+   unsigned offset;
+   if (regfile == AC_ARG_SGPR) {
+      offset = info->num_sgprs_used;
+      info->num_sgprs_used += size;
+   } else {
+      assert(regfile == AC_ARG_VGPR);
+      offset = info->num_vgprs_used;
+      info->num_vgprs_used += size;
+   }
 
-       info->args[info->arg_count].file = regfile;
-       info->args[info->arg_count].offset = offset;
-       info->args[info->arg_count].size = size;
-       info->args[info->arg_count].type = type;
+   info->args[info->arg_count].file = regfile;
+   info->args[info->arg_count].offset = offset;
+   info->args[info->arg_count].size = size;
+   info->args[info->arg_count].type = type;
 
-       if (arg) {
-               arg->arg_index = info->arg_count;
-               arg->used = true;
-       }
+   if (arg) {
+      arg->arg_index = info->arg_count;
+      arg->used = true;
+   }
 
-       info->arg_count++;
+   info->arg_count++;
 }
-
index 90798c6eabd835b5047fdfe92c4ec6824308bb9e..c3f4042d3ec3d019377df53989988520fcf179c2 100644 (file)
 #ifndef AC_SHADER_ARGS_H
 #define AC_SHADER_ARGS_H
 
-#include <stdint.h>
 #include <stdbool.h>
+#include <stdint.h>
 
 #define AC_MAX_INLINE_PUSH_CONSTS 8
 
-enum ac_arg_regfile {
-       AC_ARG_SGPR,
-       AC_ARG_VGPR,
+enum ac_arg_regfile
+{
+   AC_ARG_SGPR,
+   AC_ARG_VGPR,
 };
 
-enum ac_arg_type {
-       AC_ARG_FLOAT,
-       AC_ARG_INT,
-       AC_ARG_CONST_PTR, /* Pointer to i8 array */
-       AC_ARG_CONST_FLOAT_PTR, /* Pointer to f32 array */
-       AC_ARG_CONST_PTR_PTR, /* Pointer to pointer to i8 array */
-       AC_ARG_CONST_DESC_PTR, /* Pointer to v4i32 array */
-       AC_ARG_CONST_IMAGE_PTR, /* Pointer to v8i32 array */
+enum ac_arg_type
+{
+   AC_ARG_FLOAT,
+   AC_ARG_INT,
+   AC_ARG_CONST_PTR,       /* Pointer to i8 array */
+   AC_ARG_CONST_FLOAT_PTR, /* Pointer to f32 array */
+   AC_ARG_CONST_PTR_PTR,   /* Pointer to pointer to i8 array */
+   AC_ARG_CONST_DESC_PTR,  /* Pointer to v4i32 array */
+   AC_ARG_CONST_IMAGE_PTR, /* Pointer to v8i32 array */
 };
 
 struct ac_arg {
-       uint8_t arg_index;
-       bool used;
+   uint8_t arg_index;
+   bool used;
 };
 
-
 #define AC_MAX_ARGS 128
 
 struct ac_shader_args {
-       /* Info on how to declare arguments */
-       struct {
-               enum ac_arg_type type;
-               enum ac_arg_regfile file;
-               uint8_t offset;
-               uint8_t size;
-               bool skip;
-       } args[AC_MAX_ARGS];
-
-       uint8_t arg_count;
-       uint8_t sgpr_count;
-       uint8_t num_sgprs_used;
-       uint8_t num_vgprs_used;
-
-       struct ac_arg base_vertex;
-       struct ac_arg start_instance;
-       struct ac_arg draw_id;
-       struct ac_arg vertex_id;
-       struct ac_arg instance_id;
-       struct ac_arg tcs_patch_id;
-       struct ac_arg tcs_rel_ids;
-       struct ac_arg tes_patch_id;
-       struct ac_arg gs_prim_id;
-       struct ac_arg gs_invocation_id;
-
-       /* PS */
-       struct ac_arg frag_pos[4];
-       struct ac_arg front_face;
-       struct ac_arg ancillary;
-       struct ac_arg sample_coverage;
-       struct ac_arg prim_mask;
-       struct ac_arg persp_sample;
-       struct ac_arg persp_center;
-       struct ac_arg persp_centroid;
-       struct ac_arg pull_model;
-       struct ac_arg linear_sample;
-       struct ac_arg linear_center;
-       struct ac_arg linear_centroid;
-
-       /* CS */
-       struct ac_arg local_invocation_ids;
-       struct ac_arg num_work_groups;
-       struct ac_arg workgroup_ids[3];
-       struct ac_arg tg_size;
-
-       /* Vulkan only */
-       struct ac_arg push_constants;
-       struct ac_arg inline_push_consts[AC_MAX_INLINE_PUSH_CONSTS];
-       unsigned num_inline_push_consts;
-       unsigned base_inline_push_consts;
-       struct ac_arg view_index;
+   /* Info on how to declare arguments */
+   struct {
+      enum ac_arg_type type;
+      enum ac_arg_regfile file;
+      uint8_t offset;
+      uint8_t size;
+      bool skip;
+   } args[AC_MAX_ARGS];
+
+   uint8_t arg_count;
+   uint8_t sgpr_count;
+   uint8_t num_sgprs_used;
+   uint8_t num_vgprs_used;
+
+   struct ac_arg base_vertex;
+   struct ac_arg start_instance;
+   struct ac_arg draw_id;
+   struct ac_arg vertex_id;
+   struct ac_arg instance_id;
+   struct ac_arg tcs_patch_id;
+   struct ac_arg tcs_rel_ids;
+   struct ac_arg tes_patch_id;
+   struct ac_arg gs_prim_id;
+   struct ac_arg gs_invocation_id;
+
+   /* PS */
+   struct ac_arg frag_pos[4];
+   struct ac_arg front_face;
+   struct ac_arg ancillary;
+   struct ac_arg sample_coverage;
+   struct ac_arg prim_mask;
+   struct ac_arg persp_sample;
+   struct ac_arg persp_center;
+   struct ac_arg persp_centroid;
+   struct ac_arg pull_model;
+   struct ac_arg linear_sample;
+   struct ac_arg linear_center;
+   struct ac_arg linear_centroid;
+
+   /* CS */
+   struct ac_arg local_invocation_ids;
+   struct ac_arg num_work_groups;
+   struct ac_arg workgroup_ids[3];
+   struct ac_arg tg_size;
+
+   /* Vulkan only */
+   struct ac_arg push_constants;
+   struct ac_arg inline_push_consts[AC_MAX_INLINE_PUSH_CONSTS];
+   unsigned num_inline_push_consts;
+   unsigned base_inline_push_consts;
+   struct ac_arg view_index;
 };
 
-void ac_add_arg(struct ac_shader_args *info, enum ac_arg_regfile regfile,
-               unsigned registers, enum ac_arg_type type,
-               struct ac_arg *arg);
+void ac_add_arg(struct ac_shader_args *info, enum ac_arg_regfile regfile, unsigned registers,
+                enum ac_arg_type type, struct ac_arg *arg);
 
 #endif
-
index d4ccf38d803f2d6d6af5f468f73d13e620f6540d..a57b5cac50523a103178cfd97a9b4a1d870a4a6d 100644 (file)
  * IN THE SOFTWARE.
  */
 
+#include "ac_shader_util.h"
+
+#include "sid.h"
+
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 
-#include "ac_shader_util.h"
-#include "sid.h"
-
-unsigned
-ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
-                          bool writes_samplemask)
+unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask)
 {
-       if (writes_z) {
-               /* Z needs 32 bits. */
-               if (writes_samplemask)
-                       return V_028710_SPI_SHADER_32_ABGR;
-               else if (writes_stencil)
-                       return V_028710_SPI_SHADER_32_GR;
-               else
-                       return V_028710_SPI_SHADER_32_R;
-       } else if (writes_stencil || writes_samplemask) {
-               /* Both stencil and sample mask need only 16 bits. */
-               return V_028710_SPI_SHADER_UINT16_ABGR;
-       } else {
-               return V_028710_SPI_SHADER_ZERO;
-       }
+   if (writes_z) {
+      /* Z needs 32 bits. */
+      if (writes_samplemask)
+         return V_028710_SPI_SHADER_32_ABGR;
+      else if (writes_stencil)
+         return V_028710_SPI_SHADER_32_GR;
+      else
+         return V_028710_SPI_SHADER_32_R;
+   } else if (writes_stencil || writes_samplemask) {
+      /* Both stencil and sample mask need only 16 bits. */
+      return V_028710_SPI_SHADER_UINT16_ABGR;
+   } else {
+      return V_028710_SPI_SHADER_ZERO;
+   }
 }
 
-unsigned
-ac_get_cb_shader_mask(unsigned spi_shader_col_format)
+unsigned ac_get_cb_shader_mask(unsigned spi_shader_col_format)
 {
-       unsigned i, cb_shader_mask = 0;
-
-       for (i = 0; i < 8; i++) {
-               switch ((spi_shader_col_format >> (i * 4)) & 0xf) {
-               case V_028714_SPI_SHADER_ZERO:
-                       break;
-               case V_028714_SPI_SHADER_32_R:
-                       cb_shader_mask |= 0x1 << (i * 4);
-                       break;
-               case V_028714_SPI_SHADER_32_GR:
-                       cb_shader_mask |= 0x3 << (i * 4);
-                       break;
-               case V_028714_SPI_SHADER_32_AR:
-                       cb_shader_mask |= 0x9u << (i * 4);
-                       break;
-               case V_028714_SPI_SHADER_FP16_ABGR:
-               case V_028714_SPI_SHADER_UNORM16_ABGR:
-               case V_028714_SPI_SHADER_SNORM16_ABGR:
-               case V_028714_SPI_SHADER_UINT16_ABGR:
-               case V_028714_SPI_SHADER_SINT16_ABGR:
-               case V_028714_SPI_SHADER_32_ABGR:
-                       cb_shader_mask |= 0xfu << (i * 4);
-                       break;
-               default:
-                       assert(0);
-               }
-       }
-       return cb_shader_mask;
+   unsigned i, cb_shader_mask = 0;
+
+   for (i = 0; i < 8; i++) {
+      switch ((spi_shader_col_format >> (i * 4)) & 0xf) {
+      case V_028714_SPI_SHADER_ZERO:
+         break;
+      case V_028714_SPI_SHADER_32_R:
+         cb_shader_mask |= 0x1 << (i * 4);
+         break;
+      case V_028714_SPI_SHADER_32_GR:
+         cb_shader_mask |= 0x3 << (i * 4);
+         break;
+      case V_028714_SPI_SHADER_32_AR:
+         cb_shader_mask |= 0x9u << (i * 4);
+         break;
+      case V_028714_SPI_SHADER_FP16_ABGR:
+      case V_028714_SPI_SHADER_UNORM16_ABGR:
+      case V_028714_SPI_SHADER_SNORM16_ABGR:
+      case V_028714_SPI_SHADER_UINT16_ABGR:
+      case V_028714_SPI_SHADER_SINT16_ABGR:
+      case V_028714_SPI_SHADER_32_ABGR:
+         cb_shader_mask |= 0xfu << (i * 4);
+         break;
+      default:
+         assert(0);
+      }
+   }
+   return cb_shader_mask;
 }
 
 /**
  * Calculate the appropriate setting of VGT_GS_MODE when \p shader is a
  * geometry shader.
  */
-uint32_t
-ac_vgt_gs_mode(unsigned gs_max_vert_out, enum chip_class chip_class)
+uint32_t ac_vgt_gs_mode(unsigned gs_max_vert_out, enum chip_class chip_class)
 {
-       unsigned cut_mode;
-
-       if (gs_max_vert_out <= 128) {
-               cut_mode = V_028A40_GS_CUT_128;
-       } else if (gs_max_vert_out <= 256) {
-               cut_mode = V_028A40_GS_CUT_256;
-       } else if (gs_max_vert_out <= 512) {
-               cut_mode = V_028A40_GS_CUT_512;
-       } else {
-               assert(gs_max_vert_out <= 1024);
-               cut_mode = V_028A40_GS_CUT_1024;
-       }
-
-       return S_028A40_MODE(V_028A40_GS_SCENARIO_G) |
-              S_028A40_CUT_MODE(cut_mode)|
-              S_028A40_ES_WRITE_OPTIMIZE(chip_class <= GFX8) |
-              S_028A40_GS_WRITE_OPTIMIZE(1) |
-              S_028A40_ONCHIP(chip_class >= GFX9 ? 1 : 0);
+   unsigned cut_mode;
+
+   if (gs_max_vert_out <= 128) {
+      cut_mode = V_028A40_GS_CUT_128;
+   } else if (gs_max_vert_out <= 256) {
+      cut_mode = V_028A40_GS_CUT_256;
+   } else if (gs_max_vert_out <= 512) {
+      cut_mode = V_028A40_GS_CUT_512;
+   } else {
+      assert(gs_max_vert_out <= 1024);
+      cut_mode = V_028A40_GS_CUT_1024;
+   }
+
+   return S_028A40_MODE(V_028A40_GS_SCENARIO_G) | S_028A40_CUT_MODE(cut_mode) |
+          S_028A40_ES_WRITE_OPTIMIZE(chip_class <= GFX8) | S_028A40_GS_WRITE_OPTIMIZE(1) |
+          S_028A40_ONCHIP(chip_class >= GFX9 ? 1 : 0);
 }
 
 /// Translate a (dfmt, nfmt) pair into a chip-appropriate combined format
 /// value for LLVM8+ tbuffer intrinsics.
-unsigned
-ac_get_tbuffer_format(enum chip_class chip_class,
-                     unsigned dfmt, unsigned nfmt)
+unsigned ac_get_tbuffer_format(enum chip_class chip_class, unsigned dfmt, unsigned nfmt)
 {
-       // Some games try to access vertex buffers without a valid format.
-       // This is a game bug, but we should still handle it gracefully.
-       if (dfmt == V_008F0C_IMG_FORMAT_INVALID)
-               return V_008F0C_IMG_FORMAT_INVALID;
-
-       if (chip_class >= GFX10) {
-               unsigned format;
-               switch (dfmt) {
-               default: unreachable("bad dfmt");
-               case V_008F0C_BUF_DATA_FORMAT_INVALID: format = V_008F0C_IMG_FORMAT_INVALID; break;
-               case V_008F0C_BUF_DATA_FORMAT_8: format = V_008F0C_IMG_FORMAT_8_UINT; break;
-               case V_008F0C_BUF_DATA_FORMAT_8_8: format = V_008F0C_IMG_FORMAT_8_8_UINT; break;
-               case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: format = V_008F0C_IMG_FORMAT_8_8_8_8_UINT; break;
-               case V_008F0C_BUF_DATA_FORMAT_16: format = V_008F0C_IMG_FORMAT_16_UINT; break;
-               case V_008F0C_BUF_DATA_FORMAT_16_16: format = V_008F0C_IMG_FORMAT_16_16_UINT; break;
-               case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: format = V_008F0C_IMG_FORMAT_16_16_16_16_UINT; break;
-               case V_008F0C_BUF_DATA_FORMAT_32: format = V_008F0C_IMG_FORMAT_32_UINT; break;
-               case V_008F0C_BUF_DATA_FORMAT_32_32: format = V_008F0C_IMG_FORMAT_32_32_UINT; break;
-               case V_008F0C_BUF_DATA_FORMAT_32_32_32: format = V_008F0C_IMG_FORMAT_32_32_32_UINT; break;
-               case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: format = V_008F0C_IMG_FORMAT_32_32_32_32_UINT; break;
-               case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: format = V_008F0C_IMG_FORMAT_2_10_10_10_UINT; break;
-               }
-
-               // Use the regularity properties of the combined format enum.
-               //
-               // Note: float is incompatible with 8-bit data formats,
-               //       [us]{norm,scaled} are incomparible with 32-bit data formats.
-               //       [us]scaled are not writable.
-               switch (nfmt) {
-               case V_008F0C_BUF_NUM_FORMAT_UNORM: format -= 4; break;
-               case V_008F0C_BUF_NUM_FORMAT_SNORM: format -= 3; break;
-               case V_008F0C_BUF_NUM_FORMAT_USCALED: format -= 2; break;
-               case V_008F0C_BUF_NUM_FORMAT_SSCALED: format -= 1; break;
-               default: unreachable("bad nfmt");
-               case V_008F0C_BUF_NUM_FORMAT_UINT: break;
-               case V_008F0C_BUF_NUM_FORMAT_SINT: format += 1; break;
-               case V_008F0C_BUF_NUM_FORMAT_FLOAT: format += 2; break;
-               }
-
-               return format;
-       } else {
-               return dfmt | (nfmt << 4);
-       }
+   // Some games try to access vertex buffers without a valid format.
+   // This is a game bug, but we should still handle it gracefully.
+   if (dfmt == V_008F0C_IMG_FORMAT_INVALID)
+      return V_008F0C_IMG_FORMAT_INVALID;
+
+   if (chip_class >= GFX10) {
+      unsigned format;
+      switch (dfmt) {
+      default:
+         unreachable("bad dfmt");
+      case V_008F0C_BUF_DATA_FORMAT_INVALID:
+         format = V_008F0C_IMG_FORMAT_INVALID;
+         break;
+      case V_008F0C_BUF_DATA_FORMAT_8:
+         format = V_008F0C_IMG_FORMAT_8_UINT;
+         break;
+      case V_008F0C_BUF_DATA_FORMAT_8_8:
+         format = V_008F0C_IMG_FORMAT_8_8_UINT;
+         break;
+      case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
+         format = V_008F0C_IMG_FORMAT_8_8_8_8_UINT;
+         break;
+      case V_008F0C_BUF_DATA_FORMAT_16:
+         format = V_008F0C_IMG_FORMAT_16_UINT;
+         break;
+      case V_008F0C_BUF_DATA_FORMAT_16_16:
+         format = V_008F0C_IMG_FORMAT_16_16_UINT;
+         break;
+      case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
+         format = V_008F0C_IMG_FORMAT_16_16_16_16_UINT;
+         break;
+      case V_008F0C_BUF_DATA_FORMAT_32:
+         format = V_008F0C_IMG_FORMAT_32_UINT;
+         break;
+      case V_008F0C_BUF_DATA_FORMAT_32_32:
+         format = V_008F0C_IMG_FORMAT_32_32_UINT;
+         break;
+      case V_008F0C_BUF_DATA_FORMAT_32_32_32:
+         format = V_008F0C_IMG_FORMAT_32_32_32_UINT;
+         break;
+      case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
+         format = V_008F0C_IMG_FORMAT_32_32_32_32_UINT;
+         break;
+      case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
+         format = V_008F0C_IMG_FORMAT_2_10_10_10_UINT;
+         break;
+      }
+
+      // Use the regularity properties of the combined format enum.
+      //
+      // Note: float is incompatible with 8-bit data formats,
+      //       [us]{norm,scaled} are incomparible with 32-bit data formats.
+      //       [us]scaled are not writable.
+      switch (nfmt) {
+      case V_008F0C_BUF_NUM_FORMAT_UNORM:
+         format -= 4;
+         break;
+      case V_008F0C_BUF_NUM_FORMAT_SNORM:
+         format -= 3;
+         break;
+      case V_008F0C_BUF_NUM_FORMAT_USCALED:
+         format -= 2;
+         break;
+      case V_008F0C_BUF_NUM_FORMAT_SSCALED:
+         format -= 1;
+         break;
+      default:
+         unreachable("bad nfmt");
+      case V_008F0C_BUF_NUM_FORMAT_UINT:
+         break;
+      case V_008F0C_BUF_NUM_FORMAT_SINT:
+         format += 1;
+         break;
+      case V_008F0C_BUF_NUM_FORMAT_FLOAT:
+         format += 2;
+         break;
+      }
+
+      return format;
+   } else {
+      return dfmt | (nfmt << 4);
+   }
 }
 
 static const struct ac_data_format_info data_format_table[] = {
-       [V_008F0C_BUF_DATA_FORMAT_INVALID]     = {  0, 4, 0, V_008F0C_BUF_DATA_FORMAT_INVALID    },
-       [V_008F0C_BUF_DATA_FORMAT_8]           = {  1, 1, 1, V_008F0C_BUF_DATA_FORMAT_8          },
-       [V_008F0C_BUF_DATA_FORMAT_16]          = {  2, 1, 2, V_008F0C_BUF_DATA_FORMAT_16         },
-       [V_008F0C_BUF_DATA_FORMAT_8_8]         = {  2, 2, 1, V_008F0C_BUF_DATA_FORMAT_8          },
-       [V_008F0C_BUF_DATA_FORMAT_32]          = {  4, 1, 4, V_008F0C_BUF_DATA_FORMAT_32         },
-       [V_008F0C_BUF_DATA_FORMAT_16_16]       = {  4, 2, 2, V_008F0C_BUF_DATA_FORMAT_16         },
-       [V_008F0C_BUF_DATA_FORMAT_10_11_11]    = {  4, 3, 0, V_008F0C_BUF_DATA_FORMAT_10_11_11   },
-       [V_008F0C_BUF_DATA_FORMAT_11_11_10]    = {  4, 3, 0, V_008F0C_BUF_DATA_FORMAT_11_11_10   },
-       [V_008F0C_BUF_DATA_FORMAT_10_10_10_2]  = {  4, 4, 0, V_008F0C_BUF_DATA_FORMAT_10_10_10_2 },
-       [V_008F0C_BUF_DATA_FORMAT_2_10_10_10]  = {  4, 4, 0, V_008F0C_BUF_DATA_FORMAT_2_10_10_10 },
-       [V_008F0C_BUF_DATA_FORMAT_8_8_8_8]     = {  4, 4, 1, V_008F0C_BUF_DATA_FORMAT_8          },
-       [V_008F0C_BUF_DATA_FORMAT_32_32]       = {  8, 2, 4, V_008F0C_BUF_DATA_FORMAT_32         },
-       [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = {  8, 4, 2, V_008F0C_BUF_DATA_FORMAT_16         },
-       [V_008F0C_BUF_DATA_FORMAT_32_32_32]    = { 12, 3, 4, V_008F0C_BUF_DATA_FORMAT_32         },
-       [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = { 16, 4, 4, V_008F0C_BUF_DATA_FORMAT_32         },
+   [V_008F0C_BUF_DATA_FORMAT_INVALID] = {0, 4, 0, V_008F0C_BUF_DATA_FORMAT_INVALID},
+   [V_008F0C_BUF_DATA_FORMAT_8] = {1, 1, 1, V_008F0C_BUF_DATA_FORMAT_8},
+   [V_008F0C_BUF_DATA_FORMAT_16] = {2, 1, 2, V_008F0C_BUF_DATA_FORMAT_16},
+   [V_008F0C_BUF_DATA_FORMAT_8_8] = {2, 2, 1, V_008F0C_BUF_DATA_FORMAT_8},
+   [V_008F0C_BUF_DATA_FORMAT_32] = {4, 1, 4, V_008F0C_BUF_DATA_FORMAT_32},
+   [V_008F0C_BUF_DATA_FORMAT_16_16] = {4, 2, 2, V_008F0C_BUF_DATA_FORMAT_16},
+   [V_008F0C_BUF_DATA_FORMAT_10_11_11] = {4, 3, 0, V_008F0C_BUF_DATA_FORMAT_10_11_11},
+   [V_008F0C_BUF_DATA_FORMAT_11_11_10] = {4, 3, 0, V_008F0C_BUF_DATA_FORMAT_11_11_10},
+   [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = {4, 4, 0, V_008F0C_BUF_DATA_FORMAT_10_10_10_2},
+   [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = {4, 4, 0, V_008F0C_BUF_DATA_FORMAT_2_10_10_10},
+   [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = {4, 4, 1, V_008F0C_BUF_DATA_FORMAT_8},
+   [V_008F0C_BUF_DATA_FORMAT_32_32] = {8, 2, 4, V_008F0C_BUF_DATA_FORMAT_32},
+   [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = {8, 4, 2, V_008F0C_BUF_DATA_FORMAT_16},
+   [V_008F0C_BUF_DATA_FORMAT_32_32_32] = {12, 3, 4, V_008F0C_BUF_DATA_FORMAT_32},
+   [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = {16, 4, 4, V_008F0C_BUF_DATA_FORMAT_32},
 };
 
-const struct ac_data_format_info *
-ac_get_data_format_info(unsigned dfmt)
+const struct ac_data_format_info *ac_get_data_format_info(unsigned dfmt)
 {
-       assert(dfmt < ARRAY_SIZE(data_format_table));
-       return &data_format_table[dfmt];
+   assert(dfmt < ARRAY_SIZE(data_format_table));
+   return &data_format_table[dfmt];
 }
 
-enum ac_image_dim
-ac_get_sampler_dim(enum chip_class chip_class, enum glsl_sampler_dim dim,
-                  bool is_array)
+enum ac_image_dim ac_get_sampler_dim(enum chip_class chip_class, enum glsl_sampler_dim dim,
+                                     bool is_array)
 {
-       switch (dim) {
-       case GLSL_SAMPLER_DIM_1D:
-               if (chip_class == GFX9)
-                       return is_array ? ac_image_2darray : ac_image_2d;
-               return is_array ? ac_image_1darray : ac_image_1d;
-       case GLSL_SAMPLER_DIM_2D:
-       case GLSL_SAMPLER_DIM_RECT:
-       case GLSL_SAMPLER_DIM_EXTERNAL:
-               return is_array ? ac_image_2darray : ac_image_2d;
-       case GLSL_SAMPLER_DIM_3D:
-               return ac_image_3d;
-       case GLSL_SAMPLER_DIM_CUBE:
-               return ac_image_cube;
-       case GLSL_SAMPLER_DIM_MS:
-               return is_array ? ac_image_2darraymsaa : ac_image_2dmsaa;
-       case GLSL_SAMPLER_DIM_SUBPASS:
-               return ac_image_2darray;
-       case GLSL_SAMPLER_DIM_SUBPASS_MS:
-               return ac_image_2darraymsaa;
-       default:
-               unreachable("bad sampler dim");
-       }
+   switch (dim) {
+   case GLSL_SAMPLER_DIM_1D:
+      if (chip_class == GFX9)
+         return is_array ? ac_image_2darray : ac_image_2d;
+      return is_array ? ac_image_1darray : ac_image_1d;
+   case GLSL_SAMPLER_DIM_2D:
+   case GLSL_SAMPLER_DIM_RECT:
+   case GLSL_SAMPLER_DIM_EXTERNAL:
+      return is_array ? ac_image_2darray : ac_image_2d;
+   case GLSL_SAMPLER_DIM_3D:
+      return ac_image_3d;
+   case GLSL_SAMPLER_DIM_CUBE:
+      return ac_image_cube;
+   case GLSL_SAMPLER_DIM_MS:
+      return is_array ? ac_image_2darraymsaa : ac_image_2dmsaa;
+   case GLSL_SAMPLER_DIM_SUBPASS:
+      return ac_image_2darray;
+   case GLSL_SAMPLER_DIM_SUBPASS_MS:
+      return ac_image_2darraymsaa;
+   default:
+      unreachable("bad sampler dim");
+   }
 }
 
-enum ac_image_dim
-ac_get_image_dim(enum chip_class chip_class, enum glsl_sampler_dim sdim,
-                bool is_array)
+enum ac_image_dim ac_get_image_dim(enum chip_class chip_class, enum glsl_sampler_dim sdim,
+                                   bool is_array)
 {
-       enum ac_image_dim dim = ac_get_sampler_dim(chip_class, sdim, is_array);
-
-       /* Match the resource type set in the descriptor. */
-       if (dim == ac_image_cube ||
-           (chip_class <= GFX8 && dim == ac_image_3d))
-               dim = ac_image_2darray;
-       else if (sdim == GLSL_SAMPLER_DIM_2D && !is_array && chip_class == GFX9) {
-               /* When a single layer of a 3D texture is bound, the shader
-                * will refer to a 2D target, but the descriptor has a 3D type.
-                * Since the HW ignores BASE_ARRAY in this case, we need to
-                * send 3 coordinates. This doesn't hurt when the underlying
-                * texture is non-3D.
-                */
-               dim = ac_image_3d;
-       }
-
-       return dim;
+   enum ac_image_dim dim = ac_get_sampler_dim(chip_class, sdim, is_array);
+
+   /* Match the resource type set in the descriptor. */
+   if (dim == ac_image_cube || (chip_class <= GFX8 && dim == ac_image_3d))
+      dim = ac_image_2darray;
+   else if (sdim == GLSL_SAMPLER_DIM_2D && !is_array && chip_class == GFX9) {
+      /* When a single layer of a 3D texture is bound, the shader
+       * will refer to a 2D target, but the descriptor has a 3D type.
+       * Since the HW ignores BASE_ARRAY in this case, we need to
+       * send 3 coordinates. This doesn't hurt when the underlying
+       * texture is non-3D.
+       */
+      dim = ac_image_3d;
+   }
+
+   return dim;
 }
 
-unsigned
-ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config,
-                        signed char *face_vgpr_index_ptr,
-                        signed char *ancillary_vgpr_index_ptr)
+unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config,
+                                  signed char *face_vgpr_index_ptr,
+                                  signed char *ancillary_vgpr_index_ptr)
 {
-       unsigned num_input_vgprs = 0;
-       signed char face_vgpr_index = -1;
-       signed char ancillary_vgpr_index = -1;
-
-       if (G_0286CC_PERSP_SAMPLE_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 2;
-       if (G_0286CC_PERSP_CENTER_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 2;
-       if (G_0286CC_PERSP_CENTROID_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 2;
-       if (G_0286CC_PERSP_PULL_MODEL_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 3;
-       if (G_0286CC_LINEAR_SAMPLE_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 2;
-       if (G_0286CC_LINEAR_CENTER_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 2;
-       if (G_0286CC_LINEAR_CENTROID_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 2;
-       if (G_0286CC_LINE_STIPPLE_TEX_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 1;
-       if (G_0286CC_POS_X_FLOAT_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 1;
-       if (G_0286CC_POS_Y_FLOAT_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 1;
-       if (G_0286CC_POS_Z_FLOAT_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 1;
-       if (G_0286CC_POS_W_FLOAT_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 1;
-       if (G_0286CC_FRONT_FACE_ENA(config->spi_ps_input_addr)) {
-               face_vgpr_index = num_input_vgprs;
-               num_input_vgprs += 1;
-       }
-       if (G_0286CC_ANCILLARY_ENA(config->spi_ps_input_addr)) {
-               ancillary_vgpr_index = num_input_vgprs;
-               num_input_vgprs += 1;
-       }
-       if (G_0286CC_SAMPLE_COVERAGE_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 1;
-       if (G_0286CC_POS_FIXED_PT_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 1;
-
-       if (face_vgpr_index_ptr)
-               *face_vgpr_index_ptr = face_vgpr_index;
-       if (ancillary_vgpr_index_ptr)
-               *ancillary_vgpr_index_ptr = ancillary_vgpr_index;
-
-       return num_input_vgprs;
+   unsigned num_input_vgprs = 0;
+   signed char face_vgpr_index = -1;
+   signed char ancillary_vgpr_index = -1;
+
+   if (G_0286CC_PERSP_SAMPLE_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 2;
+   if (G_0286CC_PERSP_CENTER_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 2;
+   if (G_0286CC_PERSP_CENTROID_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 2;
+   if (G_0286CC_PERSP_PULL_MODEL_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 3;
+   if (G_0286CC_LINEAR_SAMPLE_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 2;
+   if (G_0286CC_LINEAR_CENTER_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 2;
+   if (G_0286CC_LINEAR_CENTROID_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 2;
+   if (G_0286CC_LINE_STIPPLE_TEX_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 1;
+   if (G_0286CC_POS_X_FLOAT_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 1;
+   if (G_0286CC_POS_Y_FLOAT_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 1;
+   if (G_0286CC_POS_Z_FLOAT_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 1;
+   if (G_0286CC_POS_W_FLOAT_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 1;
+   if (G_0286CC_FRONT_FACE_ENA(config->spi_ps_input_addr)) {
+      face_vgpr_index = num_input_vgprs;
+      num_input_vgprs += 1;
+   }
+   if (G_0286CC_ANCILLARY_ENA(config->spi_ps_input_addr)) {
+      ancillary_vgpr_index = num_input_vgprs;
+      num_input_vgprs += 1;
+   }
+   if (G_0286CC_SAMPLE_COVERAGE_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 1;
+   if (G_0286CC_POS_FIXED_PT_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 1;
+
+   if (face_vgpr_index_ptr)
+      *face_vgpr_index_ptr = face_vgpr_index;
+   if (ancillary_vgpr_index_ptr)
+      *ancillary_vgpr_index_ptr = ancillary_vgpr_index;
+
+   return num_input_vgprs;
 }
 
-void ac_choose_spi_color_formats(unsigned format, unsigned swap,
-                                unsigned ntype, bool is_depth,
-                                struct ac_spi_color_formats *formats)
+void ac_choose_spi_color_formats(unsigned format, unsigned swap, unsigned ntype, bool is_depth,
+                                 struct ac_spi_color_formats *formats)
 {
    /* Alpha is needed for alpha-to-coverage.
     * Blending may be with or without alpha.
index 49e1eb2428f15710f437c1cce0d4a978bff97b74..c2a52337a3dfd7263b38b12ca5176d60557c1ad3 100644 (file)
 #ifndef AC_SHADER_UTIL_H
 #define AC_SHADER_UTIL_H
 
-#include <stdbool.h>
-#include <stdint.h>
-
-#include "amd_family.h"
 #include "ac_binary.h"
+#include "amd_family.h"
 #include "compiler/nir/nir.h"
 
+#include <stdbool.h>
+#include <stdint.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-enum ac_image_dim {
-       ac_image_1d,
-       ac_image_2d,
-       ac_image_3d,
-       ac_image_cube, // includes cube arrays
-       ac_image_1darray,
-       ac_image_2darray,
-       ac_image_2dmsaa,
-       ac_image_2darraymsaa,
+enum ac_image_dim
+{
+   ac_image_1d,
+   ac_image_2d,
+   ac_image_3d,
+   ac_image_cube, // includes cube arrays
+   ac_image_1darray,
+   ac_image_2darray,
+   ac_image_2dmsaa,
+   ac_image_2darraymsaa,
 };
 
 struct ac_data_format_info {
-       uint8_t element_size;
-       uint8_t num_channels;
-       uint8_t chan_byte_size;
-       uint8_t chan_format;
+   uint8_t element_size;
+   uint8_t num_channels;
+   uint8_t chan_byte_size;
+   uint8_t chan_format;
 };
 
 struct ac_spi_color_formats {
-       unsigned normal : 8;
-       unsigned alpha : 8;
-       unsigned blend : 8;
-       unsigned blend_alpha : 8;
+   unsigned normal : 8;
+   unsigned alpha : 8;
+   unsigned blend : 8;
+   unsigned blend_alpha : 8;
 };
 
-unsigned
-ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
-                          bool writes_samplemask);
+unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask);
 
-unsigned
-ac_get_cb_shader_mask(unsigned spi_shader_col_format);
+unsigned ac_get_cb_shader_mask(unsigned spi_shader_col_format);
 
-uint32_t
-ac_vgt_gs_mode(unsigned gs_max_vert_out, enum chip_class chip_class);
+uint32_t ac_vgt_gs_mode(unsigned gs_max_vert_out, enum chip_class chip_class);
 
-unsigned
-ac_get_tbuffer_format(enum chip_class chip_class,
-                     unsigned dfmt, unsigned nfmt);
+unsigned ac_get_tbuffer_format(enum chip_class chip_class, unsigned dfmt, unsigned nfmt);
 
-const struct ac_data_format_info *
-ac_get_data_format_info(unsigned dfmt);
+const struct ac_data_format_info *ac_get_data_format_info(unsigned dfmt);
 
-enum ac_image_dim
-ac_get_sampler_dim(enum chip_class chip_class, enum glsl_sampler_dim dim,
-                  bool is_array);
+enum ac_image_dim ac_get_sampler_dim(enum chip_class chip_class, enum glsl_sampler_dim dim,
+                                     bool is_array);
 
-enum ac_image_dim
-ac_get_image_dim(enum chip_class chip_class, enum glsl_sampler_dim sdim,
-                bool is_array);
+enum ac_image_dim ac_get_image_dim(enum chip_class chip_class, enum glsl_sampler_dim sdim,
+                                   bool is_array);
 
-unsigned
-ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config,
-                        signed char *face_vgpr_index,
-                        signed char *ancillary_vgpr_index);
+unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config,
+                                  signed char *face_vgpr_index, signed char *ancillary_vgpr_index);
 
-void ac_choose_spi_color_formats(unsigned format, unsigned swap,
-                                unsigned ntype, bool is_depth,
-                                struct ac_spi_color_formats *formats);
+void ac_choose_spi_color_formats(unsigned format, unsigned swap, unsigned ntype, bool is_depth,
+                                 struct ac_spi_color_formats *formats);
 
 #ifdef __cplusplus
 }
index d08ccf0765f9e342dd61bce95cbc055c694f66ff..1ef2df5afb761d44f2fc0bd73e6eddefdcd3ad70 100644 (file)
  */
 
 #include "ac_shadowed_regs.h"
+
 #include "ac_debug.h"
 #include "sid.h"
 #include "util/macros.h"
 #include "util/u_debug.h"
+
 #include <stdio.h>
 
 static const struct ac_reg_range Gfx9UserConfigShadowRange[] = {
@@ -522,7 +524,8 @@ static const struct ac_reg_range Navi10NonShadowedRanges[] = {
       VGT_DMA_PRIMITIVE_TYPE,
       VGT_DMA_LS_HS_CONFIG - VGT_DMA_PRIMITIVE_TYPE + 4,
    },*/
-   /* VGT_INDEX_TYPE and VGT_DMA_INDEX_TYPE are a special case and neither of these should be shadowed. */
+   /* VGT_INDEX_TYPE and VGT_DMA_INDEX_TYPE are a special case and neither of these should be
+      shadowed. */
    {
       R_028A7C_VGT_DMA_INDEX_TYPE,
       4,
@@ -731,7 +734,8 @@ static const struct ac_reg_range Gfx103NonShadowedRanges[] = {
       VGT_DMA_PRIMITIVE_TYPE,
       VGT_DMA_LS_HS_CONFIG - VGT_DMA_PRIMITIVE_TYPE + 4,
    },*/
-   /* VGT_INDEX_TYPE and VGT_DMA_INDEX_TYPE are a special case and neither of these should be shadowed. */
+   /* VGT_INDEX_TYPE and VGT_DMA_INDEX_TYPE are a special case and neither of these should be
+      shadowed. */
    {
       R_028A7C_VGT_DMA_INDEX_TYPE,
       4,
@@ -816,7 +820,11 @@ void ac_get_reg_ranges(enum chip_class chip_class, enum radeon_family family,
                        enum ac_reg_range_type type, unsigned *num_ranges,
                        const struct ac_reg_range **ranges)
 {
-#define RETURN(array) do { *ranges = array; *num_ranges = ARRAY_SIZE(array); } while (0)
+#define RETURN(array)                                                                              \
+   do {                                                                                            \
+      *ranges = array;                                                                             \
+      *num_ranges = ARRAY_SIZE(array);                                                             \
+   } while (0)
 
    *num_ranges = 0;
    *ranges = NULL;
@@ -841,8 +849,7 @@ void ac_get_reg_ranges(enum chip_class chip_class, enum radeon_family family,
    case SI_REG_RANGE_SH:
       if (chip_class == GFX10_3 || chip_class == GFX10)
          RETURN(Gfx10ShShadowRange);
-      else if (family == CHIP_RAVEN2 ||
-               family == CHIP_RENOIR)
+      else if (family == CHIP_RAVEN2 || family == CHIP_RENOIR)
          RETURN(Gfx9ShShadowRangeRaven2);
       else if (chip_class == GFX9)
          RETURN(Gfx9ShShadowRange);
@@ -850,8 +857,7 @@ void ac_get_reg_ranges(enum chip_class chip_class, enum radeon_family family,
    case SI_REG_RANGE_CS_SH:
       if (chip_class == GFX10_3 || chip_class == GFX10)
          RETURN(Gfx10CsShShadowRange);
-      else if (family == CHIP_RAVEN2 ||
-               family == CHIP_RENOIR)
+      else if (family == CHIP_RAVEN2 || family == CHIP_RENOIR)
          RETURN(Gfx9CsShShadowRangeRaven2);
       else if (chip_class == GFX9)
          RETURN(Gfx9CsShShadowRange);
@@ -876,68 +882,68 @@ static void gfx9_emulate_clear_state(struct radeon_cmdbuf *cs,
                                      set_context_reg_seq_array_fn set_context_reg_seq_array)
 {
    static const uint32_t DbRenderControlGfx9[] = {
-      0x0       , // DB_RENDER_CONTROL
-      0x0       , // DB_COUNT_CONTROL
-      0x0       , // DB_DEPTH_VIEW
-      0x0       , // DB_RENDER_OVERRIDE
-      0x0       , // DB_RENDER_OVERRIDE2
-      0x0       , // DB_HTILE_DATA_BASE
-      0x0       , // DB_HTILE_DATA_BASE_HI
-      0x0       , // DB_DEPTH_SIZE
-      0x0       , // DB_DEPTH_BOUNDS_MIN
-      0x0       , // DB_DEPTH_BOUNDS_MAX
-      0x0       , // DB_STENCIL_CLEAR
-      0x0       , // DB_DEPTH_CLEAR
-      0x0       , // PA_SC_SCREEN_SCISSOR_TL
+      0x0,        // DB_RENDER_CONTROL
+      0x0,        // DB_COUNT_CONTROL
+      0x0,        // DB_DEPTH_VIEW
+      0x0,        // DB_RENDER_OVERRIDE
+      0x0,        // DB_RENDER_OVERRIDE2
+      0x0,        // DB_HTILE_DATA_BASE
+      0x0,        // DB_HTILE_DATA_BASE_HI
+      0x0,        // DB_DEPTH_SIZE
+      0x0,        // DB_DEPTH_BOUNDS_MIN
+      0x0,        // DB_DEPTH_BOUNDS_MAX
+      0x0,        // DB_STENCIL_CLEAR
+      0x0,        // DB_DEPTH_CLEAR
+      0x0,        // PA_SC_SCREEN_SCISSOR_TL
       0x40004000, // PA_SC_SCREEN_SCISSOR_BR
-      0x0       , // DB_Z_INFO
-      0x0       , // DB_STENCIL_INFO
-      0x0       , // DB_Z_READ_BASE
-      0x0       , // DB_Z_READ_BASE_HI
-      0x0       , // DB_STENCIL_READ_BASE
-      0x0       , // DB_STENCIL_READ_BASE_HI
-      0x0       , // DB_Z_WRITE_BASE
-      0x0       , // DB_Z_WRITE_BASE_HI
-      0x0       , // DB_STENCIL_WRITE_BASE
-      0x0       , // DB_STENCIL_WRITE_BASE_HI
-      0x0       , // DB_DFSM_CONTROL
-      0x0       , //
-      0x0       , // DB_Z_INFO2
-      0x0       , // DB_STENCIL_INFO2
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , // TA_BC_BASE_ADDR
+      0x0,        // DB_Z_INFO
+      0x0,        // DB_STENCIL_INFO
+      0x0,        // DB_Z_READ_BASE
+      0x0,        // DB_Z_READ_BASE_HI
+      0x0,        // DB_STENCIL_READ_BASE
+      0x0,        // DB_STENCIL_READ_BASE_HI
+      0x0,        // DB_Z_WRITE_BASE
+      0x0,        // DB_Z_WRITE_BASE_HI
+      0x0,        // DB_STENCIL_WRITE_BASE
+      0x0,        // DB_STENCIL_WRITE_BASE_HI
+      0x0,        // DB_DFSM_CONTROL
+      0x0,        //
+      0x0,        // DB_Z_INFO2
+      0x0,        // DB_STENCIL_INFO2
+      0x0,        //
+      0x0,        //
+      0x0,        //
+      0x0,        //
+      0x0,        // TA_BC_BASE_ADDR
       0x0         // TA_BC_BASE_ADDR_HI
    };
    static const uint32_t CoherDestBaseHi0Gfx9[] = {
-      0x0       , // COHER_DEST_BASE_HI_0
-      0x0       , // COHER_DEST_BASE_HI_1
-      0x0       , // COHER_DEST_BASE_HI_2
-      0x0       , // COHER_DEST_BASE_HI_3
-      0x0       , // COHER_DEST_BASE_2
-      0x0       , // COHER_DEST_BASE_3
-      0x0       , // PA_SC_WINDOW_OFFSET
+      0x0,        // COHER_DEST_BASE_HI_0
+      0x0,        // COHER_DEST_BASE_HI_1
+      0x0,        // COHER_DEST_BASE_HI_2
+      0x0,        // COHER_DEST_BASE_HI_3
+      0x0,        // COHER_DEST_BASE_2
+      0x0,        // COHER_DEST_BASE_3
+      0x0,        // PA_SC_WINDOW_OFFSET
       0x80000000, // PA_SC_WINDOW_SCISSOR_TL
       0x40004000, // PA_SC_WINDOW_SCISSOR_BR
-      0xffff    , // PA_SC_CLIPRECT_RULE
-      0x0       , // PA_SC_CLIPRECT_0_TL
+      0xffff,     // PA_SC_CLIPRECT_RULE
+      0x0,        // PA_SC_CLIPRECT_0_TL
       0x40004000, // PA_SC_CLIPRECT_0_BR
-      0x0       , // PA_SC_CLIPRECT_1_TL
+      0x0,        // PA_SC_CLIPRECT_1_TL
       0x40004000, // PA_SC_CLIPRECT_1_BR
-      0x0       , // PA_SC_CLIPRECT_2_TL
+      0x0,        // PA_SC_CLIPRECT_2_TL
       0x40004000, // PA_SC_CLIPRECT_2_BR
-      0x0       , // PA_SC_CLIPRECT_3_TL
+      0x0,        // PA_SC_CLIPRECT_3_TL
       0x40004000, // PA_SC_CLIPRECT_3_BR
       0xaa99aaaa, // PA_SC_EDGERULE
-      0x0       , // PA_SU_HARDWARE_SCREEN_OFFSET
+      0x0,        // PA_SU_HARDWARE_SCREEN_OFFSET
       0xffffffff, // CB_TARGET_MASK
       0xffffffff, // CB_SHADER_MASK
       0x80000000, // PA_SC_GENERIC_SCISSOR_TL
       0x40004000, // PA_SC_GENERIC_SCISSOR_BR
-      0x0       , // COHER_DEST_BASE_0
-      0x0       , // COHER_DEST_BASE_1
+      0x0,        // COHER_DEST_BASE_0
+      0x0,        // COHER_DEST_BASE_1
       0x80000000, // PA_SC_VPORT_SCISSOR_0_TL
       0x40004000, // PA_SC_VPORT_SCISSOR_0_BR
       0x80000000, // PA_SC_VPORT_SCISSOR_1_TL
@@ -970,529 +976,529 @@ static void gfx9_emulate_clear_state(struct radeon_cmdbuf *cs,
       0x40004000, // PA_SC_VPORT_SCISSOR_14_BR
       0x80000000, // PA_SC_VPORT_SCISSOR_15_TL
       0x40004000, // PA_SC_VPORT_SCISSOR_15_BR
-      0x0       , // PA_SC_VPORT_ZMIN_0
+      0x0,        // PA_SC_VPORT_ZMIN_0
       0x3f800000, // PA_SC_VPORT_ZMAX_0
-      0x0       , // PA_SC_VPORT_ZMIN_1
+      0x0,        // PA_SC_VPORT_ZMIN_1
       0x3f800000, // PA_SC_VPORT_ZMAX_1
-      0x0       , // PA_SC_VPORT_ZMIN_2
+      0x0,        // PA_SC_VPORT_ZMIN_2
       0x3f800000, // PA_SC_VPORT_ZMAX_2
-      0x0       , // PA_SC_VPORT_ZMIN_3
+      0x0,        // PA_SC_VPORT_ZMIN_3
       0x3f800000, // PA_SC_VPORT_ZMAX_3
-      0x0       , // PA_SC_VPORT_ZMIN_4
+      0x0,        // PA_SC_VPORT_ZMIN_4
       0x3f800000, // PA_SC_VPORT_ZMAX_4
-      0x0       , // PA_SC_VPORT_ZMIN_5
+      0x0,        // PA_SC_VPORT_ZMIN_5
       0x3f800000, // PA_SC_VPORT_ZMAX_5
-      0x0       , // PA_SC_VPORT_ZMIN_6
+      0x0,        // PA_SC_VPORT_ZMIN_6
       0x3f800000, // PA_SC_VPORT_ZMAX_6
-      0x0       , // PA_SC_VPORT_ZMIN_7
+      0x0,        // PA_SC_VPORT_ZMIN_7
       0x3f800000, // PA_SC_VPORT_ZMAX_7
-      0x0       , // PA_SC_VPORT_ZMIN_8
+      0x0,        // PA_SC_VPORT_ZMIN_8
       0x3f800000, // PA_SC_VPORT_ZMAX_8
-      0x0       , // PA_SC_VPORT_ZMIN_9
+      0x0,        // PA_SC_VPORT_ZMIN_9
       0x3f800000, // PA_SC_VPORT_ZMAX_9
-      0x0       , // PA_SC_VPORT_ZMIN_10
+      0x0,        // PA_SC_VPORT_ZMIN_10
       0x3f800000, // PA_SC_VPORT_ZMAX_10
-      0x0       , // PA_SC_VPORT_ZMIN_11
+      0x0,        // PA_SC_VPORT_ZMIN_11
       0x3f800000, // PA_SC_VPORT_ZMAX_11
-      0x0       , // PA_SC_VPORT_ZMIN_12
+      0x0,        // PA_SC_VPORT_ZMIN_12
       0x3f800000, // PA_SC_VPORT_ZMAX_12
-      0x0       , // PA_SC_VPORT_ZMIN_13
+      0x0,        // PA_SC_VPORT_ZMIN_13
       0x3f800000, // PA_SC_VPORT_ZMAX_13
-      0x0       , // PA_SC_VPORT_ZMIN_14
+      0x0,        // PA_SC_VPORT_ZMIN_14
       0x3f800000, // PA_SC_VPORT_ZMAX_14
-      0x0       , // PA_SC_VPORT_ZMIN_15
+      0x0,        // PA_SC_VPORT_ZMIN_15
       0x3f800000, // PA_SC_VPORT_ZMAX_15
-      0x0       , // PA_SC_RASTER_CONFIG
-      0x0       , // PA_SC_RASTER_CONFIG_1
-      0x0       , //
+      0x0,        // PA_SC_RASTER_CONFIG
+      0x0,        // PA_SC_RASTER_CONFIG_1
+      0x0,        //
       0x0         // PA_SC_TILE_STEERING_OVERRIDE
    };
    static const uint32_t VgtMultiPrimIbResetIndxGfx9[] = {
-      0x0         // VGT_MULTI_PRIM_IB_RESET_INDX
+      0x0 // VGT_MULTI_PRIM_IB_RESET_INDX
    };
    static const uint32_t CbBlendRedGfx9[] = {
-      0x0       , // CB_BLEND_RED
-      0x0       , // CB_BLEND_GREEN
-      0x0       , // CB_BLEND_BLUE
-      0x0       , // CB_BLEND_ALPHA
-      0x0       , // CB_DCC_CONTROL
-      0x0       , //
-      0x0       , // DB_STENCIL_CONTROL
-      0x1000000 , // DB_STENCILREFMASK
-      0x1000000 , // DB_STENCILREFMASK_BF
-      0x0       , //
-      0x0       , // PA_CL_VPORT_XSCALE
-      0x0       , // PA_CL_VPORT_XOFFSET
-      0x0       , // PA_CL_VPORT_YSCALE
-      0x0       , // PA_CL_VPORT_YOFFSET
-      0x0       , // PA_CL_VPORT_ZSCALE
-      0x0       , // PA_CL_VPORT_ZOFFSET
-      0x0       , // PA_CL_VPORT_XSCALE_1
-      0x0       , // PA_CL_VPORT_XOFFSET_1
-      0x0       , // PA_CL_VPORT_YSCALE_1
-      0x0       , // PA_CL_VPORT_YOFFSET_1
-      0x0       , // PA_CL_VPORT_ZSCALE_1
-      0x0       , // PA_CL_VPORT_ZOFFSET_1
-      0x0       , // PA_CL_VPORT_XSCALE_2
-      0x0       , // PA_CL_VPORT_XOFFSET_2
-      0x0       , // PA_CL_VPORT_YSCALE_2
-      0x0       , // PA_CL_VPORT_YOFFSET_2
-      0x0       , // PA_CL_VPORT_ZSCALE_2
-      0x0       , // PA_CL_VPORT_ZOFFSET_2
-      0x0       , // PA_CL_VPORT_XSCALE_3
-      0x0       , // PA_CL_VPORT_XOFFSET_3
-      0x0       , // PA_CL_VPORT_YSCALE_3
-      0x0       , // PA_CL_VPORT_YOFFSET_3
-      0x0       , // PA_CL_VPORT_ZSCALE_3
-      0x0       , // PA_CL_VPORT_ZOFFSET_3
-      0x0       , // PA_CL_VPORT_XSCALE_4
-      0x0       , // PA_CL_VPORT_XOFFSET_4
-      0x0       , // PA_CL_VPORT_YSCALE_4
-      0x0       , // PA_CL_VPORT_YOFFSET_4
-      0x0       , // PA_CL_VPORT_ZSCALE_4
-      0x0       , // PA_CL_VPORT_ZOFFSET_4
-      0x0       , // PA_CL_VPORT_XSCALE_5
-      0x0       , // PA_CL_VPORT_XOFFSET_5
-      0x0       , // PA_CL_VPORT_YSCALE_5
-      0x0       , // PA_CL_VPORT_YOFFSET_5
-      0x0       , // PA_CL_VPORT_ZSCALE_5
-      0x0       , // PA_CL_VPORT_ZOFFSET_5
-      0x0       , // PA_CL_VPORT_XSCALE_6
-      0x0       , // PA_CL_VPORT_XOFFSET_6
-      0x0       , // PA_CL_VPORT_YSCALE_6
-      0x0       , // PA_CL_VPORT_YOFFSET_6
-      0x0       , // PA_CL_VPORT_ZSCALE_6
-      0x0       , // PA_CL_VPORT_ZOFFSET_6
-      0x0       , // PA_CL_VPORT_XSCALE_7
-      0x0       , // PA_CL_VPORT_XOFFSET_7
-      0x0       , // PA_CL_VPORT_YSCALE_7
-      0x0       , // PA_CL_VPORT_YOFFSET_7
-      0x0       , // PA_CL_VPORT_ZSCALE_7
-      0x0       , // PA_CL_VPORT_ZOFFSET_7
-      0x0       , // PA_CL_VPORT_XSCALE_8
-      0x0       , // PA_CL_VPORT_XOFFSET_8
-      0x0       , // PA_CL_VPORT_YSCALE_8
-      0x0       , // PA_CL_VPORT_YOFFSET_8
-      0x0       , // PA_CL_VPORT_ZSCALE_8
-      0x0       , // PA_CL_VPORT_ZOFFSET_8
-      0x0       , // PA_CL_VPORT_XSCALE_9
-      0x0       , // PA_CL_VPORT_XOFFSET_9
-      0x0       , // PA_CL_VPORT_YSCALE_9
-      0x0       , // PA_CL_VPORT_YOFFSET_9
-      0x0       , // PA_CL_VPORT_ZSCALE_9
-      0x0       , // PA_CL_VPORT_ZOFFSET_9
-      0x0       , // PA_CL_VPORT_XSCALE_10
-      0x0       , // PA_CL_VPORT_XOFFSET_10
-      0x0       , // PA_CL_VPORT_YSCALE_10
-      0x0       , // PA_CL_VPORT_YOFFSET_10
-      0x0       , // PA_CL_VPORT_ZSCALE_10
-      0x0       , // PA_CL_VPORT_ZOFFSET_10
-      0x0       , // PA_CL_VPORT_XSCALE_11
-      0x0       , // PA_CL_VPORT_XOFFSET_11
-      0x0       , // PA_CL_VPORT_YSCALE_11
-      0x0       , // PA_CL_VPORT_YOFFSET_11
-      0x0       , // PA_CL_VPORT_ZSCALE_11
-      0x0       , // PA_CL_VPORT_ZOFFSET_11
-      0x0       , // PA_CL_VPORT_XSCALE_12
-      0x0       , // PA_CL_VPORT_XOFFSET_12
-      0x0       , // PA_CL_VPORT_YSCALE_12
-      0x0       , // PA_CL_VPORT_YOFFSET_12
-      0x0       , // PA_CL_VPORT_ZSCALE_12
-      0x0       , // PA_CL_VPORT_ZOFFSET_12
-      0x0       , // PA_CL_VPORT_XSCALE_13
-      0x0       , // PA_CL_VPORT_XOFFSET_13
-      0x0       , // PA_CL_VPORT_YSCALE_13
-      0x0       , // PA_CL_VPORT_YOFFSET_13
-      0x0       , // PA_CL_VPORT_ZSCALE_13
-      0x0       , // PA_CL_VPORT_ZOFFSET_13
-      0x0       , // PA_CL_VPORT_XSCALE_14
-      0x0       , // PA_CL_VPORT_XOFFSET_14
-      0x0       , // PA_CL_VPORT_YSCALE_14
-      0x0       , // PA_CL_VPORT_YOFFSET_14
-      0x0       , // PA_CL_VPORT_ZSCALE_14
-      0x0       , // PA_CL_VPORT_ZOFFSET_14
-      0x0       , // PA_CL_VPORT_XSCALE_15
-      0x0       , // PA_CL_VPORT_XOFFSET_15
-      0x0       , // PA_CL_VPORT_YSCALE_15
-      0x0       , // PA_CL_VPORT_YOFFSET_15
-      0x0       , // PA_CL_VPORT_ZSCALE_15
-      0x0       , // PA_CL_VPORT_ZOFFSET_15
-      0x0       , // PA_CL_UCP_0_X
-      0x0       , // PA_CL_UCP_0_Y
-      0x0       , // PA_CL_UCP_0_Z
-      0x0       , // PA_CL_UCP_0_W
-      0x0       , // PA_CL_UCP_1_X
-      0x0       , // PA_CL_UCP_1_Y
-      0x0       , // PA_CL_UCP_1_Z
-      0x0       , // PA_CL_UCP_1_W
-      0x0       , // PA_CL_UCP_2_X