aco: increase accuracy of SGPR limits

author Rhys Perry <pendingchaos02@gmail.com>

Fri, 13 Sep 2019 15:41:00 +0000 (16:41 +0100)

committer Rhys Perry <pendingchaos02@gmail.com>

Wed, 23 Oct 2019 18:11:21 +0000 (19:11 +0100)
author Rhys Perry <pendingchaos02@gmail.com>
Fri, 13 Sep 2019 15:41:00 +0000 (16:41 +0100)
committer Rhys Perry <pendingchaos02@gmail.com>
Wed, 23 Oct 2019 18:11:21 +0000 (19:11 +0100)
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp

index dce0894f4dc78a2ce5a5c99810a2ef517453c9ec..d7a193552ba7ad9df9af4f3e1f6e43e85e5bdc17 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -1254,9 +1254,25 @@ setup_isel_context(Program* program,
     program->chip_class = options->chip_class;
     program->family = options->family;
     program->wave_size = options->wave_size;
-   program->sgpr_limit = options->chip_class >= GFX8 ? 102 : 104;
-   if (options->family == CHIP_TONGA || options->family == CHIP_ICELAND)
-      program->sgpr_limit = 94; /* workaround hardware bug */
+
+   if (options->chip_class >= GFX10) {
+      program->physical_sgprs = 2560; /* doesn't matter as long as it's at least 128 * 20 */
+      program->sgpr_alloc_granule = 127;
+      program->sgpr_limit = 106;
+   } else if (program->chip_class >= GFX8) {
+      program->physical_sgprs = 800;
+      program->sgpr_alloc_granule = 15;
+      program->sgpr_limit = 102;
+   } else {
+      program->physical_sgprs = 512;
+      program->sgpr_alloc_granule = 7;
+      if (options->family == CHIP_TONGA || options->family == CHIP_ICELAND)
+         program->sgpr_limit = 94; /* workaround hardware bug */
+      else
+         program->sgpr_limit = 104;
+   }
+   /* TODO: we don't have to allocate VCC if we don't need it */
+   program->needs_vcc = true;
  
     for (unsigned i = 0; i < MAX_SETS; ++i)
        program->info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1;
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h

index 7551225f718b99d42fb2a9e3cc6b06bf23eaa023..3606b33402e562d4a350f9d22e9fe5a51569be14 100644 (file)
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1062,7 +1062,6 @@ class Program final {
  public:
     std::vector<Block> blocks;
     RegisterDemand max_reg_demand = RegisterDemand();
-   uint16_t sgpr_limit = 0;
     uint16_t num_waves = 0;
     ac_shader_config* config;
     struct radv_shader_info *info;
@@ -1076,6 +1075,13 @@ public:
  
     std::vector<uint8_t> constant_data;
  
+   uint16_t physical_sgprs;
+   uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
+   uint16_t sgpr_limit;
+   bool needs_vcc = false;
+   bool needs_xnack_mask = false;
+   bool needs_flat_scr = false;
+
     uint32_t allocateId()
     {
        assert(allocationID <= 16777215);
@@ -1154,6 +1160,15 @@ void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
  void aco_print_instr(Instruction *instr, FILE *output);
  void aco_print_program(Program *program, FILE *output);
  
+/* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
+uint16_t get_extra_sgprs(Program *program);
+
+/* get number of sgprs allocated required to address a number of sgprs */
+uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
+
+/* return number of addressable SGPRs for max_waves */
+uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
+
  typedef struct {
     const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
     const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp

index f99e57c8b3a5ff0fc32765f809003edf6d60c8ba..3fe413256e75bbbdfbf8aa176ac1b490935be38e 100644 (file)
--- a/src/amd/compiler/aco_live_var_analysis.cpp
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
@@ -28,6 +28,7 @@
   */
  
  #include "aco_ir.h"
+#include "util/u_math.h"
  
  #include <set>
  #include <vector>
@@ -190,25 +191,62 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
  }
  } /* end namespace */
  
+uint16_t get_extra_sgprs(Program *program)
+{
+   if (program->chip_class >= GFX10) {
+      assert(!program->needs_flat_scr);
+      assert(!program->needs_xnack_mask);
+      return 2;
+   } else if (program->chip_class >= GFX8) {
+      if (program->needs_flat_scr)
+         return 6;
+      else if (program->needs_xnack_mask)
+         return 4;
+      else if (program->needs_vcc)
+         return 2;
+      else
+         return 0;
+   } else {
+      assert(!program->needs_xnack_mask);
+      if (program->needs_flat_scr)
+         return 4;
+      else if (program->needs_vcc)
+         return 2;
+      else
+         return 0;
+   }
+}
+
+uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs)
+{
+   assert(addressable_sgprs <= program->sgpr_limit);
+   uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program);
+   uint16_t granule = program->sgpr_alloc_granule + 1;
+   return align(std::max(sgprs, granule), granule);
+}
+
+uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves)
+{
+    uint16_t sgprs = program->physical_sgprs / max_waves & ~program->sgpr_alloc_granule;
+    sgprs -= get_extra_sgprs(program);
+    return std::min(sgprs, program->sgpr_limit);
+}
+
  void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
  {
     // TODO: also take shared mem into account
-   const int16_t total_sgpr_regs = program->chip_class >= GFX8 ? 800 : 512;
-   const int16_t max_addressible_sgpr = program->sgpr_limit;
-   /* VGPRs are allocated in chunks of 4 */
-   const int16_t rounded_vgpr_demand = std::max<int16_t>(4, (new_demand.vgpr + 3) & ~3);
-   /* SGPRs are allocated in chunks of 16 between 8 and 104. VCC occupies the last 2 registers */
-   const int16_t rounded_sgpr_demand = std::min(std::max<int16_t>(8, (new_demand.sgpr + 2 + 7) & ~7), max_addressible_sgpr);
+   const int16_t vgpr_alloc = std::max<int16_t>(4, (new_demand.vgpr + 3) & ~3);
     /* this won't compile, register pressure reduction necessary */
-   if (new_demand.vgpr > 256 || new_demand.sgpr > max_addressible_sgpr) {
+   if (new_demand.vgpr > 256 || new_demand.sgpr > program->sgpr_limit) {
        program->num_waves = 0;
        program->max_reg_demand = new_demand;
     } else {
-      program->num_waves = std::min<uint16_t>(10,
-                                              std::min<uint16_t>(256 / rounded_vgpr_demand,
-                                                                 total_sgpr_regs / rounded_sgpr_demand));
+      program->num_waves = program->physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
+      program->num_waves = std::min<uint16_t>(program->num_waves, 256 / vgpr_alloc);
+      program->num_waves = std::min<uint16_t>(program->num_waves, 10);
  
-      program->max_reg_demand = {  int16_t((256 / program->num_waves) & ~3), std::min<int16_t>(((total_sgpr_regs / program->num_waves) & ~7) - 2, max_addressible_sgpr)};
+      program->max_reg_demand.vgpr = int16_t((256 / program->num_waves) & ~3);
+      program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves);
     }
  }
  
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp

index 47ea932f1157aec0559e801867b401ef08e8a23b..965fe15964a8bffd5aea62e540ce964bd3541034 100644 (file)
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@@ -34,6 +34,7 @@
  
  #include "aco_ir.h"
  #include "sid.h"
+#include "util/u_math.h"
  
  namespace aco {
  namespace {
@@ -1914,12 +1915,11 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_
     }
  
     /* num_gpr = rnd_up(max_used_gpr + 1) */
-   program->config->num_vgprs = (ctx.max_used_vgpr + 1 + 3) & ~3;
-   if (program->family == CHIP_TONGA || program->family == CHIP_ICELAND) {
-      assert(ctx.max_used_sgpr <= 93);
-      ctx.max_used_sgpr = 93; /* workaround hardware bug */
-   }
-   program->config->num_sgprs = (ctx.max_used_sgpr + 1 + 2 + 7) & ~7; /* + 2 sgprs for vcc */
+   program->config->num_vgprs = align(ctx.max_used_vgpr + 1, 4);
+   if (program->family == CHIP_TONGA || program->family == CHIP_ICELAND) /* workaround hardware bug */
+      program->config->num_sgprs = get_sgpr_alloc(program, program->sgpr_limit);
+   else
+      program->config->num_sgprs = align(ctx.max_used_sgpr + 1 + get_extra_sgprs(program), 8);
  }
  
  }
diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp

index 67264fcf14f0313584837c28b4d8aee97337dce3..1601545dcfe9b8d53cd783b65064e9697e35afa9 100644 (file)
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@@ -806,9 +806,9 @@ void schedule_program(Program *program, live& live_vars)
     //TODO: this also increases window-size/max-moves? did I realize that at the time?
     ctx.num_waves = std::min<uint16_t>(program->num_waves, 5);
     assert(ctx.num_waves);
-   uint16_t total_sgpr_regs = program->chip_class >= GFX8 ? 800 : 512;
+   uint16_t total_sgpr_regs = program->physical_sgprs;
     uint16_t max_addressible_sgpr = program->sgpr_limit;
-   ctx.max_registers = { int16_t(((256 / ctx.num_waves) & ~3) - 2), std::min<int16_t>(((total_sgpr_regs / ctx.num_waves) & ~7) - 2, max_addressible_sgpr)};
+   ctx.max_registers = { int16_t(((256 / ctx.num_waves) & ~3) - 2), std::min<int16_t>(((total_sgpr_regs / ctx.num_waves) & ~program->sgpr_alloc_granule) - 2, max_addressible_sgpr)};
  
     for (Block& block : program->blocks)
        schedule_block(ctx, program, &block, live_vars);
diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp

index 92a23bb355c7cc82c69ecc8045160199a95b279b..56167e36d6d4fbe279169a12483701e6d49960bd 100644 (file)
--- a/src/amd/compiler/aco_spill.cpp
+++ b/src/amd/compiler/aco_spill.cpp
@@ -1568,8 +1568,6 @@ void spill(Program* program, live& live_vars, const struct radv_nir_compiler_opt
        return;
  
     /* else, we check if we can improve things a bit */
-   uint16_t total_sgpr_regs = options->chip_class >= GFX8 ? 800 : 512;
-   uint16_t max_addressible_sgpr = program->sgpr_limit;
  
     /* calculate target register demand */
     RegisterDemand max_reg_demand;
@@ -1577,14 +1575,14 @@ void spill(Program* program, live& live_vars, const struct radv_nir_compiler_opt
        max_reg_demand.update(block.register_demand);
     }
  
-   RegisterDemand target_pressure = {256, int16_t(max_addressible_sgpr)};
+   RegisterDemand target_pressure = {256, int16_t(program->sgpr_limit)};
     unsigned num_waves = 1;
-   int spills_to_vgpr = (max_reg_demand.sgpr - max_addressible_sgpr + 63) / 64;
+   int spills_to_vgpr = (max_reg_demand.sgpr - program->sgpr_limit + 63) / 64;
  
     /* test if it possible to increase occupancy with little spilling */
     for (unsigned num_waves_next = 2; num_waves_next <= 8; num_waves_next++) {
        RegisterDemand target_pressure_next = {int16_t((256 / num_waves_next) & ~3),
-                                             int16_t(std::min<uint16_t>(((total_sgpr_regs / num_waves_next) & ~7) - 2, max_addressible_sgpr))};
+                                             int16_t(get_addr_sgpr_from_waves(program, num_waves_next))};
  
        /* Currently no vgpr spilling supported.
         * Spill as many sgprs as necessary to not hinder occupancy */
author	Rhys Perry <pendingchaos02@gmail.com>
	Fri, 13 Sep 2019 15:41:00 +0000 (16:41 +0100)
committer	Rhys Perry <pendingchaos02@gmail.com>
	Wed, 23 Oct 2019 18:11:21 +0000 (19:11 +0100)
src/amd/compiler/aco_instruction_selection_setup.cpp		patch \| blob \| history
src/amd/compiler/aco_ir.h		patch \| blob \| history
src/amd/compiler/aco_live_var_analysis.cpp		patch \| blob \| history
src/amd/compiler/aco_register_allocation.cpp		patch \| blob \| history
src/amd/compiler/aco_scheduler.cpp		patch \| blob \| history
src/amd/compiler/aco_spill.cpp		patch \| blob \| history