aco: workaround Tonga/Iceland hardware bug
[mesa.git] / src / amd / compiler / aco_spill.cpp
index 0d6ea9271e4cb8f598e7defdc21577b95eadf732..276dcbd7c5b5ca363d01bd7c5d964acd48f0879f 100644 (file)
  */
 
 #include "aco_ir.h"
+#include "aco_builder.h"
+#include "sid.h"
+
 #include <map>
 #include <stack>
-#include "vulkan/radv_shader.h"
-
 
 /*
  * Implements the spilling algorithm on SSA-form from
@@ -157,6 +158,8 @@ void next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set<uint32_t>&
          /* omit exec mask */
          if (op.isFixed() && op.physReg() == exec)
             continue;
+         if (op.regClass().type() == RegType::vgpr && op.regClass().is_linear())
+            continue;
          if (op.isTemp())
             next_uses[op.getTemp()] = {block_idx, idx};
       }
@@ -322,6 +325,8 @@ std::vector<std::map<Temp, uint32_t>> local_next_uses(spill_ctx& ctx, Block* blo
       for (const Operand& op : instr->operands) {
          if (op.isFixed() && op.physReg() == exec)
             continue;
+         if (op.regClass().type() == RegType::vgpr && op.regClass().is_linear())
+            continue;
          if (op.isTemp())
             next_uses[op.getTemp()] = idx;
       }
@@ -483,7 +488,7 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id
          for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
             if (pair.first.type() == RegType::vgpr &&
                 ctx.next_use_distances_start[block_idx].find(pair.first) != ctx.next_use_distances_start[block_idx].end() &&
-                ctx.next_use_distances_end[pred_idx][pair.first].second > block_idx) {
+                ctx.next_use_distances_start[block_idx][pair.first].second > block_idx) {
                ctx.spills_entry[block_idx].insert(pair);
                spilled_registers.vgpr += pair.first.size();
             }
@@ -520,7 +525,7 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id
 
    /* keep variables spilled on all incoming paths */
    for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_start[block_idx]) {
-      std::vector<unsigned>& preds = pair.first.type() == RegType::vgpr ? block->logical_preds : block->linear_preds;
+      std::vector<unsigned>& preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds;
       /* If it can be rematerialized, keep the variable spilled if all predecessors do not reload it.
        * Otherwise, if any predecessor reloads it, ensure it's reloaded on all other predecessors.
        * The idea is that it's better in practice to rematerialize redundantly than to create lots of phis. */
@@ -803,7 +808,7 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
    /* iterate all (other) spilled variables for which to spill at the predecessor */
    // TODO: would be better to have them sorted: first vgprs and first with longest distance
    for (std::pair<Temp, uint32_t> pair : ctx.spills_entry[block_idx]) {
-      std::vector<unsigned> preds = pair.first.type() == RegType::vgpr ? block->logical_preds : block->linear_preds;
+      std::vector<unsigned> preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds;
 
       for (unsigned pred_idx : preds) {
          /* variable is already spilled at predecessor */
@@ -896,7 +901,7 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
       /* skip spilled variables */
       if (ctx.spills_entry[block_idx].find(pair.first) != ctx.spills_entry[block_idx].end())
          continue;
-      std::vector<unsigned> preds = pair.first.type() == RegType::vgpr ? block->logical_preds : block->linear_preds;
+      std::vector<unsigned> preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds;
 
       /* variable is dead at predecessor, it must be from a phi */
       bool is_dead = false;
@@ -950,7 +955,7 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
 
       if (!is_same) {
          /* the variable was renamed differently in the predecessors: we have to create a phi */
-         aco_opcode opcode = pair.first.type() == RegType::vgpr ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
+         aco_opcode opcode = pair.first.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
          aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
          rename = {ctx.program->allocateId(), pair.first.regClass()};
          for (unsigned i = 0; i < phi->operands.size(); i++) {
@@ -1253,6 +1258,48 @@ void spill_block(spill_ctx& ctx, unsigned block_idx)
    ctx.loop_header.pop();
 }
 
+Temp load_scratch_resource(spill_ctx& ctx, Temp& scratch_offset,
+                           std::vector<aco_ptr<Instruction>>& instructions,
+                           unsigned offset, bool is_top_level)
+{
+   Builder bld(ctx.program);
+   if (is_top_level) {
+      bld.reset(&instructions);
+   } else {
+      /* find p_logical_end */
+      unsigned idx = instructions.size() - 1;
+      while (instructions[idx]->opcode != aco_opcode::p_logical_end)
+         idx--;
+      bld.reset(&instructions, std::next(instructions.begin(), idx));
+   }
+
+   Temp private_segment_buffer = ctx.program->private_segment_buffer;
+   if (ctx.program->stage != compute_cs)
+      private_segment_buffer = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand(0u));
+
+   if (offset)
+      scratch_offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), scratch_offset, Operand(offset));
+
+   uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
+                        S_008F0C_INDEX_STRIDE(ctx.program->wave_size == 64 ? 3 : 2);
+
+   if (ctx.program->chip_class >= GFX10) {
+      rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+                   S_008F0C_OOB_SELECT(3) |
+                   S_008F0C_RESOURCE_LEVEL(1);
+   } else if (ctx.program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
+      rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+                   S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+   }
+   /* older generations need element size = 4 bytes. element size removed in GFX9 */
+   if (ctx.program->chip_class <= GFX8)
+      rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
+
+   return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
+                     private_segment_buffer, Operand(-1u),
+                     Operand(rsrc_conf));
+}
+
 void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
    std::map<uint32_t, uint32_t> sgpr_slot;
    std::map<uint32_t, uint32_t> vgpr_slot;
@@ -1335,6 +1382,8 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
       slot_idx++;
    }
 
+   unsigned sgpr_spill_slots = spill_slot_interferences.size();
+   spill_slot_interferences.clear();
    slot_idx = 0;
    done = false;
 
@@ -1353,7 +1402,7 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
             if (i == spill_slot_interferences.size())
                spill_slot_interferences.emplace_back(std::set<uint32_t>());
             /* check for interference and ensure that vector regs are stored next to each other */
-            if (spill_slot_interferences[i].find(id) != spill_slot_interferences[i].end() || i / 64 != slot_idx / 64) {
+            if (spill_slot_interferences[i].find(id) != spill_slot_interferences[i].end()) {
                interferes = true;
                break;
             }
@@ -1368,10 +1417,29 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
          is_assigned[id] = true;
          for (unsigned i = slot_idx; i < slot_idx + ctx.interferences[id].first.size(); i++)
             spill_slot_interferences[i].insert(ctx.interferences[id].second.begin(), ctx.interferences[id].second.end());
+
+         /* add all affinities: there are no additional interferences */
+         for (std::vector<uint32_t>& vec : ctx.affinities) {
+            bool found_affinity = false;
+            for (uint32_t entry : vec) {
+               if (entry == id) {
+                  found_affinity = true;
+                  break;
+               }
+            }
+            if (!found_affinity)
+               continue;
+            for (uint32_t entry : vec) {
+               vgpr_slot[entry] = slot_idx;
+               is_assigned[entry] = true;
+            }
+         }
       }
       slot_idx++;
    }
 
+   unsigned vgpr_spill_slots = spill_slot_interferences.size();
+
    for (unsigned id = 0; id < is_assigned.size(); id++)
       assert(is_assigned[id] || !ctx.is_reloaded[id]);
 
@@ -1392,10 +1460,11 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
    }
 
    /* hope, we didn't mess up */
-   std::vector<Temp> vgpr_spill_temps((spill_slot_interferences.size() + 63) / 64);
+   std::vector<Temp> vgpr_spill_temps((sgpr_spill_slots + 63) / 64);
    assert(vgpr_spill_temps.size() <= spills_to_vgpr);
 
    /* replace pseudo instructions with actual hardware instructions */
+   Temp scratch_offset = ctx.program->scratch_offset, scratch_rsrc = Temp();
    unsigned last_top_level_block_idx = 0;
    std::vector<bool> reload_in_loop(vgpr_spill_temps.size());
    for (Block& block : ctx.program->blocks) {
@@ -1449,6 +1518,7 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
       std::vector<aco_ptr<Instruction>>::iterator it;
       std::vector<aco_ptr<Instruction>> instructions;
       instructions.reserve(block.instructions.size());
+      Builder bld(ctx.program, &instructions);
       for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
 
          if ((*it)->opcode == aco_opcode::p_spill) {
@@ -1459,8 +1529,36 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
             } else if (vgpr_slot.find(spill_id) != vgpr_slot.end()) {
                /* spill vgpr */
                ctx.program->config->spilled_vgprs += (*it)->operands[0].size();
+               uint32_t spill_slot = vgpr_slot[spill_id];
+               bool add_offset_to_sgpr = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size + vgpr_spill_slots * 4 > 4096;
+               unsigned base_offset = add_offset_to_sgpr ? 0 : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;
+
+               /* check if the scratch resource descriptor already exists */
+               if (scratch_rsrc == Temp()) {
+                  unsigned offset = add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0;
+                  scratch_rsrc = load_scratch_resource(ctx, scratch_offset,
+                                                       last_top_level_block_idx == block.index ?
+                                                       instructions : ctx.program->blocks[last_top_level_block_idx].instructions,
+                                                       offset,
+                                                       last_top_level_block_idx == block.index);
+               }
 
-               assert(false && "vgpr spilling not yet implemented.");
+               unsigned offset = base_offset + spill_slot * 4;
+               aco_opcode opcode = aco_opcode::buffer_store_dword;
+               assert((*it)->operands[0].isTemp());
+               Temp temp = (*it)->operands[0].getTemp();
+               assert(temp.type() == RegType::vgpr && !temp.is_linear());
+               if (temp.size() > 1) {
+                  Instruction* split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, temp.size())};
+                  split->operands[0] = Operand(temp);
+                  for (unsigned i = 0; i < temp.size(); i++)
+                     split->definitions[i] = bld.def(v1);
+                  bld.insert(split);
+                  for (unsigned i = 0; i < temp.size(); i++)
+                     bld.mubuf(opcode, Operand(), scratch_rsrc, scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false);
+               } else {
+                  bld.mubuf(opcode, Operand(), scratch_rsrc, scratch_offset, temp, offset, false);
+               }
             } else if (sgpr_slot.find(spill_id) != sgpr_slot.end()) {
                ctx.program->config->spilled_sgprs += (*it)->operands[0].size();
 
@@ -1500,8 +1598,35 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
 
             if (vgpr_slot.find(spill_id) != vgpr_slot.end()) {
                /* reload vgpr */
-               assert(false && "vgpr spilling not yet implemented.");
+               uint32_t spill_slot = vgpr_slot[spill_id];
+               bool add_offset_to_sgpr = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size + vgpr_spill_slots * 4 > 4096;
+               unsigned base_offset = add_offset_to_sgpr ? 0 : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;
+
+               /* check if the scratch resource descriptor already exists */
+               if (scratch_rsrc == Temp()) {
+                  unsigned offset = add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0;
+                  scratch_rsrc = load_scratch_resource(ctx, scratch_offset,
+                                                       last_top_level_block_idx == block.index ?
+                                                       instructions : ctx.program->blocks[last_top_level_block_idx].instructions,
+                                                       offset,
+                                                       last_top_level_block_idx == block.index);
+               }
 
+               unsigned offset = base_offset + spill_slot * 4;
+               aco_opcode opcode = aco_opcode::buffer_load_dword;
+               Definition def = (*it)->definitions[0];
+               if (def.size() > 1) {
+                  Instruction* vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, def.size(), 1)};
+                  vec->definitions[0] = def;
+                  for (unsigned i = 0; i < def.size(); i++) {
+                     Temp tmp = bld.tmp(v1);
+                     vec->operands[i] = Operand(tmp);
+                     bld.mubuf(opcode, Definition(tmp), Operand(), scratch_rsrc, scratch_offset, offset + i * 4, false);
+                  }
+                  bld.insert(vec);
+               } else {
+                  bld.mubuf(opcode, def, Operand(), scratch_rsrc, scratch_offset, offset, false);
+               }
             } else if (sgpr_slot.find(spill_id) != sgpr_slot.end()) {
                uint32_t spill_slot = sgpr_slot[spill_id];
                reload_in_loop[spill_slot / 64] = block.loop_nest_depth > 0;
@@ -1541,6 +1666,9 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
       block.instructions = std::move(instructions);
    }
 
+   /* update required scratch memory */
+   ctx.program->config->scratch_bytes_per_wave += align(vgpr_spill_slots * 4 * ctx.program->wave_size, 1024);
+
    /* SSA elimination inserts copies for logical phis right before p_logical_end
     * So if a linear vgpr is used between that p_logical_end and the branch,
     * we need to ensure logical phis don't choose a definition which aliases