src/amd/compiler/aco_insert_NOPs.cpp

   1 /*
   2  * Copyright © 2019 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include "aco_ir.h"
  26
  27 namespace aco {
  28 namespace {
  29
  30 struct NOP_ctx {
  31    /* just initialize these with something less than max NOPs */
  32    int VALU_wrexec = -10;
  33    int VALU_wrvcc = -10;
  34    int VALU_wrsgpr = -10;
  35    enum chip_class chip_class;
  36    unsigned vcc_physical;
  37    NOP_ctx(Program* program) : chip_class(program->chip_class) {
  38       vcc_physical = program->config->num_sgprs - 2;
  39    }
  40 };
  41
  42 bool VALU_writes_sgpr(aco_ptr<Instruction>& instr)
  43 {
  44    if ((uint32_t) instr->format & (uint32_t) Format::VOPC)
  45       return true;
  46    if (instr->isVOP3() && instr->definitions.size() == 2)
  47       return true;
  48    if (instr->opcode == aco_opcode::v_readfirstlane_b32 || instr->opcode == aco_opcode::v_readlane_b32)
  49       return true;
  50    return false;
  51 }
  52
  53 bool regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size)
  54 {
  55    return a_reg > b_reg ?
  56           (a_reg - b_reg < b_size) :
  57           (b_reg - a_reg < a_size);
  58 }
  59
  60 int handle_instruction(NOP_ctx& ctx, aco_ptr<Instruction>& instr,
  61                        std::vector<aco_ptr<Instruction>>& old_instructions,
  62                        std::vector<aco_ptr<Instruction>>& new_instructions)
  63 {
  64    int new_idx = new_instructions.size();
  65
  66    // TODO: setreg / getreg / m0 writes
  67    // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
  68
  69    /* break off from prevous SMEM clause if needed */
  70    if (instr->format == Format::SMEM && ctx.chip_class >= GFX8) {
  71       const bool is_store = instr->definitions.empty();
  72       for (int pred_idx = new_idx - 1; pred_idx >= 0; pred_idx--) {
  73          aco_ptr<Instruction>& pred = new_instructions[pred_idx];
  74          if (pred->format != Format::SMEM)
  75             break;
  76
  77          /* Don't allow clauses with store instructions since the clause's
  78           * instructions may use the same address. */
  79          if (is_store || pred->definitions.empty())
  80             return 1;
  81
  82          Definition& instr_def = instr->definitions[0];
  83          Definition& pred_def = pred->definitions[0];
  84
  85          /* ISA reference doesn't say anything about this, but best to be safe */
  86          if (regs_intersect(instr_def.physReg(), instr_def.size(), pred_def.physReg(), pred_def.size()))
  87             return 1;
  88
  89          for (const Operand& op : pred->operands) {
  90             if (op.isConstant() || !op.isFixed())
  91                continue;
  92             if (regs_intersect(instr_def.physReg(), instr_def.size(), op.physReg(), op.size()))
  93                return 1;
  94          }
  95          for (const Operand& op : instr->operands) {
  96             if (op.isConstant() || !op.isFixed())
  97                continue;
  98             if (regs_intersect(pred_def.physReg(), pred_def.size(), op.physReg(), op.size()))
  99                return 1;
 100          }
 101       }
 102    } else if (instr->isVALU() || instr->format == Format::VINTRP) {
 103       int NOPs = 0;
 104
 105       if (instr->isDPP()) {
 106          /* VALU does not forward EXEC to DPP. */
 107          if (ctx.VALU_wrexec + 5 >= new_idx)
 108             NOPs = 5 + ctx.VALU_wrexec - new_idx + 1;
 109
 110          /* VALU DPP reads VGPR written by VALU */
 111          for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 2; pred_idx--) {
 112             aco_ptr<Instruction>& pred = new_instructions[pred_idx];
 113             if ((pred->isVALU() || pred->format == Format::VINTRP) &&
 114                 !pred->definitions.empty() &&
 115                 pred->definitions[0].physReg() == instr->operands[0].physReg()) {
 116                NOPs = std::max(NOPs, 2 + pred_idx - new_idx + 1);
 117                break;
 118             }
 119          }
 120       }
 121
 122       /* SALU writes M0 */
 123       if (instr->format == Format::VINTRP && new_idx > 0 && ctx.chip_class >= GFX9) {
 124          aco_ptr<Instruction>& pred = new_instructions.back();
 125          if (pred->isSALU() &&
 126              !pred->definitions.empty() &&
 127              pred->definitions[0].physReg() == m0)
 128             NOPs = std::max(NOPs, 1);
 129       }
 130
 131       for (const Operand& op : instr->operands) {
 132          /* VALU which uses VCCZ */
 133          if (op.physReg() == PhysReg{251} &&
 134              ctx.VALU_wrvcc + 5 >= new_idx)
 135             NOPs = std::max(NOPs, 5 + ctx.VALU_wrvcc - new_idx + 1);
 136
 137          /* VALU which uses EXECZ */
 138          if (op.physReg() == PhysReg{252} &&
 139              ctx.VALU_wrexec + 5 >= new_idx)
 140             NOPs = std::max(NOPs, 5 + ctx.VALU_wrexec - new_idx + 1);
 141
 142          /* VALU which reads VCC as a constant */
 143          if (ctx.VALU_wrvcc + 1 >= new_idx) {
 144             for (unsigned k = 0; k < op.size(); k++) {
 145                unsigned reg = op.physReg() + k;
 146                if (reg == ctx.vcc_physical || reg == ctx.vcc_physical + 1)
 147                   NOPs = std::max(NOPs, 1);
 148             }
 149          }
 150       }
 151
 152       switch (instr->opcode) {
 153          case aco_opcode::v_readlane_b32:
 154          case aco_opcode::v_writelane_b32: {
 155             if (ctx.VALU_wrsgpr + 4 < new_idx)
 156                break;
 157             PhysReg reg = instr->operands[1].physReg();
 158             for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 4; pred_idx--) {
 159                aco_ptr<Instruction>& pred = new_instructions[pred_idx];
 160                if (!pred->isVALU() || !VALU_writes_sgpr(pred))
 161                   continue;
 162                for (const Definition& def : pred->definitions) {
 163                   if (def.physReg() == reg)
 164                      NOPs = std::max(NOPs, 4 + pred_idx - new_idx + 1);
 165                }
 166             }
 167             break;
 168          }
 169          case aco_opcode::v_div_fmas_f32:
 170          case aco_opcode::v_div_fmas_f64: {
 171             if (ctx.VALU_wrvcc + 4 >= new_idx)
 172                NOPs = std::max(NOPs, 4 + ctx.VALU_wrvcc - new_idx + 1);
 173             break;
 174          }
 175          default:
 176             break;
 177       }
 178
 179       /* Write VGPRs holding writedata > 64 bit from MIMG/MUBUF instructions */
 180       // FIXME: handle case if the last instruction of a block without branch is such store
 181       // TODO: confirm that DS instructions cannot cause WAR hazards here
 182       if (new_idx > 0) {
 183          aco_ptr<Instruction>& pred = new_instructions.back();
 184          if (pred->isVMEM() &&
 185              pred->operands.size() == 4 &&
 186              pred->operands[3].size() > 2 &&
 187              pred->operands[1].size() != 8 &&
 188              (pred->format != Format::MUBUF || pred->operands[2].physReg() >= 102)) {
 189             /* Ops that use a 256-bit T# do not need a wait state.
 190              * BUFFER_STORE_* operations that use an SGPR for "offset"
 191              * do not require any wait states. */
 192             PhysReg wrdata = pred->operands[3].physReg();
 193             unsigned size = pred->operands[3].size();
 194             assert(wrdata >= 256);
 195             for (const Definition& def : instr->definitions) {
 196                if (regs_intersect(def.physReg(), def.size(), wrdata, size))
 197                   NOPs = std::max(NOPs, 1);
 198             }
 199          }
 200       }
 201
 202       if (VALU_writes_sgpr(instr)) {
 203          for (const Definition& def : instr->definitions) {
 204             if (def.physReg() == vcc)
 205                ctx.VALU_wrvcc = NOPs ? new_idx : new_idx + 1;
 206             else if (def.physReg() == exec)
 207                ctx.VALU_wrexec = NOPs ? new_idx : new_idx + 1;
 208             else if (def.physReg() <= 102)
 209                ctx.VALU_wrsgpr = NOPs ? new_idx : new_idx + 1;
 210          }
 211       }
 212       return NOPs;
 213    } else if (instr->isVMEM() && ctx.VALU_wrsgpr + 5 >= new_idx) {
 214       /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
 215       for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 5; pred_idx--) {
 216          aco_ptr<Instruction>& pred = new_instructions[pred_idx];
 217          if (!(pred->isVALU() && VALU_writes_sgpr(pred)))
 218             continue;
 219
 220          for (const Definition& def : pred->definitions) {
 221             if (def.physReg() > 102)
 222                continue;
 223
 224             if (instr->operands.size() > 1 &&
 225                 regs_intersect(instr->operands[1].physReg(), instr->operands[1].size(),
 226                                def.physReg(), def.size())) {
 227                   return 5 + pred_idx - new_idx + 1;
 228             }
 229
 230             if (instr->operands.size() > 2 &&
 231                 regs_intersect(instr->operands[2].physReg(), instr->operands[2].size(),
 232                                def.physReg(), def.size())) {
 233                   return 5 + pred_idx - new_idx + 1;
 234             }
 235          }
 236       }
 237    }
 238
 239    return 0;
 240 }
 241
 242
 243 void handle_block(NOP_ctx& ctx, Block& block)
 244 {
 245    std::vector<aco_ptr<Instruction>> instructions;
 246    instructions.reserve(block.instructions.size());
 247    for (unsigned i = 0; i < block.instructions.size(); i++) {
 248       aco_ptr<Instruction>& instr = block.instructions[i];
 249       unsigned NOPs = handle_instruction(ctx, instr, block.instructions, instructions);
 250       if (NOPs) {
 251          // TODO: try to move the instruction down
 252          /* create NOP */
 253          aco_ptr<SOPP_instruction> nop{create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)};
 254          nop->imm = NOPs - 1;
 255          nop->block = -1;
 256          instructions.emplace_back(std::move(nop));
 257       }
 258
 259       instructions.emplace_back(std::move(instr));
 260    }
 261
 262    ctx.VALU_wrvcc -= instructions.size();
 263    ctx.VALU_wrexec -= instructions.size();
 264    ctx.VALU_wrsgpr -= instructions.size();
 265    block.instructions = std::move(instructions);
 266 }
 267
 268 } /* end namespace */
 269
 270
 271 void insert_NOPs(Program* program)
 272 {
 273    NOP_ctx ctx(program);
 274    for (Block& block : program->blocks) {
 275       if (block.instructions.empty())
 276          continue;
 277
 278       handle_block(ctx, block);
 279    }
 280 }
 281
 282 }