src/amd/compiler/aco_insert_NOPs.cpp

   1 /*
   2  * Copyright © 2019 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include "aco_ir.h"
  26
  27 namespace aco {
  28 namespace {
  29
  30 struct NOP_ctx {
  31    enum chip_class chip_class;
  32    unsigned vcc_physical;
  33
  34    /* pre-GFX10 */
  35    /* just initialize these with something less than max NOPs */
  36    int VALU_wrexec = -10;
  37    int VALU_wrvcc = -10;
  38    int VALU_wrsgpr = -10;
  39
  40    /* GFX10 */
  41    int last_VMEM_since_scalar_write = -1;
  42    bool has_VOPC = false;
  43
  44    NOP_ctx(Program* program) : chip_class(program->chip_class) {
  45       vcc_physical = program->config->num_sgprs - 2;
  46    }
  47 };
  48
  49 bool VALU_writes_sgpr(aco_ptr<Instruction>& instr)
  50 {
  51    if ((uint32_t) instr->format & (uint32_t) Format::VOPC)
  52       return true;
  53    if (instr->isVOP3() && instr->definitions.size() == 2)
  54       return true;
  55    if (instr->opcode == aco_opcode::v_readfirstlane_b32 || instr->opcode == aco_opcode::v_readlane_b32)
  56       return true;
  57    return false;
  58 }
  59
  60 bool regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size)
  61 {
  62    return a_reg > b_reg ?
  63           (a_reg - b_reg < b_size) :
  64           (b_reg - a_reg < a_size);
  65 }
  66
  67 unsigned handle_SMEM_clause(aco_ptr<Instruction>& instr, int new_idx,
  68                             std::vector<aco_ptr<Instruction>>& new_instructions)
  69 {
  70    //TODO: s_dcache_inv needs to be in it's own group on GFX10 (and previous versions?)
  71    const bool is_store = instr->definitions.empty();
  72    for (int pred_idx = new_idx - 1; pred_idx >= 0; pred_idx--) {
  73       aco_ptr<Instruction>& pred = new_instructions[pred_idx];
  74       if (pred->format != Format::SMEM)
  75          break;
  76
  77       /* Don't allow clauses with store instructions since the clause's
  78        * instructions may use the same address. */
  79       if (is_store || pred->definitions.empty())
  80          return 1;
  81
  82       Definition& instr_def = instr->definitions[0];
  83       Definition& pred_def = pred->definitions[0];
  84
  85       /* ISA reference doesn't say anything about this, but best to be safe */
  86       if (regs_intersect(instr_def.physReg(), instr_def.size(), pred_def.physReg(), pred_def.size()))
  87          return 1;
  88
  89       for (const Operand& op : pred->operands) {
  90          if (op.isConstant() || !op.isFixed())
  91             continue;
  92          if (regs_intersect(instr_def.physReg(), instr_def.size(), op.physReg(), op.size()))
  93             return 1;
  94       }
  95       for (const Operand& op : instr->operands) {
  96          if (op.isConstant() || !op.isFixed())
  97             continue;
  98          if (regs_intersect(pred_def.physReg(), pred_def.size(), op.physReg(), op.size()))
  99             return 1;
 100       }
 101    }
 102
 103    return 0;
 104 }
 105
 106 int handle_instruction(NOP_ctx& ctx, aco_ptr<Instruction>& instr,
 107                        std::vector<aco_ptr<Instruction>>& old_instructions,
 108                        std::vector<aco_ptr<Instruction>>& new_instructions)
 109 {
 110    int new_idx = new_instructions.size();
 111
 112    // TODO: setreg / getreg / m0 writes
 113    // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
 114
 115    /* break off from prevous SMEM clause if needed */
 116    if (instr->format == Format::SMEM && ctx.chip_class >= GFX8) {
 117       return handle_SMEM_clause(instr, new_idx, new_instructions);
 118    } else if (instr->isVALU() || instr->format == Format::VINTRP) {
 119       int NOPs = 0;
 120
 121       if (instr->isDPP()) {
 122          /* VALU does not forward EXEC to DPP. */
 123          if (ctx.VALU_wrexec + 5 >= new_idx)
 124             NOPs = 5 + ctx.VALU_wrexec - new_idx + 1;
 125
 126          /* VALU DPP reads VGPR written by VALU */
 127          for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 2; pred_idx--) {
 128             aco_ptr<Instruction>& pred = new_instructions[pred_idx];
 129             if ((pred->isVALU() || pred->format == Format::VINTRP) &&
 130                 !pred->definitions.empty() &&
 131                 pred->definitions[0].physReg() == instr->operands[0].physReg()) {
 132                NOPs = std::max(NOPs, 2 + pred_idx - new_idx + 1);
 133                break;
 134             }
 135          }
 136       }
 137
 138       /* SALU writes M0 */
 139       if (instr->format == Format::VINTRP && new_idx > 0 && ctx.chip_class >= GFX9) {
 140          aco_ptr<Instruction>& pred = new_instructions.back();
 141          if (pred->isSALU() &&
 142              !pred->definitions.empty() &&
 143              pred->definitions[0].physReg() == m0)
 144             NOPs = std::max(NOPs, 1);
 145       }
 146
 147       for (const Operand& op : instr->operands) {
 148          /* VALU which uses VCCZ */
 149          if (op.physReg() == PhysReg{251} &&
 150              ctx.VALU_wrvcc + 5 >= new_idx)
 151             NOPs = std::max(NOPs, 5 + ctx.VALU_wrvcc - new_idx + 1);
 152
 153          /* VALU which uses EXECZ */
 154          if (op.physReg() == PhysReg{252} &&
 155              ctx.VALU_wrexec + 5 >= new_idx)
 156             NOPs = std::max(NOPs, 5 + ctx.VALU_wrexec - new_idx + 1);
 157
 158          /* VALU which reads VCC as a constant */
 159          if (ctx.VALU_wrvcc + 1 >= new_idx) {
 160             for (unsigned k = 0; k < op.size(); k++) {
 161                unsigned reg = op.physReg() + k;
 162                if (reg == ctx.vcc_physical || reg == ctx.vcc_physical + 1)
 163                   NOPs = std::max(NOPs, 1);
 164             }
 165          }
 166       }
 167
 168       switch (instr->opcode) {
 169          case aco_opcode::v_readlane_b32:
 170          case aco_opcode::v_writelane_b32: {
 171             if (ctx.VALU_wrsgpr + 4 < new_idx)
 172                break;
 173             PhysReg reg = instr->operands[1].physReg();
 174             for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 4; pred_idx--) {
 175                aco_ptr<Instruction>& pred = new_instructions[pred_idx];
 176                if (!pred->isVALU() || !VALU_writes_sgpr(pred))
 177                   continue;
 178                for (const Definition& def : pred->definitions) {
 179                   if (def.physReg() == reg)
 180                      NOPs = std::max(NOPs, 4 + pred_idx - new_idx + 1);
 181                }
 182             }
 183             break;
 184          }
 185          case aco_opcode::v_div_fmas_f32:
 186          case aco_opcode::v_div_fmas_f64: {
 187             if (ctx.VALU_wrvcc + 4 >= new_idx)
 188                NOPs = std::max(NOPs, 4 + ctx.VALU_wrvcc - new_idx + 1);
 189             break;
 190          }
 191          default:
 192             break;
 193       }
 194
 195       /* Write VGPRs holding writedata > 64 bit from MIMG/MUBUF instructions */
 196       // FIXME: handle case if the last instruction of a block without branch is such store
 197       // TODO: confirm that DS instructions cannot cause WAR hazards here
 198       if (new_idx > 0) {
 199          aco_ptr<Instruction>& pred = new_instructions.back();
 200          if (pred->isVMEM() &&
 201              pred->operands.size() == 4 &&
 202              pred->operands[3].size() > 2 &&
 203              pred->operands[1].size() != 8 &&
 204              (pred->format != Format::MUBUF || pred->operands[2].physReg() >= 102)) {
 205             /* Ops that use a 256-bit T# do not need a wait state.
 206              * BUFFER_STORE_* operations that use an SGPR for "offset"
 207              * do not require any wait states. */
 208             PhysReg wrdata = pred->operands[3].physReg();
 209             unsigned size = pred->operands[3].size();
 210             assert(wrdata >= 256);
 211             for (const Definition& def : instr->definitions) {
 212                if (regs_intersect(def.physReg(), def.size(), wrdata, size))
 213                   NOPs = std::max(NOPs, 1);
 214             }
 215          }
 216       }
 217
 218       if (VALU_writes_sgpr(instr)) {
 219          for (const Definition& def : instr->definitions) {
 220             if (def.physReg() == vcc)
 221                ctx.VALU_wrvcc = NOPs ? new_idx : new_idx + 1;
 222             else if (def.physReg() == exec)
 223                ctx.VALU_wrexec = NOPs ? new_idx : new_idx + 1;
 224             else if (def.physReg() <= 102)
 225                ctx.VALU_wrsgpr = NOPs ? new_idx : new_idx + 1;
 226          }
 227       }
 228       return NOPs;
 229    } else if (instr->isVMEM() && ctx.VALU_wrsgpr + 5 >= new_idx) {
 230       /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
 231       for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 5; pred_idx--) {
 232          aco_ptr<Instruction>& pred = new_instructions[pred_idx];
 233          if (!(pred->isVALU() && VALU_writes_sgpr(pred)))
 234             continue;
 235
 236          for (const Definition& def : pred->definitions) {
 237             if (def.physReg() > 102)
 238                continue;
 239
 240             if (instr->operands.size() > 1 &&
 241                 regs_intersect(instr->operands[1].physReg(), instr->operands[1].size(),
 242                                def.physReg(), def.size())) {
 243                   return 5 + pred_idx - new_idx + 1;
 244             }
 245
 246             if (instr->operands.size() > 2 &&
 247                 regs_intersect(instr->operands[2].physReg(), instr->operands[2].size(),
 248                                def.physReg(), def.size())) {
 249                   return 5 + pred_idx - new_idx + 1;
 250             }
 251          }
 252       }
 253    }
 254
 255    return 0;
 256 }
 257
 258 std::pair<int, int> handle_instruction_gfx10(NOP_ctx& ctx, aco_ptr<Instruction>& instr,
 259                                              std::vector<aco_ptr<Instruction>>& old_instructions,
 260                                              std::vector<aco_ptr<Instruction>>& new_instructions)
 261 {
 262    int new_idx = new_instructions.size();
 263    unsigned vNOPs = 0;
 264    unsigned sNOPs = 0;
 265
 266    /* break off from prevous SMEM group ("clause" seems to mean something different in RDNA) if needed */
 267    if (instr->format == Format::SMEM)
 268       sNOPs = std::max(sNOPs, handle_SMEM_clause(instr, new_idx, new_instructions));
 269
 270    /* handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)" in-between */
 271    if (instr->isSALU() || instr->format == Format::SMEM) {
 272       if (!instr->definitions.empty() && ctx.last_VMEM_since_scalar_write != -1) {
 273          ctx.last_VMEM_since_scalar_write = -1;
 274          vNOPs = 1;
 275       }
 276    } else if (instr->isVMEM() || instr->isFlatOrGlobal()) {
 277       ctx.last_VMEM_since_scalar_write = new_idx;
 278    } else if (instr->opcode == aco_opcode::s_waitcnt) {
 279       uint16_t imm = static_cast<SOPP_instruction*>(instr.get())->imm;
 280       unsigned vmcnt = (imm & 0xF) | ((imm & (0x3 << 14)) >> 10);
 281       if (vmcnt == 0)
 282          ctx.last_VMEM_since_scalar_write = -1;
 283    } else if (instr->isVALU()) {
 284       ctx.last_VMEM_since_scalar_write = -1;
 285    }
 286
 287    /* VcmpxPermlaneHazard
 288     * Handle any permlane following a VOPC instruction, insert v_mov between them.
 289     */
 290    if (instr->format == Format::VOPC) {
 291       ctx.has_VOPC = true;
 292    } else if (ctx.has_VOPC &&
 293               (instr->opcode == aco_opcode::v_permlane16_b32 ||
 294                instr->opcode == aco_opcode::v_permlanex16_b32)) {
 295       ctx.has_VOPC = false;
 296
 297       /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */
 298       aco_ptr<VOP1_instruction> v_mov{create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1)};
 299       v_mov->definitions[0] = Definition(instr->operands[0].physReg(), v1);
 300       v_mov->operands[0] = Operand(instr->operands[0].physReg(), v1);
 301       new_instructions.emplace_back(std::move(v_mov));
 302    } else if (instr->isVALU() && instr->opcode != aco_opcode::v_nop) {
 303       ctx.has_VOPC = false;
 304    }
 305
 306    return std::make_pair(sNOPs, vNOPs);
 307 }
 308
 309
 310 void handle_block(NOP_ctx& ctx, Block& block)
 311 {
 312    std::vector<aco_ptr<Instruction>> instructions;
 313    instructions.reserve(block.instructions.size());
 314    for (unsigned i = 0; i < block.instructions.size(); i++) {
 315       aco_ptr<Instruction>& instr = block.instructions[i];
 316       unsigned NOPs = handle_instruction(ctx, instr, block.instructions, instructions);
 317       if (NOPs) {
 318          // TODO: try to move the instruction down
 319          /* create NOP */
 320          aco_ptr<SOPP_instruction> nop{create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)};
 321          nop->imm = NOPs - 1;
 322          nop->block = -1;
 323          instructions.emplace_back(std::move(nop));
 324       }
 325
 326       instructions.emplace_back(std::move(instr));
 327    }
 328
 329    ctx.VALU_wrvcc -= instructions.size();
 330    ctx.VALU_wrexec -= instructions.size();
 331    ctx.VALU_wrsgpr -= instructions.size();
 332    block.instructions = std::move(instructions);
 333 }
 334
 335 void handle_block_gfx10(NOP_ctx& ctx, Block& block)
 336 {
 337    std::vector<aco_ptr<Instruction>> instructions;
 338    instructions.reserve(block.instructions.size());
 339    for (unsigned i = 0; i < block.instructions.size(); i++) {
 340       aco_ptr<Instruction>& instr = block.instructions[i];
 341       std::pair<int, int> NOPs = handle_instruction_gfx10(ctx, instr, block.instructions, instructions);
 342       for (int i = 0; i < NOPs.second; i++) {
 343          // TODO: try to move the instruction down
 344          /* create NOP */
 345          aco_ptr<VOP1_instruction> nop{create_instruction<VOP1_instruction>(aco_opcode::v_nop, Format::VOP1, 0, 0)};
 346          instructions.emplace_back(std::move(nop));
 347       }
 348       if (NOPs.first) {
 349          // TODO: try to move the instruction down
 350          /* create NOP */
 351          aco_ptr<SOPP_instruction> nop{create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)};
 352          nop->imm = NOPs.first - 1;
 353          nop->block = -1;
 354          instructions.emplace_back(std::move(nop));
 355       }
 356
 357       instructions.emplace_back(std::move(instr));
 358    }
 359
 360    block.instructions = std::move(instructions);
 361 }
 362
 363 } /* end namespace */
 364
 365
 366 void insert_NOPs(Program* program)
 367 {
 368    NOP_ctx ctx(program);
 369
 370    for (Block& block : program->blocks) {
 371       if (block.instructions.empty())
 372          continue;
 373
 374       if (ctx.chip_class >= GFX10)
 375          handle_block_gfx10(ctx, block);
 376       else
 377          handle_block(ctx, block);
 378    }
 379 }
 380
 381 }