aco: move s_andn2_b64 instructions out of the p_discard_if
authorRhys Perry <pendingchaos02@gmail.com>
Tue, 8 Oct 2019 12:40:17 +0000 (13:40 +0100)
committerRhys Perry <pendingchaos02@gmail.com>
Wed, 9 Oct 2019 16:19:02 +0000 (16:19 +0000)
And use a new p_discard_early_exit instruction. This fixes some cases
where a definition having the same register as an operand causes issues.

v2: rename instruction to p_exit_early_if
v2: modify the existing instruction instead of creating a new one
v3: merge the "i == num - 1" IFs

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
src/amd/compiler/aco_insert_exec_mask.cpp
src/amd/compiler/aco_instruction_selection.cpp
src/amd/compiler/aco_lower_to_hw_instr.cpp
src/amd/compiler/aco_opcodes.py
src/amd/compiler/aco_scheduler.cpp

index 155c21a5aa470188a7aaff727cdd4779ca37538c..3f4b48e661f38decc9b35c4cc9c5396809ab4f06 100644 (file)
@@ -657,22 +657,23 @@ void process_instructions(exec_ctx& ctx, Block* block,
             transition_to_WQM(ctx, bld, block->index);
             ctx.info[block->index].exec.back().second &= ~mask_type_global;
          }
-         unsigned num = ctx.info[block->index].exec.size();
+         int num = ctx.info[block->index].exec.size();
          assert(num);
          Operand cond = instr->operands[0];
-         instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_discard_if, Format::PSEUDO, num + 1, num + 1));
-         for (unsigned i = 0; i < num; i++) {
-            instr->operands[i] = Operand(ctx.info[block->index].exec[i].first);
-            if (i == num - 1)
-               instr->operands[i].setFixed(exec);
-            Temp new_mask = bld.tmp(s2);
-            instr->definitions[i] = Definition(new_mask);
-            ctx.info[block->index].exec[i].first = new_mask;
+         for (int i = num - 1; i >= 0; i--) {
+            Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc),
+                                          ctx.info[block->index].exec[i].first, cond);
+            if (i == num - 1) {
+               andn2->operands[0].setFixed(exec);
+               andn2->definitions[0].setFixed(exec);
+            }
+            if (i == 0) {
+               instr->opcode = aco_opcode::p_exit_early_if;
+               instr->operands[0] = bld.scc(andn2->definitions[1].getTemp());
+            }
+            ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp();
          }
-         assert((ctx.info[block->index].exec[0].second & mask_type_wqm) == 0);
-         instr->definitions[num - 1].setFixed(exec);
-         instr->operands[num] = cond;
-         instr->definitions[num] = bld.def(s1, scc);
+         assert(!ctx.handle_wqm || (ctx.info[block->index].exec[0].second & mask_type_wqm) == 0);
 
       } else if (needs == WQM && state != WQM) {
          transition_to_WQM(ctx, bld, block->index);
@@ -738,24 +739,24 @@ void process_instructions(exec_ctx& ctx, Block* block,
             num = 1;
          }
 
-         for (unsigned i = 0; i < ctx.info[block->index].exec.size() - 1; i++)
-            num += ctx.info[block->index].exec[i].second & mask_type_exact ? 1 : 0;
-         instr.reset(create_instruction<Instruction>(aco_opcode::p_discard_if, Format::PSEUDO, num + 1, num + 1));
-         int k = 0;
-         for (unsigned i = 0; k < num; i++) {
+         num += ctx.info[block->index].exec.size() - 1;
+         for (int i = num - 1; i >= 0; i--) {
             if (ctx.info[block->index].exec[i].second & mask_type_exact) {
-               instr->operands[k] = Operand(ctx.info[block->index].exec[i].first);
-               Temp new_mask = bld.tmp(s2);
-               instr->definitions[k] = Definition(new_mask);
-               if (i == ctx.info[block->index].exec.size() - 1)
-                  instr->definitions[k].setFixed(exec);
-               k++;
-               ctx.info[block->index].exec[i].first = new_mask;
+               Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc),
+                                             ctx.info[block->index].exec[i].first, cond);
+               if (i == num - 1) {
+                  andn2->operands[0].setFixed(exec);
+                  andn2->definitions[0].setFixed(exec);
+               }
+               if (i == 0) {
+                  instr->opcode = aco_opcode::p_exit_early_if;
+                  instr->operands[0] = bld.scc(andn2->definitions[1].getTemp());
+               }
+               ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp();
+            } else {
+               assert(i != 0);
             }
          }
-         assert(k == num);
-         instr->definitions[num] = bld.def(s1, scc);
-         instr->operands[num] = Operand(cond);
          state = Exact;
 
       } else if (instr->opcode == aco_opcode::p_fs_buffer_store_smem) {
@@ -878,18 +879,15 @@ void add_branch_code(exec_ctx& ctx, Block* block)
                            bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec));
       ctx.info[idx].exec.back().first = new_exec;
 
-      aco_ptr<Pseudo_instruction> discard{create_instruction<Pseudo_instruction>(aco_opcode::p_discard_if, Format::PSEUDO, num + 1, num + 1)};
-      for (unsigned i = 0; i < num; i++) {
-         discard->operands[i] = Operand(ctx.info[block->index].exec[i].first);
-         Temp new_mask = bld.tmp(s2);
-         discard->definitions[i] = Definition(new_mask);
-         ctx.info[block->index].exec[i].first = new_mask;
+      for (int i = num - 1; i >= 0; i--) {
+         Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc),
+                                       ctx.info[block->index].exec[i].first, cond);
+         if (i == 0)
+            bld.pseudo(aco_opcode::p_exit_early_if, bld.scc(andn2->definitions[1].getTemp()));
+         ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp();
       }
       assert(!ctx.handle_wqm || (ctx.info[block->index].exec[0].second & mask_type_wqm) == 0);
-      discard->operands[num] = Operand(cond);
-      discard->definitions[num] = bld.def(s1, scc);
 
-      bld.insert(std::move(discard));
       if ((block->kind & (block_kind_break | block_kind_uniform)) == block_kind_break)
          ctx.info[idx].exec.back().first = cond;
       bld.insert(std::move(branch));
index bba091fd74b6213f06f9446aa4d556c67f04f9f3..d1849d7b92b1392796decaeea63ae5bc44d5b812 100644 (file)
@@ -3266,6 +3266,7 @@ void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
 
    ctx->program->needs_exact = true;
 
+   // TODO: optimize uniform conditions
    Builder bld(ctx->program, ctx->block);
    Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
    src = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
index 8fd33e47d922dc1fda570ff83921b41feb713991..395851119549a9e154262d84c251be5e48ddffa4 100644 (file)
@@ -606,15 +606,15 @@ void lower_to_hw_instr(Program* program)
                handle_operands(copy_operations, &ctx, program->chip_class, pi);
                break;
             }
-            case aco_opcode::p_discard_if:
+            case aco_opcode::p_exit_early_if:
             {
-               bool early_exit = false;
-               if (block->instructions[j + 1]->opcode != aco_opcode::p_logical_end ||
-                   block->instructions[j + 2]->opcode != aco_opcode::s_endpgm) {
-                  early_exit = true;
+               /* don't bother with an early exit at the end of the program */
+               if (block->instructions[j + 1]->opcode == aco_opcode::p_logical_end &&
+                   block->instructions[j + 2]->opcode == aco_opcode::s_endpgm) {
+                  break;
                }
 
-               if (early_exit && !discard_block) {
+               if (!discard_block) {
                   discard_block = program->create_and_insert_block();
                   block = &program->blocks[i];
 
@@ -628,26 +628,13 @@ void lower_to_hw_instr(Program* program)
                   bld.reset(&ctx.instructions);
                }
 
-               // TODO: optimize uniform conditions
-               Definition branch_cond = instr->definitions.back();
-               Operand discard_cond = instr->operands.back();
-               aco_ptr<Instruction> sop2;
-               /* backwards, to finally branch on the global exec mask */
-               for (int i = instr->operands.size() - 2; i >= 0; i--) {
-                  bld.sop2(aco_opcode::s_andn2_b64,
-                           instr->definitions[i], /* new mask */
-                           branch_cond, /* scc */
-                           instr->operands[i], /* old mask */
-                           discard_cond);
-               }
-
-               if (early_exit) {
-                  bld.sopp(aco_opcode::s_cbranch_scc0, bld.scc(branch_cond.getTemp()), discard_block->index);
+               //TODO: exec can be zero here with block_kind_discard
 
-                  discard_block->linear_preds.push_back(block->index);
-                  block->linear_succs.push_back(discard_block->index);
-               }
+               assert(instr->operands[0].physReg() == scc);
+               bld.sopp(aco_opcode::s_cbranch_scc0, instr->operands[0], discard_block->index);
 
+               discard_block->linear_preds.push_back(block->index);
+               block->linear_succs.push_back(discard_block->index);
                break;
             }
             case aco_opcode::p_spill:
index a5b4eb9a54e18853427840cebfdf8445bd1d9414..a358527e60b33550493df9fcd4455fecceb73b7d 100644 (file)
@@ -236,6 +236,7 @@ opcode("p_discard_if")
 opcode("p_load_helper")
 opcode("p_demote_to_helper")
 opcode("p_is_helper")
+opcode("p_exit_early_if")
 
 opcode("p_fs_buffer_store_smem", format=Format.SMEM)
 
index 0cd67a979e0cbe866e8a6a3e7853b83f002b7646..09076a9a71f545197789165fd75180f6bae85c88 100644 (file)
@@ -220,6 +220,8 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
          break;
       if (candidate->opcode == aco_opcode::p_logical_start)
          break;
+      if (candidate->opcode == aco_opcode::p_exit_early_if)
+         break;
       if (!can_move_instr(candidate, current, moving_interaction))
          break;
       register_pressure.update(register_demand[candidate_idx]);
@@ -445,6 +447,8 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
          break;
       if (candidate->opcode == aco_opcode::p_logical_start)
          break;
+      if (candidate->opcode == aco_opcode::p_exit_early_if)
+         break;
       if (!can_move_instr(candidate, current, moving_interaction))
          break;
 
@@ -665,6 +669,8 @@ void schedule_position_export(sched_ctx& ctx, Block* block,
       /* break when encountering logical_start or barriers */
       if (candidate->opcode == aco_opcode::p_logical_start)
          break;
+      if (candidate->opcode == aco_opcode::p_exit_early_if)
+         break;
       if (candidate->isVMEM() || candidate->format == Format::SMEM)
          break;
       if (!can_move_instr(candidate, current, moving_interaction))