v3d: Fold comparisons for IF conditions into the flags for the IF.

[mesa.git] / src / broadcom / compiler / nir_to_vir.c
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c

index 167f00750b4ffb4b25ed77c51f6d34c921400888..9d20853061252b57bb8430ab83fa96b484a9f5a7 100644 (file)
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -268,6 +268,7 @@ ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
                  switch (instr->sampler_dim) {
                  case GLSL_SAMPLER_DIM_1D:
                  case GLSL_SAMPLER_DIM_2D:
+                case GLSL_SAMPLER_DIM_MS:
                  case GLSL_SAMPLER_DIM_3D:
                  case GLSL_SAMPLER_DIM_CUBE:
                          /* Don't minify the array size. */
@@ -495,92 +496,91 @@ declare_uniform_range(struct v3d_compile *c, uint32_t start, uint32_t size)
   * on the compare_instr's result.
   */
  static bool
-ntq_emit_comparison(struct v3d_compile *c, struct qreg *dest,
+ntq_emit_comparison(struct v3d_compile *c,
                      nir_alu_instr *compare_instr,
-                    nir_alu_instr *sel_instr)
+                    enum v3d_qpu_cond *out_cond)
  {
          struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
          struct qreg src1;
          if (nir_op_infos[compare_instr->op].num_inputs > 1)
                  src1 = ntq_get_alu_src(c, compare_instr, 1);
          bool cond_invert = false;
+        struct qreg nop = vir_reg(QFILE_NULL, 0);
  
          switch (compare_instr->op) {
          case nir_op_feq32:
          case nir_op_seq:
-                vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHZ);
+                vir_set_pf(vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
                  break;
          case nir_op_ieq32:
-                vir_PF(c, vir_XOR(c, src0, src1), V3D_QPU_PF_PUSHZ);
+                vir_set_pf(vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
                  break;
  
          case nir_op_fne32:
          case nir_op_sne:
-                vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHZ);
+                vir_set_pf(vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
                  cond_invert = true;
                  break;
          case nir_op_ine32:
-                vir_PF(c, vir_XOR(c, src0, src1), V3D_QPU_PF_PUSHZ);
+                vir_set_pf(vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
                  cond_invert = true;
                  break;
  
          case nir_op_fge32:
          case nir_op_sge:
-                vir_PF(c, vir_FCMP(c, src1, src0), V3D_QPU_PF_PUSHC);
+                vir_set_pf(vir_FCMP_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
                  break;
          case nir_op_ige32:
-                vir_PF(c, vir_MIN(c, src1, src0), V3D_QPU_PF_PUSHC);
+                vir_set_pf(vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
                  cond_invert = true;
                  break;
          case nir_op_uge32:
-                vir_PF(c, vir_SUB(c, src0, src1), V3D_QPU_PF_PUSHC);
+                vir_set_pf(vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
                  cond_invert = true;
                  break;
  
          case nir_op_slt:
          case nir_op_flt32:
-                vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHN);
+                vir_set_pf(vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHN);
                  break;
          case nir_op_ilt32:
-                vir_PF(c, vir_MIN(c, src1, src0), V3D_QPU_PF_PUSHC);
+                vir_set_pf(vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
                  break;
          case nir_op_ult32:
-                vir_PF(c, vir_SUB(c, src0, src1), V3D_QPU_PF_PUSHC);
+                vir_set_pf(vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
                  break;
  
          default:
                  return false;
          }
  
-        enum v3d_qpu_cond cond = (cond_invert ?
-                                  V3D_QPU_COND_IFNA :
-                                  V3D_QPU_COND_IFA);
+        *out_cond = cond_invert ? V3D_QPU_COND_IFNA : V3D_QPU_COND_IFA;
  
-        switch (sel_instr->op) {
-        case nir_op_seq:
-        case nir_op_sne:
-        case nir_op_sge:
-        case nir_op_slt:
-                *dest = vir_SEL(c, cond,
-                                vir_uniform_f(c, 1.0), vir_uniform_f(c, 0.0));
-                break;
-
-        case nir_op_b32csel:
-                *dest = vir_SEL(c, cond,
-                                ntq_get_alu_src(c, sel_instr, 1),
-                                ntq_get_alu_src(c, sel_instr, 2));
-                break;
+        return true;
+}
  
-        default:
-                *dest = vir_SEL(c, cond,
-                                vir_uniform_ui(c, ~0), vir_uniform_ui(c, 0));
-                break;
+/* Finds an ALU instruction that generates our src value that could
+ * (potentially) be greedily emitted in the consuming instruction.
+ */
+static struct nir_alu_instr *
+ntq_get_alu_parent(nir_src src)
+{
+        if (!src.is_ssa || src.ssa->parent_instr->type != nir_instr_type_alu)
+                return NULL;
+        nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr);
+        if (!instr)
+                return NULL;
+
+        /* If the ALU instr's srcs are non-SSA, then we would have to avoid
+         * moving emission of the ALU instr down past another write of the
+         * src.
+         */
+        for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+                if (!instr->src[i].src.is_ssa)
+                        return NULL;
          }
  
-        /* Make the temporary for nir_store_dest(). */
-        *dest = vir_MOV(c, *dest);
-
-        return true;
+        return instr;
  }
  
  /**
@@ -591,18 +591,13 @@ ntq_emit_comparison(struct v3d_compile *c, struct qreg *dest,
  static struct qreg ntq_emit_bcsel(struct v3d_compile *c, nir_alu_instr *instr,
                                    struct qreg *src)
  {
-        if (!instr->src[0].src.is_ssa)
-                goto out;
-        if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
-                goto out;
-        nir_alu_instr *compare =
-                nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+        nir_alu_instr *compare = ntq_get_alu_parent(instr->src[0].src);
          if (!compare)
                  goto out;
  
-        struct qreg dest;
-        if (ntq_emit_comparison(c, &dest, compare, instr))
-                return dest;
+        enum v3d_qpu_cond cond;
+        if (ntq_emit_comparison(c, compare, &cond))
+                return vir_MOV(c, vir_SEL(c, cond, src[1], src[2]));
  
  out:
          vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
@@ -747,7 +742,16 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
          case nir_op_seq:
          case nir_op_sne:
          case nir_op_sge:
-        case nir_op_slt:
+        case nir_op_slt: {
+                enum v3d_qpu_cond cond;
+                MAYBE_UNUSED bool ok = ntq_emit_comparison(c, instr, &cond);
+                assert(ok);
+                result = vir_MOV(c, vir_SEL(c, cond,
+                                            vir_uniform_f(c, 1.0),
+                                            vir_uniform_f(c, 0.0)));
+                break;
+        }
+
          case nir_op_feq32:
          case nir_op_fne32:
          case nir_op_fge32:
@@ -757,11 +761,15 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
          case nir_op_ige32:
          case nir_op_uge32:
          case nir_op_ilt32:
-        case nir_op_ult32:
-                if (!ntq_emit_comparison(c, &result, instr, instr)) {
-                        fprintf(stderr, "Bad comparison instruction\n");
-                }
+        case nir_op_ult32: {
+                enum v3d_qpu_cond cond;
+                MAYBE_UNUSED bool ok = ntq_emit_comparison(c, instr, &cond);
+                assert(ok);
+                result = vir_MOV(c, vir_SEL(c, cond,
+                                            vir_uniform_ui(c, ~0),
+                                            vir_uniform_ui(c, 0)));
                  break;
+        }
  
          case nir_op_b32csel:
                  result = ntq_emit_bcsel(c, instr, src);
@@ -1241,7 +1249,7 @@ v3d_optimize_nir(struct nir_shader *s)
                  NIR_PASS(progress, s, nir_opt_dce);
                  NIR_PASS(progress, s, nir_opt_dead_cf);
                  NIR_PASS(progress, s, nir_opt_cse);
-                NIR_PASS(progress, s, nir_opt_peephole_select, 8);
+                NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
                  NIR_PASS(progress, s, nir_opt_algebraic);
                  NIR_PASS(progress, s, nir_opt_constant_folding);
                  NIR_PASS(progress, s, nir_opt_undef);
@@ -1594,6 +1602,14 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                  ntq_store_dest(c, &instr->dest, 0, vir_MSF(c));
                  break;
  
+        case nir_intrinsic_load_helper_invocation:
+                vir_PF(c, vir_MSF(c), V3D_QPU_PF_PUSHZ);
+                ntq_store_dest(c, &instr->dest, 0,
+                               vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
+                                                  vir_uniform_ui(c, ~0),
+                                                  vir_uniform_ui(c, 0))));
+                break;
+
          case nir_intrinsic_load_front_face:
                  /* The register contains 0 (front) or 1 (back), and we need to
                   * turn it into a NIR bool where true means front.
@@ -1691,8 +1707,9 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
  static void
  ntq_activate_execute_for_block(struct v3d_compile *c)
  {
-        vir_PF(c, vir_XOR(c, c->execute, vir_uniform_ui(c, c->cur_block->index)),
-               V3D_QPU_PF_PUSHZ);
+        vir_set_pf(vir_XOR_dest(c, vir_reg(QFILE_NULL, 0),
+                                c->execute, vir_uniform_ui(c, c->cur_block->index)),
+                   V3D_QPU_PF_PUSHZ);
  
          vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
  }
@@ -1719,19 +1736,33 @@ ntq_emit_if(struct v3d_compile *c, nir_if *if_stmt)
                  was_top_level = true;
          }
  
-        /* Set A for executing (execute == 0) and jumping (if->condition ==
-         * 0) channels, and then update execute flags for those to point to
-         * the ELSE block.
-         *
-         * XXX perf: we could reuse ntq_emit_comparison() to generate our if
-         * condition, and the .uf field to ignore non-executing channels, to
-         * reduce the overhead of if statements.
+        /* Set up the flags for the IF condition (taking the THEN branch). */
+        nir_alu_instr *if_condition_alu = ntq_get_alu_parent(if_stmt->condition);
+        enum v3d_qpu_cond cond;
+        if (!if_condition_alu ||
+            !ntq_emit_comparison(c, if_condition_alu, &cond)) {
+                vir_PF(c, ntq_get_src(c, if_stmt->condition, 0),
+                       V3D_QPU_PF_PUSHZ);
+                cond = V3D_QPU_COND_IFNA;
+        }
+
+        /* Update the flags+cond to mean "Taking the ELSE branch (!cond) and
+         * was previously active (execute Z) for updating the exec flags.
           */
-        vir_PF(c, vir_OR(c,
-                         c->execute,
-                         ntq_get_src(c, if_stmt->condition, 0)),
-                V3D_QPU_PF_PUSHZ);
-        vir_MOV_cond(c, V3D_QPU_COND_IFA,
+        if (was_top_level) {
+                cond = v3d_qpu_cond_invert(cond);
+        } else {
+                struct qinst *inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0),
+                                                  c->execute);
+                if (cond == V3D_QPU_COND_IFA) {
+                        vir_set_uf(inst, V3D_QPU_UF_NORNZ);
+                } else {
+                        vir_set_uf(inst, V3D_QPU_UF_ANDZ);
+                        cond = V3D_QPU_COND_IFA;
+                }
+        }
+
+        vir_MOV_cond(c, cond,
                       c->execute,
                       vir_uniform_ui(c, else_block->index));
  
@@ -1895,6 +1926,8 @@ ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
  
          c->loop_break_block = save_loop_break_block;
          c->loop_cont_block = save_loop_cont_block;
+
+        c->loops++;
  }
  
  static void
@@ -2007,24 +2040,6 @@ const nir_shader_compiler_options v3d_nir_options = {
          .native_integers = true,
  };
  
-
-#if 0
-static int
-count_nir_instrs(nir_shader *nir)
-{
-        int count = 0;
-        nir_foreach_function(function, nir) {
-                if (!function->impl)
-                        continue;
-                nir_foreach_block(block, function->impl) {
-                        nir_foreach_instr(instr, block)
-                                count++;
-                }
-        }
-        return count;
-}
-#endif
-
  /**
   * When demoting a shader down to single-threaded, removes the THRSW
   * instructions (one will still be inserted at v3d_vir_to_qpu() for the