v3d: Fold comparisons for IF conditions into the flags for the IF.
[mesa.git] / src / broadcom / compiler / nir_to_vir.c
index bb41c1be9718d3eaaba6bdacb540efb8937adced..9d20853061252b57bb8430ab83fa96b484a9f5a7 100644 (file)
@@ -268,6 +268,7 @@ ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
                 switch (instr->sampler_dim) {
                 case GLSL_SAMPLER_DIM_1D:
                 case GLSL_SAMPLER_DIM_2D:
+                case GLSL_SAMPLER_DIM_MS:
                 case GLSL_SAMPLER_DIM_3D:
                 case GLSL_SAMPLER_DIM_CUBE:
                         /* Don't minify the array size. */
@@ -495,92 +496,91 @@ declare_uniform_range(struct v3d_compile *c, uint32_t start, uint32_t size)
  * on the compare_instr's result.
  */
 static bool
-ntq_emit_comparison(struct v3d_compile *c, struct qreg *dest,
+ntq_emit_comparison(struct v3d_compile *c,
                     nir_alu_instr *compare_instr,
-                    nir_alu_instr *sel_instr)
+                    enum v3d_qpu_cond *out_cond)
 {
         struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
         struct qreg src1;
         if (nir_op_infos[compare_instr->op].num_inputs > 1)
                 src1 = ntq_get_alu_src(c, compare_instr, 1);
         bool cond_invert = false;
+        struct qreg nop = vir_reg(QFILE_NULL, 0);
 
         switch (compare_instr->op) {
-        case nir_op_feq:
+        case nir_op_feq32:
         case nir_op_seq:
-                vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHZ);
+                vir_set_pf(vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
                 break;
-        case nir_op_ieq:
-                vir_PF(c, vir_XOR(c, src0, src1), V3D_QPU_PF_PUSHZ);
+        case nir_op_ieq32:
+                vir_set_pf(vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
                 break;
 
-        case nir_op_fne:
+        case nir_op_fne32:
         case nir_op_sne:
-                vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHZ);
+                vir_set_pf(vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
                 cond_invert = true;
                 break;
-        case nir_op_ine:
-                vir_PF(c, vir_XOR(c, src0, src1), V3D_QPU_PF_PUSHZ);
+        case nir_op_ine32:
+                vir_set_pf(vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
                 cond_invert = true;
                 break;
 
-        case nir_op_fge:
+        case nir_op_fge32:
         case nir_op_sge:
-                vir_PF(c, vir_FCMP(c, src1, src0), V3D_QPU_PF_PUSHC);
+                vir_set_pf(vir_FCMP_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
                 break;
-        case nir_op_ige:
-                vir_PF(c, vir_MIN(c, src1, src0), V3D_QPU_PF_PUSHC);
+        case nir_op_ige32:
+                vir_set_pf(vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
                 cond_invert = true;
                 break;
-        case nir_op_uge:
-                vir_PF(c, vir_SUB(c, src0, src1), V3D_QPU_PF_PUSHC);
+        case nir_op_uge32:
+                vir_set_pf(vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
                 cond_invert = true;
                 break;
 
         case nir_op_slt:
-        case nir_op_flt:
-                vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHN);
+        case nir_op_flt32:
+                vir_set_pf(vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHN);
                 break;
-        case nir_op_ilt:
-                vir_PF(c, vir_MIN(c, src1, src0), V3D_QPU_PF_PUSHC);
+        case nir_op_ilt32:
+                vir_set_pf(vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
                 break;
-        case nir_op_ult:
-                vir_PF(c, vir_SUB(c, src0, src1), V3D_QPU_PF_PUSHC);
+        case nir_op_ult32:
+                vir_set_pf(vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
                 break;
 
         default:
                 return false;
         }
 
-        enum v3d_qpu_cond cond = (cond_invert ?
-                                  V3D_QPU_COND_IFNA :
-                                  V3D_QPU_COND_IFA);
+        *out_cond = cond_invert ? V3D_QPU_COND_IFNA : V3D_QPU_COND_IFA;
 
-        switch (sel_instr->op) {
-        case nir_op_seq:
-        case nir_op_sne:
-        case nir_op_sge:
-        case nir_op_slt:
-                *dest = vir_SEL(c, cond,
-                                vir_uniform_f(c, 1.0), vir_uniform_f(c, 0.0));
-                break;
-
-        case nir_op_bcsel:
-                *dest = vir_SEL(c, cond,
-                                ntq_get_alu_src(c, sel_instr, 1),
-                                ntq_get_alu_src(c, sel_instr, 2));
-                break;
+        return true;
+}
 
-        default:
-                *dest = vir_SEL(c, cond,
-                                vir_uniform_ui(c, ~0), vir_uniform_ui(c, 0));
-                break;
+/* Finds an ALU instruction that generates our src value that could
+ * (potentially) be greedily emitted in the consuming instruction.
+ */
+static struct nir_alu_instr *
+ntq_get_alu_parent(nir_src src)
+{
+        if (!src.is_ssa || src.ssa->parent_instr->type != nir_instr_type_alu)
+                return NULL;
+        nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr);
+        if (!instr)
+                return NULL;
+
+        /* If the ALU instr's srcs are non-SSA, then we would have to avoid
+         * moving emission of the ALU instr down past another write of the
+         * src.
+         */
+        for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+                if (!instr->src[i].src.is_ssa)
+                        return NULL;
         }
 
-        /* Make the temporary for nir_store_dest(). */
-        *dest = vir_MOV(c, *dest);
-
-        return true;
+        return instr;
 }
 
 /**
@@ -591,18 +591,13 @@ ntq_emit_comparison(struct v3d_compile *c, struct qreg *dest,
 static struct qreg ntq_emit_bcsel(struct v3d_compile *c, nir_alu_instr *instr,
                                   struct qreg *src)
 {
-        if (!instr->src[0].src.is_ssa)
-                goto out;
-        if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
-                goto out;
-        nir_alu_instr *compare =
-                nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+        nir_alu_instr *compare = ntq_get_alu_parent(instr->src[0].src);
         if (!compare)
                 goto out;
 
-        struct qreg dest;
-        if (ntq_emit_comparison(c, &dest, compare, instr))
-                return dest;
+        enum v3d_qpu_cond cond;
+        if (ntq_emit_comparison(c, compare, &cond))
+                return vir_MOV(c, vir_SEL(c, cond, src[1], src[2]));
 
 out:
         vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
@@ -682,14 +677,14 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
         case nir_op_u2f32:
                 result = vir_UTOF(c, src[0]);
                 break;
-        case nir_op_b2f:
+        case nir_op_b2f32:
                 result = vir_AND(c, src[0], vir_uniform_f(c, 1.0));
                 break;
-        case nir_op_b2i:
+        case nir_op_b2i32:
                 result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
                 break;
-        case nir_op_i2b:
-        case nir_op_f2b:
+        case nir_op_i2b32:
+        case nir_op_f2b32:
                 vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
                 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
                                             vir_uniform_ui(c, ~0),
@@ -747,23 +742,36 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
         case nir_op_seq:
         case nir_op_sne:
         case nir_op_sge:
-        case nir_op_slt:
-        case nir_op_feq:
-        case nir_op_fne:
-        case nir_op_fge:
-        case nir_op_flt:
-        case nir_op_ieq:
-        case nir_op_ine:
-        case nir_op_ige:
-        case nir_op_uge:
-        case nir_op_ilt:
-        case nir_op_ult:
-                if (!ntq_emit_comparison(c, &result, instr, instr)) {
-                        fprintf(stderr, "Bad comparison instruction\n");
-                }
+        case nir_op_slt: {
+                enum v3d_qpu_cond cond;
+                MAYBE_UNUSED bool ok = ntq_emit_comparison(c, instr, &cond);
+                assert(ok);
+                result = vir_MOV(c, vir_SEL(c, cond,
+                                            vir_uniform_f(c, 1.0),
+                                            vir_uniform_f(c, 0.0)));
+                break;
+        }
+
+        case nir_op_feq32:
+        case nir_op_fne32:
+        case nir_op_fge32:
+        case nir_op_flt32:
+        case nir_op_ieq32:
+        case nir_op_ine32:
+        case nir_op_ige32:
+        case nir_op_uge32:
+        case nir_op_ilt32:
+        case nir_op_ult32: {
+                enum v3d_qpu_cond cond;
+                MAYBE_UNUSED bool ok = ntq_emit_comparison(c, instr, &cond);
+                assert(ok);
+                result = vir_MOV(c, vir_SEL(c, cond,
+                                            vir_uniform_ui(c, ~0),
+                                            vir_uniform_ui(c, 0)));
                 break;
+        }
 
-        case nir_op_bcsel:
+        case nir_op_b32csel:
                 result = ntq_emit_bcsel(c, instr, src);
                 break;
         case nir_op_fcsel:
@@ -850,6 +858,9 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 break;
 
         case nir_op_unpack_half_2x16_split_x:
+                /* XXX perf: It would be good to be able to merge this unpack
+                 * with whatever uses our result.
+                 */
                 result = vir_FMOV(c, src[0]);
                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
                 break;
@@ -896,6 +907,8 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
 #define TLB_TYPE_DEPTH             ((2 << 6) | (0 << 4))
 #define TLB_DEPTH_TYPE_INVARIANT   (0 << 2) /* Unmodified sideband input used */
 #define TLB_DEPTH_TYPE_PER_PIXEL   (1 << 2) /* QPU result used */
+#define TLB_V42_DEPTH_TYPE_INVARIANT   (0 << 3) /* Unmodified sideband input used */
+#define TLB_V42_DEPTH_TYPE_PER_PIXEL   (1 << 3) /* QPU result used */
 
 /* Stencil is a single 32-bit write. */
 #define TLB_TYPE_STENCIL_ALPHA     ((2 << 6) | (1 << 4))
@@ -929,12 +942,16 @@ emit_frag_end(struct v3d_compile *c)
                 struct qinst *inst = vir_MOV_dest(c,
                                                   vir_reg(QFILE_TLBU, 0),
                                                   c->outputs[c->output_position_index]);
+                uint8_t tlb_specifier = TLB_TYPE_DEPTH;
+
+                if (c->devinfo->ver >= 42) {
+                        tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |
+                                          TLB_SAMPLE_MODE_PER_PIXEL);
+                } else
+                        tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL;
 
                 inst->src[vir_get_implicit_uniform_src(inst)] =
-                        vir_uniform_ui(c,
-                                       TLB_TYPE_DEPTH |
-                                       TLB_DEPTH_TYPE_PER_PIXEL |
-                                       0xffffff00);
+                        vir_uniform_ui(c, tlb_specifier | 0xffffff00);
         } else if (c->s->info.fs.uses_discard ||
                    c->fs_key->sample_alpha_to_coverage ||
                    !has_any_tlb_color_write) {
@@ -951,12 +968,20 @@ emit_frag_end(struct v3d_compile *c)
                 struct qinst *inst = vir_MOV_dest(c,
                                                   vir_reg(QFILE_TLBU, 0),
                                                   vir_reg(QFILE_NULL, 0));
+                uint8_t tlb_specifier = TLB_TYPE_DEPTH;
+
+                if (c->devinfo->ver >= 42) {
+                        /* The spec says the PER_PIXEL flag is ignored for
+                         * invariant writes, but the simulator demands it.
+                         */
+                        tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT |
+                                          TLB_SAMPLE_MODE_PER_PIXEL);
+                } else {
+                        tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT;
+                }
 
                 inst->src[vir_get_implicit_uniform_src(inst)] =
-                        vir_uniform_ui(c,
-                                       TLB_TYPE_DEPTH |
-                                       TLB_DEPTH_TYPE_INVARIANT |
-                                       0xffffff00);
+                        vir_uniform_ui(c, tlb_specifier | 0xffffff00);
         }
 
         /* XXX: Performance improvement: Merge Z write and color writes TLB
@@ -1224,7 +1249,7 @@ v3d_optimize_nir(struct nir_shader *s)
                 NIR_PASS(progress, s, nir_opt_dce);
                 NIR_PASS(progress, s, nir_opt_dead_cf);
                 NIR_PASS(progress, s, nir_opt_cse);
-                NIR_PASS(progress, s, nir_opt_peephole_select, 8);
+                NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
                 NIR_PASS(progress, s, nir_opt_algebraic);
                 NIR_PASS(progress, s, nir_opt_constant_folding);
                 NIR_PASS(progress, s, nir_opt_undef);
@@ -1475,6 +1500,10 @@ ntq_setup_registers(struct v3d_compile *c, struct exec_list *list)
 static void
 ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
 {
+        /* XXX perf: Experiment with using immediate loads to avoid having
+         * these end up in the uniform stream.  Watch out for breaking the
+         * small immediates optimization in the process!
+         */
         struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
         for (int i = 0; i < instr->def.num_components; i++)
                 qregs[i] = vir_uniform_ui(c, instr->value.u32[i]);
@@ -1497,15 +1526,14 @@ ntq_emit_ssa_undef(struct v3d_compile *c, nir_ssa_undef_instr *instr)
 static void
 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
 {
-        nir_const_value *const_offset;
         unsigned offset;
 
         switch (instr->intrinsic) {
         case nir_intrinsic_load_uniform:
                 assert(instr->num_components == 1);
-                const_offset = nir_src_as_const_value(instr->src[0]);
-                if (const_offset) {
-                        offset = nir_intrinsic_base(instr) + const_offset->u32[0];
+                if (nir_src_is_const(instr->src[0])) {
+                        offset = (nir_intrinsic_base(instr) +
+                                  nir_src_as_uint(instr->src[0]));
                         assert(offset % 4 == 0);
                         /* We need dwords */
                         offset = offset / 4;
@@ -1520,8 +1548,13 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
 
         case nir_intrinsic_load_ubo:
                 for (int i = 0; i < instr->num_components; i++) {
-                        int ubo = nir_src_as_const_value(instr->src[0])->u32[0];
+                        int ubo = nir_src_as_uint(instr->src[0]);
 
+                        /* XXX perf: On V3D 4.x with uniform offsets, we
+                         * should probably try setting UBOs up in the A
+                         * register file and doing a sequence of loads that
+                         * way.
+                         */
                         /* Adjust for where we stored the TGSI register base. */
                         vir_ADD_dest(c,
                                      vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
@@ -1536,9 +1569,9 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 }
                 break;
 
-                const_offset = nir_src_as_const_value(instr->src[0]);
-                if (const_offset) {
-                        offset = nir_intrinsic_base(instr) + const_offset->u32[0];
+                if (nir_src_is_const(instr->src[0])) {
+                        offset = (nir_intrinsic_base(instr) +
+                                  nir_src_as_uint(instr->src[0]));
                         assert(offset % 4 == 0);
                         /* We need dwords */
                         offset = offset / 4;
@@ -1566,8 +1599,15 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 break;
 
         case nir_intrinsic_load_sample_mask_in:
+                ntq_store_dest(c, &instr->dest, 0, vir_MSF(c));
+                break;
+
+        case nir_intrinsic_load_helper_invocation:
+                vir_PF(c, vir_MSF(c), V3D_QPU_PF_PUSHZ);
                 ntq_store_dest(c, &instr->dest, 0,
-                               vir_uniform(c, QUNIFORM_SAMPLE_MASK, 0));
+                               vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
+                                                  vir_uniform_ui(c, ~0),
+                                                  vir_uniform_ui(c, 0))));
                 break;
 
         case nir_intrinsic_load_front_face:
@@ -1589,10 +1629,9 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 break;
 
         case nir_intrinsic_load_input:
-                const_offset = nir_src_as_const_value(instr->src[0]);
-                assert(const_offset && "v3d doesn't support indirect inputs");
                 for (int i = 0; i < instr->num_components; i++) {
-                        offset = nir_intrinsic_base(instr) + const_offset->u32[0];
+                        offset = (nir_intrinsic_base(instr) +
+                                  nir_src_as_uint(instr->src[0]));
                         int comp = nir_intrinsic_component(instr) + i;
                         ntq_store_dest(c, &instr->dest, i,
                                        vir_MOV(c, c->inputs[offset * 4 + comp]));
@@ -1600,10 +1639,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 break;
 
         case nir_intrinsic_store_output:
-                const_offset = nir_src_as_const_value(instr->src[1]);
-                assert(const_offset && "v3d doesn't support indirect outputs");
                 offset = ((nir_intrinsic_base(instr) +
-                           const_offset->u32[0]) * 4 +
+                           nir_src_as_uint(instr->src[1])) * 4 +
                           nir_intrinsic_component(instr));
 
                 for (int i = 0; i < instr->num_components; i++) {
@@ -1660,12 +1697,19 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
 
 /* Clears (activates) the execute flags for any channels whose jump target
  * matches this block.
+ *
+ * XXX perf: Could we be using flpush/flpop somehow for our execution channel
+ * enabling?
+ *
+ * XXX perf: For uniform control flow, we should be able to skip c->execute
+ * handling entirely.
  */
 static void
 ntq_activate_execute_for_block(struct v3d_compile *c)
 {
-        vir_PF(c, vir_XOR(c, c->execute, vir_uniform_ui(c, c->cur_block->index)),
-               V3D_QPU_PF_PUSHZ);
+        vir_set_pf(vir_XOR_dest(c, vir_reg(QFILE_NULL, 0),
+                                c->execute, vir_uniform_ui(c, c->cur_block->index)),
+                   V3D_QPU_PF_PUSHZ);
 
         vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
 }
@@ -1692,15 +1736,33 @@ ntq_emit_if(struct v3d_compile *c, nir_if *if_stmt)
                 was_top_level = true;
         }
 
-        /* Set A for executing (execute == 0) and jumping (if->condition ==
-         * 0) channels, and then update execute flags for those to point to
-         * the ELSE block.
+        /* Set up the flags for the IF condition (taking the THEN branch). */
+        nir_alu_instr *if_condition_alu = ntq_get_alu_parent(if_stmt->condition);
+        enum v3d_qpu_cond cond;
+        if (!if_condition_alu ||
+            !ntq_emit_comparison(c, if_condition_alu, &cond)) {
+                vir_PF(c, ntq_get_src(c, if_stmt->condition, 0),
+                       V3D_QPU_PF_PUSHZ);
+                cond = V3D_QPU_COND_IFNA;
+        }
+
+        /* Update the flags+cond to mean "Taking the ELSE branch (!cond) and
+         * was previously active (execute Z) for updating the exec flags.
          */
-        vir_PF(c, vir_OR(c,
-                         c->execute,
-                         ntq_get_src(c, if_stmt->condition, 0)),
-                V3D_QPU_PF_PUSHZ);
-        vir_MOV_cond(c, V3D_QPU_COND_IFA,
+        if (was_top_level) {
+                cond = v3d_qpu_cond_invert(cond);
+        } else {
+                struct qinst *inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0),
+                                                  c->execute);
+                if (cond == V3D_QPU_COND_IFA) {
+                        vir_set_uf(inst, V3D_QPU_UF_NORNZ);
+                } else {
+                        vir_set_uf(inst, V3D_QPU_UF_ANDZ);
+                        cond = V3D_QPU_COND_IFA;
+                }
+        }
+
+        vir_MOV_cond(c, cond,
                      c->execute,
                      vir_uniform_ui(c, else_block->index));
 
@@ -1864,6 +1926,8 @@ ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
 
         c->loop_break_block = save_loop_break_block;
         c->loop_cont_block = save_loop_cont_block;
+
+        c->loops++;
 }
 
 static void
@@ -1916,6 +1980,10 @@ nir_to_vir(struct v3d_compile *c)
                 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
                 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
 
+                /* XXX perf: We could set the "disable implicit point/line
+                 * varyings" field in the shader record and not emit these, if
+                 * they're not going to be used.
+                 */
                 if (c->fs_key->is_points) {
                         c->point_x = emit_fragment_varying(c, NULL, 0);
                         c->point_y = emit_fragment_varying(c, NULL, 0);
@@ -1972,24 +2040,6 @@ const nir_shader_compiler_options v3d_nir_options = {
         .native_integers = true,
 };
 
-
-#if 0
-static int
-count_nir_instrs(nir_shader *nir)
-{
-        int count = 0;
-        nir_foreach_function(function, nir) {
-                if (!function->impl)
-                        continue;
-                nir_foreach_block(block, function->impl) {
-                        nir_foreach_instr(instr, block)
-                                count++;
-                }
-        }
-        return count;
-}
-#endif
-
 /**
  * When demoting a shader down to single-threaded, removes the THRSW
  * instructions (one will still be inserted at v3d_vir_to_qpu() for the
@@ -2110,7 +2160,14 @@ v3d_nir_to_vir(struct v3d_compile *c)
 
         vir_check_payload_w(c);
 
-        /* XXX: vir_schedule_instructions(c); */
+        /* XXX perf: On VC4, we do a VIR-level instruction scheduling here.
+         * We used that on that platform to pipeline TMU writes and reduce the
+         * number of thread switches, as well as try (mostly successfully) to
+         * reduce maximum register pressure to allow more threads.  We should
+         * do something of that sort for V3D -- either instruction scheduling
+         * here, or delay the the THRSW and LDTMUs from our texture
+         * instructions until the results are needed.
+         */
 
         if (V3D_DEBUG & (V3D_DEBUG_VIR |
                          v3d_debug_flag_for_shader_stage(c->s->info.stage))) {