i965/fs: Merge CMP and SEL into CSEL on Gen8+
authorIan Romanick <ian.d.romanick@intel.com>
Thu, 22 Feb 2018 02:06:56 +0000 (18:06 -0800)
committerIan Romanick <ian.d.romanick@intel.com>
Thu, 8 Mar 2018 23:26:26 +0000 (15:26 -0800)
v2: Fix several problems handling inverted predicates.  Add a much
bigger comment around the BRW_CONDITIONAL_NZ case.

v3: Allow uniforms and shader inputs as sources for the original SEL and
CMP instructions.  This enables a LOT more shaders to receive CSEL
merging (5816 vs 8564 on SKL).

v4: Report progress.

Broadwell and Skylake had similar results. (Broadwell shown)
helped: 8527
HURT: 0
helped stats (abs) min: 1 max: 27 x̄: 2.44 x̃: 1
helped stats (rel) min: 0.03% max: 17.80% x̄: 1.12% x̃: 0.70%
95% mean confidence interval for instructions value: -2.51 -2.36
95% mean confidence interval for instructions %-change: -1.15% -1.10%
Instructions are helped.

total cycles in shared programs: 559442317 -> 558288357 (-0.21%)
cycles in affected programs: 372699860 -> 371545900 (-0.31%)
helped: 6748
HURT: 1450
helped stats (abs) min: 1 max: 32000 x̄: 182.41 x̃: 12
helped stats (rel) min: <.01% max: 66.08% x̄: 3.42% x̃: 0.70%
HURT stats (abs)   min: 1 max: 2538 x̄: 53.08 x̃: 14
HURT stats (rel)   min: <.01% max: 96.72% x̄: 3.32% x̃: 0.90%
95% mean confidence interval for cycles value: -179.01 -102.51
95% mean confidence interval for cycles %-change: -2.37% -2.08%
Cycles are helped.

LOST:   0
GAINED: 6

No changes on earlier platforms.

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com> [v1]
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> [v3]
Reviewed-by: Matt Turner <mattst88@gmail.com>
src/intel/compiler/brw_fs.cpp
src/intel/compiler/brw_fs.h

index 02a8ea0fd9dada60ef5a5f3867e4f9d202b1e3e1..422eedcf0af1d8c0328a8800660274df2ff137fa 100644 (file)
@@ -2843,6 +2843,106 @@ mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds)
    return ((1 << n) - 1) << shift;
 }
 
+bool
+fs_visitor::opt_peephole_csel()
+{
+   if (devinfo->gen < 8)
+      return false;
+
+   bool progress = false;
+
+   foreach_block_reverse(block, cfg) {
+      int ip = block->end_ip + 1;
+
+      foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
+         ip--;
+
+         if (inst->opcode != BRW_OPCODE_SEL ||
+             inst->predicate != BRW_PREDICATE_NORMAL ||
+             (inst->dst.type != BRW_REGISTER_TYPE_F &&
+              inst->dst.type != BRW_REGISTER_TYPE_D &&
+              inst->dst.type != BRW_REGISTER_TYPE_UD))
+            continue;
+
+         /* Because it is a 3-src instruction, CSEL cannot have an immediate
+          * value as a source, but we can sometimes handle zero.
+          */
+         if ((inst->src[0].file != VGRF && inst->src[0].file != ATTR &&
+              inst->src[0].file != UNIFORM) ||
+             (inst->src[1].file != VGRF && inst->src[1].file != ATTR &&
+              inst->src[1].file != UNIFORM && !inst->src[1].is_zero()))
+            continue;
+
+         foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+            if (!scan_inst->flags_written())
+               continue;
+
+            if ((scan_inst->opcode != BRW_OPCODE_CMP &&
+                 scan_inst->opcode != BRW_OPCODE_MOV) ||
+                scan_inst->predicate != BRW_PREDICATE_NONE ||
+                (scan_inst->src[0].file != VGRF &&
+                 scan_inst->src[0].file != ATTR &&
+                 scan_inst->src[0].file != UNIFORM) ||
+                scan_inst->src[0].type != BRW_REGISTER_TYPE_F)
+               break;
+
+            if (scan_inst->opcode == BRW_OPCODE_CMP && !scan_inst->src[1].is_zero())
+               break;
+
+            const brw::fs_builder ibld(this, block, inst);
+
+            const enum brw_conditional_mod cond =
+               inst->predicate_inverse
+               ? brw_negate_cmod(scan_inst->conditional_mod)
+               : scan_inst->conditional_mod;
+
+            fs_inst *csel_inst = NULL;
+
+            if (inst->src[1].file != IMM) {
+               csel_inst = ibld.CSEL(inst->dst,
+                                     inst->src[0],
+                                     inst->src[1],
+                                     scan_inst->src[0],
+                                     cond);
+            } else if (cond == BRW_CONDITIONAL_NZ) {
+               /* Consider the sequence
+                *
+                * cmp.nz.f0  null<1>F   g3<8,8,1>F   0F
+                * (+f0) sel  g124<1>UD  g2<8,8,1>UD  0x00000000UD
+                *
+                * The sel will pick the immediate value 0 if r0 is ±0.0.
+                * Therefore, this sequence is equivalent:
+                *
+                * cmp.nz.f0  null<1>F   g3<8,8,1>F   0F
+                * (+f0) sel  g124<1>F   g2<8,8,1>F   (abs)g3<8,8,1>F
+                *
+                * The abs is ensures that the result is 0UD when g3 is -0.0F.
+                * By normal cmp-sel merging, this is also equivalent:
+                *
+                * csel.nz    g124<1>F   g2<4,4,1>F   (abs)g3<4,4,1>F  g3<4,4,1>F
+                */
+               csel_inst = ibld.CSEL(inst->dst,
+                                     inst->src[0],
+                                     scan_inst->src[0],
+                                     scan_inst->src[0],
+                                     cond);
+
+               csel_inst->src[1].abs = true;
+            }
+
+            if (csel_inst != NULL) {
+               progress = true;
+               inst->remove(block);
+            }
+
+            break;
+         }
+      }
+   }
+
+   return progress;
+}
+
 bool
 fs_visitor::compute_to_mrf()
 {
@@ -6078,6 +6178,12 @@ fs_visitor::optimize()
       OPT(compact_virtual_grfs);
    } while (progress);
 
+   /* Do this after cmod propagation has had every possible opportunity to
+    * propagate results into SEL instructions.
+    */
+   if (OPT(opt_peephole_csel))
+      OPT(dead_code_eliminate);
+
    progress = false;
    pass_num = 0;
 
index 1b7df844696feb08babcde0d14a27ec5060ebea1..e384db809dcbb920f7162a87930993f875fc35fb 100644 (file)
@@ -191,6 +191,7 @@ public:
    fs_reg resolve_source_modifiers(const fs_reg &src);
    void emit_discard_jump();
    bool opt_peephole_sel();
+   bool opt_peephole_csel();
    bool opt_peephole_predicated_break();
    bool opt_saturate_propagation();
    bool opt_cmod_propagation();