intel/compiler: split is_partial_write() into two variants
authorIago Toral Quiroga <itoral@igalia.com>
Tue, 10 Jul 2018 07:52:46 +0000 (09:52 +0200)
committerJuan A. Suarez Romero <jasuarez@igalia.com>
Thu, 18 Apr 2019 09:05:18 +0000 (11:05 +0200)
This function is used in two different scenarios that for 32-bit
instructions are the same, but for 16-bit instructions are not.

One scenario is that in which we are working at a SIMD8 register
level and we need to know if a register is fully defined or written.
This is useful, for example, in the context of liveness analysis or
register allocation, where we work with units of registers.

The other scenario is that in which we want to know if an instruction
is writing a full scalar component or just some subset of it. This is
useful, for example, in the context of some optimization passes
like copy propagation.

For 32-bit instructions (or larger), a SIMD8 dispatch will always write
at least a full SIMD8 register (32B) if the write is not partial. The
function is_partial_write() checks this to determine if we have a partial
write. However, when we deal with 16-bit instructions, that logic disables
some optimizations that should be safe. For example, a SIMD8 16-bit MOV will
only update half of a SIMD register, but it is still a complete write of the
variable for a SIMD8 dispatch, so we should not prevent copy propagation in
this scenario because we don't write all 32 bytes in the SIMD register
or because the write starts at offset 16B (wehere we pack components Y or
W of 16-bit vectors).

This is a problem for SIMD8 executions (VS, TCS, TES, GS) of 16-bit
instructions, which lose a number of optimizations because of this, most
important of which is copy-propagation.

This patch splits is_partial_write() into is_partial_reg_write(), which
represents the current is_partial_write(), useful for things like
liveness analysis, and is_partial_var_write(), which considers
the dispatch size to check if we are writing a full variable (rather
than a full register) to decide if the write is partial or not, which
is what we really want in many optimization passes.

Then the patch goes on and rewrites all uses of is_partial_write() to use
one or the other version. Specifically, we use is_partial_var_write()
in the following places: copy propagation, cmod propagation, common
subexpression elimination, saturate propagation and sel peephole.

Notice that the semantics of is_partial_var_write() exactly match the
current implementation of is_partial_write() for anything that is
32-bit or larger, so no changes are expected for 32-bit instructions.

Tested against ~5000 tests involving 16-bit instructions in CTS produced
the following changes in instruction counts:

            Patched  |     Master    |    %    |
================================================
SIMD8  |    621,900  |    706,721    | -12.00% |
================================================
SIMD16 |     93,252  |     93,252    |   0.00% |
================================================

As expected, the change only affects SIMD8 dispatches.

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
src/intel/compiler/brw_fs.cpp
src/intel/compiler/brw_fs_cmod_propagation.cpp
src/intel/compiler/brw_fs_copy_propagation.cpp
src/intel/compiler/brw_fs_cse.cpp
src/intel/compiler/brw_fs_dead_code_eliminate.cpp
src/intel/compiler/brw_fs_live_variables.cpp
src/intel/compiler/brw_fs_reg_allocate.cpp
src/intel/compiler/brw_fs_register_coalesce.cpp
src/intel/compiler/brw_fs_saturate_propagation.cpp
src/intel/compiler/brw_fs_sel_peephole.cpp
src/intel/compiler/brw_ir_fs.h

index 15f51b038cba5f62173af665dd0ef8915627b2ae..ebcdd52d668b5ce6d91d75fef668a4874cc92cbe 100644 (file)
@@ -734,14 +734,33 @@ fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
  * it.
  */
 bool
-fs_inst::is_partial_write() const
+fs_inst::is_partial_reg_write() const
 {
    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
-           (this->exec_size * type_sz(this->dst.type)) < 32 ||
            !this->dst.is_contiguous() ||
+           (this->exec_size * type_sz(this->dst.type)) < REG_SIZE ||
            this->dst.offset % REG_SIZE != 0);
 }
 
+/**
+ * Returns true if the instruction has a flag that means it won't
+ * update an entire variable for the given dispatch width.
+ *
+ * This is only different from is_partial_reg_write() for SIMD8
+ * dispatches of 16-bit (or smaller) instructions.
+ */
+bool
+fs_inst::is_partial_var_write(uint32_t dispatch_width) const
+{
+   const uint32_t type_size = type_sz(this->dst.type);
+   uint32_t var_size = MIN2(REG_SIZE, dispatch_width * type_size);
+
+   return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
+           !this->dst.is_contiguous() ||
+           (this->exec_size * type_sz(this->dst.type)) < var_size ||
+           this->dst.offset % var_size != 0);
+}
+
 unsigned
 fs_inst::components_read(unsigned i) const
 {
@@ -2956,7 +2975,7 @@ fs_visitor::opt_register_renaming()
       if (depth == 0 &&
           inst->dst.file == VGRF &&
           alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written &&
-          !inst->is_partial_write()) {
+          !inst->is_partial_reg_write()) {
          if (remap[dst] == ~0u) {
             remap[dst] = dst;
          } else {
@@ -3160,7 +3179,7 @@ fs_visitor::compute_to_mrf()
       next_ip++;
 
       if (inst->opcode != BRW_OPCODE_MOV ||
-         inst->is_partial_write() ||
+         inst->is_partial_reg_write() ||
          inst->dst.file != MRF || inst->src[0].file != VGRF ||
          inst->dst.type != inst->src[0].type ||
          inst->src[0].abs || inst->src[0].negate ||
@@ -3193,7 +3212,7 @@ fs_visitor::compute_to_mrf()
             * that writes that reg, but it would require smarter
             * tracking.
             */
-           if (scan_inst->is_partial_write())
+           if (scan_inst->is_partial_reg_write())
               break;
 
             /* Handling things not fully contained in the source of the copy
@@ -3511,7 +3530,7 @@ fs_visitor::remove_duplicate_mrf_writes()
       if (inst->opcode == BRW_OPCODE_MOV &&
          inst->dst.file == MRF &&
          inst->src[0].file != ARF &&
-         !inst->is_partial_write()) {
+         !inst->is_partial_reg_write()) {
          last_mrf_move[inst->dst.nr] = inst;
       }
    }
index 14852e52e6e45bd582ae20f6ec34c857622a12d6..a821d4e54814dabf52836e801c1df58d19b23625 100644 (file)
 
 static bool
 cmod_propagate_cmp_to_add(const gen_device_info *devinfo, bblock_t *block,
-                          fs_inst *inst)
+                          fs_inst *inst, unsigned dispatch_width)
 {
    bool read_flag = false;
 
    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
       if (scan_inst->opcode == BRW_OPCODE_ADD &&
-          !scan_inst->is_partial_write() &&
+          !scan_inst->is_partial_var_write(dispatch_width) &&
           scan_inst->exec_size == inst->exec_size) {
          bool negate;
 
@@ -126,7 +126,7 @@ cmod_propagate_cmp_to_add(const gen_device_info *devinfo, bblock_t *block,
  */
 static bool
 cmod_propagate_not(const gen_device_info *devinfo, bblock_t *block,
-                   fs_inst *inst)
+                   fs_inst *inst, unsigned dispatch_width)
 {
    const enum brw_conditional_mod cond = brw_negate_cmod(inst->conditional_mod);
    bool read_flag = false;
@@ -141,7 +141,7 @@ cmod_propagate_not(const gen_device_info *devinfo, bblock_t *block,
              scan_inst->opcode != BRW_OPCODE_AND)
             break;
 
-         if (scan_inst->is_partial_write() ||
+         if (scan_inst->is_partial_var_write(dispatch_width) ||
              scan_inst->dst.offset != inst->src[0].offset ||
              scan_inst->exec_size != inst->exec_size)
             break;
@@ -166,7 +166,9 @@ cmod_propagate_not(const gen_device_info *devinfo, bblock_t *block,
 }
 
 static bool
-opt_cmod_propagation_local(const gen_device_info *devinfo, bblock_t *block)
+opt_cmod_propagation_local(const gen_device_info *devinfo,
+                           bblock_t *block,
+                           unsigned dispatch_width)
 {
    bool progress = false;
    int ip = block->end_ip + 1;
@@ -219,14 +221,14 @@ opt_cmod_propagation_local(const gen_device_info *devinfo, bblock_t *block)
        */
       if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) {
          if (brw_reg_type_is_floating_point(inst->src[0].type) &&
-             cmod_propagate_cmp_to_add(devinfo, block, inst))
+             cmod_propagate_cmp_to_add(devinfo, block, inst, dispatch_width))
             progress = true;
 
          continue;
       }
 
       if (inst->opcode == BRW_OPCODE_NOT) {
-         progress = cmod_propagate_not(devinfo, block, inst) || progress;
+         progress = cmod_propagate_not(devinfo, block, inst, dispatch_width) || progress;
          continue;
       }
 
@@ -234,7 +236,7 @@ opt_cmod_propagation_local(const gen_device_info *devinfo, bblock_t *block)
       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
          if (regions_overlap(scan_inst->dst, scan_inst->size_written,
                              inst->src[0], inst->size_read(0))) {
-            if (scan_inst->is_partial_write() ||
+            if (scan_inst->is_partial_var_write(dispatch_width) ||
                 scan_inst->dst.offset != inst->src[0].offset ||
                 scan_inst->exec_size != inst->exec_size)
                break;
@@ -364,7 +366,7 @@ fs_visitor::opt_cmod_propagation()
    bool progress = false;
 
    foreach_block_reverse(block, cfg) {
-      progress = opt_cmod_propagation_local(devinfo, block) || progress;
+      progress = opt_cmod_propagation_local(devinfo, block, dispatch_width) || progress;
    }
 
    if (progress)
index 1f4e122e6c92190a0e016ebc2f34320256585056..1670eedce77bbb6a87c0f100fd7cecfc14e347b5 100644 (file)
@@ -515,7 +515,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
    /* Compute the first component of the copy that the instruction is
     * reading, and the base byte offset within that component.
     */
-   assert(entry->dst.offset % REG_SIZE == 0 && entry->dst.stride == 1);
+   assert(entry->dst.stride == 1);
    const unsigned component = rel_offset / type_sz(entry->dst.type);
    const unsigned suboffset = rel_offset % type_sz(entry->dst.type);
 
@@ -767,7 +767,7 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
 }
 
 static bool
-can_propagate_from(fs_inst *inst)
+can_propagate_from(fs_inst *inst, unsigned dispatch_width)
 {
    return (inst->opcode == BRW_OPCODE_MOV &&
            inst->dst.file == VGRF &&
@@ -778,7 +778,7 @@ can_propagate_from(fs_inst *inst)
             inst->src[0].file == UNIFORM ||
             inst->src[0].file == IMM) &&
            inst->src[0].type == inst->dst.type &&
-           !inst->is_partial_write());
+           !inst->is_partial_var_write(dispatch_width));
 }
 
 /* Walks a basic block and does copy propagation on it using the acp
@@ -830,7 +830,7 @@ fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block,
       /* If this instruction's source could potentially be folded into the
        * operand of another instruction, add it to the ACP.
        */
-      if (can_propagate_from(inst)) {
+      if (can_propagate_from(inst, dispatch_width)) {
          acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
          entry->dst = inst->dst;
          entry->src = inst->src[0];
index 6efa111b1a43b3480cb58df1374cae6f0a4b74a2..e955c35e145aa45a26cfff1b9ebd42f507e833cd 100644 (file)
@@ -252,7 +252,8 @@ fs_visitor::opt_cse_local(bblock_t *block)
    int ip = block->start_ip;
    foreach_inst_in_block(fs_inst, inst, block) {
       /* Skip some cases. */
-      if (is_expression(this, inst) && !inst->is_partial_write() &&
+      if (is_expression(this, inst) &&
+          !inst->is_partial_var_write(dispatch_width) &&
           ((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) ||
            inst->dst.is_null()))
       {
index 38ae1d41a6ad65fecf603f0d3e1bcd6fd85ac13a..7c60a33eb58833c483f11827f9b5b6ed99355479 100644 (file)
@@ -107,7 +107,7 @@ fs_visitor::dead_code_eliminate()
          }
 
          if (inst->dst.file == VGRF) {
-            if (!inst->is_partial_write()) {
+            if (!inst->is_partial_reg_write()) {
                int var = live_intervals->var_from_reg(inst->dst);
                for (unsigned i = 0; i < regs_written(inst); i++) {
                   BITSET_CLEAR(live, var + i);
index 059f076fa5153ebcc1f05c905c55e31cff677e3c..30625aa586aeee97d62c490722a79c0b784b125e 100644 (file)
@@ -84,7 +84,7 @@ fs_live_variables::setup_one_write(struct block_data *bd, fs_inst *inst,
     * screens off previous updates of that variable (VGRF channel).
     */
    if (inst->dst.file == VGRF) {
-      if (!inst->is_partial_write() && !BITSET_TEST(bd->use, var))
+      if (!inst->is_partial_reg_write() && !BITSET_TEST(bd->use, var))
          BITSET_SET(bd->def, var);
 
       BITSET_SET(bd->defout, var);
index b3825f1ef8c11065038e359a27c456cba68d4312..94b1815349231f2722d81ac278b8cf219347de22 100644 (file)
@@ -1054,7 +1054,7 @@ fs_visitor::spill_reg(unsigned spill_reg)
           * write, there should be no need for the unspill since the
           * instruction will be overwriting the whole destination in any case.
          */
-         if (inst->is_partial_write() ||
+         if (inst->is_partial_reg_write() ||
              (!inst->force_writemask_all && !per_channel))
             emit_unspill(ubld, spill_src, subset_spill_offset,
                          regs_written(inst));
index 4fe6773da54b43e2b6d25dc85b90ed578ed5f38e..27234292c30b70d4dd92c08c377ca2670351df2e 100644 (file)
@@ -70,7 +70,7 @@ is_coalesce_candidate(const fs_visitor *v, const fs_inst *inst)
 {
    if ((inst->opcode != BRW_OPCODE_MOV &&
         inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) ||
-       inst->is_partial_write() ||
+       inst->is_partial_reg_write() ||
        inst->saturate ||
        inst->src[0].file != VGRF ||
        inst->src[0].negate ||
index d6cfa79a618fa2499541d8cf1c0ff57d6861bb3d..1e1461063aef9b162f4dc3890a60653be22e3c6b 100644 (file)
@@ -43,7 +43,8 @@
  */
 
 static bool
-opt_saturate_propagation_local(fs_visitor *v, bblock_t *block)
+opt_saturate_propagation_local(fs_visitor *v, bblock_t *block,
+                               unsigned dispatch_width)
 {
    bool progress = false;
    int ip = block->end_ip + 1;
@@ -66,7 +67,7 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block)
       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
          if (regions_overlap(scan_inst->dst, scan_inst->size_written,
                              inst->src[0], inst->size_read(0))) {
-            if (scan_inst->is_partial_write() ||
+            if (scan_inst->is_partial_var_write(dispatch_width) ||
                 (scan_inst->dst.type != inst->dst.type &&
                  !scan_inst->can_change_types()))
                break;
@@ -153,7 +154,7 @@ fs_visitor::opt_saturate_propagation()
    calculate_live_intervals();
 
    foreach_block (block, cfg) {
-      progress = opt_saturate_propagation_local(this, block) || progress;
+      progress = opt_saturate_propagation_local(this, block, dispatch_width) || progress;
    }
 
    /* Live intervals are still valid. */
index 6395b409b7c54713aff1d408b9938999b6703992..98d640a3bfefc28615c8e0cb1538d2af71e3d4d1 100644 (file)
@@ -167,8 +167,8 @@ fs_visitor::opt_peephole_sel()
              then_mov[i]->exec_size != else_mov[i]->exec_size ||
              then_mov[i]->group != else_mov[i]->group ||
              then_mov[i]->force_writemask_all != else_mov[i]->force_writemask_all ||
-             then_mov[i]->is_partial_write() ||
-             else_mov[i]->is_partial_write() ||
+             then_mov[i]->is_partial_var_write(dispatch_width) ||
+             else_mov[i]->is_partial_var_write(dispatch_width) ||
              then_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE ||
              else_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE) {
             movs = i;
index 56a4bdc6e526302409fde11f31d25b4365f5a4bf..87e03258d932720e23f4001d5a817185da7a1e7d 100644 (file)
@@ -348,7 +348,8 @@ public:
    void resize_sources(uint8_t num_sources);
 
    bool is_send_from_grf() const;
-   bool is_partial_write() const;
+   bool is_partial_reg_write() const;
+   bool is_partial_var_write(unsigned dispatch_width) const;
    bool is_copy_payload(const brw::simple_allocator &grf_alloc) const;
    unsigned components_read(unsigned i) const;
    unsigned size_read(int arg) const;