intel/fs,vec4: Pull stall logic for memory fences up into the IR

[mesa.git] / src / intel / compiler / brw_fs_copy_propagation.cpp
diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp

index e6eb174e0f2c4163a54d58b0a5ab17925b22084a..a33043573a53a23f46b1f3545c660887bf35430a 100644 (file)
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -48,8 +48,8 @@ struct acp_entry : public exec_node {
     fs_reg dst;
     fs_reg src;
     unsigned global_idx;
-   uint8_t size_written;
-   uint8_t size_read;
+   unsigned size_written;
+   unsigned size_read;
     enum opcode opcode;
     bool saturate;
  };
@@ -94,7 +94,7 @@ class fs_copy_prop_dataflow
  {
  public:
     fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg,
-                         const fs_live_variables *live,
+                         const fs_live_variables &live,
                           exec_list *out_acp[ACP_HASH_SIZE]);
  
     void setup_initial_values();
@@ -104,7 +104,7 @@ public:
  
     void *mem_ctx;
     cfg_t *cfg;
-   const fs_live_variables *live;
+   const fs_live_variables &live;
  
     acp_entry **acp;
     int num_acp;
@@ -115,7 +115,7 @@ public:
  } /* anonymous namespace */
  
  fs_copy_prop_dataflow::fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg,
-                                             const fs_live_variables *live,
+                                             const fs_live_variables &live,
                                               exec_list *out_acp[ACP_HASH_SIZE])
     : mem_ctx(mem_ctx), cfg(cfg), live(live)
  {
@@ -265,8 +265,8 @@ fs_copy_prop_dataflow::setup_initial_values()
        for (int i = 0; i < num_acp; i++) {
           BITSET_SET(bd[block->num].undef, i);
           for (unsigned off = 0; off < acp[i]->size_written; off += REG_SIZE) {
-            if (BITSET_TEST(live->block_data[block->num].defout,
-                            live->var_from_reg(byte_offset(acp[i]->dst, off))))
+            if (BITSET_TEST(live.block_data[block->num].defout,
+                            live.var_from_reg(byte_offset(acp[i]->dst, off))))
                 BITSET_CLEAR(bd[block->num].undef, i);
           }
        }
@@ -452,10 +452,24 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
     if (entry->src.file == IMM)
        return false;
     assert(entry->src.file == VGRF || entry->src.file == UNIFORM ||
-          entry->src.file == ATTR);
-
+          entry->src.file == ATTR || entry->src.file == FIXED_GRF);
+
+   /* Avoid propagating a LOAD_PAYLOAD instruction into another if there is a
+    * good chance that we'll be able to eliminate the latter through register
+    * coalescing.  If only part of the sources of the second LOAD_PAYLOAD can
+    * be simplified through copy propagation we would be making register
+    * coalescing impossible, ending up with unnecessary copies in the program.
+    * This is also the case for is_multi_copy_payload() copies that can only
+    * be coalesced when the instruction is lowered into a sequence of MOVs.
+    *
+    * Worse -- In cases where the ACP entry was the result of CSE combining
+    * multiple LOAD_PAYLOAD subexpressions, propagating the first LOAD_PAYLOAD
+    * into the second would undo the work of CSE, leading to an infinite
+    * optimization loop.  Avoid this by detecting LOAD_PAYLOAD copies from CSE
+    * temporaries which should match is_coalescing_payload().
+    */
     if (entry->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
-       inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD)
+       (is_coalescing_payload(alloc, inst) || is_multi_copy_payload(inst)))
        return false;
  
     assert(entry->dst.file == VGRF);
@@ -469,6 +483,21 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
                              entry->dst, entry->size_written))
        return false;
  
+   /* Avoid propagating a FIXED_GRF register into an EOT instruction in order
+    * for any register allocation restrictions to be applied.
+    */
+   if (entry->src.file == FIXED_GRF && inst->eot)
+      return false;
+
+   /* Avoid propagating odd-numbered FIXED_GRF registers into the first source
+    * of a LINTERP instruction on platforms where the PLN instruction has
+    * register alignment restrictions.
+    */
+   if (devinfo->has_pln && devinfo->gen <= 6 &&
+       entry->src.file == FIXED_GRF && (entry->src.nr & 1) &&
+       inst->opcode == FS_OPCODE_LINTERP && arg == 0)
+      return false;
+
     /* we can't generally copy-propagate UD negations because we
      * can end up accessing the resulting values as signed integers
      * instead. See also resolve_ud_negate() and comment in
@@ -493,16 +522,30 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
      * derivatives, assume that their operands are packed so we can't
      * generally propagate strided regions to them.
      */
-   if (instruction_requires_packed_data(inst) && entry->src.stride > 1)
+   const unsigned entry_stride = (entry->src.file == FIXED_GRF ? 1 :
+                                  entry->src.stride);
+   if (instruction_requires_packed_data(inst) && entry_stride > 1)
        return false;
  
     /* Bail if the result of composing both strides would exceed the
      * hardware limit.
      */
-   if (!can_take_stride(inst, arg, entry->src.stride * inst->src[arg].stride,
+   if (!can_take_stride(inst, arg, entry_stride * inst->src[arg].stride,
                          devinfo))
        return false;
  
+   /* Bail if the source FIXED_GRF region of the copy cannot be trivially
+    * composed with the source region of the instruction -- E.g. because the
+    * copy uses some extended stride greater than 4 not supported natively by
+    * the hardware as a horizontal stride, or because instruction compression
+    * could require us to use a vertical stride shorter than a GRF.
+    */
+   if (entry->src.file == FIXED_GRF &&
+       (inst->src[arg].stride > 4 ||
+        inst->dst.component_size(inst->exec_size) >
+        inst->src[arg].component_size(inst->exec_size)))
+      return false;
+
     /* Bail if the instruction type is larger than the execution type of the
      * copy, what implies that each channel is reading multiple channels of the
      * destination of the copy, and simply replacing the sources would give a
@@ -524,7 +567,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
      *
      * Which would have different semantics.
      */
-   if (entry->src.stride != 1 &&
+   if (entry_stride != 1 &&
         (inst->src[arg].stride *
          type_sz(inst->src[arg].type)) % type_sz(entry->src.type) != 0)
        return false;
@@ -562,13 +605,42 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
        }
     }
  
+   /* Save the offset of inst->src[arg] relative to entry->dst for it to be
+    * applied later.
+    */
+   const unsigned rel_offset = inst->src[arg].offset - entry->dst.offset;
+
+   /* Fold the copy into the instruction consuming it. */
     inst->src[arg].file = entry->src.file;
     inst->src[arg].nr = entry->src.nr;
-   inst->src[arg].stride *= entry->src.stride;
-   inst->saturate = inst->saturate || entry->saturate;
+   inst->src[arg].subnr = entry->src.subnr;
+   inst->src[arg].offset = entry->src.offset;
+
+   /* Compose the strides of both regions. */
+   if (entry->src.file == FIXED_GRF) {
+      if (inst->src[arg].stride) {
+         const unsigned orig_width = 1 << entry->src.width;
+         const unsigned reg_width = REG_SIZE / (type_sz(inst->src[arg].type) *
+                                                inst->src[arg].stride);
+         inst->src[arg].width = cvt(MIN2(orig_width, reg_width)) - 1;
+         inst->src[arg].hstride = cvt(inst->src[arg].stride);
+         inst->src[arg].vstride = inst->src[arg].hstride + inst->src[arg].width;
+      } else {
+         inst->src[arg].vstride = inst->src[arg].hstride =
+            inst->src[arg].width = 0;
+      }
  
-   /* Compute the offset of inst->src[arg] relative to entry->dst */
-   const unsigned rel_offset = inst->src[arg].offset - entry->dst.offset;
+      inst->src[arg].stride = 1;
+
+      /* Hopefully no Align16 around here... */
+      assert(entry->src.swizzle == BRW_SWIZZLE_XYZW);
+      inst->src[arg].swizzle = entry->src.swizzle;
+   } else {
+      inst->src[arg].stride *= entry->src.stride;
+   }
+
+   /* Compose any saturate modifiers. */
+   inst->saturate = inst->saturate || entry->saturate;
  
     /* Compute the first component of the copy that the instruction is
      * reading, and the base byte offset within that component.
@@ -580,9 +652,8 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
     /* Calculate the byte offset at the origin of the copy of the given
      * component and suboffset.
      */
-   inst->src[arg].offset = suboffset +
-      component * entry->src.stride * type_sz(entry->src.type) +
-      entry->src.offset;
+   inst->src[arg] = byte_offset(inst->src[arg],
+      component * entry_stride * type_sz(entry->src.type) + suboffset);
  
     if (has_source_modifiers) {
        if (entry->dst.type != inst->src[arg].type) {
@@ -752,7 +823,11 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
           if (i == 1) {
              inst->src[i] = val;
              progress = true;
-         } else if (i == 0 && inst->src[1].file != IMM) {
+         } else if (i == 0 && inst->src[1].file != IMM &&
+                    (inst->conditional_mod == BRW_CONDITIONAL_NONE ||
+                     /* Only GE and L are commutative. */
+                     inst->conditional_mod == BRW_CONDITIONAL_GE ||
+                     inst->conditional_mod == BRW_CONDITIONAL_L)) {
              inst->src[0] = inst->src[1];
              inst->src[1] = val;
  
@@ -834,9 +909,12 @@ can_propagate_from(fs_inst *inst)
                                inst->src[0], inst->size_read(0))) ||
              inst->src[0].file == ATTR ||
              inst->src[0].file == UNIFORM ||
-            inst->src[0].file == IMM) &&
+            inst->src[0].file == IMM ||
+            (inst->src[0].file == FIXED_GRF &&
+             inst->src[0].is_contiguous())) &&
             inst->src[0].type == inst->dst.type &&
-           !inst->is_partial_write());
+           !inst->is_partial_write()) ||
+          is_identity_payload(FIXED_GRF, inst);
  }
  
  /* Walks a basic block and does copy propagation on it using the acp
@@ -863,7 +941,7 @@ fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block,
        }
  
        /* kill the destination from the ACP */
-      if (inst->dst.file == VGRF) {
+      if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) {
           foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.nr % ACP_HASH_SIZE]) {
              if (regions_overlap(entry->dst, entry->size_written,
                                  inst->dst, inst->size_written))
@@ -889,11 +967,12 @@ fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block,
         * operand of another instruction, add it to the ACP.
         */
        if (can_propagate_from(inst)) {
-         acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
+         acp_entry *entry = rzalloc(copy_prop_ctx, acp_entry);
           entry->dst = inst->dst;
           entry->src = inst->src[0];
           entry->size_written = inst->size_written;
-         entry->size_read = inst->size_read(0);
+         for (unsigned i = 0; i < inst->sources; i++)
+            entry->size_read += inst->size_read(i);
           entry->opcode = inst->opcode;
           entry->saturate = inst->saturate;
           acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry);
@@ -905,7 +984,9 @@ fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block,
              assert(effective_width * type_sz(inst->src[i].type) % REG_SIZE == 0);
              const unsigned size_written = effective_width *
                                            type_sz(inst->src[i].type);
-            if (inst->src[i].file == VGRF) {
+            if (inst->src[i].file == VGRF ||
+                (inst->src[i].file == FIXED_GRF &&
+                 inst->src[i].is_contiguous())) {
                 acp_entry *entry = rzalloc(copy_prop_ctx, acp_entry);
                 entry->dst = byte_offset(inst->dst, offset);
                 entry->src = inst->src[i];
@@ -936,7 +1017,7 @@ fs_visitor::opt_copy_propagation()
     for (int i = 0; i < cfg->num_blocks; i++)
        out_acp[i] = new exec_list [ACP_HASH_SIZE];
  
-   calculate_live_intervals();
+   const fs_live_variables &live = live_analysis.require();
  
     /* First, walk through each block doing local copy propagation and getting
      * the set of copies available at the end of the block.
@@ -958,15 +1039,15 @@ fs_visitor::opt_copy_propagation()
        for (unsigned a = 0; a < ACP_HASH_SIZE; a++) {
           foreach_in_list_safe(acp_entry, entry, &out_acp[block->num][a]) {
              assert(entry->dst.file == VGRF);
-            if (block->start_ip <= virtual_grf_start[entry->dst.nr] &&
-                virtual_grf_end[entry->dst.nr] <= block->end_ip)
+            if (block->start_ip <= live.vgrf_start[entry->dst.nr] &&
+                live.vgrf_end[entry->dst.nr] <= block->end_ip)
                 entry->remove();
           }
        }
     }
  
     /* Do dataflow analysis for those available copies. */
-   fs_copy_prop_dataflow dataflow(copy_prop_ctx, cfg, live_intervals, out_acp);
+   fs_copy_prop_dataflow dataflow(copy_prop_ctx, cfg, live, out_acp);
  
     /* Next, re-run local copy propagation, this time with the set of copies
      * provided by the dataflow analysis available at the start of a block.
@@ -990,7 +1071,8 @@ fs_visitor::opt_copy_propagation()
     ralloc_free(copy_prop_ctx);
  
     if (progress)
-      invalidate_live_intervals();
+      invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
+                          DEPENDENCY_INSTRUCTION_DETAIL);
  
     return progress;
  }