intel/fs: Add partial support for copy-propagating FIXED_GRFs.

author Francisco Jerez <currojerez@riseup.net>

Mon, 30 Dec 2019 08:38:08 +0000 (00:38 -0800)

committer Francisco Jerez <currojerez@riseup.net>

Fri, 17 Jan 2020 21:21:33 +0000 (13:21 -0800)
author Francisco Jerez <currojerez@riseup.net>
Mon, 30 Dec 2019 08:38:08 +0000 (00:38 -0800)
committer Francisco Jerez <currojerez@riseup.net>
Fri, 17 Jan 2020 21:21:33 +0000 (13:21 -0800)
diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp

index e6eb174e0f2c4163a54d58b0a5ab17925b22084a..adf0f2fefff2c672ed2dab6c49e6bff2e8b68ace 100644 (file)
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -452,7 +452,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
     if (entry->src.file == IMM)
        return false;
     assert(entry->src.file == VGRF || entry->src.file == UNIFORM ||
-          entry->src.file == ATTR);
+          entry->src.file == ATTR || entry->src.file == FIXED_GRF);
  
     if (entry->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
         inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD)
@@ -469,6 +469,21 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
                              entry->dst, entry->size_written))
        return false;
  
+   /* Avoid propagating a FIXED_GRF register into an EOT instruction in order
+    * for any register allocation restrictions to be applied.
+    */
+   if (entry->src.file == FIXED_GRF && inst->eot)
+      return false;
+
+   /* Avoid propagating odd-numbered FIXED_GRF registers into the first source
+    * of a LINTERP instruction on platforms where the PLN instruction has
+    * register alignment restrictions.
+    */
+   if (devinfo->has_pln && devinfo->gen <= 6 &&
+       entry->src.file == FIXED_GRF && (entry->src.nr & 1) &&
+       inst->opcode == FS_OPCODE_LINTERP && arg == 0)
+      return false;
+
     /* we can't generally copy-propagate UD negations because we
      * can end up accessing the resulting values as signed integers
      * instead. See also resolve_ud_negate() and comment in
@@ -493,16 +508,30 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
      * derivatives, assume that their operands are packed so we can't
      * generally propagate strided regions to them.
      */
-   if (instruction_requires_packed_data(inst) && entry->src.stride > 1)
+   const unsigned entry_stride = (entry->src.file == FIXED_GRF ? 1 :
+                                  entry->src.stride);
+   if (instruction_requires_packed_data(inst) && entry_stride > 1)
        return false;
  
     /* Bail if the result of composing both strides would exceed the
      * hardware limit.
      */
-   if (!can_take_stride(inst, arg, entry->src.stride * inst->src[arg].stride,
+   if (!can_take_stride(inst, arg, entry_stride * inst->src[arg].stride,
                          devinfo))
        return false;
  
+   /* Bail if the source FIXED_GRF region of the copy cannot be trivially
+    * composed with the source region of the instruction -- E.g. because the
+    * copy uses some extended stride greater than 4 not supported natively by
+    * the hardware as a horizontal stride, or because instruction compression
+    * could require us to use a vertical stride shorter than a GRF.
+    */
+   if (entry->src.file == FIXED_GRF &&
+       (inst->src[arg].stride > 4 ||
+        inst->dst.component_size(inst->exec_size) >
+        inst->src[arg].component_size(inst->exec_size)))
+      return false;
+
     /* Bail if the instruction type is larger than the execution type of the
      * copy, what implies that each channel is reading multiple channels of the
      * destination of the copy, and simply replacing the sources would give a
@@ -524,7 +553,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
      *
      * Which would have different semantics.
      */
-   if (entry->src.stride != 1 &&
+   if (entry_stride != 1 &&
         (inst->src[arg].stride *
          type_sz(inst->src[arg].type)) % type_sz(entry->src.type) != 0)
        return false;
@@ -562,13 +591,42 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
        }
     }
  
+   /* Save the offset of inst->src[arg] relative to entry->dst for it to be
+    * applied later.
+    */
+   const unsigned rel_offset = inst->src[arg].offset - entry->dst.offset;
+
+   /* Fold the copy into the instruction consuming it. */
     inst->src[arg].file = entry->src.file;
     inst->src[arg].nr = entry->src.nr;
-   inst->src[arg].stride *= entry->src.stride;
-   inst->saturate = inst->saturate || entry->saturate;
+   inst->src[arg].subnr = entry->src.subnr;
+   inst->src[arg].offset = entry->src.offset;
+
+   /* Compose the strides of both regions. */
+   if (entry->src.file == FIXED_GRF) {
+      if (inst->src[arg].stride) {
+         const unsigned orig_width = 1 << entry->src.width;
+         const unsigned reg_width = REG_SIZE / (type_sz(inst->src[arg].type) *
+                                                inst->src[arg].stride);
+         inst->src[arg].width = cvt(MIN2(orig_width, reg_width)) - 1;
+         inst->src[arg].hstride = cvt(inst->src[arg].stride);
+         inst->src[arg].vstride = inst->src[arg].hstride + inst->src[arg].width;
+      } else {
+         inst->src[arg].vstride = inst->src[arg].hstride =
+            inst->src[arg].width = 0;
+      }
  
-   /* Compute the offset of inst->src[arg] relative to entry->dst */
-   const unsigned rel_offset = inst->src[arg].offset - entry->dst.offset;
+      inst->src[arg].stride = 1;
+
+      /* Hopefully no Align16 around here... */
+      assert(entry->src.swizzle == BRW_SWIZZLE_XYZW);
+      inst->src[arg].swizzle = entry->src.swizzle;
+   } else {
+      inst->src[arg].stride *= entry->src.stride;
+   }
+
+   /* Compose any saturate modifiers. */
+   inst->saturate = inst->saturate || entry->saturate;
  
     /* Compute the first component of the copy that the instruction is
      * reading, and the base byte offset within that component.
@@ -580,9 +638,8 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
     /* Calculate the byte offset at the origin of the copy of the given
      * component and suboffset.
      */
-   inst->src[arg].offset = suboffset +
-      component * entry->src.stride * type_sz(entry->src.type) +
-      entry->src.offset;
+   inst->src[arg] = byte_offset(inst->src[arg],
+      component * entry_stride * type_sz(entry->src.type) + suboffset);
  
     if (has_source_modifiers) {
        if (entry->dst.type != inst->src[arg].type) {
@@ -834,7 +891,9 @@ can_propagate_from(fs_inst *inst)
                                inst->src[0], inst->size_read(0))) ||
              inst->src[0].file == ATTR ||
              inst->src[0].file == UNIFORM ||
-            inst->src[0].file == IMM) &&
+            inst->src[0].file == IMM ||
+            (inst->src[0].file == FIXED_GRF &&
+             inst->src[0].is_contiguous())) &&
             inst->src[0].type == inst->dst.type &&
             !inst->is_partial_write());
  }
@@ -863,7 +922,7 @@ fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block,
        }
  
        /* kill the destination from the ACP */
-      if (inst->dst.file == VGRF) {
+      if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) {
           foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.nr % ACP_HASH_SIZE]) {
              if (regions_overlap(entry->dst, entry->size_written,
                                  inst->dst, inst->size_written))
@@ -905,7 +964,9 @@ fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block,
              assert(effective_width * type_sz(inst->src[i].type) % REG_SIZE == 0);
              const unsigned size_written = effective_width *
                                            type_sz(inst->src[i].type);
-            if (inst->src[i].file == VGRF) {
+            if (inst->src[i].file == VGRF ||
+                (inst->src[i].file == FIXED_GRF &&
+                 inst->src[i].is_contiguous())) {
                 acp_entry *entry = rzalloc(copy_prop_ctx, acp_entry);
                 entry->dst = byte_offset(inst->dst, offset);
                 entry->src = inst->src[i];
author	Francisco Jerez <currojerez@riseup.net>
	Mon, 30 Dec 2019 08:38:08 +0000 (00:38 -0800)
committer	Francisco Jerez <currojerez@riseup.net>
	Fri, 17 Jan 2020 21:21:33 +0000 (13:21 -0800)