i965/fs: Skip SIMD lowering destination zipping if possible.

author Francisco Jerez <currojerez@riseup.net>

Fri, 27 May 2016 07:45:04 +0000 (00:45 -0700)

committer Francisco Jerez <currojerez@riseup.net>

Thu, 2 Jun 2016 20:24:48 +0000 (13:24 -0700)
author Francisco Jerez <currojerez@riseup.net>
Fri, 27 May 2016 07:45:04 +0000 (00:45 -0700)
committer Francisco Jerez <currojerez@riseup.net>
Thu, 2 Jun 2016 20:24:48 +0000 (13:24 -0700)
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index 9abe73acef2bcd9cbe18fc48652b798a7d2506f0..f8b0059abe95a9d025cb3662ca497c07e15a88ec 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -5109,6 +5109,52 @@ emit_unzip(const fs_builder &lbld, bblock_t *block, fs_inst *inst,
     }
  }
  
+/**
+ * Return true if splitting out the group of channels of instruction \p inst
+ * given by lbld.group() requires allocating a temporary for the destination
+ * of the lowered instruction and copying the data back to the original
+ * destination region.
+ */
+static inline bool
+needs_dst_copy(const fs_builder &lbld, const fs_inst *inst)
+{
+   /* If the instruction writes more than one component we'll have to shuffle
+    * the results of multiple lowered instructions in order to make sure that
+    * they end up arranged correctly in the original destination region.
+    */
+   if (inst->regs_written * REG_SIZE >
+       inst->dst.component_size(inst->exec_size))
+      return true;
+
+   /* If the lowered execution size is larger than the original the result of
+    * the instruction won't fit in the original destination, so we'll have to
+    * allocate a temporary in any case.
+    */
+   if (lbld.dispatch_width() > inst->exec_size)
+      return true;
+
+   for (unsigned i = 0; i < inst->sources; i++) {
+      /* If we already made a copy of the source for other reasons there won't
+       * be any overlap with the destination.
+       */
+      if (needs_src_copy(lbld, inst, i))
+         continue;
+
+      /* In order to keep the logic simple we emit a copy whenever the
+       * destination region doesn't exactly match an overlapping source, which
+       * may point at the source and destination not being aligned group by
+       * group which could cause one of the lowered instructions to overwrite
+       * the data read from the same source by other lowered instructions.
+       */
+      if (regions_overlap(inst->dst, inst->regs_written * REG_SIZE,
+                          inst->src[i], inst->regs_read(i) * REG_SIZE) &&
+          !inst->dst.equals(inst->src[i]))
+        return true;
+   }
+
+   return false;
+}
+
  /**
   * Insert data from a packed temporary into the channel group given by
   * lbld.group() of the destination region of instruction \p inst and return
@@ -5129,6 +5175,8 @@ emit_zip(const fs_builder &lbld, bblock_t *block, fs_inst *inst)
     const fs_reg dst = horiz_offset(inst->dst, lbld.group());
     const unsigned dst_size = inst->regs_written * REG_SIZE /
              inst->dst.component_size(inst->exec_size);
+
+   if (needs_dst_copy(lbld, inst)) {
     const fs_reg tmp = lbld.vgrf(inst->dst.type, dst_size);
  
     if (inst->predicate) {
@@ -5146,6 +5194,13 @@ emit_zip(const fs_builder &lbld, bblock_t *block, fs_inst *inst)
            .MOV(offset(dst, inst->exec_size, k), offset(tmp, lbld, k));
  
     return tmp;
+
+   } else {
+      /* No need to allocate a temporary for the lowered instruction, just
+       * take the right group of channels from the original region.
+       */
+      return dst;
+   }
  }
  
  bool
author	Francisco Jerez <currojerez@riseup.net>
	Fri, 27 May 2016 07:45:04 +0000 (00:45 -0700)
committer	Francisco Jerez <currojerez@riseup.net>
	Thu, 2 Jun 2016 20:24:48 +0000 (13:24 -0700)