From 471af25fc57dc43a8277b4b17ec82547287621d0 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 30 Nov 2012 22:29:26 -0800
Subject: [PATCH] i965/vs: Extend opt_compute_to_mrf to handle limited
 "reswizzling"

The way our visitor works, scalar expression/swizzle results that get
stored in channels other than .x will have an intermediate MOV from
their result in the .x channel to the real .y (or whatever) channel, and
similarly for vec2/vec3 results.

By knowing how to adjust DP4-type instructions for optimizing out a
swizzled MOV, we can reduce instructions in common matrix multiplication
cases.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_vec4.cpp        | 99 +++++++++++++++++--
 src/mesa/drivers/dri/i965/brw_vec4.h          |  2 +
 .../dri/i965/test_vec4_register_coalesce.cpp  | 21 ++++
 3 files changed, 113 insertions(+), 9 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 436ba97e147..7ab37e7ca9f 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -600,6 +600,85 @@ vec4_visitor::move_push_constants_to_pull_constants()
    pack_uniform_registers();
 }
 
+bool
+vec4_instruction::can_reswizzle_dst(int dst_writemask,
+                                    int swizzle,
+                                    int swizzle_mask)
+{
+   /* If this instruction sets anything not referenced by swizzle, then we'd
+    * totally break it when we reswizzle.
+    */
+   if (dst.writemask & ~swizzle_mask)
+      return false;
+
+   switch (opcode) {
+   case BRW_OPCODE_DP4:
+   case BRW_OPCODE_DP3:
+   case BRW_OPCODE_DP2:
+      return true;
+   default:
+      /* Check if there happens to be no reswizzling required. */
+      for (int c = 0; c < 4; c++) {
+         int bit = 1 << BRW_GET_SWZ(swizzle, c);
+         /* Skip components of the swizzle not used by the dst. */
+         if (!(dst_writemask & (1 << c)))
+            continue;
+
+         /* We don't do the reswizzling yet, so just sanity check that we
+          * don't have to.
+          */
+         if (bit != (1 << c))
+            return false;
+      }
+      return true;
+   }
+}
+
+/**
+ * For any channels in the swizzle's source that were populated by this
+ * instruction, rewrite the instruction to put the appropriate result directly
+ * in those channels.
+ *
+ * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
+ */
+void
+vec4_instruction::reswizzle_dst(int dst_writemask, int swizzle)
+{
+   int new_writemask = 0;
+
+   switch (opcode) {
+   case BRW_OPCODE_DP4:
+   case BRW_OPCODE_DP3:
+   case BRW_OPCODE_DP2:
+      for (int c = 0; c < 4; c++) {
+         int bit = 1 << BRW_GET_SWZ(swizzle, c);
+         /* Skip components of the swizzle not used by the dst. */
+         if (!(dst_writemask & (1 << c)))
+            continue;
+         /* If we were populating this component, then populate the
+          * corresponding channel of the new dst.
+          */
+         if (dst.writemask & bit)
+            new_writemask |= (1 << c);
+      }
+      dst.writemask = new_writemask;
+      break;
+   default:
+      for (int c = 0; c < 4; c++) {
+         int bit = 1 << BRW_GET_SWZ(swizzle, c);
+         /* Skip components of the swizzle not used by the dst. */
+         if (!(dst_writemask & (1 << c)))
+            continue;
+
+         /* We don't do the reswizzling yet, so just sanity check that we
+          * don't have to.
+          */
+         assert(bit == (1 << c));
+      }
+      break;
+   }
+}
+
 /*
  * Tries to reduce extra MOV instructions by taking GRFs that get just
  * written and then MOVed into an MRF and making the original write of
@@ -641,26 +720,20 @@ vec4_visitor::opt_compute_to_mrf()
        */
       bool chans_needed[4] = {false, false, false, false};
       int chans_remaining = 0;
+      int swizzle_mask = 0;
       for (int i = 0; i < 4; i++) {
 	 int chan = BRW_GET_SWZ(inst->src[0].swizzle, i);
 
 	 if (!(inst->dst.writemask & (1 << i)))
 	    continue;
 
-	 /* We don't handle compute-to-MRF across a swizzle.  We would
-	  * need to be able to rewrite instructions above to output
-	  * results to different channels.
-	  */
-	 if (chan != i)
-	    chans_remaining = 5;
+         swizzle_mask |= (1 << chan);
 
 	 if (!chans_needed[chan]) {
 	    chans_needed[chan] = true;
 	    chans_remaining++;
 	 }
       }
-      if (chans_remaining > 4)
-	 continue;
 
       /* Now walk up the instruction stream trying to see if we can
        * rewrite everything writing to the GRF into the MRF instead.
@@ -689,6 +762,13 @@ vec4_visitor::opt_compute_to_mrf()
 	       }
 	    }
 
+            /* If we can't handle the swizzle, bail. */
+            if (!scan_inst->can_reswizzle_dst(inst->dst.writemask,
+                                              inst->src[0].swizzle,
+                                              swizzle_mask)) {
+               break;
+            }
+
 	    /* Mark which channels we found unconditional writes for. */
 	    if (!scan_inst->predicate) {
 	       for (int i = 0; i < 4; i++) {
@@ -759,10 +839,11 @@ vec4_visitor::opt_compute_to_mrf()
 	    if (scan_inst->dst.file == GRF &&
 		scan_inst->dst.reg == inst->src[0].reg &&
 		scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
+               scan_inst->reswizzle_dst(inst->dst.writemask,
+                                        inst->src[0].swizzle);
 	       scan_inst->dst.file = MRF;
 	       scan_inst->dst.reg = mrf;
 	       scan_inst->dst.reg_offset = 0;
-	       scan_inst->dst.writemask &= inst->dst.writemask;
 	       scan_inst->saturate |= inst->saturate;
 	    }
 	    scan_inst = (vec4_instruction *)scan_inst->next;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index d7c1cce075d..6da44d4080a 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -194,6 +194,8 @@ public:
 
    bool is_tex();
    bool is_math();
+   bool can_reswizzle_dst(int dst_writemask, int swizzle, int swizzle_mask);
+   void reswizzle_dst(int dst_writemask, int swizzle);
 };
 
 /**
diff --git a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
index c79b0fd1831..fa9c155655f 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
@@ -122,3 +122,24 @@ TEST_F(register_coalesce_test, test_multiple_use)
 
    EXPECT_NE(mul->dst.file, MRF);
 }
+
+TEST_F(register_coalesce_test, test_dp4_mrf)
+{
+   src_reg some_src_1 = src_reg(v, glsl_type::vec4_type);
+   src_reg some_src_2 = src_reg(v, glsl_type::vec4_type);
+   dst_reg init;
+
+   dst_reg m0 = dst_reg(MRF, 0);
+   m0.writemask = WRITEMASK_Y;
+   m0.type = BRW_REGISTER_TYPE_F;
+
+   dst_reg temp = dst_reg(v, glsl_type::float_type);
+
+   vec4_instruction *dp4 = v->emit(v->DP4(temp, some_src_1, some_src_2));
+   v->emit(v->MOV(m0, src_reg(temp)));
+
+   register_coalesce(v);
+
+   EXPECT_EQ(dp4->dst.file, MRF);
+   EXPECT_EQ(dp4->dst.writemask, WRITEMASK_Y);
+}
-- 
2.30.2