From f10f5e498008636b110d01b2613e552da7793708 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Wed, 6 Mar 2013 16:38:10 -0800
Subject: [PATCH] i965/fs: Fix register allocation for uniform pull constants
 in 16-wide.

We were allowing a compressed instruction to write a register that
contained the last use of a uniform pull constant (either UBO load or push
constant spillover), so it would get half its values smashed.

Since we need to see the actual instruction to decide this, move the
pre-gen6 pixel_x/y logic here, which should improve the performance of
register allocation since virtual_grf_interferes() is called more than
once per instruction.

NOTE: This is a candidate for the stable branches.
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=61317
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 .../dri/i965/brw_fs_live_variables.cpp        | 54 +++++++++++--------
 1 file changed, 31 insertions(+), 23 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
index db8f39732d2..4c7991dc54d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
@@ -190,6 +190,37 @@ fs_visitor::calculate_live_intervals()
 	    int reg = inst->src[i].reg;
 
 	    use[reg] = ip;
+
+            /* In most cases, a register can be written over safely by the
+             * same instruction that is its last use.  For a single
+             * instruction, the sources are dereferenced before writing of the
+             * destination starts (naturally).  This gets more complicated for
+             * simd16, because the instruction:
+             *
+             * mov(16)      g4<1>F      g4<8,8,1>F   g6<8,8,1>F
+             *
+             * is actually decoded in hardware as:
+             *
+             * mov(8)       g4<1>F      g4<8,8,1>F   g6<8,8,1>F
+             * mov(8)       g5<1>F      g5<8,8,1>F   g7<8,8,1>F
+             *
+             * Which is safe.  However, if we have uniform accesses
+             * happening, we get into trouble:
+             *
+             * mov(8)       g4<1>F      g4<0,1,0>F   g6<8,8,1>F
+             * mov(8)       g5<1>F      g4<0,1,0>F   g7<8,8,1>F
+             *
+             * Now our destination for the first instruction overwrote the
+             * second instruction's src0, and we get garbage for those 8
+             * pixels.  There's a similar issue for the pre-gen6
+             * pixel_x/pixel_y, which are registers of 16-bit values and thus
+             * would get stomped by the first decode as well.
+             */
+            if (dispatch_width == 16 && (inst->src[i].smear ||
+                                         (this->pixel_x.reg == reg ||
+                                          this->pixel_y.reg == reg))) {
+               use[reg]++;
+            }
 	 }
       }
 
@@ -264,28 +295,5 @@ fs_visitor::virtual_grf_interferes(int a, int b)
    int start = MAX2(a_def, b_def);
    int end = MIN2(a_use, b_use);
 
-   /* If the register is used to store 16 values of less than float
-    * size (only the case for pixel_[xy]), then we can't allocate
-    * another dword-sized thing to that register that would be used in
-    * the same instruction.  This is because when the GPU decodes (for
-    * example):
-    *
-    * (declare (in ) vec4 gl_FragCoord@0x97766a0)
-    * add(16)         g6<1>F          g6<8,8,1>UW     0.5F { align1 compr };
-    *
-    * it's actually processed as:
-    * add(8)         g6<1>F          g6<8,8,1>UW     0.5F { align1 };
-    * add(8)         g7<1>F          g6.8<8,8,1>UW   0.5F { align1 sechalf };
-    *
-    * so our second half values in g6 got overwritten in the first
-    * half.
-    */
-   if (dispatch_width == 16 && (this->pixel_x.reg == a ||
-				this->pixel_x.reg == b ||
-				this->pixel_y.reg == a ||
-				this->pixel_y.reg == b)) {
-      return start <= end;
-   }
-
    return start < end;
 }
-- 
2.30.2