From 4df56177edba492c3aada4386a4f271c7849deb5 Mon Sep 17 00:00:00 2001
From: Paul Berry <stereotype441@gmail.com>
Date: Tue, 22 Oct 2013 05:56:37 -0700
Subject: [PATCH] i965/fs: Only unroll high-accuracy dFdy() from SIMD16 to
 SIMD8 on gen4 and IVB.

In commit 800610f (i965/fs: Improve accuracy of dFdy() to match
dFdx()) I unrolled the high-accuracy dFdy() computation from a single
SIMD16 instruction to two SIMD8 instructions because of text I found
in the i965 (gen4) PRM saying that instruction compression could not
be used in align16 mode.  I couldn't find similar text in later
hardware docs, and I observed problems trying to use instruction
compression on align16 mode on Ivy Bridge, so I assumed that the
restriction still applied and the associated documentation had simply
been lost.

After consultation with the hardware engineers, it turns out this is
not the case.  In point of fact, the restriction was dropped in gen5,
re-introduced in Ivy Bridge, and dropped again in Haswell.  The reason
I didn't notice this is that in the Ivy Bridge documentation, the
restriction was in a different section, and described using different
language.

Now that we know that the restriction only applies to Gen4 and Ivy
Bridge, we can limit the unrolling to those platforms.

Tested on gen5, gen6, and gen7 (both Ivy Bridge and Haswell).

Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
---
 .../drivers/dri/i965/brw_fs_generator.cpp     | 37 ++++++++++++++-----
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 0e6e8533c73..df72b989066 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -640,6 +640,30 @@ fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src
                          bool negate_value)
 {
    if (c->key.high_quality_derivatives) {
+      /* From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
+       * Region Restrictions):
+       *
+       *     In Align16 access mode, SIMD16 is not allowed for DW operations
+       *     and SIMD8 is not allowed for DF operations.
+       *
+       * In this context, "DW operations" means "operations acting on 32-bit
+       * values", so it includes operations on floats.
+       *
+       * Gen4 has a similar restriction.  From the i965 PRM, section 11.5.3
+       * (Instruction Compression -> Rules and Restrictions):
+       *
+       *     A compressed instruction must be in Align1 access mode. Align16
+       *     mode instructions cannot be compressed.
+       *
+       * Similar text exists in the g45 PRM.
+       *
+       * On these platforms, if we're building a SIMD16 shader, we need to
+       * manually unroll to a pair of SIMD8 instructions.
+       */
+      bool unroll_to_simd8 =
+         (dispatch_width == 16 &&
+          (brw->gen == 4 || (brw->gen == 7 && !brw->is_haswell)));
+
       /* produce accurate derivatives */
       struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
                                     BRW_REGISTER_TYPE_F,
@@ -655,20 +679,13 @@ fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src
                                     BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
       brw_push_insn_state(p);
       brw_set_access_mode(p, BRW_ALIGN_16);
-      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      if (unroll_to_simd8)
+         brw_set_compression_control(p, BRW_COMPRESSION_NONE);
       if (negate_value)
          brw_ADD(p, dst, src1, negate(src0));
       else
          brw_ADD(p, dst, src0, negate(src1));
-      if (dispatch_width == 16) {
-         /* From page 340 of the i965 PRM:
-          *
-          *     "A compressed instruction must be in Align1 access
-          *     mode. Align16 mode instructions cannot be compressed."
-          *
-          * Therefore, when doing a 16-wide dispatch, we need to manually
-          * unroll to two instructions.
-          */
+      if (unroll_to_simd8) {
          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
          src0 = sechalf(src0);
          src1 = sechalf(src1);
-- 
2.30.2