From 8d3b86e34a7b0f77613c7f5669891e54d76f0cbf Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Sat, 4 Jan 2020 16:16:24 -0800
Subject: [PATCH] intel/fs/gen7+: Implement discard/demote for SIMD32 programs.
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

At this point this simply involves fixing the initialization of the
sample mask flag register to take the right dispatch mask from the
thread payload, and fixing sample_mask_reg() to return f1.1 for the
second half of a SIMD32 thread.  This improves Manhattan 3.1
performance by 2.4%Â±0.31% (N>40) on my ICL with SIMD32 enabled
relative to falling back to SIMD16 for the shaders that use discard.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/intel/compiler/brw_fs.cpp     | 18 +++++++++++-------
 src/intel/compiler/brw_fs_nir.cpp |  4 +++-
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 963d1c18155..fd9217b24b2 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -4285,8 +4285,8 @@ sample_mask_reg(const fs_builder &bld)
    if (v->stage != MESA_SHADER_FRAGMENT) {
       return brw_imm_ud(0xffffffff);
    } else if (brw_wm_prog_data(v->stage_prog_data)->uses_kill) {
-      assert(bld.group() < 16 && bld.dispatch_width() <= 16);
-      return brw_flag_subreg(sample_mask_flag_subreg(v));
+      assert(bld.dispatch_width() <= 16);
+      return brw_flag_subreg(sample_mask_flag_subreg(v) + bld.group() / 16);
    } else {
       assert(v->devinfo->gen >= 6 && bld.dispatch_width() <= 16);
       return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
@@ -8171,11 +8171,15 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
        * Initialize it with the dispatched pixels.
        */
       if (wm_prog_data->uses_kill) {
-         const fs_reg dispatch_mask =
-            devinfo->gen >= 6 ? brw_vec1_grf(1, 7) : brw_vec1_grf(0, 0);
-         bld.exec_all().group(1, 0)
-            .MOV(sample_mask_reg(bld),
-                 retype(dispatch_mask, BRW_REGISTER_TYPE_UW));
+         const unsigned lower_width = MIN2(dispatch_width, 16);
+         for (unsigned i = 0; i < dispatch_width / lower_width; i++) {
+            const fs_reg dispatch_mask =
+               devinfo->gen >= 6 ? brw_vec1_grf((i ? 2 : 1), 7) :
+               brw_vec1_grf(0, 0);
+            bld.exec_all().group(1, 0)
+               .MOV(sample_mask_reg(bld.group(lower_width, i)),
+                    retype(dispatch_mask, BRW_REGISTER_TYPE_UW));
+         }
       }
 
       emit_nir_code();
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 5d66ead4a24..3b34c407f51 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -3562,7 +3562,9 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
          emit_discard_jump();
       }
 
-      limit_dispatch_width(16, "Fragment discard/demote not implemented in SIMD32 mode.\n");
+      if (devinfo->gen < 7)
+         limit_dispatch_width(
+            16, "Fragment discard/demote not implemented in SIMD32 mode.\n");
       break;
    }
 
-- 
2.30.2