intel/fs/gen7+: Implement discard/demote for SIMD32 programs.
authorFrancisco Jerez <currojerez@riseup.net>
Sun, 5 Jan 2020 00:16:24 +0000 (16:16 -0800)
committerFrancisco Jerez <currojerez@riseup.net>
Fri, 14 Feb 2020 22:31:49 +0000 (14:31 -0800)
At this point this simply involves fixing the initialization of the
sample mask flag register to take the right dispatch mask from the
thread payload, and fixing sample_mask_reg() to return f1.1 for the
second half of a SIMD32 thread.  This improves Manhattan 3.1
performance by 2.4%±0.31% (N>40) on my ICL with SIMD32 enabled
relative to falling back to SIMD16 for the shaders that use discard.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
src/intel/compiler/brw_fs.cpp
src/intel/compiler/brw_fs_nir.cpp

index 963d1c18155d433317ce7d94e39d07ba1289197d..fd9217b24b2609b23b3ef85dfb55dbb3b821d755 100644 (file)
@@ -4285,8 +4285,8 @@ sample_mask_reg(const fs_builder &bld)
    if (v->stage != MESA_SHADER_FRAGMENT) {
       return brw_imm_ud(0xffffffff);
    } else if (brw_wm_prog_data(v->stage_prog_data)->uses_kill) {
-      assert(bld.group() < 16 && bld.dispatch_width() <= 16);
-      return brw_flag_subreg(sample_mask_flag_subreg(v));
+      assert(bld.dispatch_width() <= 16);
+      return brw_flag_subreg(sample_mask_flag_subreg(v) + bld.group() / 16);
    } else {
       assert(v->devinfo->gen >= 6 && bld.dispatch_width() <= 16);
       return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
@@ -8171,11 +8171,15 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
        * Initialize it with the dispatched pixels.
        */
       if (wm_prog_data->uses_kill) {
-         const fs_reg dispatch_mask =
-            devinfo->gen >= 6 ? brw_vec1_grf(1, 7) : brw_vec1_grf(0, 0);
-         bld.exec_all().group(1, 0)
-            .MOV(sample_mask_reg(bld),
-                 retype(dispatch_mask, BRW_REGISTER_TYPE_UW));
+         const unsigned lower_width = MIN2(dispatch_width, 16);
+         for (unsigned i = 0; i < dispatch_width / lower_width; i++) {
+            const fs_reg dispatch_mask =
+               devinfo->gen >= 6 ? brw_vec1_grf((i ? 2 : 1), 7) :
+               brw_vec1_grf(0, 0);
+            bld.exec_all().group(1, 0)
+               .MOV(sample_mask_reg(bld.group(lower_width, i)),
+                    retype(dispatch_mask, BRW_REGISTER_TYPE_UW));
+         }
       }
 
       emit_nir_code();
index 5d66ead4a2413d5e8ead088df2792d293441da9b..3b34c407f51e5d9d832733b2788213b823c05d0f 100644 (file)
@@ -3562,7 +3562,9 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
          emit_discard_jump();
       }
 
-      limit_dispatch_width(16, "Fragment discard/demote not implemented in SIMD32 mode.\n");
+      if (devinfo->gen < 7)
+         limit_dispatch_width(
+            16, "Fragment discard/demote not implemented in SIMD32 mode.\n");
       break;
    }