intel/ir/gen12+: Work around FS performance regressions due to SIMD32 discard divergence.

author Francisco Jerez <currojerez@riseup.net>

Sun, 31 May 2020 21:56:40 +0000 (14:56 -0700)

committer Marge Bot <eric+marge@anholt.net>

Thu, 23 Jul 2020 01:40:06 +0000 (01:40 +0000)
author Francisco Jerez <currojerez@riseup.net>
Sun, 31 May 2020 21:56:40 +0000 (14:56 -0700)
committer Marge Bot <eric+marge@anholt.net>
Thu, 23 Jul 2020 01:40:06 +0000 (01:40 +0000)
diff --git a/src/intel/compiler/brw_ir_performance.cpp b/src/intel/compiler/brw_ir_performance.cpp

index 8d02db932ba5d2724babc93bfbfac35c6e59f689..3c39594d1212dc0bd9d3fc8e4338236e4c4ea0e6 100644 (file)
--- a/src/intel/compiler/brw_ir_performance.cpp
+++ b/src/intel/compiler/brw_ir_performance.cpp
@@ -1522,9 +1522,19 @@ namespace {
         *       difference is the worst-case scenario branch_weight used for
         *       SIMD32 which accounts for the possibility of a dynamically
         *       uniform branch becoming divergent in SIMD32.
+       *
+       *       Note that we provide slightly more pessimistic weights on
+       *       Gen12+ for SIMD32, since the effective warp size on that
+       *       platform is 2x the SIMD width due to EU fusion, which increases
+       *       the likelihood of divergent control flow in comparison to
+       *       previous generations, giving narrower SIMD modes a performance
+       *       advantage in several test-cases with non-uniform discard jumps.
         */
        const float branch_weight = (dispatch_width > 16 ? 1.0 : 0.5);
+      const float discard_weight = (dispatch_width > 16 || s->devinfo->gen < 12 ?
+                                    1.0 : 0.5);
        const float loop_weight = 10;
+      unsigned discard_count = 0;
        unsigned elapsed = 0;
        state st;
  
@@ -1538,6 +1548,8 @@ namespace {
  
              if (inst->opcode == BRW_OPCODE_ENDIF)
                 st.weight /= branch_weight;
+            else if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT && discard_count)
+               st.weight /= discard_weight;
  
              elapsed += (st.unit_ready[unit_fe] - clock0) * st.weight;
  
@@ -1547,6 +1559,8 @@ namespace {
                 st.weight *= loop_weight;
              else if (inst->opcode == BRW_OPCODE_WHILE)
                 st.weight /= loop_weight;
+            else if (inst->opcode == FS_OPCODE_DISCARD_JUMP && !discard_count++)
+               st.weight *= discard_weight;
           }
  
           p.block_latency[block->num] = elapsed - elapsed0;
author	Francisco Jerez <currojerez@riseup.net>
	Sun, 31 May 2020 21:56:40 +0000 (14:56 -0700)
committer	Marge Bot <eric+marge@anholt.net>
	Thu, 23 Jul 2020 01:40:06 +0000 (01:40 +0000)