From 9406b3a5c16fd034d2f2db828f2f833c336683f9 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Wed, 17 Jul 2019 14:56:18 -0700
Subject: [PATCH] i965/gen9: Optimize slice and subslice load balancing
 behavior.
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

The default pixel hashing mode settings used for slice and subslice
load balancing are far from optimal under certain conditions (see the
comments below for the gory details).  The top-of-the-line GT4 parts
suffer from a particularly severe performance problem currently due to
a subslice load balancing issue.  Fixing this seems to improve
graphics performance across the board for most of the benchmarks in my
test set, up to ~20% in some cases, e.g. from SKL GT4:

unigine/valley:                3.44% Â±0.11%
gfxbench/gl_manhattan31:       3.99% Â±0.13%
gputest/pixmark_piano:         7.95% Â±0.33%
synmark/OglTexFilterAniso:    15.22% Â±0.07%
synmark/OglTexMem128:         22.26% Â±0.06%

Lower-end platforms are also affected by some subslice load imbalance
to a lesser degree, especially during CCS resolve and fast clear
operations, which are handled specially here due to rasterization
ocurring in reduced CCS coordinates, which changes the semantics of
the pixel hashing mode settings.

No regressions seen during my tests on some SKL, KBL and BXT
configurations.  Additional benchmark reports welcome on any Gen9
platforms (that includes anything with Skylake, Broxton, Kabylake,
Geminilake, Coffeelake, Whiskey Lake, Comet Lake or Amber Lake in your
renderer string).

P.S.: A similar problem is likely to be present on other non-Gen9
      platforms, especially for CCS resolve and fast clear operations.
      Will follow-up with additional patches fixing the hashing mode
      for those once I have enough performance data to justify it.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_context.h      |  5 ++
 src/mesa/drivers/dri/i965/brw_defines.h      |  5 ++
 src/mesa/drivers/dri/i965/brw_misc_state.c   | 90 ++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_state_upload.c |  9 +-
 src/mesa/drivers/dri/i965/genX_blorp_exec.c  |  6 ++
 5 files changed, 109 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 2ac443bf032..17639bf5995 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1219,6 +1219,9 @@ struct brw_context
 
    enum gen9_astc5x5_wa_tex_type gen9_astc5x5_wa_tex_mask;
 
+   /** Last rendering scale argument provided to brw_emit_hashing_mode(). */
+   unsigned current_hash_scale;
+
    __DRIcontext *driContext;
    struct intel_screen *screen;
 };
@@ -1265,6 +1268,8 @@ GLboolean brwCreateContext(gl_api api,
  */
 void brw_workaround_depthstencil_alignment(struct brw_context *brw,
                                            GLbitfield clear_mask);
+void brw_emit_hashing_mode(struct brw_context *brw, unsigned width,
+                           unsigned height, unsigned scale);
 
 /* brw_object_purgeable.c */
 void brw_init_object_purgeable_functions(struct dd_function_table *functions);
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 425f5534110..33d042be869 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1570,6 +1570,11 @@ enum brw_pixel_shader_coverage_mask_mode {
 # define GEN9_SUBSLICE_HASHING_8x4      (2 << 8)
 # define GEN9_SUBSLICE_HASHING_16x16    (3 << 8)
 # define GEN9_SUBSLICE_HASHING_MASK_BITS REG_MASK(3 << 8)
+# define GEN9_SLICE_HASHING_NORMAL      (0 << 11)
+# define GEN9_SLICE_HASHING_DISABLED    (1 << 11)
+# define GEN9_SLICE_HASHING_32x16       (2 << 11)
+# define GEN9_SLICE_HASHING_32x32       (3 << 11)
+# define GEN9_SLICE_HASHING_MASK_BITS REG_MASK(3 << 11)
 
 /* Predicate registers */
 #define MI_PREDICATE_SRC0               0x2400
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index e73cadc5d3e..1291470d479 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -601,6 +601,96 @@ brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline)
    }
 }
 
+/**
+ * Update the pixel hashing modes that determine the balancing of PS threads
+ * across subslices and slices.
+ *
+ * \param width Width bound of the rendering area (already scaled down if \p
+ *              scale is greater than 1).
+ * \param height Height bound of the rendering area (already scaled down if \p
+ *               scale is greater than 1).
+ * \param scale The number of framebuffer samples that could potentially be
+ *              affected by an individual channel of the PS thread.  This is
+ *              typically one for single-sampled rendering, but for operations
+ *              like CCS resolves and fast clears a single PS invocation may
+ *              update a huge number of pixels, in which case a finer
+ *              balancing is desirable in order to maximally utilize the
+ *              bandwidth available.  UINT_MAX can be used as shorthand for
+ *              "finest hashing mode available".
+ */
+void
+brw_emit_hashing_mode(struct brw_context *brw, unsigned width,
+                      unsigned height, unsigned scale)
+{
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+
+   if (devinfo->gen == 9) {
+      const uint32_t slice_hashing[] = {
+         /* Because all Gen9 platforms with more than one slice require
+          * three-way subslice hashing, a single "normal" 16x16 slice hashing
+          * block is guaranteed to suffer from substantial imbalance, with one
+          * subslice receiving twice as much work as the other two in the
+          * slice.
+          *
+          * The performance impact of that would be particularly severe when
+          * three-way hashing is also in use for slice balancing (which is the
+          * case for all Gen9 GT4 platforms), because one of the slices
+          * receives one every three 16x16 blocks in either direction, which
+          * is roughly the periodicity of the underlying subslice imbalance
+          * pattern ("roughly" because in reality the hardware's
+          * implementation of three-way hashing doesn't do exact modulo 3
+          * arithmetic, which somewhat decreases the magnitude of this effect
+          * in practice).  This leads to a systematic subslice imbalance
+          * within that slice regardless of the size of the primitive.  The
+          * 32x32 hashing mode guarantees that the subslice imbalance within a
+          * single slice hashing block is minimal, largely eliminating this
+          * effect.
+          */
+         GEN9_SLICE_HASHING_32x32,
+         /* Finest slice hashing mode available. */
+         GEN9_SLICE_HASHING_NORMAL
+      };
+      const uint32_t subslice_hashing[] = {
+         /* The 16x16 subslice hashing mode is used on non-LLC platforms to
+          * match the performance of previous Mesa versions.  16x16 has a
+          * slight cache locality benefit especially visible in the sampler L1
+          * cache efficiency of low-bandwidth platforms, but it comes at the
+          * cost of greater subslice imbalance for primitives of dimensions
+          * approximately intermediate between 16x4 and 16x16.
+          */
+         (devinfo->has_llc ? GEN9_SUBSLICE_HASHING_16x4 :
+                             GEN9_SUBSLICE_HASHING_16x16),
+         /* Finest subslice hashing mode available. */
+         GEN9_SUBSLICE_HASHING_8x4
+      };
+      /* Dimensions of the smallest hashing block of a given hashing mode.  If
+       * the rendering area is smaller than this there can't possibly be any
+       * benefit from switching to this mode, so we optimize out the
+       * transition.
+       */
+      const unsigned min_size[][2] = {
+         { 16, 4 },
+         { 8, 4 }
+      };
+      const unsigned idx = scale > 1;
+
+      if (width > min_size[idx][0] || height > min_size[idx][1]) {
+         const uint32_t gt_mode =
+            (devinfo->num_slices == 1 ? 0 :
+             GEN9_SLICE_HASHING_MASK_BITS | slice_hashing[idx]) |
+            GEN9_SUBSLICE_HASHING_MASK_BITS | subslice_hashing[idx];
+
+         brw_emit_pipe_control_flush(brw,
+                                     PIPE_CONTROL_STALL_AT_SCOREBOARD |
+                                     PIPE_CONTROL_CS_STALL);
+
+         brw_load_register_imm32(brw, GEN7_GT_MODE, gt_mode);
+
+         brw->current_hash_scale = scale;
+      }
+   }
+}
+
 /**
  * Misc invariant state packets
  */
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index ac9ee2dabf1..7a2daf4a533 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -133,12 +133,6 @@ brw_upload_initial_gpu_state(struct brw_context *brw)
                               REG_MASK(GEN9_PARTIAL_RESOLVE_DISABLE_IN_VC) |
                               GEN9_FLOAT_BLEND_OPTIMIZATION_ENABLE |
                               GEN9_PARTIAL_RESOLVE_DISABLE_IN_VC);
-
-      if (gen_device_info_is_9lp(devinfo)) {
-         brw_load_register_imm32(brw, GEN7_GT_MODE,
-                                 GEN9_SUBSLICE_HASHING_MASK_BITS |
-                                 GEN9_SUBSLICE_HASHING_16x16);
-      }
    }
 
    if (devinfo->gen >= 8) {
@@ -543,6 +537,9 @@ brw_upload_pipeline_state(struct brw_context *brw,
 
    brw_select_pipeline(brw, pipeline);
 
+   if (pipeline == BRW_RENDER_PIPELINE && brw->current_hash_scale != 1)
+      brw_emit_hashing_mode(brw, UINT_MAX, UINT_MAX, 1);
+
    if (unlikely(INTEL_DEBUG & DEBUG_REEMIT)) {
       /* Always re-emit all state. */
       brw->NewGLState = ~0;
diff --git a/src/mesa/drivers/dri/i965/genX_blorp_exec.c b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
index 8f06f5e9ef2..62a8310f68b 100644
--- a/src/mesa/drivers/dri/i965/genX_blorp_exec.c
+++ b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
@@ -343,6 +343,12 @@ retry:
    gen8_write_pma_stall_bits(brw, 0);
 #endif
 
+   const unsigned scale = params->fast_clear_op ? UINT_MAX : 1;
+   if (brw->current_hash_scale != scale) {
+      brw_emit_hashing_mode(brw, params->x1 - params->x0,
+                            params->y1 - params->y0, scale);
+   }
+
    blorp_emit(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
       rect.ClippedDrawingRectangleXMax = MAX2(params->x1, params->x0) - 1;
       rect.ClippedDrawingRectangleYMax = MAX2(params->y1, params->y0) - 1;
-- 
2.30.2