i965/gen9: Optimize slice and subslice load balancing behavior.

author Francisco Jerez <currojerez@riseup.net>

Wed, 17 Jul 2019 21:56:18 +0000 (14:56 -0700)

committer Francisco Jerez <currojerez@riseup.net>

Mon, 12 Aug 2019 20:17:58 +0000 (13:17 -0700)
author Francisco Jerez <currojerez@riseup.net>
Wed, 17 Jul 2019 21:56:18 +0000 (14:56 -0700)
committer Francisco Jerez <currojerez@riseup.net>
Mon, 12 Aug 2019 20:17:58 +0000 (13:17 -0700)
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h

index 2ac443bf032849a18a9a89b7f71e56dc1e7f1aee..17639bf59959a9880b3800512c89122dcbd269e8 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1219,6 +1219,9 @@ struct brw_context
  
     enum gen9_astc5x5_wa_tex_type gen9_astc5x5_wa_tex_mask;
  
+   /** Last rendering scale argument provided to brw_emit_hashing_mode(). */
+   unsigned current_hash_scale;
+
     __DRIcontext *driContext;
     struct intel_screen *screen;
  };
@@ -1265,6 +1268,8 @@ GLboolean brwCreateContext(gl_api api,
   */
  void brw_workaround_depthstencil_alignment(struct brw_context *brw,
                                             GLbitfield clear_mask);
+void brw_emit_hashing_mode(struct brw_context *brw, unsigned width,
+                           unsigned height, unsigned scale);
  
  /* brw_object_purgeable.c */
  void brw_init_object_purgeable_functions(struct dd_function_table *functions);
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h

index 425f55341102c1cc8cd4cbfe526b816bdf463275..33d042be86939d5647ff3ed0cef653579605fdad 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1570,6 +1570,11 @@ enum brw_pixel_shader_coverage_mask_mode {
  # define GEN9_SUBSLICE_HASHING_8x4      (2 << 8)
  # define GEN9_SUBSLICE_HASHING_16x16    (3 << 8)
  # define GEN9_SUBSLICE_HASHING_MASK_BITS REG_MASK(3 << 8)
+# define GEN9_SLICE_HASHING_NORMAL      (0 << 11)
+# define GEN9_SLICE_HASHING_DISABLED    (1 << 11)
+# define GEN9_SLICE_HASHING_32x16       (2 << 11)
+# define GEN9_SLICE_HASHING_32x32       (3 << 11)
+# define GEN9_SLICE_HASHING_MASK_BITS REG_MASK(3 << 11)
  
  /* Predicate registers */
  #define MI_PREDICATE_SRC0               0x2400
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c

index e73cadc5d3e6f8a8cfd99c9b456dbcefa016898e..1291470d4795a79b21686b97f36c608e1d311762 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -601,6 +601,96 @@ brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline)
     }
  }
  
+/**
+ * Update the pixel hashing modes that determine the balancing of PS threads
+ * across subslices and slices.
+ *
+ * \param width Width bound of the rendering area (already scaled down if \p
+ *              scale is greater than 1).
+ * \param height Height bound of the rendering area (already scaled down if \p
+ *               scale is greater than 1).
+ * \param scale The number of framebuffer samples that could potentially be
+ *              affected by an individual channel of the PS thread.  This is
+ *              typically one for single-sampled rendering, but for operations
+ *              like CCS resolves and fast clears a single PS invocation may
+ *              update a huge number of pixels, in which case a finer
+ *              balancing is desirable in order to maximally utilize the
+ *              bandwidth available.  UINT_MAX can be used as shorthand for
+ *              "finest hashing mode available".
+ */
+void
+brw_emit_hashing_mode(struct brw_context *brw, unsigned width,
+                      unsigned height, unsigned scale)
+{
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+
+   if (devinfo->gen == 9) {
+      const uint32_t slice_hashing[] = {
+         /* Because all Gen9 platforms with more than one slice require
+          * three-way subslice hashing, a single "normal" 16x16 slice hashing
+          * block is guaranteed to suffer from substantial imbalance, with one
+          * subslice receiving twice as much work as the other two in the
+          * slice.
+          *
+          * The performance impact of that would be particularly severe when
+          * three-way hashing is also in use for slice balancing (which is the
+          * case for all Gen9 GT4 platforms), because one of the slices
+          * receives one every three 16x16 blocks in either direction, which
+          * is roughly the periodicity of the underlying subslice imbalance
+          * pattern ("roughly" because in reality the hardware's
+          * implementation of three-way hashing doesn't do exact modulo 3
+          * arithmetic, which somewhat decreases the magnitude of this effect
+          * in practice).  This leads to a systematic subslice imbalance
+          * within that slice regardless of the size of the primitive.  The
+          * 32x32 hashing mode guarantees that the subslice imbalance within a
+          * single slice hashing block is minimal, largely eliminating this
+          * effect.
+          */
+         GEN9_SLICE_HASHING_32x32,
+         /* Finest slice hashing mode available. */
+         GEN9_SLICE_HASHING_NORMAL
+      };
+      const uint32_t subslice_hashing[] = {
+         /* The 16x16 subslice hashing mode is used on non-LLC platforms to
+          * match the performance of previous Mesa versions.  16x16 has a
+          * slight cache locality benefit especially visible in the sampler L1
+          * cache efficiency of low-bandwidth platforms, but it comes at the
+          * cost of greater subslice imbalance for primitives of dimensions
+          * approximately intermediate between 16x4 and 16x16.
+          */
+         (devinfo->has_llc ? GEN9_SUBSLICE_HASHING_16x4 :
+                             GEN9_SUBSLICE_HASHING_16x16),
+         /* Finest subslice hashing mode available. */
+         GEN9_SUBSLICE_HASHING_8x4
+      };
+      /* Dimensions of the smallest hashing block of a given hashing mode.  If
+       * the rendering area is smaller than this there can't possibly be any
+       * benefit from switching to this mode, so we optimize out the
+       * transition.
+       */
+      const unsigned min_size[][2] = {
+         { 16, 4 },
+         { 8, 4 }
+      };
+      const unsigned idx = scale > 1;
+
+      if (width > min_size[idx][0] || height > min_size[idx][1]) {
+         const uint32_t gt_mode =
+            (devinfo->num_slices == 1 ? 0 :
+             GEN9_SLICE_HASHING_MASK_BITS | slice_hashing[idx]) |
+            GEN9_SUBSLICE_HASHING_MASK_BITS | subslice_hashing[idx];
+
+         brw_emit_pipe_control_flush(brw,
+                                     PIPE_CONTROL_STALL_AT_SCOREBOARD |
+                                     PIPE_CONTROL_CS_STALL);
+
+         brw_load_register_imm32(brw, GEN7_GT_MODE, gt_mode);
+
+         brw->current_hash_scale = scale;
+      }
+   }
+}
+
  /**
   * Misc invariant state packets
   */
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c

index ac9ee2dabf10e2d62ed4a89d1ce5c60a00f0cf8b..7a2daf4a5335c23e7152788dcabe78f8915a1d6c 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -133,12 +133,6 @@ brw_upload_initial_gpu_state(struct brw_context *brw)
                                REG_MASK(GEN9_PARTIAL_RESOLVE_DISABLE_IN_VC) |
                                GEN9_FLOAT_BLEND_OPTIMIZATION_ENABLE |
                                GEN9_PARTIAL_RESOLVE_DISABLE_IN_VC);
-
-      if (gen_device_info_is_9lp(devinfo)) {
-         brw_load_register_imm32(brw, GEN7_GT_MODE,
-                                 GEN9_SUBSLICE_HASHING_MASK_BITS |
-                                 GEN9_SUBSLICE_HASHING_16x16);
-      }
     }
  
     if (devinfo->gen >= 8) {
@@ -543,6 +537,9 @@ brw_upload_pipeline_state(struct brw_context *brw,
  
     brw_select_pipeline(brw, pipeline);
  
+   if (pipeline == BRW_RENDER_PIPELINE && brw->current_hash_scale != 1)
+      brw_emit_hashing_mode(brw, UINT_MAX, UINT_MAX, 1);
+
     if (unlikely(INTEL_DEBUG & DEBUG_REEMIT)) {
        /* Always re-emit all state. */
        brw->NewGLState = ~0;
diff --git a/src/mesa/drivers/dri/i965/genX_blorp_exec.c b/src/mesa/drivers/dri/i965/genX_blorp_exec.c

index 8f06f5e9ef2525295def9bc7f2caab628ed20539..62a8310f68bc766740338620cb2e2f3b3a456470 100644 (file)
--- a/src/mesa/drivers/dri/i965/genX_blorp_exec.c
+++ b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
@@ -343,6 +343,12 @@ retry:
     gen8_write_pma_stall_bits(brw, 0);
  #endif
  
+   const unsigned scale = params->fast_clear_op ? UINT_MAX : 1;
+   if (brw->current_hash_scale != scale) {
+      brw_emit_hashing_mode(brw, params->x1 - params->x0,
+                            params->y1 - params->y0, scale);
+   }
+
     blorp_emit(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
        rect.ClippedDrawingRectangleXMax = MAX2(params->x1, params->x0) - 1;
        rect.ClippedDrawingRectangleYMax = MAX2(params->y1, params->y0) - 1;
author	Francisco Jerez <currojerez@riseup.net>
	Wed, 17 Jul 2019 21:56:18 +0000 (14:56 -0700)
committer	Francisco Jerez <currojerez@riseup.net>
	Mon, 12 Aug 2019 20:17:58 +0000 (13:17 -0700)
src/mesa/drivers/dri/i965/brw_context.h		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_defines.h		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_misc_state.c		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_state_upload.c		patch \| blob \| history
src/mesa/drivers/dri/i965/genX_blorp_exec.c		patch \| blob \| history