From: Francisco Jerez <currojerez@riseup.net>
Date: Sat, 10 Aug 2019 19:45:46 +0000 (-0700)
Subject: iris/gen9: Optimize slice and subslice load balancing behavior.
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=026773397b1e64e95ad04e271a2de70453672424;p=mesa.git

iris/gen9: Optimize slice and subslice load balancing behavior.

See "i965/gen9: Optimize slice and subslice load balancing behavior."
for the rationale.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---

diff --git a/src/gallium/drivers/iris/iris_blorp.c b/src/gallium/drivers/iris/iris_blorp.c
index 7298e23d23c..7aae5ea7002 100644
--- a/src/gallium/drivers/iris/iris_blorp.c
+++ b/src/gallium/drivers/iris/iris_blorp.c
@@ -307,6 +307,12 @@ iris_blorp_exec(struct blorp_batch *blorp_batch,
 
    iris_require_command_space(batch, 1400);
 
+   const unsigned scale = params->fast_clear_op ? UINT_MAX : 1;
+   if (ice->state.current_hash_scale != scale) {
+      genX(emit_hashing_mode)(ice, batch, params->x1 - params->x0,
+                              params->y1 - params->y0, scale);
+   }
+
    blorp_exec(blorp_batch, params);
 
    /* We've smashed all state compared to what the normal 3D pipeline
diff --git a/src/gallium/drivers/iris/iris_context.c b/src/gallium/drivers/iris/iris_context.c
index 8710f010ebf..02b74d39619 100644
--- a/src/gallium/drivers/iris/iris_context.c
+++ b/src/gallium/drivers/iris/iris_context.c
@@ -98,6 +98,7 @@ iris_lost_context_state(struct iris_batch *batch)
    }
 
    ice->state.dirty = ~0ull;
+   ice->state.current_hash_scale = 0;
    memset(ice->state.last_grid, 0, sizeof(ice->state.last_grid));
    batch->last_surface_base_address = ~0ull;
    ice->vtbl.lost_genx_state(ice, batch);
diff --git a/src/gallium/drivers/iris/iris_context.h b/src/gallium/drivers/iris/iris_context.h
index 2ca496d1d03..8b7c5736c85 100644
--- a/src/gallium/drivers/iris/iris_context.h
+++ b/src/gallium/drivers/iris/iris_context.h
@@ -733,6 +733,9 @@ struct iris_context {
 
       /** Records the size of variable-length state for INTEL_DEBUG=bat */
       struct hash_table_u64 *sizes;
+
+      /** Last rendering scale argument provided to genX(emit_hashing_mode). */
+      unsigned current_hash_scale;
    } state;
 };
 
diff --git a/src/gallium/drivers/iris/iris_genx_protos.h b/src/gallium/drivers/iris/iris_genx_protos.h
index 623eb6b4802..16da78d7e9f 100644
--- a/src/gallium/drivers/iris/iris_genx_protos.h
+++ b/src/gallium/drivers/iris/iris_genx_protos.h
@@ -33,6 +33,10 @@ void genX(emit_urb_setup)(struct iris_context *ice,
                           struct iris_batch *batch,
                           const unsigned size[4],
                           bool tess_present, bool gs_present);
+void genX(emit_hashing_mode)(struct iris_context *ice,
+                             struct iris_batch *batch,
+                             unsigned width, unsigned height,
+                             unsigned scale);
 
 /* iris_blorp.c */
 void genX(init_blorp)(struct iris_context *ice);
diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c
index 921078b28a2..98ed1331164 100644
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@@ -5192,6 +5192,9 @@ iris_upload_dirty_render_state(struct iris_context *ice,
       }
    }
 
+   if (ice->state.current_hash_scale != 1)
+      genX(emit_hashing_mode)(ice, batch, UINT_MAX, UINT_MAX, 1);
+
    /* TODO: Gen8 PMA fix */
 }
 
@@ -6462,6 +6465,99 @@ iris_emit_mi_report_perf_count(struct iris_batch *batch,
    }
 }
 
+/**
+ * Update the pixel hashing modes that determine the balancing of PS threads
+ * across subslices and slices.
+ *
+ * \param width Width bound of the rendering area (already scaled down if \p
+ *              scale is greater than 1).
+ * \param height Height bound of the rendering area (already scaled down if \p
+ *               scale is greater than 1).
+ * \param scale The number of framebuffer samples that could potentially be
+ *              affected by an individual channel of the PS thread.  This is
+ *              typically one for single-sampled rendering, but for operations
+ *              like CCS resolves and fast clears a single PS invocation may
+ *              update a huge number of pixels, in which case a finer
+ *              balancing is desirable in order to maximally utilize the
+ *              bandwidth available.  UINT_MAX can be used as shorthand for
+ *              "finest hashing mode available".
+ */
+void
+genX(emit_hashing_mode)(struct iris_context *ice, struct iris_batch *batch,
+                        unsigned width, unsigned height, unsigned scale)
+{
+#if GEN_GEN == 9
+   const struct gen_device_info *devinfo = &batch->screen->devinfo;
+   const unsigned slice_hashing[] = {
+      /* Because all Gen9 platforms with more than one slice require
+       * three-way subslice hashing, a single "normal" 16x16 slice hashing
+       * block is guaranteed to suffer from substantial imbalance, with one
+       * subslice receiving twice as much work as the other two in the
+       * slice.
+       *
+       * The performance impact of that would be particularly severe when
+       * three-way hashing is also in use for slice balancing (which is the
+       * case for all Gen9 GT4 platforms), because one of the slices
+       * receives one every three 16x16 blocks in either direction, which
+       * is roughly the periodicity of the underlying subslice imbalance
+       * pattern ("roughly" because in reality the hardware's
+       * implementation of three-way hashing doesn't do exact modulo 3
+       * arithmetic, which somewhat decreases the magnitude of this effect
+       * in practice).  This leads to a systematic subslice imbalance
+       * within that slice regardless of the size of the primitive.  The
+       * 32x32 hashing mode guarantees that the subslice imbalance within a
+       * single slice hashing block is minimal, largely eliminating this
+       * effect.
+       */
+      _32x32,
+      /* Finest slice hashing mode available. */
+      NORMAL
+   };
+   const unsigned subslice_hashing[] = {
+      /* 16x16 would provide a slight cache locality benefit especially
+       * visible in the sampler L1 cache efficiency of low-bandwidth
+       * non-LLC platforms, but it comes at the cost of greater subslice
+       * imbalance for primitives of dimensions approximately intermediate
+       * between 16x4 and 16x16.
+       */
+      _16x4,
+      /* Finest subslice hashing mode available. */
+      _8x4
+   };
+   /* Dimensions of the smallest hashing block of a given hashing mode.  If
+    * the rendering area is smaller than this there can't possibly be any
+    * benefit from switching to this mode, so we optimize out the
+    * transition.
+    */
+   const unsigned min_size[][2] = {
+      { 16, 4 },
+      { 8, 4 }
+   };
+   const unsigned idx = scale > 1;
+
+   if (width > min_size[idx][0] || height > min_size[idx][1]) {
+      uint32_t gt_mode;
+
+      iris_pack_state(GENX(GT_MODE), &gt_mode, reg) {
+         reg.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0);
+         reg.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0);
+         reg.SubsliceHashing = subslice_hashing[idx];
+         reg.SubsliceHashingMask = -1;
+      };
+
+      iris_emit_raw_pipe_control(batch,
+                                 "workaround: CS stall before GT_MODE LRI",
+                                 PIPE_CONTROL_STALL_AT_SCOREBOARD |
+                                 PIPE_CONTROL_CS_STALL,
+                                 NULL, 0, 0);
+
+      iris_emit_lri(batch, GT_MODE, gt_mode);
+
+      ice->state.current_hash_scale = scale;
+   }
+#endif
+}
+
 void
 genX(init_state)(struct iris_context *ice)
 {