From 026773397b1e64e95ad04e271a2de70453672424 Mon Sep 17 00:00:00 2001 From: Francisco Jerez Date: Sat, 10 Aug 2019 12:45:46 -0700 Subject: [PATCH] iris/gen9: Optimize slice and subslice load balancing behavior. See "i965/gen9: Optimize slice and subslice load balancing behavior." for the rationale. Reviewed-by: Kenneth Graunke --- src/gallium/drivers/iris/iris_blorp.c | 6 ++ src/gallium/drivers/iris/iris_context.c | 1 + src/gallium/drivers/iris/iris_context.h | 3 + src/gallium/drivers/iris/iris_genx_protos.h | 4 + src/gallium/drivers/iris/iris_state.c | 96 +++++++++++++++++++++ 5 files changed, 110 insertions(+) diff --git a/src/gallium/drivers/iris/iris_blorp.c b/src/gallium/drivers/iris/iris_blorp.c index 7298e23d23c..7aae5ea7002 100644 --- a/src/gallium/drivers/iris/iris_blorp.c +++ b/src/gallium/drivers/iris/iris_blorp.c @@ -307,6 +307,12 @@ iris_blorp_exec(struct blorp_batch *blorp_batch, iris_require_command_space(batch, 1400); + const unsigned scale = params->fast_clear_op ? UINT_MAX : 1; + if (ice->state.current_hash_scale != scale) { + genX(emit_hashing_mode)(ice, batch, params->x1 - params->x0, + params->y1 - params->y0, scale); + } + blorp_exec(blorp_batch, params); /* We've smashed all state compared to what the normal 3D pipeline diff --git a/src/gallium/drivers/iris/iris_context.c b/src/gallium/drivers/iris/iris_context.c index 8710f010ebf..02b74d39619 100644 --- a/src/gallium/drivers/iris/iris_context.c +++ b/src/gallium/drivers/iris/iris_context.c @@ -98,6 +98,7 @@ iris_lost_context_state(struct iris_batch *batch) } ice->state.dirty = ~0ull; + ice->state.current_hash_scale = 0; memset(ice->state.last_grid, 0, sizeof(ice->state.last_grid)); batch->last_surface_base_address = ~0ull; ice->vtbl.lost_genx_state(ice, batch); diff --git a/src/gallium/drivers/iris/iris_context.h b/src/gallium/drivers/iris/iris_context.h index 2ca496d1d03..8b7c5736c85 100644 --- a/src/gallium/drivers/iris/iris_context.h +++ b/src/gallium/drivers/iris/iris_context.h @@ -733,6 +733,9 @@ struct iris_context { /** Records the size of variable-length state for INTEL_DEBUG=bat */ struct hash_table_u64 *sizes; + + /** Last rendering scale argument provided to genX(emit_hashing_mode). */ + unsigned current_hash_scale; } state; }; diff --git a/src/gallium/drivers/iris/iris_genx_protos.h b/src/gallium/drivers/iris/iris_genx_protos.h index 623eb6b4802..16da78d7e9f 100644 --- a/src/gallium/drivers/iris/iris_genx_protos.h +++ b/src/gallium/drivers/iris/iris_genx_protos.h @@ -33,6 +33,10 @@ void genX(emit_urb_setup)(struct iris_context *ice, struct iris_batch *batch, const unsigned size[4], bool tess_present, bool gs_present); +void genX(emit_hashing_mode)(struct iris_context *ice, + struct iris_batch *batch, + unsigned width, unsigned height, + unsigned scale); /* iris_blorp.c */ void genX(init_blorp)(struct iris_context *ice); diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index 921078b28a2..98ed1331164 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -5192,6 +5192,9 @@ iris_upload_dirty_render_state(struct iris_context *ice, } } + if (ice->state.current_hash_scale != 1) + genX(emit_hashing_mode)(ice, batch, UINT_MAX, UINT_MAX, 1); + /* TODO: Gen8 PMA fix */ } @@ -6462,6 +6465,99 @@ iris_emit_mi_report_perf_count(struct iris_batch *batch, } } +/** + * Update the pixel hashing modes that determine the balancing of PS threads + * across subslices and slices. + * + * \param width Width bound of the rendering area (already scaled down if \p + * scale is greater than 1). + * \param height Height bound of the rendering area (already scaled down if \p + * scale is greater than 1). + * \param scale The number of framebuffer samples that could potentially be + * affected by an individual channel of the PS thread. This is + * typically one for single-sampled rendering, but for operations + * like CCS resolves and fast clears a single PS invocation may + * update a huge number of pixels, in which case a finer + * balancing is desirable in order to maximally utilize the + * bandwidth available. UINT_MAX can be used as shorthand for + * "finest hashing mode available". + */ +void +genX(emit_hashing_mode)(struct iris_context *ice, struct iris_batch *batch, + unsigned width, unsigned height, unsigned scale) +{ +#if GEN_GEN == 9 + const struct gen_device_info *devinfo = &batch->screen->devinfo; + const unsigned slice_hashing[] = { + /* Because all Gen9 platforms with more than one slice require + * three-way subslice hashing, a single "normal" 16x16 slice hashing + * block is guaranteed to suffer from substantial imbalance, with one + * subslice receiving twice as much work as the other two in the + * slice. + * + * The performance impact of that would be particularly severe when + * three-way hashing is also in use for slice balancing (which is the + * case for all Gen9 GT4 platforms), because one of the slices + * receives one every three 16x16 blocks in either direction, which + * is roughly the periodicity of the underlying subslice imbalance + * pattern ("roughly" because in reality the hardware's + * implementation of three-way hashing doesn't do exact modulo 3 + * arithmetic, which somewhat decreases the magnitude of this effect + * in practice). This leads to a systematic subslice imbalance + * within that slice regardless of the size of the primitive. The + * 32x32 hashing mode guarantees that the subslice imbalance within a + * single slice hashing block is minimal, largely eliminating this + * effect. + */ + _32x32, + /* Finest slice hashing mode available. */ + NORMAL + }; + const unsigned subslice_hashing[] = { + /* 16x16 would provide a slight cache locality benefit especially + * visible in the sampler L1 cache efficiency of low-bandwidth + * non-LLC platforms, but it comes at the cost of greater subslice + * imbalance for primitives of dimensions approximately intermediate + * between 16x4 and 16x16. + */ + _16x4, + /* Finest subslice hashing mode available. */ + _8x4 + }; + /* Dimensions of the smallest hashing block of a given hashing mode. If + * the rendering area is smaller than this there can't possibly be any + * benefit from switching to this mode, so we optimize out the + * transition. + */ + const unsigned min_size[][2] = { + { 16, 4 }, + { 8, 4 } + }; + const unsigned idx = scale > 1; + + if (width > min_size[idx][0] || height > min_size[idx][1]) { + uint32_t gt_mode; + + iris_pack_state(GENX(GT_MODE), >_mode, reg) { + reg.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0); + reg.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0); + reg.SubsliceHashing = subslice_hashing[idx]; + reg.SubsliceHashingMask = -1; + }; + + iris_emit_raw_pipe_control(batch, + "workaround: CS stall before GT_MODE LRI", + PIPE_CONTROL_STALL_AT_SCOREBOARD | + PIPE_CONTROL_CS_STALL, + NULL, 0, 0); + + iris_emit_lri(batch, GT_MODE, gt_mode); + + ice->state.current_hash_scale = scale; + } +#endif +} + void genX(init_state)(struct iris_context *ice) { -- 2.30.2