From c2fe7a0fb818578d87f12300523d796205edffd5 Mon Sep 17 00:00:00 2001 From: Francisco Jerez Date: Wed, 17 Jul 2019 18:41:38 -0700 Subject: [PATCH] anv/gen9: Optimize slice and subslice load balancing behavior. See "i965/gen9: Optimize slice and subslice load balancing behavior." for the rationale. According to Jason, improves Aztec Ruins performance by 2.7%. Reviewed-by: Kenneth Graunke (v1) v2: Undo CPU performance micro-optimization done in i965 and iris due to lack of data justifying it on anv. Use cmd_buffer_apply_pipe_flushes wrapper instead of emitting pipe control command directly. (Jason) Reviewed-by: Jason Ekstrand --- src/intel/vulkan/anv_genX.h | 4 ++ src/intel/vulkan/anv_private.h | 6 ++ src/intel/vulkan/genX_blorp_exec.c | 4 ++ src/intel/vulkan/genX_cmd_buffer.c | 95 ++++++++++++++++++++++++++++++ 4 files changed, 109 insertions(+) diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h index a5435e566a3..06c6b467acf 100644 --- a/src/intel/vulkan/anv_genX.h +++ b/src/intel/vulkan/anv_genX.h @@ -44,6 +44,10 @@ void genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer); void genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer); +void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, + unsigned width, unsigned height, + unsigned scale); + void genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer); void genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer); diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 2465f264354..b381386a716 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -2421,6 +2421,12 @@ struct anv_cmd_state { bool conditional_render_enabled; + /** + * Last rendering scale argument provided to + * genX(cmd_buffer_emit_hashing_mode)(). + */ + unsigned current_hash_scale; + /** * Array length is anv_cmd_state::pass::attachment_count. Array content is * valid only when recording a render pass instance. diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c index 1592e7f7e3d..239bfb47433 100644 --- a/src/intel/vulkan/genX_blorp_exec.c +++ b/src/intel/vulkan/genX_blorp_exec.c @@ -223,6 +223,10 @@ genX(blorp_exec)(struct blorp_batch *batch, genX(cmd_buffer_config_l3)(cmd_buffer, cfg); } + const unsigned scale = params->fast_clear_op ? UINT_MAX : 1; + genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, params->x1 - params->x0, + params->y1 - params->y0, scale); + #if GEN_GEN >= 11 /* The PIPE_CONTROL command description says: * diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 86ef1663ac4..10da132115c 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -1595,6 +1595,7 @@ genX(CmdExecuteCommands)( */ primary->state.current_pipeline = UINT32_MAX; primary->state.current_l3_config = NULL; + primary->state.current_hash_scale = 0; /* Each of the secondary command buffers will use its own state base * address. We need to re-emit state base address for the primary after @@ -2663,6 +2664,8 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config); + genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1); + genX(flush_pipeline_select_3d)(cmd_buffer); if (vb_emit) { @@ -3925,6 +3928,98 @@ genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer) } } +/** + * Update the pixel hashing modes that determine the balancing of PS threads + * across subslices and slices. + * + * \param width Width bound of the rendering area (already scaled down if \p + * scale is greater than 1). + * \param height Height bound of the rendering area (already scaled down if \p + * scale is greater than 1). + * \param scale The number of framebuffer samples that could potentially be + * affected by an individual channel of the PS thread. This is + * typically one for single-sampled rendering, but for operations + * like CCS resolves and fast clears a single PS invocation may + * update a huge number of pixels, in which case a finer + * balancing is desirable in order to maximally utilize the + * bandwidth available. UINT_MAX can be used as shorthand for + * "finest hashing mode available". + */ +void +genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, + unsigned width, unsigned height, + unsigned scale) +{ +#if GEN_GEN == 9 + const struct gen_device_info *devinfo = &cmd_buffer->device->info; + const unsigned slice_hashing[] = { + /* Because all Gen9 platforms with more than one slice require + * three-way subslice hashing, a single "normal" 16x16 slice hashing + * block is guaranteed to suffer from substantial imbalance, with one + * subslice receiving twice as much work as the other two in the + * slice. + * + * The performance impact of that would be particularly severe when + * three-way hashing is also in use for slice balancing (which is the + * case for all Gen9 GT4 platforms), because one of the slices + * receives one every three 16x16 blocks in either direction, which + * is roughly the periodicity of the underlying subslice imbalance + * pattern ("roughly" because in reality the hardware's + * implementation of three-way hashing doesn't do exact modulo 3 + * arithmetic, which somewhat decreases the magnitude of this effect + * in practice). This leads to a systematic subslice imbalance + * within that slice regardless of the size of the primitive. The + * 32x32 hashing mode guarantees that the subslice imbalance within a + * single slice hashing block is minimal, largely eliminating this + * effect. + */ + _32x32, + /* Finest slice hashing mode available. */ + NORMAL + }; + const unsigned subslice_hashing[] = { + /* 16x16 would provide a slight cache locality benefit especially + * visible in the sampler L1 cache efficiency of low-bandwidth + * non-LLC platforms, but it comes at the cost of greater subslice + * imbalance for primitives of dimensions approximately intermediate + * between 16x4 and 16x16. + */ + _16x4, + /* Finest subslice hashing mode available. */ + _8x4 + }; + /* Dimensions of the smallest hashing block of a given hashing mode. If + * the rendering area is smaller than this there can't possibly be any + * benefit from switching to this mode, so we optimize out the + * transition. + */ + const unsigned min_size[][2] = { + { 16, 4 }, + { 8, 4 } + }; + const unsigned idx = scale > 1; + + if (cmd_buffer->state.current_hash_scale != scale && + (width > min_size[idx][0] || height > min_size[idx][1])) { + uint32_t gt_mode; + + anv_pack_struct(>_mode, GENX(GT_MODE), + .SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0), + .SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0), + .SubsliceHashing = subslice_hashing[idx], + .SubsliceHashingMask = -1); + + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + emit_lri(&cmd_buffer->batch, GENX(GT_MODE_num), gt_mode); + + cmd_buffer->state.current_hash_scale = scale; + } +#endif +} + static void cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) { -- 2.30.2