From 0b7ecfdda5900135c26e8b94285b8addc1055f09 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Wed, 25 Sep 2019 00:31:07 -0700 Subject: [PATCH] iris: Implement the Broadwell NP Z PMA Stall Fix This should help avoid stalls in the pixel mask array in certain non-promoted depth cases. It especially helps for Z16, as each bit in the PMA corresponds to two pixels when using Z16, as opposed to the usual one pixel. Improves performance in GFXBench5 TRex by 22% (n=1). --- src/gallium/drivers/iris/iris_blorp.c | 4 + src/gallium/drivers/iris/iris_context.h | 1 + src/gallium/drivers/iris/iris_genx_protos.h | 3 + src/gallium/drivers/iris/iris_program.c | 5 + src/gallium/drivers/iris/iris_state.c | 198 +++++++++++++++++++- 5 files changed, 209 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/iris/iris_blorp.c b/src/gallium/drivers/iris/iris_blorp.c index fa46dafcda2..d6b99c59ca7 100644 --- a/src/gallium/drivers/iris/iris_blorp.c +++ b/src/gallium/drivers/iris/iris_blorp.c @@ -307,6 +307,10 @@ iris_blorp_exec(struct blorp_batch *blorp_batch, iris_require_command_space(batch, 1400); +#if GEN_GEN == 8 + genX(update_pma_fix)(ice, batch, false); +#endif + const unsigned scale = params->fast_clear_op ? UINT_MAX : 1; if (ice->state.current_hash_scale != scale) { genX(emit_hashing_mode)(ice, batch, params->x1 - params->x0, diff --git a/src/gallium/drivers/iris/iris_context.h b/src/gallium/drivers/iris/iris_context.h index 85ffd1fece4..74a66f4a5cf 100644 --- a/src/gallium/drivers/iris/iris_context.h +++ b/src/gallium/drivers/iris/iris_context.h @@ -134,6 +134,7 @@ enum { #define IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES (1ull << 55) #define IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES (1ull << 56) #define IRIS_DIRTY_VF_STATISTICS (1ull << 57) +#define IRIS_DIRTY_PMA_FIX (1ull << 58) #define IRIS_ALL_DIRTY_FOR_COMPUTE (IRIS_DIRTY_CS | \ IRIS_DIRTY_SAMPLER_STATES_CS | \ diff --git a/src/gallium/drivers/iris/iris_genx_protos.h b/src/gallium/drivers/iris/iris_genx_protos.h index 16da78d7e9f..84d4b4b324c 100644 --- a/src/gallium/drivers/iris/iris_genx_protos.h +++ b/src/gallium/drivers/iris/iris_genx_protos.h @@ -37,6 +37,9 @@ void genX(emit_hashing_mode)(struct iris_context *ice, struct iris_batch *batch, unsigned width, unsigned height, unsigned scale); +void genX(update_pma_fix)(struct iris_context *ice, + struct iris_batch *batch, + bool enable); /* iris_blorp.c */ void genX(init_blorp)(struct iris_context *ice); diff --git a/src/gallium/drivers/iris/iris_program.c b/src/gallium/drivers/iris/iris_program.c index ae701ec984d..886cdff56b6 100644 --- a/src/gallium/drivers/iris/iris_program.c +++ b/src/gallium/drivers/iris/iris_program.c @@ -2387,6 +2387,8 @@ static void iris_bind_fs_state(struct pipe_context *ctx, void *state) { struct iris_context *ice = (struct iris_context *) ctx; + struct iris_screen *screen = (struct iris_screen *) ctx->screen; + const struct gen_device_info *devinfo = &screen->devinfo; struct iris_uncompiled_shader *old_ish = ice->shaders.uncompiled[MESA_SHADER_FRAGMENT]; struct iris_uncompiled_shader *new_ish = state; @@ -2401,6 +2403,9 @@ iris_bind_fs_state(struct pipe_context *ctx, void *state) (new_ish->nir->info.outputs_written & color_bits)) ice->state.dirty |= IRIS_DIRTY_PS_BLEND; + if (devinfo->gen == 8) + ice->state.dirty |= IRIS_DIRTY_PMA_FIX; + bind_shader_state((void *) ctx, state, MESA_SHADER_FRAGMENT); } diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index 6f0aa182339..ac6a5dd5fd1 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -1055,6 +1055,10 @@ struct iris_genx_state { uint32_t so_buffers[4 * GENX(3DSTATE_SO_BUFFER_length)]; +#if GEN_GEN == 8 + bool pma_fix_enabled; +#endif + #if GEN_GEN == 9 /* Is object level preemption enabled? */ bool object_preemption; @@ -1242,6 +1246,9 @@ iris_bind_blend_state(struct pipe_context *ctx, void *state) ice->state.dirty |= IRIS_DIRTY_BLEND_STATE; ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES; ice->state.dirty |= ice->state.dirty_for_nos[IRIS_NOS_BLEND]; + + if (GEN_GEN == 8) + ice->state.dirty |= IRIS_DIRTY_PMA_FIX; } /** @@ -1276,6 +1283,9 @@ struct iris_depth_stencil_alpha_state { /** Outbound to resolve and cache set tracking. */ bool depth_writes_enabled; bool stencil_writes_enabled; + + /** Outbound to Gen8-9 PMA stall equations */ + bool depth_test_enabled; }; /** @@ -1295,6 +1305,7 @@ iris_create_zsa_state(struct pipe_context *ctx, cso->alpha = state->alpha; cso->depth_writes_enabled = state->depth.writemask; + cso->depth_test_enabled = state->depth.enabled; cso->stencil_writes_enabled = state->stencil[0].writemask != 0 || (two_sided_stencil && state->stencil[1].writemask != 0); @@ -1364,6 +1375,181 @@ iris_bind_zsa_state(struct pipe_context *ctx, void *state) ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT; ice->state.dirty |= IRIS_DIRTY_WM_DEPTH_STENCIL; ice->state.dirty |= ice->state.dirty_for_nos[IRIS_NOS_DEPTH_STENCIL_ALPHA]; + + if (GEN_GEN == 8) + ice->state.dirty |= IRIS_DIRTY_PMA_FIX; +} + +#if GEN_GEN == 8 +static bool +want_pma_fix(struct iris_context *ice) +{ + UNUSED struct iris_screen *screen = (void *) ice->ctx.screen; + UNUSED const struct gen_device_info *devinfo = &screen->devinfo; + const struct brw_wm_prog_data *wm_prog_data = (void *) + ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data; + const struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; + const struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa; + const struct iris_blend_state *cso_blend = ice->state.cso_blend; + + /* In very specific combinations of state, we can instruct Gen8-9 hardware + * to avoid stalling at the pixel mask array. The state equations are + * documented in these places: + * + * - Gen8 Depth PMA Fix: CACHE_MODE_1::NP_PMA_FIX_ENABLE + * - Gen9 Stencil PMA Fix: CACHE_MODE_0::STC PMA Optimization Enable + * + * Both equations share some common elements: + * + * no_hiz_op = + * !(3DSTATE_WM_HZ_OP::DepthBufferClear || + * 3DSTATE_WM_HZ_OP::DepthBufferResolve || + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable || + * 3DSTATE_WM_HZ_OP::StencilBufferClear) && + * + * killpixels = + * 3DSTATE_WM::ForceKillPix != ForceOff && + * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels || + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget || + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable || + * 3DSTATE_PS_BLEND::AlphaTestEnable || + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) + * + * (Technically the stencil PMA treats ForceKillPix differently, + * but I think this is a documentation oversight, and we don't + * ever use it in this way, so it doesn't matter). + * + * common_pma_fix = + * 3DSTATE_WM::ForceThreadDispatch != 1 && + * 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 && + * 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL && + * 3DSTATE_DEPTH_BUFFER::HIZ Enable && + * 3DSTATE_WM::EDSC_Mode != EDSC_PREPS && + * 3DSTATE_PS_EXTRA::PixelShaderValid && + * no_hiz_op + * + * These are always true: + * + * 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 + * 3DSTATE_PS_EXTRA::PixelShaderValid + * + * Also, we never use the normal drawing path for HiZ ops; these are true: + * + * !(3DSTATE_WM_HZ_OP::DepthBufferClear || + * 3DSTATE_WM_HZ_OP::DepthBufferResolve || + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable || + * 3DSTATE_WM_HZ_OP::StencilBufferClear) + * + * This happens sometimes: + * + * 3DSTATE_WM::ForceThreadDispatch != 1 + * + * However, we choose to ignore it as it either agrees with the signal + * (dispatch was already enabled, so nothing out of the ordinary), or + * there are no framebuffer attachments (so no depth or HiZ anyway, + * meaning the PMA signal will already be disabled). + */ + + if (!cso_fb->zsbuf) + return false; + + struct iris_resource *zres, *sres; + iris_get_depth_stencil_resources(cso_fb->zsbuf->texture, &zres, &sres); + + /* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL && + * 3DSTATE_DEPTH_BUFFER::HIZ Enable && + */ + if (!zres || !iris_resource_level_has_hiz(zres, cso_fb->zsbuf->u.tex.level)) + return false; + + /* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS */ + if (wm_prog_data->early_fragment_tests) + return false; + + /* 3DSTATE_WM::ForceKillPix != ForceOff && + * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels || + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget || + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable || + * 3DSTATE_PS_BLEND::AlphaTestEnable || + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) + */ + bool killpixels = wm_prog_data->uses_kill || wm_prog_data->uses_omask || + cso_blend->alpha_to_coverage || cso_zsa->alpha.enabled; + + /* The Gen8 depth PMA equation becomes: + * + * depth_writes = + * 3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable && + * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE + * + * stencil_writes = + * 3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable && + * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE && + * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE + * + * Z_PMA_OPT = + * common_pma_fix && + * 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable && + * ((killpixels && (depth_writes || stencil_writes)) || + * 3DSTATE_PS_EXTRA::PixelShaderComputedDepthMode != PSCDEPTH_OFF) + * + */ + if (!cso_zsa->depth_test_enabled) + return false; + + return wm_prog_data->computed_depth_mode != PSCDEPTH_OFF || + (killpixels && (cso_zsa->depth_writes_enabled || + (sres && cso_zsa->stencil_writes_enabled))); +} +#endif + +void +genX(update_pma_fix)(struct iris_context *ice, + struct iris_batch *batch, + bool enable) +{ +#if GEN_GEN == 8 + struct iris_genx_state *genx = ice->state.genx; + + if (genx->pma_fix_enabled == enable) + return; + + genx->pma_fix_enabled = enable; + + /* According to the Broadwell PIPE_CONTROL documentation, software should + * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set + * prior to the LRI. If stencil buffer writes are enabled, then a Render * Cache Flush is also necessary. + * + * The Gen9 docs say to use a depth stall rather than a command streamer + * stall. However, the hardware seems to violently disagree. A full + * command streamer stall seems to be needed in both cases. + */ + iris_emit_pipe_control_flush(batch, "PMA fix change (1/2)", + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_RENDER_TARGET_FLUSH); + + uint32_t reg_val; + iris_pack_state(GENX(CACHE_MODE_1), ®_val, reg) { + reg.NPPMAFixEnable = enable; + reg.NPEarlyZFailsDisable = enable; + reg.NPPMAFixEnableMask = true; + reg.NPEarlyZFailsDisableMask = true; + } + iris_emit_lri(batch, CACHE_MODE_1, reg_val); + + /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache + * Flush bits is often necessary. We do it regardless because it's easier. + * The render cache flush is also necessary if stencil writes are enabled. + * + * Again, the Gen9 docs give a different set of flushes but the Broadwell + * flushes seem to work just as well. + */ + iris_emit_pipe_control_flush(batch, "PMA fix change (1/2)", + PIPE_CONTROL_DEPTH_STALL | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_RENDER_TARGET_FLUSH); +#endif } /** @@ -2816,6 +3002,9 @@ iris_set_framebuffer_state(struct pipe_context *ctx, ice->state.dirty |= ice->state.dirty_for_nos[IRIS_NOS_FRAMEBUFFER]; + if (GEN_GEN == 8) + ice->state.dirty |= IRIS_DIRTY_PMA_FIX; + #if GEN_GEN == 11 // XXX: we may want to flag IRIS_DIRTY_MULTISAMPLE (or SAMPLE_MASK?) // XXX: see commit 979fc1bc9bcc64027ff2cfafd285676f31b930a6 @@ -5642,10 +5831,15 @@ iris_upload_dirty_render_state(struct iris_context *ice, } } +#if GEN_GEN == 8 + if (dirty & IRIS_DIRTY_PMA_FIX) { + bool enable = want_pma_fix(ice); + genX(update_pma_fix)(ice, batch, enable); + } +#endif + if (ice->state.current_hash_scale != 1) genX(emit_hashing_mode)(ice, batch, UINT_MAX, UINT_MAX, 1); - - /* TODO: Gen8 PMA fix */ } static void -- 2.30.2