From: Kenneth Graunke Date: Wed, 22 Oct 2014 15:58:58 +0000 (-0700) Subject: i965: Implement the PMA stall fix. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=7423cc891b4d6fcc63bfeb79cc1d711ce81122bd;p=mesa.git i965: Implement the PMA stall fix. Certain non-promoted depth cases typically incur stalls. In very specific cases, we can enable a workaround which improves performance. Improves performance in GLBenchmark 2.7 TRex by 1.17762% +/- 0.448765% (n=75) at 1280x720 on Broadwell GT3. Haswell has this feature as well, but we can't currently write registers from userspace batches (and we'd incur additional software batch scanning overhead as well), so we haven't enabled it. Broadwell allows us to write CACHE_MODE_1. Backporters beware: the formula and flushing incantation differs between Haswell and Broadwell. v2: Move pma_stall_bits from brw->state to brw itself (requested by Kristian Høgsberg). Signed-off-by: Kenneth Graunke --- diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index eb37e750f6d..656cbe8ce84 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -1092,6 +1092,9 @@ struct brw_context /* Whether the last depth/stencil packets were both NULL. */ bool no_depth_or_stencil; + /* The last PMA stall bits programmed. */ + uint32_t pma_stall_bits; + struct { /** Does the current draw use the index buffer? */ bool indexed; diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h index 2efe56e4c73..209fab10c85 100644 --- a/src/mesa/drivers/dri/i965/brw_state.h +++ b/src/mesa/drivers/dri/i965/brw_state.h @@ -137,6 +137,7 @@ extern const struct brw_tracked_state gen8_disable_stages; extern const struct brw_tracked_state gen8_gs_state; extern const struct brw_tracked_state gen8_index_buffer; extern const struct brw_tracked_state gen8_multisample_state; +extern const struct brw_tracked_state gen8_pma_fix; extern const struct brw_tracked_state gen8_ps_blend; extern const struct brw_tracked_state gen8_ps_extra; extern const struct brw_tracked_state gen8_ps_state; diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c index a6913195848..1c53e5b7d7d 100644 --- a/src/mesa/drivers/dri/i965/brw_state_upload.c +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c @@ -333,6 +333,7 @@ static const struct brw_tracked_state *gen8_atoms[] = &gen8_vertices, &haswell_cut_index, + &gen8_pma_fix, }; static void @@ -390,6 +391,11 @@ void brw_init_state( struct brw_context *brw ) brw->state.dirty.mesa = ~0; brw->state.dirty.brw = ~0ull; + /* ~0 is a nonsensical value which won't match anything we program, so + * the programming will take effect on the first time around. + */ + brw->pma_stall_bits = ~0; + /* Make sure that brw->state.dirty.brw has enough bits to hold all possible * dirty flags. */ diff --git a/src/mesa/drivers/dri/i965/gen8_depth_state.c b/src/mesa/drivers/dri/i965/gen8_depth_state.c index a24032c343c..e716141e62b 100644 --- a/src/mesa/drivers/dri/i965/gen8_depth_state.c +++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c @@ -28,6 +28,7 @@ #include "brw_context.h" #include "brw_state.h" #include "brw_defines.h" +#include "brw_wm.h" /** * Helper function to emit depth related command packets. @@ -211,6 +212,172 @@ gen8_emit_depth_stencil_hiz(struct brw_context *brw, hiz, width, height, depth, lod, min_array_element); } +/** + * Should we set the PMA FIX ENABLE bit? + * + * To avoid unnecessary depth related stalls, we need to set this bit. + * However, there is a very complicated formula which governs when it + * is legal to do so. This function computes that. + * + * See the documenation for the CACHE_MODE_1 register, bit 11. + */ +static bool +pma_fix_enable(const struct brw_context *brw) +{ + const struct gl_context *ctx = &brw->ctx; + /* BRW_NEW_FRAGMENT_PROGRAM */ + const struct gl_fragment_program *fp = brw->fragment_program; + /* _NEW_BUFFERS */ + struct intel_renderbuffer *depth_irb = + intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH); + + /* 3DSTATE_WM::ForceThreadDispatch is never used. */ + const bool wm_force_thread_dispatch = false; + + /* 3DSTATE_RASTER::ForceSampleCount is never used. */ + const bool raster_force_sample_count_nonzero = false; + + /* _NEW_BUFFERS: + * 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL && + * 3DSTATE_DEPTH_BUFFER::HIZ Enable + */ + const bool hiz_enabled = depth_irb && intel_renderbuffer_has_hiz(depth_irb); + + /* 3DSTATE_WM::Early Depth/Stencil Control != EDSC_PREPS (2). + * We always leave this set to EDSC_NORMAL (0). + */ + const bool edsc_not_preps = true; + + /* 3DSTATE_PS_EXTRA::PixelShaderValid is always true. */ + const bool pixel_shader_valid = true; + + /* !(3DSTATE_WM_HZ_OP::DepthBufferClear || + * 3DSTATE_WM_HZ_OP::DepthBufferResolve || + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable || + * 3DSTATE_WM_HZ_OP::StencilBufferClear) + * + * HiZ operations are done outside of the normal state upload, so they're + * definitely not happening now. + */ + const bool in_hiz_op = false; + + /* _NEW_DEPTH: + * DEPTH_STENCIL_STATE::DepthTestEnable + */ + const bool depth_test_enabled = depth_irb && ctx->Depth.Test; + + /* _NEW_DEPTH: + * 3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable && + * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE. + */ + const bool depth_writes_enabled = ctx->Depth.Mask; + + /* _NEW_STENCIL: + * !DEPTH_STENCIL_STATE::Stencil Buffer Write Enable || + * !3DSTATE_DEPTH_BUFFER::Stencil Buffer Enable || + * !3DSTATE_STENCIL_BUFFER::Stencil Buffer Enable + */ + const bool stencil_writes_enabled = ctx->Stencil._WriteEnabled; + + /* BRW_NEW_FRAGMENT_PROGRAM: + * 3DSTATE_PS_EXTRA::Pixel Shader Computed Depth Mode == PSCDEPTH_OFF + */ + const bool ps_computes_depth = + (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) && + fp->FragDepthLayout != FRAG_DEPTH_LAYOUT_UNCHANGED; + + /* BRW_NEW_FRAGMENT_PROGRAM: 3DSTATE_PS_EXTRA::PixelShaderKillsPixels + * CACHE_NEW_WM_PROG: 3DSTATE_PS_EXTRA::oMask Present to RenderTarget + * _NEW_MULTISAMPLE: 3DSTATE_PS_BLEND::AlphaToCoverageEnable + * _NEW_COLOR: 3DSTATE_PS_BLEND::AlphaTestEnable + * + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false. + * 3DSTATE_WM::ForceKillPix != ForceOff is always true. + */ + const bool kill_pixel = + fp->UsesKill || + brw->wm.prog_data->uses_omask || + (ctx->Multisample._Enabled && ctx->Multisample.SampleAlphaToCoverage) || + ctx->Color.AlphaEnabled; + + /* The big formula in CACHE_MODE_1::NP PMA FIX ENABLE. */ + return !wm_force_thread_dispatch && + !raster_force_sample_count_nonzero && + hiz_enabled && + edsc_not_preps && + pixel_shader_valid && + !in_hiz_op && + depth_test_enabled && + (ps_computes_depth || + (kill_pixel && (depth_writes_enabled || stencil_writes_enabled))); +} + +static void +write_pma_stall_bits(struct brw_context *brw, uint32_t pma_stall_bits) +{ + struct gl_context *ctx = &brw->ctx; + + /* If we haven't actually changed the value, bail now to avoid unnecessary + * pipeline stalls and register writes. + */ + if (brw->pma_stall_bits == pma_stall_bits) + return; + + brw->pma_stall_bits = pma_stall_bits; + + /* According to the PIPE_CONTROL documentation, software should emit a + * PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set prior + * to the LRI. If stencil buffer writes are enabled, then a Render Cache + * Flush is also necessary. + */ + const uint32_t render_cache_flush = + ctx->Stencil._WriteEnabled ? PIPE_CONTROL_WRITE_FLUSH : 0; + brw_emit_pipe_control_flush(brw, + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + render_cache_flush); + + /* CACHE_MODE_1 is a non-privileged register. */ + BEGIN_BATCH(3); + OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2)); + OUT_BATCH(GEN7_CACHE_MODE_1); + OUT_BATCH(GEN8_HIZ_PMA_MASK_BITS | pma_stall_bits); + ADVANCE_BATCH(); + + /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache + * Flush bits is often necessary. We do it regardless because it's easier. + * The render cache flush is also necessary if stencil writes are enabled. + */ + brw_emit_pipe_control_flush(brw, + PIPE_CONTROL_DEPTH_STALL | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + render_cache_flush); + +} + +static void +gen8_emit_pma_stall_workaround(struct brw_context *brw) +{ + uint32_t bits = 0; + if (pma_fix_enable(brw)) + bits |= GEN8_HIZ_NP_PMA_FIX_ENABLE | GEN8_HIZ_NP_EARLY_Z_FAILS_DISABLE; + + write_pma_stall_bits(brw, bits); +} + +const struct brw_tracked_state gen8_pma_fix = { + .dirty = { + .mesa = _NEW_BUFFERS | + _NEW_COLOR | + _NEW_DEPTH | + _NEW_MULTISAMPLE | + _NEW_STENCIL, + .brw = BRW_NEW_FRAGMENT_PROGRAM, + .cache = CACHE_NEW_WM_PROG, + }, + .emit = gen8_emit_pma_stall_workaround +}; + /** * Emit packets to perform a depth/HiZ resolve or fast depth/stencil clear. * @@ -224,6 +391,9 @@ gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt, if (op == GEN6_HIZ_OP_NONE) return; + /* Disable the PMA stall fix since we're about to do a HiZ operation. */ + write_pma_stall_bits(brw, 0); + assert(mt->first_level == 0); assert(mt->logical_depth0 >= 1);