From e8d52dab482045ced3f5b9154aedbb8641fa961a Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Tue, 6 Dec 2016 17:52:14 -0800 Subject: [PATCH] anv: Add support for the PMA fix on Broadwell This helps Dota 2 on Broadwell by 8-9%. I also hacked up the driver and used the Sascha "shadowmapping" demo to get some results. Setting uses_kill to true dropped the framerate on the demo by 25-30%. Enabling the PMA fix brought it back up to around 90% of the original framerate. Reviewed-by: Lionel Landwerlin Reviewed-by: Nanley Chery --- src/intel/vulkan/TODO | 1 - src/intel/vulkan/anv_cmd_buffer.c | 2 + src/intel/vulkan/anv_genX.h | 3 + src/intel/vulkan/anv_private.h | 17 ++++ src/intel/vulkan/gen7_cmd_buffer.c | 7 ++ src/intel/vulkan/gen8_cmd_buffer.c | 131 +++++++++++++++++++++++++++++ src/intel/vulkan/genX_blorp_exec.c | 5 ++ src/intel/vulkan/genX_cmd_buffer.c | 15 +++- src/intel/vulkan/genX_pipeline.c | 42 +++++++++ 9 files changed, 221 insertions(+), 2 deletions(-) diff --git a/src/intel/vulkan/TODO b/src/intel/vulkan/TODO index 38acc0dd5b6..f8b73a11315 100644 --- a/src/intel/vulkan/TODO +++ b/src/intel/vulkan/TODO @@ -12,5 +12,4 @@ Performance: - Compressed multisample support - Pushing pieces of UBOs? - Enable guardband clipping - - pma stall workaround - Use soft-pin to avoid relocations diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index 5886fa63e69..8c08f8d48e6 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -135,6 +135,8 @@ anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer) state->restart_index = UINT32_MAX; state->dynamic = default_dynamic_state; state->need_query_wa = true; + state->pma_fix_enabled = false; + state->hiz_enabled = false; if (state->attachments != NULL) { vk_free(&cmd_buffer->pool->alloc, state->attachments); diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h index d04fe38a517..67147b0e92b 100644 --- a/src/intel/vulkan/anv_genX.h +++ b/src/intel/vulkan/anv_genX.h @@ -55,6 +55,9 @@ void genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer); void genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer); +void genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, + bool enable); + void genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch, const struct gen_l3_config *l3_config, diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 89ae8183799..449aa0f4ab6 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1168,6 +1168,20 @@ struct anv_cmd_state { struct anv_dynamic_state dynamic; bool need_query_wa; + /** + * Whether or not the gen8 PMA fix is enabled. We ensure that, at the top + * of any command buffer it is disabled by disabling it in EndCommandBuffer + * and before invoking the secondary in ExecuteCommands. + */ + bool pma_fix_enabled; + + /** + * Whether or not we know for certain that HiZ is enabled for the current + * subpass. If, for whatever reason, we are unsure as to whether HiZ is + * enabled or not, this will be false. + */ + bool hiz_enabled; + /** * Array length is anv_cmd_state::pass::attachment_count. Array content is * valid only when recording a render pass instance. @@ -1471,8 +1485,11 @@ struct anv_pipeline { uint32_t cs_right_mask; + bool writes_depth; + bool depth_test_enable; bool writes_stencil; bool depth_clamp_enable; + bool kill_pixel; struct { uint32_t sf[7]; diff --git a/src/intel/vulkan/gen7_cmd_buffer.c b/src/intel/vulkan/gen7_cmd_buffer.c index 013ed8718a4..c1a25e8ce26 100644 --- a/src/intel/vulkan/gen7_cmd_buffer.c +++ b/src/intel/vulkan/gen7_cmd_buffer.c @@ -260,6 +260,13 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer) cmd_buffer->state.dirty = 0; } +void +genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, + bool enable) +{ + /* The NP PMA fix doesn't exist on gen7 */ +} + void genX(CmdSetEvent)( VkCommandBuffer commandBuffer, VkEvent event, diff --git a/src/intel/vulkan/gen8_cmd_buffer.c b/src/intel/vulkan/gen8_cmd_buffer.c index 8c8de622eb8..0628f3a0dd1 100644 --- a/src/intel/vulkan/gen8_cmd_buffer.c +++ b/src/intel/vulkan/gen8_cmd_buffer.c @@ -154,6 +154,133 @@ __emit_sf_state(struct anv_cmd_buffer *cmd_buffer) #endif +void +genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable) +{ +#if GEN_GEN == 8 + if (cmd_buffer->state.pma_fix_enabled == enable) + return; + + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.DepthCacheFlushEnable = true; + pc.CommandStreamerStallEnable = true; + pc.RenderTargetCacheFlushEnable = true; + } + + uint32_t cache_mode; + anv_pack_struct(&cache_mode, GENX(CACHE_MODE_1), + .NPPMAFixEnable = enable, + .NPEarlyZFailsDisable = enable, + .NPPMAFixEnableMask = true, + .NPEarlyZFailsDisableMask = true); + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) { + lri.RegisterOffset = GENX(CACHE_MODE_1_num); + lri.DataDWord = cache_mode; + } + + /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache + * Flush bits is often necessary. We do it regardless because it's easier. + * The render cache flush is also necessary if stencil writes are enabled. + */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.DepthStallEnable = true; + pc.DepthCacheFlushEnable = true; + pc.RenderTargetCacheFlushEnable = true; + } + + cmd_buffer->state.pma_fix_enabled = enable; +#endif /* GEN_GEN == 8 */ +} + +static inline bool +want_depth_pma_fix(struct anv_cmd_buffer *cmd_buffer) +{ + assert(GEN_GEN == 8); + + /* From the Broadwell PRM Vol. 2c CACHE_MODE_1::NP_PMA_FIX_ENABLE: + * + * SW must set this bit in order to enable this fix when following + * expression is TRUE. + * + * 3DSTATE_WM::ForceThreadDispatch != 1 && + * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) && + * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) && + * (3DSTATE_DEPTH_BUFFER::HIZ Enable) && + * !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) && + * (3DSTATE_PS_EXTRA::PixelShaderValid) && + * !(3DSTATE_WM_HZ_OP::DepthBufferClear || + * 3DSTATE_WM_HZ_OP::DepthBufferResolve || + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable || + * 3DSTATE_WM_HZ_OP::StencilBufferClear) && + * (3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable) && + * (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels || + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget || + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable || + * 3DSTATE_PS_BLEND::AlphaTestEnable || + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) && + * 3DSTATE_WM::ForceKillPix != ForceOff && + * ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable && + * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) || + * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable && + * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE && + * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) || + * (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF)) + */ + + /* These are always true: + * 3DSTATE_WM::ForceThreadDispatch != 1 && + * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) + */ + + /* We only enable the PMA fix if we know for certain that HiZ is enabled. + * If we don't know whether HiZ is enabled or not, we disable the PMA fix + * and there is no harm. + * + * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) && + * 3DSTATE_DEPTH_BUFFER::HIZ Enable + */ + if (!cmd_buffer->state.hiz_enabled) + return false; + + /* 3DSTATE_PS_EXTRA::PixelShaderValid */ + struct anv_pipeline *pipeline = cmd_buffer->state.pipeline; + if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) + return false; + + /* !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) */ + const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); + if (wm_prog_data->early_fragment_tests) + return false; + + /* We never use anv_pipeline for HiZ ops so this is trivially true: + * !(3DSTATE_WM_HZ_OP::DepthBufferClear || + * 3DSTATE_WM_HZ_OP::DepthBufferResolve || + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable || + * 3DSTATE_WM_HZ_OP::StencilBufferClear) + */ + + /* 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable */ + if (!pipeline->depth_test_enable) + return false; + + /* (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels || + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget || + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable || + * 3DSTATE_PS_BLEND::AlphaTestEnable || + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) && + * 3DSTATE_WM::ForceKillPix != ForceOff && + * ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable && + * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) || + * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable && + * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE && + * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) || + * (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF)) + */ + return (pipeline->kill_pixel && (pipeline->writes_depth || + pipeline->writes_stencil)) || + wm_prog_data->computed_depth_mode != PSCDEPTH_OFF; +} + void genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer) { @@ -211,6 +338,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer) } if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE | + ANV_CMD_DIRTY_RENDER_TARGETS | ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK | ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK)) { uint32_t wm_depth_stencil_dw[GENX(3DSTATE_WM_DEPTH_STENCIL_length)]; @@ -234,6 +362,9 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer) anv_batch_emit_merge(&cmd_buffer->batch, wm_depth_stencil_dw, pipeline->gen8.wm_depth_stencil); + + genX(cmd_buffer_enable_pma_fix)(cmd_buffer, + want_depth_pma_fix(cmd_buffer)); } #else if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) { diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c index 663e6c92819..6f0b0638973 100644 --- a/src/intel/vulkan/genX_blorp_exec.c +++ b/src/intel/vulkan/genX_blorp_exec.c @@ -154,6 +154,11 @@ genX(blorp_exec)(struct blorp_batch *batch, genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer); + /* BLORP doesn't do anything fancy with depth such as discards, so we want + * the PMA fix off. Also, off is always the safe option. + */ + genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false); + blorp_exec(batch, params); cmd_buffer->state.vb_dirty = ~0; diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 14338b22ece..40a72f4d141 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -637,6 +637,11 @@ genX(EndCommandBuffer)( { ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + /* We want every command buffer to start with the PMA fix in a known state, + * so we disable it at the end of the command buffer. + */ + genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); anv_cmd_buffer_end_batch_buffer(cmd_buffer); @@ -654,6 +659,11 @@ genX(CmdExecuteCommands)( assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); + /* The secondary command buffers will assume that the PMA fix is disabled + * when they begin executing. Make sure this is true. + */ + genX(cmd_buffer_enable_pma_fix)(primary, false); + for (uint32_t i = 0; i < commandBufferCount; i++) { ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]); @@ -2227,7 +2237,8 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) const bool has_stencil = image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT); - /* FIXME: Implement the PMA stall W/A */ + cmd_buffer->state.hiz_enabled = has_hiz; + /* FIXME: Width and Height are wrong */ genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer); @@ -2465,6 +2476,8 @@ void genX(CmdEndRenderPass)( anv_cmd_buffer_resolve_subpass(cmd_buffer); + cmd_buffer->state.hiz_enabled = false; + #ifndef NDEBUG anv_dump_add_framebuffer(cmd_buffer, cmd_buffer->state.framebuffer); #endif diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index f641661db1b..d0dbe13f393 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -455,6 +455,10 @@ emit_rs_state(struct anv_pipeline *pipeline, */ #if GEN_GEN >= 8 raster.DXMultisampleRasterizationEnable = true; + /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix + * computations. If we ever set this bit to a different value, they will + * need to be updated accordingly. + */ raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0; raster.ForceMultisampling = false; #else @@ -664,6 +668,8 @@ emit_ds_state(struct anv_pipeline *pipeline, * to make sure it's initialized to something useful. */ pipeline->writes_stencil = false; + pipeline->writes_depth = false; + pipeline->depth_test_enable = false; memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw)); return; } @@ -722,6 +728,9 @@ emit_ds_state(struct anv_pipeline *pipeline, if (info->depthTestEnable && info->depthCompareOp == VK_COMPARE_OP_EQUAL) depth_stencil.DepthBufferWriteEnable = false; + pipeline->writes_depth = depth_stencil.DepthBufferWriteEnable; + pipeline->depth_test_enable = depth_stencil.DepthTestEnable; + #if GEN_GEN <= 7 GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw, &depth_stencil); #else @@ -1443,6 +1452,38 @@ emit_3dstate_vf_topology(struct anv_pipeline *pipeline) } #endif +static void +compute_kill_pixel(struct anv_pipeline *pipeline, + const VkPipelineMultisampleStateCreateInfo *ms_info, + const struct anv_subpass *subpass) +{ + if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { + pipeline->kill_pixel = false; + return; + } + + const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); + + /* This computes the KillPixel portion of the computation for whether or + * not we want to enable the PMA fix on gen8. It's given by this chunk of + * the giant formula: + * + * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels || + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget || + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable || + * 3DSTATE_PS_BLEND::AlphaTestEnable || + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) + * + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and so is + * 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept + * of an alpha test. + */ + pipeline->kill_pixel = + subpass->has_ds_self_dep || wm_prog_data->uses_kill || + wm_prog_data->uses_omask || + (ms_info && ms_info->alphaToCoverageEnable); +} + static VkResult genX(graphics_pipeline_create)( VkDevice _device, @@ -1480,6 +1521,7 @@ genX(graphics_pipeline_create)( emit_ds_state(pipeline, pCreateInfo->pDepthStencilState, pass, subpass); emit_cb_state(pipeline, pCreateInfo->pColorBlendState, pCreateInfo->pMultisampleState); + compute_kill_pixel(pipeline, pCreateInfo->pMultisampleState, subpass); emit_urb_setup(pipeline); -- 2.30.2