From: Connor Abbott Date: Mon, 20 Jul 2020 10:14:41 +0000 (+0200) Subject: tu: Implement VK_EXT_conditional_rendering X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=ee2c58dde46a7c673cd19cdd433af5c13b5e2ae1;p=mesa.git tu: Implement VK_EXT_conditional_rendering Part-of: --- diff --git a/src/freedreno/vulkan/tu_clear_blit.c b/src/freedreno/vulkan/tu_clear_blit.c index fe045c581cd..4f27035df43 100644 --- a/src/freedreno/vulkan/tu_clear_blit.c +++ b/src/freedreno/vulkan/tu_clear_blit.c @@ -327,6 +327,13 @@ r2d_setup(struct tu_cmd_buffer *cmd, r2d_setup_common(cmd, cs, vk_format, aspect_mask, rotation, clear, ubwc, false); } +static void +r2d_teardown(struct tu_cmd_buffer *cmd, + struct tu_cs *cs) +{ + /* nothing to do here */ +} + static void r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { @@ -803,6 +810,11 @@ r3d_setup(struct tu_cmd_buffer *cmd, .component_enable = aspect_write_mask(vk_format, aspect_mask))); tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format))); tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format))); + + if (cmd->state.predication_active) { + tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1); + tu_cs_emit(cs, 0); + } } static void @@ -816,6 +828,15 @@ r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit(cs, 2); /* vertex count */ } +static void +r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + if (cmd->state.predication_active) { + tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1); + tu_cs_emit(cs, 1); + } +} + /* blit ops - common interface for 2d/shader paths */ struct blit_ops { @@ -844,6 +865,8 @@ struct blit_ops { bool clear, bool ubwc); void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs); + void (*teardown)(struct tu_cmd_buffer *cmd, + struct tu_cs *cs); }; static const struct blit_ops r2d_ops = { @@ -855,6 +878,7 @@ static const struct blit_ops r2d_ops = { .dst_buffer = r2d_dst_buffer, .setup = r2d_setup, .run = r2d_run, + .teardown = r2d_teardown, }; static const struct blit_ops r3d_ops = { @@ -866,6 +890,7 @@ static const struct blit_ops r3d_ops = { .dst_buffer = r3d_dst_buffer, .setup = r3d_setup, .run = r3d_run, + .teardown = r3d_teardown, }; /* passthrough set coords from 3D extents */ @@ -1061,6 +1086,8 @@ tu6_blit_image(struct tu_cmd_buffer *cmd, ops->src(cmd, cs, &src, i, filter); ops->run(cmd, cs); } + + ops->teardown(cmd, cs); } void @@ -1170,6 +1197,8 @@ tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd, ops->run(cmd, cs); } } + + ops->teardown(cmd, cs); } void @@ -1243,6 +1272,8 @@ tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd, ops->run(cmd, cs); } } + + ops->teardown(cmd, cs); } void @@ -1464,6 +1495,8 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd, ops->run(cmd, cs); } } + + ops->teardown(cmd, cs); } void @@ -1514,6 +1547,8 @@ copy_buffer(struct tu_cmd_buffer *cmd, dst_va += width * block_size; blocks -= width; } + + ops->teardown(cmd, cs); } void @@ -1595,6 +1630,8 @@ tu_CmdFillBuffer(VkCommandBuffer commandBuffer, dst_va += width * 4; blocks -= width; } + + ops->teardown(cmd, cs); } void @@ -1637,6 +1674,8 @@ tu_CmdResolveImage(VkCommandBuffer commandBuffer, ops->run(cmd, cs); } } + + ops->teardown(cmd, cs); } void @@ -1663,6 +1702,8 @@ tu_resolve_sysmem(struct tu_cmd_buffer *cmd, ops->dst(cs, dst, i); ops->run(cmd, cs); } + + ops->teardown(cmd, cs); } static void @@ -1714,6 +1755,8 @@ clear_image(struct tu_cmd_buffer *cmd, ops->run(cmd, cs); } } + + ops->teardown(cmd, cs); } void @@ -2050,6 +2093,22 @@ tu_CmdClearAttachments(VkCommandBuffer commandBuffer, */ tu_emit_cache_flush_renderpass(cmd, cs); + /* vkCmdClearAttachments is supposed to respect the predicate if active. + * The easiest way to do this is to always use the 3d path, which always + * works even with GMEM because it's just a simple draw using the existing + * attachment state. However it seems that IGNORE_VISIBILITY draws must be + * skipped in the binning pass, since otherwise they produce binning data + * which isn't consumed and leads to the wrong binning data being read, so + * condition on GMEM | SYSMEM. + */ + if (cmd->state.predication_active) { + tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM | + CP_COND_EXEC_0_RENDER_MODE_SYSMEM); + tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); + tu_cond_exec_end(cs); + return; + } + tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM); tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); tu_cond_exec_end(cs); @@ -2089,6 +2148,8 @@ clear_sysmem_attachment(struct tu_cmd_buffer *cmd, } ops->run(cmd, cs); } + + ops->teardown(cmd, cs); } void diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index 24b0de0085a..6d76f00f9e0 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -31,6 +31,7 @@ #include "adreno_common.xml.h" #include "vk_format.h" +#include "vk_util.h" #include "tu_cs.h" @@ -568,6 +569,29 @@ use_hw_binning(struct tu_cmd_buffer *cmd) if (cmd->state.xfb_used) return true; + /* Some devices have a newer a630_sqe.fw in which, only in CP_DRAW_INDX and + * CP_DRAW_INDX_OFFSET, visibility-based skipping happens *before* + * predication-based skipping. It seems this breaks predication, because + * draws skipped by predication will not be executed in the binning phase, + * and therefore won't have an entry in the draw stream, but the + * visibility-based skipping will expect it to have an entry. The result is + * a GPU hang when actually executing the first non-predicated draw. + * However, it seems that things still work if the whole renderpass is + * predicated. Affected tests are + * dEQP-VK.conditional_rendering.draw_clear.draw.case_2 as well as a few + * other case_N. + * + * Broken FW version: 016ee181 + * linux-firmware (working) FW version: 016ee176 + * + * All known a650_sqe.fw versions don't have this bug. + * + * TODO: we should do version detection of the FW so that devices using the + * linux-firmware version of a630_sqe.fw don't need this workaround. + */ + if (cmd->state.has_subpass_predication && cmd->device->physical_device->gpu_id != 650) + return false; + if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_NOBIN)) return false; @@ -583,6 +607,13 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd) if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_SYSMEM)) return true; + /* If hw binning is required because of XFB but doesn't work because of the + * conditional rendering bug, fallback to sysmem. + */ + if (cmd->state.xfb_used && cmd->state.has_subpass_predication && + cmd->device->physical_device->gpu_id != 650) + return true; + /* can't fit attachments into gmem */ if (!cmd->state.pass->gmem_pixels) return true; @@ -1591,8 +1622,21 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer, break; } } else if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { + assert(pBeginInfo->pInheritanceInfo); + + vk_foreach_struct(ext, pBeginInfo->pInheritanceInfo) { + switch (ext->sType) { + case VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT: { + const VkCommandBufferInheritanceConditionalRenderingInfoEXT *cond_rend = (void *) ext; + cmd_buffer->state.predication_active = cond_rend->conditionalRenderingEnable; + break; + default: + break; + } + } + } + if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { - assert(pBeginInfo->pInheritanceInfo); cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass); cmd_buffer->state.subpass = &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; @@ -2356,10 +2400,19 @@ vk2tu_access(VkAccessFlags flags, bool gmem) * * Transform feedback counters are read via CP_MEM_TO_REG, which implicitly * does CP_WAIT_FOR_ME, but we still need a WFI if the GPU writes it. + * + * Currently we read the draw predicate using CP_MEM_TO_MEM, which + * also implicitly does CP_WAIT_FOR_ME. However CP_DRAW_PRED_SET does *not* + * implicitly do CP_WAIT_FOR_ME, it seems to only wait for counters to + * complete since it's written for DX11 where you can only predicate on the + * result of a query object. So if we implement 64-bit comparisons in the + * future, or if CP_DRAW_PRED_SET grows the capability to do 32-bit + * comparisons, then this will have to be dealt with. */ if (flags & (VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | + VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT | VK_ACCESS_MEMORY_READ_BIT)) { mask |= TU_ACCESS_WFI_READ; } @@ -2531,6 +2584,8 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer, if (secondary->state.has_tess) cmd->state.has_tess = true; + if (secondary->state.has_subpass_predication) + cmd->state.has_subpass_predication = true; } else { assert(tu_cs_is_empty(&secondary->draw_cs)); assert(tu_cs_is_empty(&secondary->draw_epilogue_cs)); @@ -3671,6 +3726,7 @@ tu_CmdEndRenderPass(VkCommandBuffer commandBuffer) cmd_buffer->state.subpass = NULL; cmd_buffer->state.framebuffer = NULL; cmd_buffer->state.has_tess = false; + cmd_buffer->state.has_subpass_predication = false; } void @@ -3870,3 +3926,64 @@ tu_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask) { /* No-op */ } + + +void +tu_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer, + const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin) +{ + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + + cmd->state.predication_active = true; + if (cmd->state.pass) + cmd->state.has_subpass_predication = true; + + struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs; + + tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1); + tu_cs_emit(cs, 1); + + /* Wait for any writes to the predicate to land */ + if (cmd->state.pass) + tu_emit_cache_flush_renderpass(cmd, cs); + else + tu_emit_cache_flush(cmd, cs); + + TU_FROM_HANDLE(tu_buffer, buf, pConditionalRenderingBegin->buffer); + uint64_t iova = tu_buffer_iova(buf) + pConditionalRenderingBegin->offset; + + /* qcom doesn't support 32-bit reference values, only 64-bit, but Vulkan + * mandates 32-bit comparisons. Our workaround is to copy the the reference + * value to the low 32-bits of a location where the high 32 bits are known + * to be 0 and then compare that. + */ + tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5); + tu_cs_emit(cs, 0); + tu_cs_emit_qw(cs, global_iova(cmd, predicate)); + tu_cs_emit_qw(cs, iova); + + tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); + tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); + + bool inv = pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT; + tu_cs_emit_pkt7(cs, CP_DRAW_PRED_SET, 3); + tu_cs_emit(cs, CP_DRAW_PRED_SET_0_SRC(PRED_SRC_MEM) | + CP_DRAW_PRED_SET_0_TEST(inv ? EQ_0_PASS : NE_0_PASS)); + tu_cs_emit_qw(cs, global_iova(cmd, predicate)); + + tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ); +} + +void +tu_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer) +{ + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + + cmd->state.predication_active = false; + + struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs; + + tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1); + tu_cs_emit(cs, 0); +} + diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c index 14e24078316..8540f8dad6b 100644 --- a/src/freedreno/vulkan/tu_device.c +++ b/src/freedreno/vulkan/tu_device.c @@ -793,8 +793,8 @@ tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice, case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONDITIONAL_RENDERING_FEATURES_EXT: { VkPhysicalDeviceConditionalRenderingFeaturesEXT *features = (VkPhysicalDeviceConditionalRenderingFeaturesEXT *) ext; - features->conditionalRendering = false; - features->inheritedConditionalRendering = false; + features->conditionalRendering = true; + features->inheritedConditionalRendering = true; break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT: { @@ -1354,8 +1354,10 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, if (result != VK_SUCCESS) goto fail_global_bo_map; - memcpy(device->global_bo.map + gb_offset(border_color), border_color, sizeof(border_color)); - tu_init_clear_blit_shaders(device->global_bo.map); + struct tu6_global *global = device->global_bo.map; + memcpy(global->border_color, border_color, sizeof(border_color)); + global->predicate = 0; + tu_init_clear_blit_shaders(global); VkPipelineCacheCreateInfo ci; ci.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO; diff --git a/src/freedreno/vulkan/tu_extensions.py b/src/freedreno/vulkan/tu_extensions.py index 9b84e14fc00..50afba18920 100644 --- a/src/freedreno/vulkan/tu_extensions.py +++ b/src/freedreno/vulkan/tu_extensions.py @@ -90,6 +90,7 @@ EXTENSIONS = [ Extension('VK_EXT_depth_clip_enable', 1, True), Extension('VK_KHR_draw_indirect_count', 1, True), Extension('VK_EXT_4444_formats', 1, True), + Extension('VK_EXT_conditional_rendering', 1, True), ] MAX_API_VERSION = VkVersion(MAX_API_VERSION) diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index b5f76e7319d..9cd15f780cd 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -368,7 +368,8 @@ struct tu6_global volatile uint32_t vsc_draw_overflow; uint32_t _pad1; volatile uint32_t vsc_prim_overflow; - uint32_t _pad2[3]; + uint32_t _pad2; + uint64_t predicate; /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */ struct { @@ -923,6 +924,8 @@ struct tu_cmd_state bool xfb_used; bool has_tess; + bool has_subpass_predication; + bool predication_active; }; struct tu_cmd_pool