From: Connor Abbott Date: Thu, 2 Apr 2020 15:48:19 +0000 (+0200) Subject: tu: Rewrite flushing to use barriers X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=487aa807bd1b70602fcb6fbdabd101d4cff7c07b;p=mesa.git tu: Rewrite flushing to use barriers Replace the various ad-hoc flushes that we've inserted, copied from freedreno, etc. with a unified system that uses the user-supplied information via vkCmdPipelineBarrier() and subpass dependencies. There are a few notable differences in behavior: - We now move setting RB_CCU_CNTL up a little in the gmem case, but hopefully that won't matter too much. This matches what the Vulkan blob does. - We properly implement delayed setting of events, completing our implementaton of events. - Finally, of course, we should be a lot less flush-happy. We won't emit useless CCU/cache flushes with multiple copies, renderpasses, etc. that don't depend on each other, and also won't flush/invalidate the cache around renderpasses unless we actually need to. Part-of: --- diff --git a/src/freedreno/vulkan/tu_clear_blit.c b/src/freedreno/vulkan/tu_clear_blit.c index e6f23b07ed4..373dbe13ef1 100644 --- a/src/freedreno/vulkan/tu_clear_blit.c +++ b/src/freedreno/vulkan/tu_clear_blit.c @@ -468,18 +468,7 @@ r2d_setup(struct tu_cmd_buffer *cmd, bool clear, uint8_t mask) { - const struct tu_physical_device *phys_dev = cmd->device->physical_device; - - /* TODO: flushing with barriers instead of blindly always flushing */ - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS); - tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR); - tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH); - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); - - tu_cs_emit_wfi(cs); - tu_cs_emit_regs(cs, - A6XX_RB_CCU_CNTL(.offset = phys_dev->ccu_offset_bypass)); + tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM); r2d_setup_common(cmd, cs, vk_format, rotation, clear, mask, false); } @@ -489,11 +478,6 @@ r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { tu_cs_emit_pkt7(cs, CP_BLIT, 1); tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE)); - - /* TODO: flushing with barriers instead of blindly always flushing */ - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS); - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); } /* r3d_ = shader path operations */ @@ -912,21 +896,11 @@ r3d_setup(struct tu_cmd_buffer *cmd, bool clear, uint8_t mask) { - const struct tu_physical_device *phys_dev = cmd->device->physical_device; - if (!cmd->state.pass) { - /* TODO: flushing with barriers instead of blindly always flushing */ - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS); - tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR); - tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH); - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); - - tu_cs_emit_regs(cs, - A6XX_RB_CCU_CNTL(.offset = phys_dev->ccu_offset_bypass)); - + tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM); tu6_emit_window_scissor(cs, 0, 0, 0x7fff, 0x7fff); } + tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000)); tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000)); @@ -979,13 +953,6 @@ r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs) CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY)); tu_cs_emit(cs, 1); /* instance count */ tu_cs_emit(cs, 2); /* vertex count */ - - if (!cmd->state.pass) { - /* TODO: flushing with barriers instead of blindly always flushing */ - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS); - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); - } } /* blit ops - common interface for 2d/shader paths */ @@ -1941,15 +1908,38 @@ tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd, struct tu_cs *cs = &cmd->draw_cs; for (uint32_t j = 0; j < attachment_count; j++) { + /* The vulkan spec, section 17.2 "Clearing Images Inside a Render + * Pass Instance" says that: + * + * Unlike other clear commands, vkCmdClearAttachments executes as + * a drawing command, rather than a transfer command, with writes + * performed by it executing in rasterization order. Clears to + * color attachments are executed as color attachment writes, by + * the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT stage. + * Clears to depth/stencil attachments are executed as depth + * writes and writes by the + * VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT and + * VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT stages. + * + * However, the 2d path here is executed the same way as a + * transfer command, using the CCU color cache exclusively with + * a special depth-as-color format for depth clears. This means that + * we can't rely on the normal pipeline barrier mechanism here, and + * have to manually flush whenever using a different cache domain + * from what the 3d path would've used. This happens when we clear + * depth/stencil, since normally depth attachments use CCU depth, but + * we clear it using a special depth-as-color format. Since the clear + * potentially uses a different attachment state we also need to + * invalidate color beforehand and flush it afterwards. + */ + uint32_t a; if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { a = subpass->color_attachments[attachments[j].colorAttachment].attachment; + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); } else { a = subpass->depth_stencil_attachment.attachment; - - /* sync depth into color */ tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS); - /* also flush color to avoid losing contents from invalidate */ tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR); } @@ -1971,6 +1961,9 @@ tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd, ops->setup(cmd, cs, iview->image->vk_format, ROTATE_0, true, mask); ops->clear_value(cs, iview->image->vk_format, &attachments[j].clearValue); + /* Wait for the flushes we triggered manually to complete */ + tu_cs_emit_wfi(cs); + for (uint32_t i = 0; i < rect_count; i++) { ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent); for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) { @@ -1980,11 +1973,8 @@ tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd, } if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { - /* does not use CCU - flush - * note: cache invalidate might be needed to, and just not covered by test cases - */ - if (attachments[j].colorAttachment > 0) - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); + tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR); } else { /* sync color into depth */ tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); @@ -2313,10 +2303,30 @@ tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd, ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent); ops->clear_value(cs, attachment->format, &info->pClearValues[a]); + /* Wait for any flushes at the beginning of the renderpass to complete */ + tu_cs_emit_wfi(cs); + for (uint32_t i = 0; i < fb->layers; i++) { ops->dst(cs, iview, i); ops->run(cmd, cs); } + + /* The spec doesn't explicitly say, but presumably the initial renderpass + * clear is considered part of the renderpass, and therefore barriers + * aren't required inside the subpass/renderpass. Therefore we need to + * flush CCU color into CCU depth here, just like with + * vkCmdClearAttachments(). Note that because this only happens at the + * beginning of a renderpass, and renderpass writes are considered + * "incoherent", we shouldn't have to worry about syncing depth into color + * beforehand as depth should already be flushed. + */ + if (vk_format_is_depth_or_stencil(attachment->format)) { + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); + tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH); + } else { + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); + tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR); + } } void @@ -2488,14 +2498,18 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, A6XX_SP_PS_2D_SRC_HI(), A6XX_SP_PS_2D_SRC_PITCH(.pitch = tiling->tile0.extent.width * src->cpp)); - /* sync GMEM writes with CACHE */ + /* sync GMEM writes with CACHE. */ tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); + /* Wait for CACHE_INVALIDATE to land */ + tu_cs_emit_wfi(cs); + tu_cs_emit_pkt7(cs, CP_BLIT, 1); tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE)); - /* TODO: flushing with barriers instead of blindly always flushing */ + /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to + * sysmem, and we generally assume that GMEM renderpasses leave their + * results in sysmem, so we need to flush manually here. + */ tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS); - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); } diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index 2bc625f5929..02cae98e3e9 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -344,11 +344,102 @@ tu6_emit_event_write(struct tu_cmd_buffer *cmd, } static void -tu6_emit_wfi(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer, + struct tu_cs *cs, + enum tu_cmd_flush_bits flushes) { - if (cmd->wait_for_idle) { + /* Experiments show that invalidating CCU while it still has data in it + * doesn't work, so make sure to always flush before invalidating in case + * any data remains that hasn't yet been made available through a barrier. + * However it does seem to work for UCHE. + */ + if (flushes & (TU_CMD_FLAG_CCU_FLUSH_COLOR | + TU_CMD_FLAG_CCU_INVALIDATE_COLOR)) + tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_COLOR_TS); + if (flushes & (TU_CMD_FLAG_CCU_FLUSH_DEPTH | + TU_CMD_FLAG_CCU_INVALIDATE_DEPTH)) + tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_DEPTH_TS); + if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_COLOR) + tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_COLOR); + if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_DEPTH) + tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_DEPTH); + if (flushes & TU_CMD_FLAG_CACHE_FLUSH) + tu6_emit_event_write(cmd_buffer, cs, CACHE_FLUSH_TS); + if (flushes & TU_CMD_FLAG_CACHE_INVALIDATE) + tu6_emit_event_write(cmd_buffer, cs, CACHE_INVALIDATE); + if (flushes & TU_CMD_FLAG_WFI) tu_cs_emit_wfi(cs); - cmd->wait_for_idle = false; +} + +/* "Normal" cache flushes, that don't require any special handling */ + +static void +tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer, + struct tu_cs *cs) +{ + tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.cache.flush_bits); + cmd_buffer->state.cache.flush_bits = 0; +} + +/* Renderpass cache flushes */ + +static void +tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer, + struct tu_cs *cs) +{ + tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.renderpass_cache.flush_bits); + cmd_buffer->state.renderpass_cache.flush_bits = 0; +} + +/* Cache flushes for things that use the color/depth read/write path (i.e. + * blits and draws). This deals with changing CCU state as well as the usual + * cache flushing. + */ + +void +tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer, + struct tu_cs *cs, + enum tu_cmd_ccu_state ccu_state) +{ + enum tu_cmd_flush_bits flushes = cmd_buffer->state.cache.flush_bits; + + assert(ccu_state != TU_CMD_CCU_UNKNOWN); + + /* Changing CCU state must involve invalidating the CCU. In sysmem mode, + * the CCU may also contain data that we haven't flushed out yet, so we + * also need to flush. Also, in order to program RB_CCU_CNTL, we need to + * emit a WFI as it isn't pipelined. + */ + if (ccu_state != cmd_buffer->state.ccu_state) { + if (cmd_buffer->state.ccu_state != TU_CMD_CCU_GMEM) { + flushes |= + TU_CMD_FLAG_CCU_FLUSH_COLOR | + TU_CMD_FLAG_CCU_FLUSH_DEPTH; + cmd_buffer->state.cache.pending_flush_bits &= ~( + TU_CMD_FLAG_CCU_FLUSH_COLOR | + TU_CMD_FLAG_CCU_FLUSH_DEPTH); + } + flushes |= + TU_CMD_FLAG_CCU_INVALIDATE_COLOR | + TU_CMD_FLAG_CCU_INVALIDATE_DEPTH | + TU_CMD_FLAG_WFI; + cmd_buffer->state.cache.pending_flush_bits &= ~( + TU_CMD_FLAG_CCU_INVALIDATE_COLOR | + TU_CMD_FLAG_CCU_INVALIDATE_DEPTH); + } + + tu6_emit_flushes(cmd_buffer, cs, flushes); + cmd_buffer->state.cache.flush_bits = 0; + + if (ccu_state != cmd_buffer->state.ccu_state) { + struct tu_physical_device *phys_dev = cmd_buffer->device->physical_device; + tu_cs_emit_regs(cs, + A6XX_RB_CCU_CNTL(.offset = + ccu_state == TU_CMD_CCU_GMEM ? + phys_dev->ccu_offset_gmem : + phys_dev->ccu_offset_bypass, + .gmem = ccu_state == TU_CMD_CCU_GMEM)); + cmd_buffer->state.ccu_state = ccu_state; } } @@ -704,6 +795,49 @@ tu6_emit_sysmem_resolve(struct tu_cmd_buffer *cmd, tu_resolve_sysmem(cmd, cs, src, dst, fb->layers, &cmd->state.tiling_config.render_area); } +static void +tu6_emit_sysmem_resolves(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + const struct tu_subpass *subpass) +{ + if (subpass->resolve_attachments) { + /* From the documentation for vkCmdNextSubpass, section 7.4 "Render Pass + * Commands": + * + * End-of-subpass multisample resolves are treated as color + * attachment writes for the purposes of synchronization. That is, + * they are considered to execute in the + * VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT pipeline stage and + * their writes are synchronized with + * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT. Synchronization between + * rendering within a subpass and any resolve operations at the end + * of the subpass occurs automatically, without need for explicit + * dependencies or pipeline barriers. However, if the resolve + * attachment is also used in a different subpass, an explicit + * dependency is needed. + * + * We use the CP_BLIT path for sysmem resolves, which is really a + * transfer command, so we have to manually flush similar to the gmem + * resolve case. However, a flush afterwards isn't needed because of the + * last sentence and the fact that we're in sysmem mode. + */ + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); + tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); + + /* Wait for the flushes to land before using the 2D engine */ + tu_cs_emit_wfi(cs); + + for (unsigned i = 0; i < subpass->color_count; i++) { + uint32_t a = subpass->resolve_attachments[i].attachment; + if (a == VK_ATTACHMENT_UNUSED) + continue; + + tu6_emit_sysmem_resolve(cmd, cs, a, + subpass->color_attachments[i].attachment); + } + } +} + static void tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { @@ -758,6 +892,7 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit_regs(cs, A6XX_RB_CCU_CNTL(.offset = phys_dev->ccu_offset_bypass)); + cmd->state.ccu_state = TU_CMD_CCU_SYSMEM; tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E04, 0x00100000); tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE04, 0x8); tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE00, 0); @@ -1073,8 +1208,6 @@ tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit_pkt7(cs, CP_SET_MODE, 1); tu_cs_emit(cs, 0x0); - - cmd->wait_for_idle = false; } static void @@ -1109,7 +1242,6 @@ static void tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct VkRect2D *renderArea) { - const struct tu_physical_device *phys_dev = cmd->device->physical_device; const struct tu_framebuffer *fb = cmd->state.framebuffer; assert(fb->width > 0 && fb->height > 0); @@ -1126,13 +1258,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); tu_cs_emit(cs, 0x0); - tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR); - tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH); - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); - - tu6_emit_wfi(cmd, cs); - tu_cs_emit_regs(cs, - A6XX_RB_CCU_CNTL(.offset = phys_dev->ccu_offset_bypass)); + tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM); /* enable stream-out, with sysmem there is only one pass: */ tu_cs_emit_regs(cs, @@ -1153,15 +1279,7 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs) /* Do any resolves of the last subpass. These are handled in the * tile_store_ib in the gmem path. */ - const struct tu_subpass *subpass = cmd->state.subpass; - if (subpass->resolve_attachments) { - for (unsigned i = 0; i < subpass->color_count; i++) { - uint32_t a = subpass->resolve_attachments[i].attachment; - if (a != VK_ATTACHMENT_UNUSED) - tu6_emit_sysmem_resolve(cmd, cs, a, - subpass->color_attachments[i].attachment); - } - } + tu6_emit_sysmem_resolves(cmd, cs, cmd->state.subpass); tu_cs_emit_call(cs, &cmd->draw_epilogue_cs); @@ -1170,9 +1288,6 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu6_emit_event_write(cmd, cs, LRZ_FLUSH); - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS); - tu_cs_sanity_check(cs); } @@ -1186,20 +1301,10 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) /* lrz clear? */ - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); - tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); tu_cs_emit(cs, 0x0); - /* TODO: flushing with barriers instead of blindly always flushing */ - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS); - tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR); - tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH); - - tu_cs_emit_wfi(cs); - tu_cs_emit_regs(cs, - A6XX_RB_CCU_CNTL(.offset = phys_dev->ccu_offset_gmem, .gmem = 1)); + tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_GMEM); const struct tu_tiling_config *tiling = &cmd->state.tiling_config; if (use_hw_binning(cmd)) { @@ -1253,7 +1358,6 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, tu6_emit_tile_select(cmd, cs, tile); tu_cs_emit_call(cs, &cmd->draw_cs); - cmd->wait_for_idle = true; if (use_hw_binning(cmd)) { tu_cs_emit_pkt7(cs, CP_REG_TEST, 1); @@ -1318,7 +1422,6 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd) tu6_sysmem_render_begin(cmd, &cmd->cs, &tiling->render_area); tu_cs_emit_call(&cmd->cs, &cmd->draw_cs); - cmd->wait_for_idle = true; tu6_sysmem_render_end(cmd, &cmd->cs); } @@ -1576,8 +1679,6 @@ tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer) static VkResult tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer) { - cmd_buffer->wait_for_idle = true; - cmd_buffer->record_result = VK_SUCCESS; tu_bo_list_reset(&cmd_buffer->bo_list); @@ -1677,6 +1778,16 @@ tu_ResetCommandBuffer(VkCommandBuffer commandBuffer, return tu_reset_cmd_buffer(cmd_buffer); } +/* Initialize the cache, assuming all necessary flushes have happened but *not* + * invalidations. + */ +static void +tu_cache_init(struct tu_cache_state *cache) +{ + cache->flush_bits = 0; + cache->pending_flush_bits = TU_CMD_FLAG_ALL_INVALIDATE; +} + VkResult tu_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo) @@ -1694,6 +1805,8 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer, } memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state)); + tu_cache_init(&cmd_buffer->state.cache); + tu_cache_init(&cmd_buffer->state.renderpass_cache); cmd_buffer->usage_flags = pBeginInfo->flags; tu_cs_begin(&cmd_buffer->cs); @@ -1709,11 +1822,18 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer, default: break; } - } else if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY && - (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) { - assert(pBeginInfo->pInheritanceInfo); - cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass); - cmd_buffer->state.subpass = &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; + } else if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { + if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { + assert(pBeginInfo->pInheritanceInfo); + cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass); + cmd_buffer->state.subpass = + &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; + } else { + /* When executing in the middle of another command buffer, the CCU + * state is unknown. + */ + cmd_buffer->state.ccu_state = TU_CMD_CCU_UNKNOWN; + } } cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING; @@ -1921,11 +2041,44 @@ tu_CmdPushConstants(VkCommandBuffer commandBuffer, cmd->state.dirty |= TU_CMD_DIRTY_PUSH_CONSTANTS; } +/* Flush everything which has been made available but we haven't actually + * flushed yet. + */ +static void +tu_flush_all_pending(struct tu_cache_state *cache) +{ + cache->flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH; + cache->pending_flush_bits &= ~TU_CMD_FLAG_ALL_FLUSH; +} + VkResult tu_EndCommandBuffer(VkCommandBuffer commandBuffer) { TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + /* We currently flush CCU at the end of the command buffer, like + * what the blob does. There's implicit synchronization around every + * vkQueueSubmit, but the kernel only flushes the UCHE, and we don't + * know yet if this command buffer will be the last in the submit so we + * have to defensively flush everything else. + * + * TODO: We could definitely do better than this, since these flushes + * aren't required by Vulkan, but we'd need kernel support to do that. + * Ideally, we'd like the kernel to flush everything afterwards, so that we + * wouldn't have to do any flushes here, and when submitting multiple + * command buffers there wouldn't be any unnecessary flushes in between. + */ + if (cmd_buffer->state.pass) { + tu_flush_all_pending(&cmd_buffer->state.renderpass_cache); + tu_emit_cache_flush_renderpass(cmd_buffer, &cmd_buffer->draw_cs); + } else { + tu_flush_all_pending(&cmd_buffer->state.cache); + cmd_buffer->state.cache.flush_bits |= + TU_CMD_FLAG_CCU_FLUSH_COLOR | + TU_CMD_FLAG_CCU_FLUSH_DEPTH; + tu_emit_cache_flush(cmd_buffer, &cmd_buffer->cs); + } + tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->scratch_bo, MSM_SUBMIT_BO_WRITE); @@ -2128,6 +2281,206 @@ tu_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer, tu6_emit_sample_locations(&cmd->draw_cs, pSampleLocationsInfo); } +static void +tu_flush_for_access(struct tu_cache_state *cache, + enum tu_cmd_access_mask src_mask, + enum tu_cmd_access_mask dst_mask) +{ + enum tu_cmd_flush_bits flush_bits = 0; + + if (src_mask & TU_ACCESS_SYSMEM_WRITE) { + cache->pending_flush_bits |= TU_CMD_FLAG_ALL_INVALIDATE; + } + +#define SRC_FLUSH(domain, flush, invalidate) \ + if (src_mask & TU_ACCESS_##domain##_WRITE) { \ + cache->pending_flush_bits |= TU_CMD_FLAG_##flush | \ + (TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate); \ + } + + SRC_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE) + SRC_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR) + SRC_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH) + +#undef SRC_FLUSH + +#define SRC_INCOHERENT_FLUSH(domain, flush, invalidate) \ + if (src_mask & TU_ACCESS_##domain##_INCOHERENT_WRITE) { \ + flush_bits |= TU_CMD_FLAG_##flush; \ + cache->pending_flush_bits |= \ + (TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate); \ + } + + SRC_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR) + SRC_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH) + +#undef SRC_INCOHERENT_FLUSH + + if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE)) { + flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH; + } + +#define DST_FLUSH(domain, flush, invalidate) \ + if (dst_mask & (TU_ACCESS_##domain##_READ | \ + TU_ACCESS_##domain##_WRITE)) { \ + flush_bits |= cache->pending_flush_bits & \ + (TU_CMD_FLAG_##invalidate | \ + (TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush)); \ + } + + DST_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE) + DST_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR) + DST_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH) + +#undef DST_FLUSH + +#define DST_INCOHERENT_FLUSH(domain, flush, invalidate) \ + if (dst_mask & (TU_ACCESS_##domain##_READ | \ + TU_ACCESS_##domain##_WRITE)) { \ + flush_bits |= TU_CMD_FLAG_##invalidate | \ + (cache->pending_flush_bits & \ + (TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush)); \ + } + + DST_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR) + DST_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH) + +#undef DST_INCOHERENT_FLUSH + + if (dst_mask & TU_ACCESS_WFI_READ) { + flush_bits |= TU_CMD_FLAG_WFI; + } + + cache->flush_bits |= flush_bits; + cache->pending_flush_bits &= ~flush_bits; +} + +static enum tu_cmd_access_mask +vk2tu_access(VkAccessFlags flags, bool gmem) +{ + enum tu_cmd_access_mask mask = 0; + + /* If the GPU writes a buffer that is then read by an indirect draw + * command, we theoretically need a WFI + WAIT_FOR_ME combination to + * wait for the writes to complete. The WAIT_FOR_ME is performed as part + * of the draw by the firmware, so we just need to execute a WFI. + */ + if (flags & + (VK_ACCESS_INDIRECT_COMMAND_READ_BIT | + VK_ACCESS_MEMORY_READ_BIT)) { + mask |= TU_ACCESS_WFI_READ; + } + + if (flags & + (VK_ACCESS_INDIRECT_COMMAND_READ_BIT | /* Read performed by CP */ + VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | /* Read performed by CP, I think */ + VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT | /* Read performed by CP */ + VK_ACCESS_HOST_READ_BIT | /* sysmem by definition */ + VK_ACCESS_MEMORY_READ_BIT)) { + mask |= TU_ACCESS_SYSMEM_READ; + } + + if (flags & + (VK_ACCESS_HOST_WRITE_BIT | + VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT | /* Write performed by CP, I think */ + VK_ACCESS_MEMORY_WRITE_BIT)) { + mask |= TU_ACCESS_SYSMEM_WRITE; + } + + if (flags & + (VK_ACCESS_INDEX_READ_BIT | /* Read performed by PC, I think */ + VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | /* Read performed by VFD */ + VK_ACCESS_UNIFORM_READ_BIT | /* Read performed by SP */ + /* TODO: Is there a no-cache bit for textures so that we can ignore + * these? + */ + VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | /* Read performed by TP */ + VK_ACCESS_SHADER_READ_BIT | /* Read perfomed by SP/TP */ + VK_ACCESS_MEMORY_READ_BIT)) { + mask |= TU_ACCESS_UCHE_READ; + } + + if (flags & + (VK_ACCESS_SHADER_WRITE_BIT | /* Write performed by SP */ + VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT | /* Write performed by VPC */ + VK_ACCESS_MEMORY_WRITE_BIT)) { + mask |= TU_ACCESS_UCHE_WRITE; + } + + /* When using GMEM, the CCU is always flushed automatically to GMEM, and + * then GMEM is flushed to sysmem. Furthermore, we already had to flush any + * previous writes in sysmem mode when transitioning to GMEM. Therefore we + * can ignore CCU and pretend that color attachments and transfers use + * sysmem directly. + */ + + if (flags & + (VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | + VK_ACCESS_COLOR_ATTACHMENT_READ_NONCOHERENT_BIT_EXT | + VK_ACCESS_MEMORY_READ_BIT)) { + if (gmem) + mask |= TU_ACCESS_SYSMEM_READ; + else + mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_READ; + } + + if (flags & + (VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | + VK_ACCESS_MEMORY_READ_BIT)) { + if (gmem) + mask |= TU_ACCESS_SYSMEM_READ; + else + mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_READ; + } + + if (flags & + (VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_MEMORY_WRITE_BIT)) { + if (gmem) { + mask |= TU_ACCESS_SYSMEM_WRITE; + } else { + mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE; + } + } + + if (flags & + (VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | + VK_ACCESS_MEMORY_WRITE_BIT)) { + if (gmem) { + mask |= TU_ACCESS_SYSMEM_WRITE; + } else { + mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE; + } + } + + /* When the dst access is a transfer read/write, it seems we sometimes need + * to insert a WFI after any flushes, to guarantee that the flushes finish + * before the 2D engine starts. However the opposite (i.e. a WFI after + * CP_BLIT and before any subsequent flush) does not seem to be needed, and + * the blob doesn't emit such a WFI. + */ + + if (flags & + (VK_ACCESS_TRANSFER_WRITE_BIT | + VK_ACCESS_MEMORY_WRITE_BIT)) { + if (gmem) { + mask |= TU_ACCESS_SYSMEM_WRITE; + } else { + mask |= TU_ACCESS_CCU_COLOR_WRITE; + } + mask |= TU_ACCESS_WFI_READ; + } + + if (flags & + (VK_ACCESS_TRANSFER_READ_BIT | /* Access performed by TP */ + VK_ACCESS_MEMORY_READ_BIT)) { + mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_WFI_READ; + } + + return mask; +} + + void tu_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCount, @@ -2138,6 +2491,15 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer, assert(commandBufferCount > 0); + /* Emit any pending flushes. */ + if (cmd->state.pass) { + tu_flush_all_pending(&cmd->state.renderpass_cache); + tu_emit_cache_flush_renderpass(cmd, &cmd->draw_cs); + } else { + tu_flush_all_pending(&cmd->state.cache); + tu_emit_cache_flush(cmd, &cmd->cs); + } + for (uint32_t i = 0; i < commandBufferCount; i++) { TU_FROM_HANDLE(tu_cmd_buffer, secondary, pCmdBuffers[i]); @@ -2176,6 +2538,17 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer, } } cmd->state.dirty = ~0u; /* TODO: set dirty only what needs to be */ + + /* After executing secondary command buffers, there may have been arbitrary + * flushes executed, so when we encounter a pipeline barrier with a + * srcMask, we have to assume that we need to invalidate. Therefore we need + * to re-initialize the cache with all pending invalidate bits set. + */ + if (cmd->state.pass) { + tu_cache_init(&cmd->state.renderpass_cache); + } else { + tu_cache_init(&cmd->state.cache); + } } VkResult @@ -2269,6 +2642,29 @@ tu_TrimCommandPool(VkDevice device, } } +static void +tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer, + const struct tu_subpass_barrier *barrier, + bool external) +{ + /* Note: we don't know until the end of the subpass whether we'll use + * sysmem, so assume sysmem here to be safe. + */ + struct tu_cache_state *cache = + external ? &cmd_buffer->state.cache : &cmd_buffer->state.renderpass_cache; + enum tu_cmd_access_mask src_flags = + vk2tu_access(barrier->src_access_mask, false); + enum tu_cmd_access_mask dst_flags = + vk2tu_access(barrier->dst_access_mask, false); + + if (barrier->incoherent_ccu_color) + src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE; + if (barrier->incoherent_ccu_depth) + src_flags |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE; + + tu_flush_for_access(cache, src_flags, dst_flags); +} + void tu_CmdBeginRenderPass(VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo *pRenderPassBegin, @@ -2285,6 +2681,15 @@ tu_CmdBeginRenderPass(VkCommandBuffer commandBuffer, tu_cmd_update_tiling_config(cmd, &pRenderPassBegin->renderArea); tu_cmd_prepare_tile_store_ib(cmd); + /* Note: because this is external, any flushes will happen before draw_cs + * gets called. However deferred flushes could have to happen later as part + * of the subpass. + */ + tu_subpass_barrier(cmd, &pass->subpasses[0].start_barrier, true); + cmd->state.renderpass_cache.pending_flush_bits = + cmd->state.cache.pending_flush_bits; + cmd->state.renderpass_cache.flush_bits = 0; + tu_emit_load_clear(cmd, pRenderPassBegin); tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs); @@ -2353,32 +2758,12 @@ tu_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents) tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); - /* Emit flushes so that input attachments will read the correct value. - * TODO: use subpass dependencies to flush or not - */ - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS); - - if (subpass->resolve_attachments) { - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); - - for (unsigned i = 0; i < subpass->color_count; i++) { - uint32_t a = subpass->resolve_attachments[i].attachment; - if (a == VK_ATTACHMENT_UNUSED) - continue; - - tu6_emit_sysmem_resolve(cmd, cs, a, - subpass->color_attachments[i].attachment); - } - - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); - } + tu6_emit_sysmem_resolves(cmd, cs, subpass); tu_cond_exec_end(cs); - /* subpass->input_count > 0 then texture cache invalidate is likely to be needed */ - if (cmd->state.subpass->input_count) - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); + /* Handle dependencies for the next subpass */ + tu_subpass_barrier(cmd, &cmd->state.subpass->start_barrier, false); /* emit mrt/zs/msaa/ubwc state for the subpass that is starting */ tu6_emit_zs(cmd, cmd->state.subpass, cs); @@ -3276,6 +3661,8 @@ tu_draw(struct tu_cmd_buffer *cmd, const struct tu_draw_info *draw) struct tu_cs *cs = &cmd->draw_cs; VkResult result; + tu_emit_cache_flush_renderpass(cmd, cs); + result = tu6_bind_draw_states(cmd, cs, draw); if (result != VK_SUCCESS) { cmd->record_result = result; @@ -3294,8 +3681,6 @@ tu_draw(struct tu_cmd_buffer *cmd, const struct tu_draw_info *draw) } } - cmd->wait_for_idle = true; - tu_cs_sanity_check(cs); } @@ -3475,6 +3860,11 @@ tu_dispatch(struct tu_cmd_buffer *cmd, &cmd->descriptors[VK_PIPELINE_BIND_POINT_COMPUTE]; VkResult result; + /* TODO: We could probably flush less if we add a compute_flush_bits + * bitfield. + */ + tu_emit_cache_flush(cmd, cs); + if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_PIPELINE) tu_cs_emit_ib(cs, &pipeline->program.state_ib); @@ -3571,8 +3961,6 @@ tu_dispatch(struct tu_cmd_buffer *cmd, } tu_cs_emit_wfi(cs); - - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); } void @@ -3641,6 +4029,10 @@ tu_CmdEndRenderPass(VkCommandBuffer commandBuffer) tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs); tu_cs_begin(&cmd_buffer->draw_epilogue_cs); + cmd_buffer->state.cache.pending_flush_bits |= + cmd_buffer->state.renderpass_cache.pending_flush_bits; + tu_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true); + cmd_buffer->state.pass = NULL; cmd_buffer->state.subpass = NULL; cmd_buffer->state.framebuffer = NULL; @@ -3670,16 +4062,67 @@ tu_barrier(struct tu_cmd_buffer *cmd, const VkImageMemoryBarrier *pImageMemoryBarriers, const struct tu_barrier_info *info) { - /* renderpass case is only for subpass self-dependencies - * which means syncing the render output with texture cache - * note: only the CACHE_INVALIDATE is needed in GMEM mode - * and in sysmem mode we might not need either color/depth flush + struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs; + VkAccessFlags srcAccessMask = 0; + VkAccessFlags dstAccessMask = 0; + + for (uint32_t i = 0; i < memoryBarrierCount; i++) { + srcAccessMask |= pMemoryBarriers[i].srcAccessMask; + dstAccessMask |= pMemoryBarriers[i].dstAccessMask; + } + + for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) { + srcAccessMask |= pBufferMemoryBarriers[i].srcAccessMask; + dstAccessMask |= pBufferMemoryBarriers[i].dstAccessMask; + } + + enum tu_cmd_access_mask src_flags = 0; + enum tu_cmd_access_mask dst_flags = 0; + + for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) { + TU_FROM_HANDLE(tu_image, image, pImageMemoryBarriers[i].image); + VkImageLayout old_layout = pImageMemoryBarriers[i].oldLayout; + /* For non-linear images, PREINITIALIZED is the same as UNDEFINED */ + if (old_layout == VK_IMAGE_LAYOUT_UNDEFINED || + (image->tiling != VK_IMAGE_TILING_LINEAR && + old_layout == VK_IMAGE_LAYOUT_PREINITIALIZED)) { + /* The underlying memory for this image may have been used earlier + * within the same queue submission for a different image, which + * means that there may be old, stale cache entries which are in the + * "wrong" location, which could cause problems later after writing + * to the image. We don't want these entries being flushed later and + * overwriting the actual image, so we need to flush the CCU. + */ + src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE; + } + srcAccessMask |= pImageMemoryBarriers[i].srcAccessMask; + dstAccessMask |= pImageMemoryBarriers[i].dstAccessMask; + } + + /* Inside a renderpass, we don't know yet whether we'll be using sysmem + * so we have to use the sysmem flushes. */ - if (cmd->state.pass) { - tu6_emit_event_write(cmd, &cmd->draw_cs, PC_CCU_FLUSH_COLOR_TS); - tu6_emit_event_write(cmd, &cmd->draw_cs, PC_CCU_FLUSH_DEPTH_TS); - tu6_emit_event_write(cmd, &cmd->draw_cs, CACHE_INVALIDATE); - return; + bool gmem = cmd->state.ccu_state == TU_CMD_CCU_GMEM && + !cmd->state.pass; + src_flags |= vk2tu_access(srcAccessMask, gmem); + dst_flags |= vk2tu_access(dstAccessMask, gmem); + + struct tu_cache_state *cache = + cmd->state.pass ? &cmd->state.renderpass_cache : &cmd->state.cache; + tu_flush_for_access(cache, src_flags, dst_flags); + + for (uint32_t i = 0; i < info->eventCount; i++) { + TU_FROM_HANDLE(tu_event, event, info->pEvents[i]); + + tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_READ); + + tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6); + tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) | + CP_WAIT_REG_MEM_0_POLL_MEMORY); + tu_cs_emit_qw(cs, event->bo.iova); /* POLL_ADDR_LO/HI */ + tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(1)); + tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0u)); + tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(20)); } } @@ -3708,17 +4151,36 @@ tu_CmdPipelineBarrier(VkCommandBuffer commandBuffer, } static void -write_event(struct tu_cmd_buffer *cmd, struct tu_event *event, unsigned value) +write_event(struct tu_cmd_buffer *cmd, struct tu_event *event, + VkPipelineStageFlags stageMask, unsigned value) { struct tu_cs *cs = &cmd->cs; - tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_WRITE); + /* vkCmdSetEvent/vkCmdResetEvent cannot be called inside a render pass */ + assert(!cmd->state.pass); - /* TODO: any flush required before/after ? */ + tu_emit_cache_flush(cmd, cs); - tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3); - tu_cs_emit_qw(cs, event->bo.iova); /* ADDR_LO/HI */ - tu_cs_emit(cs, value); + tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_WRITE); + + /* Flags that only require a top-of-pipe event. DrawIndirect parameters are + * read by the CP, so the draw indirect stage counts as top-of-pipe too. + */ + VkPipelineStageFlags top_of_pipe_flags = + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT | + VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT; + + if (!(stageMask & ~top_of_pipe_flags)) { + tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3); + tu_cs_emit_qw(cs, event->bo.iova); /* ADDR_LO/HI */ + tu_cs_emit(cs, value); + } else { + /* Use a RB_DONE_TS event to wait for everything to complete. */ + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4); + tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS)); + tu_cs_emit_qw(cs, event->bo.iova); + tu_cs_emit(cs, value); + } } void @@ -3729,7 +4191,7 @@ tu_CmdSetEvent(VkCommandBuffer commandBuffer, TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); TU_FROM_HANDLE(tu_event, event, _event); - write_event(cmd, event, 1); + write_event(cmd, event, stageMask, 1); } void @@ -3740,7 +4202,7 @@ tu_CmdResetEvent(VkCommandBuffer commandBuffer, TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); TU_FROM_HANDLE(tu_event, event, _event); - write_event(cmd, event, 0); + write_event(cmd, event, stageMask, 0); } void @@ -3757,23 +4219,15 @@ tu_CmdWaitEvents(VkCommandBuffer commandBuffer, const VkImageMemoryBarrier *pImageMemoryBarriers) { TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - struct tu_cs *cs = &cmd->cs; - - /* TODO: any flush required before/after? (CP_WAIT_FOR_ME?) */ - - for (uint32_t i = 0; i < eventCount; i++) { - TU_FROM_HANDLE(tu_event, event, pEvents[i]); + struct tu_barrier_info info; - tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_READ); + info.eventCount = eventCount; + info.pEvents = pEvents; + info.srcStageMask = 0; - tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6); - tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) | - CP_WAIT_REG_MEM_0_POLL_MEMORY); - tu_cs_emit_qw(cs, event->bo.iova); /* POLL_ADDR_LO/HI */ - tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(1)); - tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0u)); - tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(20)); - } + tu_barrier(cmd, memoryBarrierCount, pMemoryBarriers, + bufferMemoryBarrierCount, pBufferMemoryBarriers, + imageMemoryBarrierCount, pImageMemoryBarriers, &info); } void diff --git a/src/freedreno/vulkan/tu_pass.c b/src/freedreno/vulkan/tu_pass.c index 842a918bbaa..c3ca8eb939f 100644 --- a/src/freedreno/vulkan/tu_pass.c +++ b/src/freedreno/vulkan/tu_pass.c @@ -29,6 +29,257 @@ #include "vk_util.h" #include "vk_format.h" +static void +tu_render_pass_add_subpass_dep(struct tu_render_pass *pass, + const VkSubpassDependency2 *dep) +{ + uint32_t src = dep->srcSubpass; + uint32_t dst = dep->dstSubpass; + + /* Ignore subpass self-dependencies as they allow the app to call + * vkCmdPipelineBarrier() inside the render pass and the driver should only + * do the barrier when called, not when starting the render pass. + */ + if (src == dst) + return; + + struct tu_subpass_barrier *src_barrier; + if (src == VK_SUBPASS_EXTERNAL) { + src_barrier = &pass->subpasses[0].start_barrier; + } else if (src == pass->subpass_count - 1) { + src_barrier = &pass->end_barrier; + } else { + src_barrier = &pass->subpasses[src + 1].start_barrier; + } + + struct tu_subpass_barrier *dst_barrier; + if (dst == VK_SUBPASS_EXTERNAL) { + dst_barrier = &pass->end_barrier; + } else { + dst_barrier = &pass->subpasses[dst].start_barrier; + } + + if (dep->dstStageMask != VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT) + src_barrier->src_stage_mask |= dep->srcStageMask; + src_barrier->src_access_mask |= dep->srcAccessMask; + dst_barrier->dst_access_mask |= dep->dstAccessMask; + if (src == VK_SUBPASS_EXTERNAL) + pass->subpasses[dst].has_external_src = true; + if (dst == VK_SUBPASS_EXTERNAL) + pass->subpasses[src].has_external_dst = true; +} + +/* We currently only care about undefined layouts, because we have to + * flush/invalidate CCU for those. PREINITIALIZED is the same thing as + * UNDEFINED for anything not linear tiled, but we don't know yet whether the + * images used are tiled, so just assume they are. + */ + +static bool +layout_undefined(VkImageLayout layout) +{ + return layout == VK_IMAGE_LAYOUT_UNDEFINED || + layout == VK_IMAGE_LAYOUT_PREINITIALIZED; +} + +/* This implements the following bit of spec text: + * + * If there is no subpass dependency from VK_SUBPASS_EXTERNAL to the + * first subpass that uses an attachment, then an implicit subpass + * dependency exists from VK_SUBPASS_EXTERNAL to the first subpass it is + * used in. The implicit subpass dependency only exists if there + * exists an automatic layout transition away from initialLayout. + * The subpass dependency operates as if defined with the + * following parameters: + * + * VkSubpassDependency implicitDependency = { + * .srcSubpass = VK_SUBPASS_EXTERNAL; + * .dstSubpass = firstSubpass; // First subpass attachment is used in + * .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + * .dstStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; + * .srcAccessMask = 0; + * .dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | + * VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | + * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | + * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + * .dependencyFlags = 0; + * }; + * + * Similarly, if there is no subpass dependency from the last subpass + * that uses an attachment to VK_SUBPASS_EXTERNAL, then an implicit + * subpass dependency exists from the last subpass it is used in to + * VK_SUBPASS_EXTERNAL. The implicit subpass dependency only exists + * if there exists an automatic layout transition into finalLayout. + * The subpass dependency operates as if defined with the following + * parameters: + * + * VkSubpassDependency implicitDependency = { + * .srcSubpass = lastSubpass; // Last subpass attachment is used in + * .dstSubpass = VK_SUBPASS_EXTERNAL; + * .srcStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; + * .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; + * .srcAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | + * VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | + * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | + * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + * .dstAccessMask = 0; + * .dependencyFlags = 0; + * }; + * + * Note: currently this is the only use we have for layout transitions, + * besides needing to invalidate CCU at the beginning, so we also flag + * transitions from UNDEFINED here. + */ +static void +tu_render_pass_add_implicit_deps(struct tu_render_pass *pass) +{ + bool att_used[pass->attachment_count]; + + memset(att_used, 0, sizeof(att_used)); + + for (unsigned i = 0; i < pass->subpass_count; i++) { + struct tu_subpass *subpass = &pass->subpasses[i]; + if (!subpass->has_external_src) + continue; + + bool src_implicit_dep = false; + + for (unsigned j = 0; j < subpass->input_count; j++) { + unsigned att_idx = subpass->input_attachments[j].attachment; + if (att_idx == VK_ATTACHMENT_UNUSED) + continue; + struct tu_render_pass_attachment *att = &pass->attachments[att_idx]; + if (att->initial_layout != subpass->input_attachments[j].layout && + !att_used[att_idx]) { + src_implicit_dep = true; + } + att_used[att_idx] = true; + } + + for (unsigned j = 0; j < subpass->color_count; j++) { + unsigned att_idx = subpass->color_attachments[j].attachment; + if (att_idx == VK_ATTACHMENT_UNUSED) + continue; + struct tu_render_pass_attachment *att = &pass->attachments[att_idx]; + if (att->initial_layout != subpass->color_attachments[j].layout && + !att_used[att_idx]) { + src_implicit_dep = true; + } + att_used[att_idx] = true; + } + + if (subpass->resolve_attachments) { + for (unsigned j = 0; j < subpass->color_count; j++) { + unsigned att_idx = subpass->resolve_attachments[j].attachment; + if (att_idx == VK_ATTACHMENT_UNUSED) + continue; + struct tu_render_pass_attachment *att = &pass->attachments[att_idx]; + if (att->initial_layout != subpass->resolve_attachments[j].layout && + !att_used[att_idx]) { + src_implicit_dep = true; + } + att_used[att_idx] = true; + } + } + + if (src_implicit_dep) { + tu_render_pass_add_subpass_dep(pass, &(VkSubpassDependency2KHR) { + .srcSubpass = VK_SUBPASS_EXTERNAL, + .dstSubpass = i, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + .srcAccessMask = 0, + .dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | + VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | + VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, + .dependencyFlags = 0, + }); + } + } + + memset(att_used, 0, sizeof(att_used)); + + for (int i = pass->subpass_count - 1; i >= 0; i--) { + struct tu_subpass *subpass = &pass->subpasses[i]; + if (!subpass->has_external_dst) + continue; + + bool dst_implicit_dep = false; + + for (unsigned j = 0; j < subpass->input_count; j++) { + unsigned att_idx = subpass->input_attachments[j].attachment; + if (att_idx == VK_ATTACHMENT_UNUSED) + continue; + struct tu_render_pass_attachment *att = &pass->attachments[att_idx]; + if (att->final_layout != subpass->input_attachments[j].layout && + !att_used[att_idx]) { + dst_implicit_dep = true; + } + att_used[att_idx] = true; + } + + for (unsigned j = 0; j < subpass->color_count; j++) { + unsigned att_idx = subpass->color_attachments[j].attachment; + if (att_idx == VK_ATTACHMENT_UNUSED) + continue; + struct tu_render_pass_attachment *att = &pass->attachments[att_idx]; + if (att->final_layout != subpass->color_attachments[j].layout && + !att_used[att_idx]) { + dst_implicit_dep = true; + } + att_used[att_idx] = true; + } + + if (subpass->resolve_attachments) { + for (unsigned j = 0; j < subpass->color_count; j++) { + unsigned att_idx = subpass->resolve_attachments[j].attachment; + if (att_idx == VK_ATTACHMENT_UNUSED) + continue; + struct tu_render_pass_attachment *att = &pass->attachments[att_idx]; + if (att->final_layout != subpass->resolve_attachments[j].layout && + !att_used[att_idx]) { + dst_implicit_dep = true; + } + att_used[att_idx] = true; + } + } + + if (dst_implicit_dep) { + tu_render_pass_add_subpass_dep(pass, &(VkSubpassDependency2KHR) { + .srcSubpass = i, + .dstSubpass = VK_SUBPASS_EXTERNAL, + .srcStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | + VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | + VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, + .dstAccessMask = 0, + .dependencyFlags = 0, + }); + } + } + + /* Handle UNDEFINED transitions, similar to the handling in tu_barrier(). + * Assume that if an attachment has an initial layout of UNDEFINED, it gets + * transitioned eventually. + */ + for (unsigned i = 0; i < pass->attachment_count; i++) { + if (layout_undefined(pass->attachments[i].initial_layout)) { + if (vk_format_is_depth_or_stencil(pass->attachments[i].format)) { + pass->subpasses[0].start_barrier.incoherent_ccu_depth = true; + } else { + pass->subpasses[0].start_barrier.incoherent_ccu_color = true; + } + } + } +} + static void update_samples(struct tu_subpass *subpass, VkSampleCountFlagBits samples) { @@ -119,6 +370,8 @@ create_render_pass_common(struct tu_render_pass *pass, att->load = false; } } + + tu_render_pass_add_implicit_deps(pass); } static void @@ -193,6 +446,8 @@ tu_CreateRenderPass(VkDevice _device, att->format = pCreateInfo->pAttachments[i].format; att->samples = pCreateInfo->pAttachments[i].samples; att->cpp = vk_format_get_blocksize(att->format) * att->samples; + att->initial_layout = pCreateInfo->pAttachments[i].initialLayout; + att->final_layout = pCreateInfo->pAttachments[i].finalLayout; att->gmem_offset = -1; attachment_set_ops(att, @@ -240,6 +495,8 @@ tu_CreateRenderPass(VkDevice _device, for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) { uint32_t a = desc->pInputAttachments[j].attachment; subpass->input_attachments[j].attachment = a; + subpass->input_attachments[j].layout = + desc->pInputAttachments[j].layout; if (a != VK_ATTACHMENT_UNUSED) pass->attachments[a].gmem_offset = 0; } @@ -252,6 +509,8 @@ tu_CreateRenderPass(VkDevice _device, for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) { uint32_t a = desc->pColorAttachments[j].attachment; subpass->color_attachments[j].attachment = a; + subpass->color_attachments[j].layout = + desc->pColorAttachments[j].layout; if (a != VK_ATTACHMENT_UNUSED) { pass->attachments[a].gmem_offset = 0; @@ -266,6 +525,8 @@ tu_CreateRenderPass(VkDevice _device, for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) { subpass->resolve_attachments[j].attachment = desc->pResolveAttachments[j].attachment; + subpass->resolve_attachments[j].layout = + desc->pResolveAttachments[j].layout; } } @@ -274,12 +535,28 @@ tu_CreateRenderPass(VkDevice _device, subpass->depth_stencil_attachment.attachment = a; if (a != VK_ATTACHMENT_UNUSED) { pass->attachments[a].gmem_offset = 0; + subpass->depth_stencil_attachment.layout = + desc->pDepthStencilAttachment->layout; update_samples(subpass, pCreateInfo->pAttachments[a].samples); } subpass->samples = subpass->samples ?: 1; } + for (unsigned i = 0; i < pCreateInfo->dependencyCount; ++i) { + /* Convert to a Dependency2 */ + struct VkSubpassDependency2 dep2 = { + .srcSubpass = pCreateInfo->pDependencies[i].srcSubpass, + .dstSubpass = pCreateInfo->pDependencies[i].dstSubpass, + .srcStageMask = pCreateInfo->pDependencies[i].srcStageMask, + .dstStageMask = pCreateInfo->pDependencies[i].dstStageMask, + .srcAccessMask = pCreateInfo->pDependencies[i].srcAccessMask, + .dstAccessMask = pCreateInfo->pDependencies[i].dstAccessMask, + .dependencyFlags = pCreateInfo->pDependencies[i].dependencyFlags, + }; + tu_render_pass_add_subpass_dep(pass, &dep2); + } + *pRenderPass = tu_render_pass_to_handle(pass); create_render_pass_common(pass, device->physical_device); @@ -321,6 +598,8 @@ tu_CreateRenderPass2(VkDevice _device, att->format = pCreateInfo->pAttachments[i].format; att->samples = pCreateInfo->pAttachments[i].samples; att->cpp = vk_format_get_blocksize(att->format) * att->samples; + att->initial_layout = pCreateInfo->pAttachments[i].initialLayout; + att->final_layout = pCreateInfo->pAttachments[i].finalLayout; att->gmem_offset = -1; attachment_set_ops(att, @@ -367,6 +646,8 @@ tu_CreateRenderPass2(VkDevice _device, for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) { uint32_t a = desc->pInputAttachments[j].attachment; subpass->input_attachments[j].attachment = a; + subpass->input_attachments[j].layout = + desc->pInputAttachments[j].layout; if (a != VK_ATTACHMENT_UNUSED) pass->attachments[a].gmem_offset = 0; } @@ -379,6 +660,8 @@ tu_CreateRenderPass2(VkDevice _device, for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) { uint32_t a = desc->pColorAttachments[j].attachment; subpass->color_attachments[j].attachment = a; + subpass->color_attachments[j].layout = + desc->pColorAttachments[j].layout; if (a != VK_ATTACHMENT_UNUSED) { pass->attachments[a].gmem_offset = 0; @@ -393,6 +676,8 @@ tu_CreateRenderPass2(VkDevice _device, for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) { subpass->resolve_attachments[j].attachment = desc->pResolveAttachments[j].attachment; + subpass->resolve_attachments[j].layout = + desc->pResolveAttachments[j].layout; } } @@ -402,12 +687,18 @@ tu_CreateRenderPass2(VkDevice _device, subpass->depth_stencil_attachment.attachment = a; if (a != VK_ATTACHMENT_UNUSED) { pass->attachments[a].gmem_offset = 0; + subpass->depth_stencil_attachment.layout = + desc->pDepthStencilAttachment->layout; update_samples(subpass, pCreateInfo->pAttachments[a].samples); } subpass->samples = subpass->samples ?: 1; } + for (unsigned i = 0; i < pCreateInfo->dependencyCount; ++i) { + tu_render_pass_add_subpass_dep(pass, &pCreateInfo->pDependencies[i]); + } + *pRenderPass = tu_render_pass_to_handle(pass); create_render_pass_common(pass, device->physical_device); diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index 372aff4225d..da890674485 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -901,6 +901,116 @@ struct tu_streamout_state { uint32_t vpc_so_buf_cntl; }; +/* There are only three cache domains we have to care about: the CCU, or + * color cache unit, which is used for color and depth/stencil attachments + * and copy/blit destinations, and is split conceptually into color and depth, + * and the universal cache or UCHE which is used for pretty much everything + * else, except for the CP (uncached) and host. We need to flush whenever data + * crosses these boundaries. + */ + +enum tu_cmd_access_mask { + TU_ACCESS_UCHE_READ = 1 << 0, + TU_ACCESS_UCHE_WRITE = 1 << 1, + TU_ACCESS_CCU_COLOR_READ = 1 << 2, + TU_ACCESS_CCU_COLOR_WRITE = 1 << 3, + TU_ACCESS_CCU_DEPTH_READ = 1 << 4, + TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5, + + /* Experiments have shown that while it's safe to avoid flushing the CCU + * after each blit/renderpass, it's not safe to assume that subsequent + * lookups with a different attachment state will hit unflushed cache + * entries. That is, the CCU needs to be flushed and possibly invalidated + * when accessing memory with a different attachment state. Writing to an + * attachment under the following conditions after clearing using the + * normal 2d engine path is known to have issues: + * + * - It isn't the 0'th layer. + * - There are more than one attachment, and this isn't the 0'th attachment + * (this seems to also depend on the cpp of the attachments). + * + * Our best guess is that the layer/MRT state is used when computing + * the location of a cache entry in CCU, to avoid conflicts. We assume that + * any access in a renderpass after or before an access by a transfer needs + * a flush/invalidate, and use the _INCOHERENT variants to represent access + * by a transfer. + */ + TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6, + TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7, + TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8, + TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9, + + TU_ACCESS_SYSMEM_READ = 1 << 10, + TU_ACCESS_SYSMEM_WRITE = 1 << 11, + + /* Set if a WFI is required due to data being read by the CP or the 2D + * engine. + */ + TU_ACCESS_WFI_READ = 1 << 12, + + TU_ACCESS_READ = + TU_ACCESS_UCHE_READ | + TU_ACCESS_CCU_COLOR_READ | + TU_ACCESS_CCU_DEPTH_READ | + TU_ACCESS_CCU_COLOR_INCOHERENT_READ | + TU_ACCESS_CCU_DEPTH_INCOHERENT_READ | + TU_ACCESS_SYSMEM_READ, + + TU_ACCESS_WRITE = + TU_ACCESS_UCHE_WRITE | + TU_ACCESS_CCU_COLOR_WRITE | + TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE | + TU_ACCESS_CCU_DEPTH_WRITE | + TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE | + TU_ACCESS_SYSMEM_WRITE, + + TU_ACCESS_ALL = + TU_ACCESS_READ | + TU_ACCESS_WRITE, +}; + +enum tu_cmd_flush_bits { + TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0, + TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1, + TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2, + TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3, + TU_CMD_FLAG_CACHE_FLUSH = 1 << 4, + TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5, + + TU_CMD_FLAG_ALL_FLUSH = + TU_CMD_FLAG_CCU_FLUSH_DEPTH | + TU_CMD_FLAG_CCU_FLUSH_COLOR | + TU_CMD_FLAG_CACHE_FLUSH, + + TU_CMD_FLAG_ALL_INVALIDATE = + TU_CMD_FLAG_CCU_INVALIDATE_DEPTH | + TU_CMD_FLAG_CCU_INVALIDATE_COLOR | + TU_CMD_FLAG_CACHE_INVALIDATE, + + TU_CMD_FLAG_WFI = 1 << 6, +}; + +/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty + * heavy, involving a CCU cache flush/invalidate and a WFI in order to change + * which part of the gmem is used by the CCU. Here we keep track of what the + * state of the CCU. + */ +enum tu_cmd_ccu_state { + TU_CMD_CCU_SYSMEM, + TU_CMD_CCU_GMEM, + TU_CMD_CCU_UNKNOWN, +}; + +struct tu_cache_state { + /* Caches which must be made available (flushed) eventually if there are + * any users outside that cache domain, and caches which must be + * invalidated eventually if there are any reads. + */ + enum tu_cmd_flush_bits pending_flush_bits; + /* Pending flushes */ + enum tu_cmd_flush_bits flush_bits; +}; + struct tu_cmd_state { uint32_t dirty; @@ -935,6 +1045,17 @@ struct tu_cmd_state uint32_t max_index_count; uint64_t index_va; + /* Renderpasses are tricky, because we may need to flush differently if + * using sysmem vs. gmem and therefore we have to delay any flushing that + * happens before a renderpass. So we have to have two copies of the flush + * state, one for intra-renderpass flushes (i.e. renderpass dependencies) + * and one for outside a renderpass. + */ + struct tu_cache_state cache; + struct tu_cache_state renderpass_cache; + + enum tu_cmd_ccu_state ccu_state; + const struct tu_render_pass *pass; const struct tu_subpass *subpass; const struct tu_framebuffer *framebuffer; @@ -1054,8 +1175,6 @@ struct tu_cmd_buffer uint32_t vsc_draw_strm_pitch; uint32_t vsc_prim_strm_pitch; bool use_vsc_data; - - bool wait_for_idle; }; /* Temporary struct for tracking a register state to be written, used by @@ -1071,6 +1190,10 @@ struct tu_reg_value { uint32_t bo_shift; }; +void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer, + struct tu_cs *cs, + enum tu_cmd_ccu_state ccu_state); + void tu6_emit_event_write(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -1602,9 +1725,17 @@ struct tu_framebuffer struct tu_attachment_info attachments[0]; }; +struct tu_subpass_barrier { + VkPipelineStageFlags src_stage_mask; + VkAccessFlags src_access_mask; + VkAccessFlags dst_access_mask; + bool incoherent_ccu_color, incoherent_ccu_depth; +}; + struct tu_subpass_attachment { uint32_t attachment; + VkImageLayout layout; }; struct tu_subpass @@ -1617,8 +1748,11 @@ struct tu_subpass struct tu_subpass_attachment depth_stencil_attachment; VkSampleCountFlagBits samples; + bool has_external_src, has_external_dst; uint32_t srgb_cntl; + + struct tu_subpass_barrier start_barrier; }; struct tu_render_pass_attachment @@ -1629,6 +1763,7 @@ struct tu_render_pass_attachment VkImageAspectFlags clear_mask; bool load; bool store; + VkImageLayout initial_layout, final_layout; int32_t gmem_offset; }; @@ -1640,6 +1775,7 @@ struct tu_render_pass uint32_t tile_align_w; struct tu_subpass_attachment *subpass_attachments; struct tu_render_pass_attachment *attachments; + struct tu_subpass_barrier end_barrier; struct tu_subpass subpasses[0]; };