From a0ca688a674d9b85d7efecc8a0199be80c26b501 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Mon, 20 Jul 2020 13:12:32 +0200 Subject: [PATCH] tu: Integrate WFI/WAIT_FOR_ME/WAIT_MEM_WRITES with cache tracking Track them via pending_flush_bits. Previously WFI was only tracked in flush_bits and WAIT_FOR_ME was emitted directly. This means that we don't emit WAIT_FOR_ME or WAIT_FOR_IDLE if there wasn't a cache flush or other write by the GPU. Also split up host writes from sysmem writes, as only the former require WFI/WAIT_FOR_ME. Along the way, I also realized that we were missing proper handling of transform feedback counter writes which require WAIT_MEM_WRITES. Plumb that through as well. And CmdDrawIndirectByteCountEXT needs a WAIT_FOR_ME as it does not wait for WFI internally. As an example of what this does, a typical barrier for transform feedback with srcAccess = VK_TRANSFORM_FEEDBACK_WRITE_COUNTER_BIT_EXT and dstAccess = VK_ACCESS_INDIRECT_COMMAND_READ_BIT used to emit on A650: - WAIT_FOR_IDLE and now we emit: - WAIT_MEM_WRITES - WAIT_FOR_ME So we've eliminated a useless WFI and added some necessary waits. Part-of: --- src/freedreno/vulkan/tu_cmd_buffer.c | 120 +++++++++++++++++++++------ src/freedreno/vulkan/tu_private.h | 59 ++++++++++--- 2 files changed, 142 insertions(+), 37 deletions(-) diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index e31f127f322..43e3c78386b 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -159,8 +159,12 @@ tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer, tu6_emit_event_write(cmd_buffer, cs, CACHE_FLUSH_TS); if (flushes & TU_CMD_FLAG_CACHE_INVALIDATE) tu6_emit_event_write(cmd_buffer, cs, CACHE_INVALIDATE); - if (flushes & TU_CMD_FLAG_WFI) + if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES) + tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); + if (flushes & TU_CMD_FLAG_WAIT_FOR_IDLE) tu_cs_emit_wfi(cs); + if (flushes & TU_CMD_FLAG_WAIT_FOR_ME) + tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); } /* "Normal" cache flushes, that don't require any special handling */ @@ -214,10 +218,11 @@ tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer, flushes |= TU_CMD_FLAG_CCU_INVALIDATE_COLOR | TU_CMD_FLAG_CCU_INVALIDATE_DEPTH | - TU_CMD_FLAG_WFI; + TU_CMD_FLAG_WAIT_FOR_IDLE; cmd_buffer->state.cache.pending_flush_bits &= ~( TU_CMD_FLAG_CCU_INVALIDATE_COLOR | - TU_CMD_FLAG_CCU_INVALIDATE_DEPTH); + TU_CMD_FLAG_CCU_INVALIDATE_DEPTH | + TU_CMD_FLAG_WAIT_FOR_IDLE); } tu6_emit_flushes(cmd_buffer, cs, flushes); @@ -2192,10 +2197,28 @@ tu_flush_for_access(struct tu_cache_state *cache, { enum tu_cmd_flush_bits flush_bits = 0; + if (src_mask & TU_ACCESS_HOST_WRITE) { + /* Host writes are always visible to CP, so only invalidate GPU caches */ + cache->pending_flush_bits |= TU_CMD_FLAG_GPU_INVALIDATE; + } + if (src_mask & TU_ACCESS_SYSMEM_WRITE) { + /* Invalidate CP and 2D engine (make it do WFI + WFM if necessary) as + * well. + */ cache->pending_flush_bits |= TU_CMD_FLAG_ALL_INVALIDATE; } + if (src_mask & TU_ACCESS_CP_WRITE) { + /* Flush the CP write queue. However a WFI shouldn't be necessary as + * WAIT_MEM_WRITES should cover it. + */ + cache->pending_flush_bits |= + TU_CMD_FLAG_WAIT_MEM_WRITES | + TU_CMD_FLAG_GPU_INVALIDATE | + TU_CMD_FLAG_WAIT_FOR_ME; + } + #define SRC_FLUSH(domain, flush, invalidate) \ if (src_mask & TU_ACCESS_##domain##_WRITE) { \ cache->pending_flush_bits |= TU_CMD_FLAG_##flush | \ @@ -2220,7 +2243,11 @@ tu_flush_for_access(struct tu_cache_state *cache, #undef SRC_INCOHERENT_FLUSH - if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE)) { + /* Treat host & sysmem write accesses the same, since the kernel implicitly + * drains the queue before signalling completion to the host. + */ + if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE | + TU_ACCESS_HOST_READ | TU_ACCESS_HOST_WRITE)) { flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH; } @@ -2252,7 +2279,13 @@ tu_flush_for_access(struct tu_cache_state *cache, #undef DST_INCOHERENT_FLUSH if (dst_mask & TU_ACCESS_WFI_READ) { - flush_bits |= TU_CMD_FLAG_WFI; + flush_bits |= cache->pending_flush_bits & + (TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_WAIT_FOR_IDLE); + } + + if (dst_mask & TU_ACCESS_WFM_READ) { + flush_bits |= cache->pending_flush_bits & + (TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_WAIT_FOR_ME); } cache->flush_bits |= flush_bits; @@ -2265,30 +2298,45 @@ vk2tu_access(VkAccessFlags flags, bool gmem) enum tu_cmd_access_mask mask = 0; /* If the GPU writes a buffer that is then read by an indirect draw - * command, we theoretically need a WFI + WAIT_FOR_ME combination to - * wait for the writes to complete. The WAIT_FOR_ME is performed as part - * of the draw by the firmware, so we just need to execute a WFI. + * command, we theoretically need to emit a WFI to wait for any cache + * flushes, and then a WAIT_FOR_ME to wait on the CP for the WFI to + * complete. Waiting for the WFI to complete is performed as part of the + * draw by the firmware, so we just need to execute the WFI. + * + * Transform feedback counters are read via CP_MEM_TO_REG, which implicitly + * does CP_WAIT_FOR_ME, but we still need a WFI if the GPU writes it. */ if (flags & (VK_ACCESS_INDIRECT_COMMAND_READ_BIT | + VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | VK_ACCESS_MEMORY_READ_BIT)) { mask |= TU_ACCESS_WFI_READ; } if (flags & (VK_ACCESS_INDIRECT_COMMAND_READ_BIT | /* Read performed by CP */ - VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | /* Read performed by CP, I think */ VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT | /* Read performed by CP */ - VK_ACCESS_HOST_READ_BIT | /* sysmem by definition */ + VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | /* Read performed by CP */ VK_ACCESS_MEMORY_READ_BIT)) { mask |= TU_ACCESS_SYSMEM_READ; } + if (flags & + (VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT | + VK_ACCESS_MEMORY_WRITE_BIT)) { + mask |= TU_ACCESS_CP_WRITE; + } + + if (flags & + (VK_ACCESS_HOST_READ_BIT | + VK_ACCESS_MEMORY_WRITE_BIT)) { + mask |= TU_ACCESS_HOST_READ; + } + if (flags & (VK_ACCESS_HOST_WRITE_BIT | - VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT | /* Write performed by CP, I think */ VK_ACCESS_MEMORY_WRITE_BIT)) { - mask |= TU_ACCESS_SYSMEM_WRITE; + mask |= TU_ACCESS_HOST_WRITE; } if (flags & @@ -3166,6 +3214,20 @@ tu_CmdDrawIndexed(VkCommandBuffer commandBuffer, tu_cs_emit(cs, cmd->state.max_index_count); } +/* Various firmware bugs/inconsistencies mean that some indirect draw opcodes + * do not wait for WFI's to complete before executing. Add a WAIT_FOR_ME if + * pending for these opcodes. This may result in a few extra WAIT_FOR_ME's + * with these opcodes, but the alternative would add unnecessary WAIT_FOR_ME's + * before draw opcodes that don't need it. + */ +static void +draw_wfm(struct tu_cmd_buffer *cmd) +{ + cmd->state.renderpass_cache.flush_bits |= + cmd->state.renderpass_cache.pending_flush_bits & TU_CMD_FLAG_WAIT_FOR_ME; + cmd->state.renderpass_cache.pending_flush_bits &= ~TU_CMD_FLAG_WAIT_FOR_ME; +} + void tu_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, @@ -3179,15 +3241,17 @@ tu_CmdDrawIndirect(VkCommandBuffer commandBuffer, cmd->state.vs_params = (struct tu_draw_state) {}; - tu6_draw_common(cmd, cs, false, 0); - - /* workaround for a firmware bug with CP_DRAW_INDIRECT_MULTI, where it - * doesn't wait for WFIs to be completed and leads to GPU fault/hang - * TODO: this could be worked around in a more performant way, - * or there may exist newer firmware that has been fixed + /* The latest known a630_sqe.fw fails to wait for WFI before reading the + * indirect buffer when using CP_DRAW_INDIRECT_MULTI, so we have to fall + * back to CP_WAIT_FOR_ME except for a650 which has a fixed firmware. + * + * TODO: There may be newer a630_sqe.fw released in the future which fixes + * this, if so we should detect it and avoid this workaround. */ if (cmd->device->physical_device->gpu_id != 650) - tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); + draw_wfm(cmd); + + tu6_draw_common(cmd, cs, false, 0); tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 6); tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX)); @@ -3213,15 +3277,10 @@ tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, cmd->state.vs_params = (struct tu_draw_state) {}; - tu6_draw_common(cmd, cs, true, 0); - - /* workaround for a firmware bug with CP_DRAW_INDIRECT_MULTI, where it - * doesn't wait for WFIs to be completed and leads to GPU fault/hang - * TODO: this could be worked around in a more performant way, - * or there may exist newer firmware that has been fixed - */ if (cmd->device->physical_device->gpu_id != 650) - tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); + draw_wfm(cmd); + + tu6_draw_common(cmd, cs, true, 0); tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 9); tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA)); @@ -3248,6 +3307,13 @@ void tu_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, TU_FROM_HANDLE(tu_buffer, buf, _counterBuffer); struct tu_cs *cs = &cmd->draw_cs; + /* All known firmware versions do not wait for WFI's with CP_DRAW_AUTO. + * Plus, for the common case where the counter buffer is written by + * vkCmdEndTransformFeedback, we need to wait for the CP_WAIT_MEM_WRITES to + * complete which means we need a WAIT_FOR_ME anyway. + */ + draw_wfm(cmd); + cmd->state.vs_params = tu6_emit_vs_params(cmd, 0, firstInstance); tu6_draw_common(cmd, cs, false, 0); diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index 7a64245ef9e..ff55e379a02 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -752,13 +752,34 @@ enum tu_cmd_access_mask { TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8, TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9, - TU_ACCESS_SYSMEM_READ = 1 << 10, - TU_ACCESS_SYSMEM_WRITE = 1 << 11, + /* Accesses by the host */ + TU_ACCESS_HOST_READ = 1 << 10, + TU_ACCESS_HOST_WRITE = 1 << 11, - /* Set if a WFI is required due to data being read by the CP or the 2D - * engine. + /* Accesses by a GPU engine which bypasses any cache. e.g. writes via + * CP_EVENT_WRITE::BLIT and the CP are SYSMEM_WRITE. */ - TU_ACCESS_WFI_READ = 1 << 12, + TU_ACCESS_SYSMEM_READ = 1 << 12, + TU_ACCESS_SYSMEM_WRITE = 1 << 13, + + /* Set if a WFI is required. This can be required for: + * - 2D engine which (on some models) doesn't wait for flushes to complete + * before starting + * - CP draw indirect opcodes, where we need to wait for any flushes to + * complete but the CP implicitly waits for WFI's to complete and + * therefore we only need a WFI after the flushes. + */ + TU_ACCESS_WFI_READ = 1 << 14, + + /* Set if a CP_WAIT_FOR_ME is required due to the data being read by the CP + * without it waiting for any WFI. + */ + TU_ACCESS_WFM_READ = 1 << 15, + + /* Memory writes from the CP start in-order with draws and event writes, + * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read. + */ + TU_ACCESS_CP_WRITE = 1 << 16, TU_ACCESS_READ = TU_ACCESS_UCHE_READ | @@ -766,7 +787,10 @@ enum tu_cmd_access_mask { TU_ACCESS_CCU_DEPTH_READ | TU_ACCESS_CCU_COLOR_INCOHERENT_READ | TU_ACCESS_CCU_DEPTH_INCOHERENT_READ | - TU_ACCESS_SYSMEM_READ, + TU_ACCESS_HOST_READ | + TU_ACCESS_SYSMEM_READ | + TU_ACCESS_WFI_READ | + TU_ACCESS_WFM_READ, TU_ACCESS_WRITE = TU_ACCESS_UCHE_WRITE | @@ -774,7 +798,9 @@ enum tu_cmd_access_mask { TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE | TU_ACCESS_CCU_DEPTH_WRITE | TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE | - TU_ACCESS_SYSMEM_WRITE, + TU_ACCESS_HOST_WRITE | + TU_ACCESS_SYSMEM_WRITE | + TU_ACCESS_CP_WRITE, TU_ACCESS_ALL = TU_ACCESS_READ | @@ -788,18 +814,31 @@ enum tu_cmd_flush_bits { TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3, TU_CMD_FLAG_CACHE_FLUSH = 1 << 4, TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5, + TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6, + TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7, + TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8, TU_CMD_FLAG_ALL_FLUSH = TU_CMD_FLAG_CCU_FLUSH_DEPTH | TU_CMD_FLAG_CCU_FLUSH_COLOR | - TU_CMD_FLAG_CACHE_FLUSH, + TU_CMD_FLAG_CACHE_FLUSH | + /* Treat the CP as a sort of "cache" which may need to be "flushed" via + * waiting for writes to land with WAIT_FOR_MEM_WRITES. + */ + TU_CMD_FLAG_WAIT_MEM_WRITES, - TU_CMD_FLAG_ALL_INVALIDATE = + TU_CMD_FLAG_GPU_INVALIDATE = TU_CMD_FLAG_CCU_INVALIDATE_DEPTH | TU_CMD_FLAG_CCU_INVALIDATE_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE, - TU_CMD_FLAG_WFI = 1 << 6, + TU_CMD_FLAG_ALL_INVALIDATE = + TU_CMD_FLAG_GPU_INVALIDATE | + /* Treat the CP as a sort of "cache" which may need to be "invalidated" + * via waiting for UCHE/CCU flushes to land with WFI/WFM. + */ + TU_CMD_FLAG_WAIT_FOR_IDLE | + TU_CMD_FLAG_WAIT_FOR_ME, }; /* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty -- 2.30.2