#include "tu_private.h"
-#include "registers/adreno_pm4.xml.h"
-#include "registers/adreno_common.xml.h"
+#include "adreno_pm4.xml.h"
+#include "adreno_common.xml.h"
#include "vk_format.h"
tu6_emit_event_write(cmd_buffer, cs, CACHE_FLUSH_TS);
if (flushes & TU_CMD_FLAG_CACHE_INVALIDATE)
tu6_emit_event_write(cmd_buffer, cs, CACHE_INVALIDATE);
- if (flushes & TU_CMD_FLAG_WFI)
+ if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES)
+ tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
+ if (flushes & TU_CMD_FLAG_WAIT_FOR_IDLE)
tu_cs_emit_wfi(cs);
+ if (flushes & TU_CMD_FLAG_WAIT_FOR_ME)
+ tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
}
/* "Normal" cache flushes, that don't require any special handling */
flushes |=
TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
- TU_CMD_FLAG_WFI;
+ TU_CMD_FLAG_WAIT_FOR_IDLE;
cmd_buffer->state.cache.pending_flush_bits &= ~(
TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
- TU_CMD_FLAG_CCU_INVALIDATE_DEPTH);
+ TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
+ TU_CMD_FLAG_WAIT_FOR_IDLE);
}
tu6_emit_flushes(cmd_buffer, cs, flushes);
static void
tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align)
{
+
const VkRect2D *render_area = &cmd->state.render_area;
+
+ /* Avoid assertion fails with an empty render area at (0, 0) where the
+ * subtraction below wraps around. Empty render areas should be forced to
+ * the sysmem path by use_sysmem_rendering(). It's not even clear whether
+ * an empty scissor here works, and the blob seems to force sysmem too as
+ * it sets something wrong (non-empty) for the scissor.
+ */
+ if (render_area->extent.width == 0 ||
+ render_area->extent.height == 0)
+ return;
+
uint32_t x1 = render_area->offset.x;
uint32_t y1 = render_area->offset.y;
uint32_t x2 = x1 + render_area->extent.width - 1;
if (cmd->state.framebuffer->layers > 1)
return true;
+ /* Use sysmem for empty render areas */
+ if (cmd->state.render_area.extent.width == 0 ||
+ cmd->state.render_area.extent.height == 0)
+ return true;
+
if (cmd->has_tess)
return true;
tu6_emit_window_scissor(cs, x1, y1, x2, y2);
tu6_emit_window_offset(cs, x1, y1);
- tu_cs_emit_regs(cs,
- A6XX_VPC_SO_OVERRIDE(.so_disable = false));
+ tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false));
if (use_hw_binning(cmd)) {
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
.gfx_bindless = 0x1f,
.cs_bindless = 0x1f));
+ tu_cs_emit_wfi(cs);
+
+ cmd->state.cache.pending_flush_bits &=
+ ~(TU_CMD_FLAG_WAIT_FOR_IDLE | TU_CMD_FLAG_CACHE_INVALIDATE);
+
tu_cs_emit_regs(cs,
A6XX_RB_CCU_CNTL(.offset = phys_dev->ccu_offset_bypass));
cmd->state.ccu_state = TU_CMD_CCU_SYSMEM;
tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9107, 0);
- tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9236,
- A6XX_VPC_UNKNOWN_9236_POINT_COORD_INVERT(0));
+ tu_cs_emit_regs(cs, A6XX_VPC_POINT_COORD_INVERT(false));
tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9300, 0);
- tu_cs_emit_write_reg(cs, REG_A6XX_VPC_SO_OVERRIDE,
- A6XX_VPC_SO_OVERRIDE_SO_DISABLE);
+ tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(true));
- tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9801, 0);
tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9980, 0);
- tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9990, 0);
tu_cs_emit_write_reg(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 0);
tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9B07, 0);
tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
/* enable stream-out, with sysmem there is only one pass: */
- tu_cs_emit_regs(cs,
- A6XX_VPC_SO_OVERRIDE(.so_disable = false));
+ tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false));
tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
tu_cs_emit(cs, 0x1);
const struct tu_framebuffer *fb = cmd->state.framebuffer;
if (use_hw_binning(cmd)) {
/* enable stream-out during binning pass: */
- tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=false));
+ tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false));
tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height,
A6XX_RB_BIN_CONTROL_BINNING_PASS | 0x6000000);
tu6_emit_binning_pass(cmd, cs);
/* and disable stream-out for draw pass: */
- tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=true));
+ tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(true));
tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height,
A6XX_RB_BIN_CONTROL_USE_VIZ | 0x6000000);
tu_cs_emit(cs, 0x1);
} else {
/* no binning pass, so enable stream-out for draw pass:: */
- tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=false));
+ tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false));
tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height, 0x6000000);
}
{
enum tu_cmd_flush_bits flush_bits = 0;
+ if (src_mask & TU_ACCESS_HOST_WRITE) {
+ /* Host writes are always visible to CP, so only invalidate GPU caches */
+ cache->pending_flush_bits |= TU_CMD_FLAG_GPU_INVALIDATE;
+ }
+
if (src_mask & TU_ACCESS_SYSMEM_WRITE) {
+ /* Invalidate CP and 2D engine (make it do WFI + WFM if necessary) as
+ * well.
+ */
cache->pending_flush_bits |= TU_CMD_FLAG_ALL_INVALIDATE;
}
+ if (src_mask & TU_ACCESS_CP_WRITE) {
+ /* Flush the CP write queue. However a WFI shouldn't be necessary as
+ * WAIT_MEM_WRITES should cover it.
+ */
+ cache->pending_flush_bits |=
+ TU_CMD_FLAG_WAIT_MEM_WRITES |
+ TU_CMD_FLAG_GPU_INVALIDATE |
+ TU_CMD_FLAG_WAIT_FOR_ME;
+ }
+
#define SRC_FLUSH(domain, flush, invalidate) \
if (src_mask & TU_ACCESS_##domain##_WRITE) { \
cache->pending_flush_bits |= TU_CMD_FLAG_##flush | \
#undef SRC_INCOHERENT_FLUSH
- if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE)) {
+ /* Treat host & sysmem write accesses the same, since the kernel implicitly
+ * drains the queue before signalling completion to the host.
+ */
+ if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE |
+ TU_ACCESS_HOST_READ | TU_ACCESS_HOST_WRITE)) {
flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH;
}
#undef DST_INCOHERENT_FLUSH
if (dst_mask & TU_ACCESS_WFI_READ) {
- flush_bits |= TU_CMD_FLAG_WFI;
+ flush_bits |= cache->pending_flush_bits &
+ (TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_WAIT_FOR_IDLE);
+ }
+
+ if (dst_mask & TU_ACCESS_WFM_READ) {
+ flush_bits |= cache->pending_flush_bits &
+ (TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_WAIT_FOR_ME);
}
cache->flush_bits |= flush_bits;
enum tu_cmd_access_mask mask = 0;
/* If the GPU writes a buffer that is then read by an indirect draw
- * command, we theoretically need a WFI + WAIT_FOR_ME combination to
- * wait for the writes to complete. The WAIT_FOR_ME is performed as part
- * of the draw by the firmware, so we just need to execute a WFI.
+ * command, we theoretically need to emit a WFI to wait for any cache
+ * flushes, and then a WAIT_FOR_ME to wait on the CP for the WFI to
+ * complete. Waiting for the WFI to complete is performed as part of the
+ * draw by the firmware, so we just need to execute the WFI.
+ *
+ * Transform feedback counters are read via CP_MEM_TO_REG, which implicitly
+ * does CP_WAIT_FOR_ME, but we still need a WFI if the GPU writes it.
*/
if (flags &
(VK_ACCESS_INDIRECT_COMMAND_READ_BIT |
+ VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT |
VK_ACCESS_MEMORY_READ_BIT)) {
mask |= TU_ACCESS_WFI_READ;
}
if (flags &
(VK_ACCESS_INDIRECT_COMMAND_READ_BIT | /* Read performed by CP */
- VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | /* Read performed by CP, I think */
VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT | /* Read performed by CP */
- VK_ACCESS_HOST_READ_BIT | /* sysmem by definition */
+ VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | /* Read performed by CP */
VK_ACCESS_MEMORY_READ_BIT)) {
mask |= TU_ACCESS_SYSMEM_READ;
}
+ if (flags &
+ (VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT |
+ VK_ACCESS_MEMORY_WRITE_BIT)) {
+ mask |= TU_ACCESS_CP_WRITE;
+ }
+
+ if (flags &
+ (VK_ACCESS_HOST_READ_BIT |
+ VK_ACCESS_MEMORY_WRITE_BIT)) {
+ mask |= TU_ACCESS_HOST_READ;
+ }
+
if (flags &
(VK_ACCESS_HOST_WRITE_BIT |
- VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT | /* Write performed by CP, I think */
VK_ACCESS_MEMORY_WRITE_BIT)) {
- mask |= TU_ACCESS_SYSMEM_WRITE;
+ mask |= TU_ACCESS_HOST_WRITE;
}
if (flags &
tu6_emit_tess_consts(struct tu_cmd_buffer *cmd,
uint32_t draw_count,
const struct tu_pipeline *pipeline,
- struct tu_draw_state *state)
+ struct tu_draw_state *state,
+ uint64_t *factor_iova)
{
struct tu_cs cs;
- VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 20, &cs);
+ VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 16, &cs);
if (result != VK_SUCCESS)
return result;
tu_cs_emit_qw(&cs, tess_param_iova);
tu_cs_emit_qw(&cs, tess_factor_iova);
- tu_cs_emit_pkt4(&cs, REG_A6XX_PC_TESSFACTOR_ADDR_LO, 2);
- tu_cs_emit_qw(&cs, tess_factor_iova);
-
- /* TODO: Without this WFI here, the hardware seems unable to read these
- * addresses we just emitted. Freedreno emits these consts as part of
- * IB1 instead of in a draw state which might make this WFI unnecessary,
- * but it requires a bit more indirection (SS6_INDIRECT for consts). */
- tu_cs_emit_wfi(&cs);
+ *factor_iova = tess_factor_iova;
}
*state = tu_cs_end_draw_state(&cmd->sub_cs, &cs);
return VK_SUCCESS;
pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
struct tu_draw_state tess_consts = {};
if (has_tess) {
+ uint64_t tess_factor_iova = 0;
+
cmd->has_tess = true;
- result = tu6_emit_tess_consts(cmd, draw_count, pipeline, &tess_consts);
+ result = tu6_emit_tess_consts(cmd, draw_count, pipeline, &tess_consts, &tess_factor_iova);
if (result != VK_SUCCESS)
return result;
+
+ /* this sequence matches what the blob does before every tess draw
+ * PC_TESSFACTOR_ADDR_LO is a non-context register and needs a wfi
+ * before writing to it
+ */
+ tu_cs_emit_wfi(cs);
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESSFACTOR_ADDR_LO, 2);
+ tu_cs_emit_qw(cs, tess_factor_iova);
+
+ tu_cs_emit_pkt7(cs, CP_SET_SUBDRAW_SIZE, 1);
+ tu_cs_emit(cs, draw_count);
}
/* for the first draw in a renderpass, re-emit all the draw states
tu_cs_emit(cs, cmd->state.max_index_count);
}
+/* Various firmware bugs/inconsistencies mean that some indirect draw opcodes
+ * do not wait for WFI's to complete before executing. Add a WAIT_FOR_ME if
+ * pending for these opcodes. This may result in a few extra WAIT_FOR_ME's
+ * with these opcodes, but the alternative would add unnecessary WAIT_FOR_ME's
+ * before draw opcodes that don't need it.
+ */
+static void
+draw_wfm(struct tu_cmd_buffer *cmd)
+{
+ cmd->state.renderpass_cache.flush_bits |=
+ cmd->state.renderpass_cache.pending_flush_bits & TU_CMD_FLAG_WAIT_FOR_ME;
+ cmd->state.renderpass_cache.pending_flush_bits &= ~TU_CMD_FLAG_WAIT_FOR_ME;
+}
+
void
tu_CmdDrawIndirect(VkCommandBuffer commandBuffer,
VkBuffer _buffer,
cmd->state.vs_params = (struct tu_draw_state) {};
- tu6_draw_common(cmd, cs, false, 0);
-
- /* workaround for a firmware bug with CP_DRAW_INDIRECT_MULTI, where it
- * doesn't wait for WFIs to be completed and leads to GPU fault/hang
- * TODO: this could be worked around in a more performant way,
- * or there may exist newer firmware that has been fixed
+ /* The latest known a630_sqe.fw fails to wait for WFI before reading the
+ * indirect buffer when using CP_DRAW_INDIRECT_MULTI, so we have to fall
+ * back to CP_WAIT_FOR_ME except for a650 which has a fixed firmware.
+ *
+ * TODO: There may be newer a630_sqe.fw released in the future which fixes
+ * this, if so we should detect it and avoid this workaround.
*/
if (cmd->device->physical_device->gpu_id != 650)
- tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
+ draw_wfm(cmd);
+
+ tu6_draw_common(cmd, cs, false, 0);
tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 6);
tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);
tu_cs_emit(cs, stride);
- tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
+ tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
}
void
cmd->state.vs_params = (struct tu_draw_state) {};
- tu6_draw_common(cmd, cs, true, 0);
-
- /* workaround for a firmware bug with CP_DRAW_INDIRECT_MULTI, where it
- * doesn't wait for WFIs to be completed and leads to GPU fault/hang
- * TODO: this could be worked around in a more performant way,
- * or there may exist newer firmware that has been fixed
- */
if (cmd->device->physical_device->gpu_id != 650)
- tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
+ draw_wfm(cmd);
+
+ tu6_draw_common(cmd, cs, true, 0);
tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 9);
tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);
tu_cs_emit(cs, stride);
- tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
+ tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
+}
+
+void
+tu_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,
+ VkBuffer _buffer,
+ VkDeviceSize offset,
+ VkBuffer countBuffer,
+ VkDeviceSize countBufferOffset,
+ uint32_t drawCount,
+ uint32_t stride)
+{
+ TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+ TU_FROM_HANDLE(tu_buffer, buf, _buffer);
+ TU_FROM_HANDLE(tu_buffer, count_buf, countBuffer);
+ struct tu_cs *cs = &cmd->draw_cs;
+
+ cmd->state.vs_params = (struct tu_draw_state) {};
+
+ /* It turns out that the firmware we have for a650 only partially fixed the
+ * problem with CP_DRAW_INDIRECT_MULTI not waiting for WFI's to complete
+ * before reading indirect parameters. It waits for WFI's before reading
+ * the draw parameters, but after reading the indirect count :(.
+ */
+ draw_wfm(cmd);
+
+ tu6_draw_common(cmd, cs, false, 0);
+
+ tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 8);
+ tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
+ tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDIRECT_COUNT) |
+ A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
+ tu_cs_emit(cs, drawCount);
+ tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);
+ tu_cs_emit_qw(cs, count_buf->bo->iova + count_buf->bo_offset + countBufferOffset);
+ tu_cs_emit(cs, stride);
+
+ tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
+ tu_bo_list_add(&cmd->bo_list, count_buf->bo, MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
+}
+
+void
+tu_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,
+ VkBuffer _buffer,
+ VkDeviceSize offset,
+ VkBuffer countBuffer,
+ VkDeviceSize countBufferOffset,
+ uint32_t drawCount,
+ uint32_t stride)
+{
+ TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+ TU_FROM_HANDLE(tu_buffer, buf, _buffer);
+ TU_FROM_HANDLE(tu_buffer, count_buf, countBuffer);
+ struct tu_cs *cs = &cmd->draw_cs;
+
+ cmd->state.vs_params = (struct tu_draw_state) {};
+
+ draw_wfm(cmd);
+
+ tu6_draw_common(cmd, cs, true, 0);
+
+ tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 11);
+ tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
+ tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDIRECT_COUNT_INDEXED) |
+ A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
+ tu_cs_emit(cs, drawCount);
+ tu_cs_emit_qw(cs, cmd->state.index_va);
+ tu_cs_emit(cs, cmd->state.max_index_count);
+ tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);
+ tu_cs_emit_qw(cs, count_buf->bo->iova + count_buf->bo_offset + countBufferOffset);
+ tu_cs_emit(cs, stride);
+
+ tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
+ tu_bo_list_add(&cmd->bo_list, count_buf->bo, MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
}
void tu_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
TU_FROM_HANDLE(tu_buffer, buf, _counterBuffer);
struct tu_cs *cs = &cmd->draw_cs;
+ /* All known firmware versions do not wait for WFI's with CP_DRAW_AUTO.
+ * Plus, for the common case where the counter buffer is written by
+ * vkCmdEndTransformFeedback, we need to wait for the CP_WAIT_MEM_WRITES to
+ * complete which means we need a WAIT_FOR_ME anyway.
+ */
+ draw_wfm(cmd);
+
cmd->state.vs_params = tu6_emit_vs_params(cmd, 0, firstInstance);
tu6_draw_common(cmd, cs, false, 0);