#include <string.h>
#include <unistd.h>
-#include "registers/adreno_pm4.xml.h"
-#include "registers/adreno_common.xml.h"
-#include "registers/a6xx.xml.h"
+#include "adreno_pm4.xml.h"
+#include "adreno_common.xml.h"
+#include "a6xx.xml.h"
#include "nir/nir_builder.h"
#include "util/os_time.h"
#define NSEC_PER_SEC 1000000000ull
#define WAIT_TIMEOUT 5
-/* It seems like sample counts need to be copied over to 16-byte aligned
- * memory. */
-struct PACKED slot_value {
+struct PACKED query_slot {
+ uint64_t available;
+};
+
+struct PACKED occlusion_slot_value {
+ /* Seems sample counters are placed to be 16-byte aligned
+ * even though this query needs an 8-byte slot. */
uint64_t value;
- uint64_t __padding;
+ uint64_t _padding;
};
struct PACKED occlusion_query_slot {
- struct slot_value available; /* 0 when unavailable, 1 when available */
- struct slot_value begin;
- struct slot_value end;
- struct slot_value result;
+ struct query_slot common;
+ uint64_t result;
+
+ struct occlusion_slot_value begin;
+ struct occlusion_slot_value end;
+};
+
+struct PACKED timestamp_query_slot {
+ struct query_slot common;
+ uint64_t result;
+};
+
+struct PACKED primitive_slot_value {
+ uint64_t values[2];
+};
+
+struct PACKED primitive_query_slot {
+ struct query_slot common;
+ /* The result of transform feedback queries is two integer values:
+ * results[0] is the count of primitives written,
+ * results[1] is the count of primitives generated.
+ * Also a result for each stream is stored at 4 slots respectively.
+ */
+ uint64_t results[2];
+
+ /* Primitive counters also need to be 16-byte aligned. */
+ uint64_t _padding;
+
+ struct primitive_slot_value begin[4];
+ struct primitive_slot_value end[4];
};
/* Returns the IOVA of a given uint64_t field in a given slot of a query
* pool. */
#define query_iova(type, pool, query, field) \
- pool->bo.iova + pool->stride * query + offsetof(type, field) + \
- offsetof(struct slot_value, value)
+ pool->bo.iova + pool->stride * (query) + offsetof(type, field)
#define occlusion_query_iova(pool, query, field) \
query_iova(struct occlusion_query_slot, pool, query, field)
-#define query_is_available(type, slot) \
- ((type*)slot)->available.value
+#define primitive_query_iova(pool, query, field, i) \
+ query_iova(struct primitive_query_slot, pool, query, field) + \
+ offsetof(struct primitive_slot_value, values[i])
+
+#define query_available_iova(pool, query) \
+ query_iova(struct query_slot, pool, query, available)
+
+#define query_result_iova(pool, query, i) \
+ pool->bo.iova + pool->stride * (query) + \
+ sizeof(struct query_slot) + sizeof(uint64_t) * i
+
+#define query_result_addr(pool, query, i) \
+ pool->bo.map + pool->stride * query + \
+ sizeof(struct query_slot) + sizeof(uint64_t) * i
-#define occlusion_query_is_available(slot) \
- query_is_available(struct occlusion_query_slot, slot)
+#define query_is_available(slot) slot->available
/*
* Returns a pointer to a given slot in a query pool.
case VK_QUERY_TYPE_OCCLUSION:
slot_size = sizeof(struct occlusion_query_slot);
break;
- case VK_QUERY_TYPE_PIPELINE_STATISTICS:
case VK_QUERY_TYPE_TIMESTAMP:
+ slot_size = sizeof(struct timestamp_query_slot);
+ break;
+ case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+ slot_size = sizeof(struct primitive_query_slot);
+ break;
+ case VK_QUERY_TYPE_PIPELINE_STATISTICS:
unreachable("Unimplemented query type");
default:
assert(!"Invalid query type");
}
struct tu_query_pool *pool =
- vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-
+ vk_object_alloc(&device->vk, pAllocator, sizeof(*pool),
+ VK_OBJECT_TYPE_QUERY_POOL);
if (!pool)
return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
VkResult result = tu_bo_init_new(device, &pool->bo,
pCreateInfo->queryCount * slot_size);
if (result != VK_SUCCESS) {
- vk_free2(&device->alloc, pAllocator, pool);
+ vk_object_free(&device->vk, pAllocator, pool);
return result;
}
result = tu_bo_map(device, &pool->bo);
if (result != VK_SUCCESS) {
tu_bo_finish(device, &pool->bo);
- vk_free2(&device->alloc, pAllocator, pool);
+ vk_object_free(&device->vk, pAllocator, pool);
return result;
}
return;
tu_bo_finish(device, &pool->bo);
- vk_free2(&device->alloc, pAllocator, pool);
+ vk_object_free(&device->vk, pAllocator, pool);
+}
+
+static uint32_t
+get_result_count(struct tu_query_pool *pool)
+{
+ switch (pool->type) {
+ /* Occulusion and timestamp queries write one integer value */
+ case VK_QUERY_TYPE_OCCLUSION:
+ case VK_QUERY_TYPE_TIMESTAMP:
+ return 1;
+ /* Transform feedback queries write two integer values */
+ case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+ return 2;
+ default:
+ assert(!"Invalid query type");
+ return 0;
+ }
}
/* Wait on the the availability status of a query up until a timeout. */
/* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
* scheduler friendly way instead of busy polling once the patch has landed
* upstream. */
- struct occlusion_query_slot *slot = slot_address(pool, query);
+ struct query_slot *slot = slot_address(pool, query);
uint64_t abs_timeout = os_time_get_absolute_timeout(
WAIT_TIMEOUT * NSEC_PER_SEC);
while(os_time_get_nano() < abs_timeout) {
- if (occlusion_query_is_available(slot))
+ if (query_is_available(slot))
return VK_SUCCESS;
}
return vk_error(device->instance, VK_TIMEOUT);
}
static VkResult
-get_occlusion_query_pool_results(struct tu_device *device,
- struct tu_query_pool *pool,
- uint32_t firstQuery,
- uint32_t queryCount,
- size_t dataSize,
- void *pData,
- VkDeviceSize stride,
- VkQueryResultFlags flags)
+get_query_pool_results(struct tu_device *device,
+ struct tu_query_pool *pool,
+ uint32_t firstQuery,
+ uint32_t queryCount,
+ size_t dataSize,
+ void *pData,
+ VkDeviceSize stride,
+ VkQueryResultFlags flags)
{
assert(dataSize >= stride * queryCount);
VkResult result = VK_SUCCESS;
for (uint32_t i = 0; i < queryCount; i++) {
uint32_t query = firstQuery + i;
- struct occlusion_query_slot *slot = slot_address(pool, query);
- bool available = occlusion_query_is_available(slot);
+ struct query_slot *slot = slot_address(pool, query);
+ bool available = query_is_available(slot);
+ uint32_t result_count = get_result_count(pool);
+
if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
VkResult wait_result = wait_for_available(device, pool, query);
if (wait_result != VK_SUCCESS)
}
}
- if (available)
- write_query_value_cpu(result_base, 0, slot->result.value, flags);
- else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
- /* From the Vulkan 1.1.130 spec:
- *
- * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
- * is not set, and the query’s status is unavailable, an
- * intermediate result value between zero and the final result
- * value is written to pData for that query.
- *
- * Just return 0 here for simplicity since it's a valid result.
- */
- write_query_value_cpu(result_base, 0, 0, flags);
+ for (uint32_t k = 0; k < result_count; k++) {
+ if (available) {
+ uint64_t *result = query_result_addr(pool, query, k);
+ write_query_value_cpu(result_base, k, *result, flags);
+ } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
+ /* From the Vulkan 1.1.130 spec:
+ *
+ * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
+ * is not set, and the query’s status is unavailable, an
+ * intermediate result value between zero and the final result
+ * value is written to pData for that query.
+ *
+ * Just return 0 here for simplicity since it's a valid result.
+ */
+ write_query_value_cpu(result_base, k, 0, flags);
+ }
if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
/* From the Vulkan 1.1.130 spec:
* integer value written for each query is non-zero if the query’s
* status was available or zero if the status was unavailable.
*/
- write_query_value_cpu(result_base, 1, available, flags);
+ write_query_value_cpu(result_base, result_count, available, flags);
result_base += stride;
}
TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
assert(firstQuery + queryCount <= pool->size);
+ if (tu_device_is_lost(device))
+ return VK_ERROR_DEVICE_LOST;
+
switch (pool->type) {
- case VK_QUERY_TYPE_OCCLUSION: {
- return get_occlusion_query_pool_results(device, pool, firstQuery,
- queryCount, dataSize, pData, stride, flags);
- }
- case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+ case VK_QUERY_TYPE_OCCLUSION:
case VK_QUERY_TYPE_TIMESTAMP:
+ case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+ return get_query_pool_results(device, pool, firstQuery, queryCount,
+ dataSize, pData, stride, flags);
+ case VK_QUERY_TYPE_PIPELINE_STATISTICS:
unreachable("Unimplemented query type");
default:
assert(!"Invalid query type");
sizeof(uint64_t) : sizeof(uint32_t);
uint64_t write_iova = base_write_iova + (offset * element_size);
- tu_cs_reserve_space(cmdbuf->device, cs, 6);
tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
CP_MEM_TO_MEM_0_DOUBLE : 0;
}
static void
-emit_copy_occlusion_query_pool_results(struct tu_cmd_buffer *cmdbuf,
- struct tu_cs *cs,
- struct tu_query_pool *pool,
- uint32_t firstQuery,
- uint32_t queryCount,
- struct tu_buffer *buffer,
- VkDeviceSize dstOffset,
- VkDeviceSize stride,
- VkQueryResultFlags flags)
+emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
+ struct tu_cs *cs,
+ struct tu_query_pool *pool,
+ uint32_t firstQuery,
+ uint32_t queryCount,
+ struct tu_buffer *buffer,
+ VkDeviceSize dstOffset,
+ VkDeviceSize stride,
+ VkQueryResultFlags flags)
{
/* From the Vulkan 1.1.130 spec:
*
* To ensure that previous writes to the available bit are coherent, first
* wait for all writes to complete.
*/
- tu_cs_reserve_space(cmdbuf->device, cs, 1);
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
for (uint32_t i = 0; i < queryCount; i++) {
uint32_t query = firstQuery + i;
- uint64_t available_iova = occlusion_query_iova(pool, query, available);
- uint64_t result_iova = occlusion_query_iova(pool, query, result);
+ uint64_t available_iova = query_available_iova(pool, query);
uint64_t buffer_iova = tu_buffer_iova(buffer) + dstOffset + i * stride;
+ uint32_t result_count = get_result_count(pool);
+
/* Wait for the available bit to be set if executed with the
* VK_QUERY_RESULT_WAIT_BIT flag. */
if (flags & VK_QUERY_RESULT_WAIT_BIT) {
- tu_cs_reserve_space(cmdbuf->device, cs, 7);
tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
CP_WAIT_REG_MEM_0_POLL_MEMORY);
tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
}
- if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
- /* Unconditionally copying the bo->result into the buffer here is
- * valid because we only set bo->result on vkCmdEndQuery. Thus, even
- * if the query is unavailable, this will copy the correct partial
- * value of 0.
- */
- copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
- 0 /* offset */, flags);
- } else {
- /* Conditionally copy bo->result into the buffer based on whether the
- * query is available.
- *
- * NOTE: For the conditional packets to be executed, CP_COND_EXEC
- * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
- * that 0 < available < 2, aka available == 1.
- */
- tu_cs_reserve_space(cmdbuf->device, cs, 7);
- tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
- tu_cs_emit_qw(cs, available_iova);
- tu_cs_emit_qw(cs, available_iova);
- tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
- tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
-
- /* Start of conditional execution */
- copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
- 0 /* offset */, flags);
- /* End of conditional execution */
+ for (uint32_t k = 0; k < result_count; k++) {
+ uint64_t result_iova = query_result_iova(pool, query, k);
+
+ if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
+ /* Unconditionally copying the bo->result into the buffer here is
+ * valid because we only set bo->result on vkCmdEndQuery. Thus, even
+ * if the query is unavailable, this will copy the correct partial
+ * value of 0.
+ */
+ copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
+ k /* offset */, flags);
+ } else {
+ /* Conditionally copy bo->result into the buffer based on whether the
+ * query is available.
+ *
+ * NOTE: For the conditional packets to be executed, CP_COND_EXEC
+ * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
+ * that 0 < available < 2, aka available == 1.
+ */
+ tu_cs_reserve(cs, 7 + 6);
+ tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
+ tu_cs_emit_qw(cs, available_iova);
+ tu_cs_emit_qw(cs, available_iova);
+ tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
+ tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
+
+ /* Start of conditional execution */
+ copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
+ k /* offset */, flags);
+ /* End of conditional execution */
+ }
}
if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
- 1 /* offset */, flags);
+ result_count /* offset */, flags);
}
}
assert(firstQuery + queryCount <= pool->size);
switch (pool->type) {
- case VK_QUERY_TYPE_OCCLUSION: {
- return emit_copy_occlusion_query_pool_results(cmdbuf, cs, pool,
- firstQuery, queryCount, buffer, dstOffset, stride, flags);
- }
- case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+ case VK_QUERY_TYPE_OCCLUSION:
case VK_QUERY_TYPE_TIMESTAMP:
+ case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+ return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery,
+ queryCount, buffer, dstOffset, stride, flags);
+ case VK_QUERY_TYPE_PIPELINE_STATISTICS:
unreachable("Unimplemented query type");
default:
assert(!"Invalid query type");
}
static void
-emit_reset_occlusion_query_pool(struct tu_cmd_buffer *cmdbuf,
- struct tu_query_pool *pool,
- uint32_t firstQuery,
- uint32_t queryCount)
+emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
+ struct tu_query_pool *pool,
+ uint32_t firstQuery,
+ uint32_t queryCount)
{
struct tu_cs *cs = &cmdbuf->cs;
for (uint32_t i = 0; i < queryCount; i++) {
uint32_t query = firstQuery + i;
- uint64_t available_iova = occlusion_query_iova(pool, query, available);
- uint64_t result_iova = occlusion_query_iova(pool, query, result);
- tu_cs_reserve_space(cmdbuf->device, cs, 11);
- tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
- tu_cs_emit_qw(cs, available_iova);
- tu_cs_emit_qw(cs, 0x0);
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
- tu_cs_emit_qw(cs, result_iova);
+ tu_cs_emit_qw(cs, query_available_iova(pool, query));
tu_cs_emit_qw(cs, 0x0);
+
+ for (uint32_t k = 0; k < get_result_count(pool); k++) {
+ tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
+ tu_cs_emit_qw(cs, query_result_iova(pool, query, k));
+ tu_cs_emit_qw(cs, 0x0);
+ }
}
}
TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
switch (pool->type) {
+ case VK_QUERY_TYPE_TIMESTAMP:
case VK_QUERY_TYPE_OCCLUSION:
- emit_reset_occlusion_query_pool(cmdbuf, pool, firstQuery, queryCount);
+ case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+ emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
break;
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
- case VK_QUERY_TYPE_TIMESTAMP:
unreachable("Unimplemented query type");
default:
assert(!"Invalid query type");
uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
- tu_cs_reserve_space(cmdbuf->device, cs, 7);
tu_cs_emit_regs(cs,
A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
tu_cs_emit(cs, ZPASS_DONE);
}
+static void
+emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,
+ struct tu_query_pool *pool,
+ uint32_t query,
+ uint32_t stream_id)
+{
+ struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
+ uint64_t begin_iova = primitive_query_iova(pool, query, begin[0], 0);
+
+ tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS_LO(begin_iova));
+ tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
+}
+
void
tu_CmdBeginQuery(VkCommandBuffer commandBuffer,
VkQueryPool queryPool,
*/
emit_begin_occlusion_query(cmdbuf, pool, query);
break;
+ case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+ emit_begin_xfb_query(cmdbuf, pool, query, 0);
+ break;
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
case VK_QUERY_TYPE_TIMESTAMP:
unreachable("Unimplemented query type");
tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
}
+void
+tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
+ VkQueryPool queryPool,
+ uint32_t query,
+ VkQueryControlFlags flags,
+ uint32_t index)
+{
+ TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
+ TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
+ assert(query < pool->size);
+
+ switch (pool->type) {
+ case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+ emit_begin_xfb_query(cmdbuf, pool, query, index);
+ break;
+ default:
+ assert(!"Invalid query type");
+ }
+
+ tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
+}
+
static void
emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
struct tu_query_pool *pool,
const struct tu_render_pass *pass = cmdbuf->state.pass;
struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
- uint64_t available_iova = occlusion_query_iova(pool, query, available);
+ uint64_t available_iova = query_available_iova(pool, query);
uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
uint64_t end_iova = occlusion_query_iova(pool, query, end);
- uint64_t result_iova = occlusion_query_iova(pool, query, result);
- tu_cs_reserve_space(cmdbuf->device, cs, 31);
+ uint64_t result_iova = query_result_iova(pool, query, 0);
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
tu_cs_emit_qw(cs, end_iova);
tu_cs_emit_qw(cs, 0xffffffffffffffffull);
*/
cs = &cmdbuf->draw_epilogue_cs;
- tu_cs_reserve_space(cmdbuf->device, cs, 5);
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
tu_cs_emit_qw(cs, available_iova);
tu_cs_emit_qw(cs, 0x1);
}
+static void
+emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
+ struct tu_query_pool *pool,
+ uint32_t query,
+ uint32_t stream_id)
+{
+ struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
+
+ uint64_t end_iova = primitive_query_iova(pool, query, end[0], 0);
+ uint64_t result_written_iova = query_result_iova(pool, query, 0);
+ uint64_t result_generated_iova = query_result_iova(pool, query, 1);
+ uint64_t begin_written_iova = primitive_query_iova(pool, query, begin[stream_id], 0);
+ uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin[stream_id], 1);
+ uint64_t end_written_iova = primitive_query_iova(pool, query, end[stream_id], 0);
+ uint64_t end_generated_iova = primitive_query_iova(pool, query, end[stream_id], 1);
+ uint64_t available_iova = query_available_iova(pool, query);
+
+ tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS_LO(end_iova));
+ tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
+
+ tu_cs_emit_wfi(cs);
+ tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
+
+ /* Set the count of written primitives */
+ tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
+ tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
+ CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
+ tu_cs_emit_qw(cs, result_written_iova);
+ tu_cs_emit_qw(cs, result_written_iova);
+ tu_cs_emit_qw(cs, end_written_iova);
+ tu_cs_emit_qw(cs, begin_written_iova);
+
+ tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
+
+ /* Set the count of generated primitives */
+ tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
+ tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
+ CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
+ tu_cs_emit_qw(cs, result_generated_iova);
+ tu_cs_emit_qw(cs, result_generated_iova);
+ tu_cs_emit_qw(cs, end_generated_iova);
+ tu_cs_emit_qw(cs, begin_generated_iova);
+
+ /* Set the availability to 1 */
+ tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
+ tu_cs_emit_qw(cs, available_iova);
+ tu_cs_emit_qw(cs, 0x1);
+}
+
+/* Implement this bit of spec text from section 17.2 "Query Operation":
+ *
+ * If queries are used while executing a render pass instance that has
+ * multiview enabled, the query uses N consecutive query indices in the
+ * query pool (starting at query) where N is the number of bits set in the
+ * view mask in the subpass the query is used in. How the numerical
+ * results of the query are distributed among the queries is
+ * implementation-dependent. For example, some implementations may write
+ * each view’s results to a distinct query, while other implementations
+ * may write the total result to the first query and write zero to the
+ * other queries. However, the sum of the results in all the queries must
+ * accurately reflect the total result of the query summed over all views.
+ * Applications can sum the results from all the queries to compute the
+ * total result.
+ *
+ * Since we execute all views at once, we write zero to the other queries.
+ * Furthermore, because queries must be reset before use, and we set the
+ * result to 0 in vkCmdResetQueryPool(), we just need to mark it as available.
+ */
+
+static void
+handle_multiview_queries(struct tu_cmd_buffer *cmd,
+ struct tu_query_pool *pool,
+ uint32_t query)
+{
+ if (!cmd->state.pass || !cmd->state.subpass->multiview_mask)
+ return;
+
+ unsigned views = util_bitcount(cmd->state.subpass->multiview_mask);
+ struct tu_cs *cs = &cmd->draw_epilogue_cs;
+
+ for (uint32_t i = 1; i < views; i++) {
+ tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
+ tu_cs_emit_qw(cs, query_available_iova(pool, query + i));
+ tu_cs_emit_qw(cs, 0x1);
+ }
+}
+
void
tu_CmdEndQuery(VkCommandBuffer commandBuffer,
VkQueryPool queryPool,
case VK_QUERY_TYPE_OCCLUSION:
emit_end_occlusion_query(cmdbuf, pool, query);
break;
+ case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+ emit_end_xfb_query(cmdbuf, pool, query, 0);
+ break;
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
case VK_QUERY_TYPE_TIMESTAMP:
unreachable("Unimplemented query type");
assert(!"Invalid query type");
}
+ handle_multiview_queries(cmdbuf, pool, query);
+
+ tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
+}
+
+void
+tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
+ VkQueryPool queryPool,
+ uint32_t query,
+ uint32_t index)
+{
+ TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
+ TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
+ assert(query < pool->size);
+
+ switch (pool->type) {
+ case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+ assert(index <= 4);
+ emit_end_xfb_query(cmdbuf, pool, query, index);
+ break;
+ default:
+ assert(!"Invalid query type");
+ }
+
tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
}
VkQueryPool queryPool,
uint32_t query)
{
+ TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+ TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
+
+ tu_bo_list_add(&cmd->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
+
+ /* Inside a render pass, just write the timestamp multiple times so that
+ * the user gets the last one if we use GMEM. There isn't really much
+ * better we can do, and this seems to be what the blob does too.
+ */
+ struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
+
+ /* Stages that will already have been executed by the time the CP executes
+ * the REG_TO_MEM. DrawIndirect parameters are read by the CP, so the draw
+ * indirect stage counts as top-of-pipe too.
+ */
+ VkPipelineStageFlags top_of_pipe_flags =
+ VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT |
+ VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
+
+ if (pipelineStage & ~top_of_pipe_flags) {
+ /* Execute a WFI so that all commands complete. Note that CP_REG_TO_MEM
+ * does CP_WAIT_FOR_ME internally, which will wait for the WFI to
+ * complete.
+ *
+ * Stalling the CP like this is really unfortunate, but I don't think
+ * there's a better solution that allows all 48 bits of precision
+ * because CP_EVENT_WRITE doesn't support 64-bit timestamps.
+ */
+ tu_cs_emit_wfi(cs);
+ }
+
+ tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
+ tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO) |
+ CP_REG_TO_MEM_0_CNT(2) |
+ CP_REG_TO_MEM_0_64B);
+ tu_cs_emit_qw(cs, query_result_iova(pool, query, 0));
+
+ /* Only flag availability once the entire renderpass is done, similar to
+ * the begin/end path.
+ */
+ cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;
+
+ tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
+ tu_cs_emit_qw(cs, query_available_iova(pool, query));
+ tu_cs_emit_qw(cs, 0x1);
+
+ /* From the spec for vkCmdWriteTimestamp:
+ *
+ * If vkCmdWriteTimestamp is called while executing a render pass
+ * instance that has multiview enabled, the timestamp uses N consecutive
+ * query indices in the query pool (starting at query) where N is the
+ * number of bits set in the view mask of the subpass the command is
+ * executed in. The resulting query values are determined by an
+ * implementation-dependent choice of one of the following behaviors:
+ *
+ * - The first query is a timestamp value and (if more than one bit is
+ * set in the view mask) zero is written to the remaining queries.
+ * If two timestamps are written in the same subpass, the sum of the
+ * execution time of all views between those commands is the
+ * difference between the first query written by each command.
+ *
+ * - All N queries are timestamp values. If two timestamps are written
+ * in the same subpass, the sum of the execution time of all views
+ * between those commands is the sum of the difference between
+ * corresponding queries written by each command. The difference
+ * between corresponding queries may be the execution time of a
+ * single view.
+ *
+ * We execute all views in the same draw call, so we implement the first
+ * option, the same as regular queries.
+ */
+ handle_multiview_queries(cmd, pool, query);
}