#define NSEC_PER_SEC 1000000000ull
#define WAIT_TIMEOUT 5
-/* It seems like sample counts need to be copied over to 16-byte aligned
- * memory. */
+/* Depending on the query type, there might be 2 integer values.
+ * eg. VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
+ * values[0] : primitives written, values[1]: primitives generated
+ */
struct PACKED slot_value {
- uint64_t value;
- uint64_t __padding;
+ uint64_t values[2];
};
struct PACKED query_slot {
/* Returns the IOVA of a given uint64_t field in a given slot of a query
* pool. */
-#define query_iova(type, pool, query, field) \
+#define query_iova(type, pool, query, field, value_index) \
pool->bo.iova + pool->stride * query + offsetof(type, field) + \
- offsetof(struct slot_value, value)
+ offsetof(struct slot_value, values[value_index])
#define occlusion_query_iova(pool, query, field) \
- query_iova(struct occlusion_query_slot, pool, query, field)
+ query_iova(struct occlusion_query_slot, pool, query, field, 0)
#define query_available_iova(pool, query) \
- query_iova(struct query_slot, pool, query, available)
+ query_iova(struct query_slot, pool, query, available, 0)
-#define query_result_iova(pool, query) \
- query_iova(struct query_slot, pool, query, result)
+#define query_result_iova(pool, query, i) \
+ query_iova(struct query_slot, pool, query, result, i)
-#define query_is_available(slot) slot->available.value
+#define query_is_available(slot) slot->available.values[0]
/*
* Returns a pointer to a given slot in a query pool.
vk_free2(&device->alloc, pAllocator, pool);
}
+static uint32_t
+get_result_count(struct tu_query_pool *pool)
+{
+ switch (pool->type) {
+ /* Occulusion and timestamp queries write one integer value */
+ case VK_QUERY_TYPE_OCCLUSION:
+ case VK_QUERY_TYPE_TIMESTAMP:
+ return 1;
+ /* Transform feedback queries write two integer values */
+ case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+ return 2;
+ default:
+ assert(!"Invalid query type");
+ return 0;
+ }
+}
+
/* Wait on the the availability status of a query up until a timeout. */
static VkResult
wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
uint32_t query = firstQuery + i;
struct query_slot *slot = slot_address(pool, query);
bool available = query_is_available(slot);
+ uint32_t result_count = get_result_count(pool);
+
if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
VkResult wait_result = wait_for_available(device, pool, query);
if (wait_result != VK_SUCCESS)
}
}
- if (available)
- write_query_value_cpu(result_base, 0, slot->result.value, flags);
- else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
- /* From the Vulkan 1.1.130 spec:
- *
- * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
- * is not set, and the query’s status is unavailable, an
- * intermediate result value between zero and the final result
- * value is written to pData for that query.
- *
- * Just return 0 here for simplicity since it's a valid result.
- */
- write_query_value_cpu(result_base, 0, 0, flags);
+ for (uint32_t k = 0; k < result_count; k++) {
+ if (available)
+ write_query_value_cpu(result_base, k, slot->result.values[k], flags);
+ else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
+ /* From the Vulkan 1.1.130 spec:
+ *
+ * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
+ * is not set, and the query’s status is unavailable, an
+ * intermediate result value between zero and the final result
+ * value is written to pData for that query.
+ *
+ * Just return 0 here for simplicity since it's a valid result.
+ */
+ write_query_value_cpu(result_base, k, 0, flags);
+ }
if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
/* From the Vulkan 1.1.130 spec:
* integer value written for each query is non-zero if the query’s
* status was available or zero if the status was unavailable.
*/
- write_query_value_cpu(result_base, 1, available, flags);
+ write_query_value_cpu(result_base, result_count, available, flags);
result_base += stride;
}
for (uint32_t i = 0; i < queryCount; i++) {
uint32_t query = firstQuery + i;
uint64_t available_iova = query_available_iova(pool, query);
- uint64_t result_iova = query_result_iova(pool, query);
uint64_t buffer_iova = tu_buffer_iova(buffer) + dstOffset + i * stride;
+ uint32_t result_count = get_result_count(pool);
/* Wait for the available bit to be set if executed with the
* VK_QUERY_RESULT_WAIT_BIT flag. */
tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
}
- if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
- /* Unconditionally copying the bo->result into the buffer here is
- * valid because we only set bo->result on vkCmdEndQuery. Thus, even
- * if the query is unavailable, this will copy the correct partial
- * value of 0.
- */
- copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
- 0 /* offset */, flags);
- } else {
- /* Conditionally copy bo->result into the buffer based on whether the
- * query is available.
- *
- * NOTE: For the conditional packets to be executed, CP_COND_EXEC
- * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
- * that 0 < available < 2, aka available == 1.
- */
- tu_cs_reserve(cs, 7 + 6);
- tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
- tu_cs_emit_qw(cs, available_iova);
- tu_cs_emit_qw(cs, available_iova);
- tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
- tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
-
- /* Start of conditional execution */
- copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
- 0 /* offset */, flags);
- /* End of conditional execution */
+ for (uint32_t k = 0; k < result_count; k++) {
+ uint64_t result_iova = query_result_iova(pool, query, k);
+
+ if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
+ /* Unconditionally copying the bo->result into the buffer here is
+ * valid because we only set bo->result on vkCmdEndQuery. Thus, even
+ * if the query is unavailable, this will copy the correct partial
+ * value of 0.
+ */
+ copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
+ k /* offset */, flags);
+ } else {
+ /* Conditionally copy bo->result into the buffer based on whether the
+ * query is available.
+ *
+ * NOTE: For the conditional packets to be executed, CP_COND_EXEC
+ * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
+ * that 0 < available < 2, aka available == 1.
+ */
+ tu_cs_reserve(cs, 7 + 6);
+ tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
+ tu_cs_emit_qw(cs, available_iova);
+ tu_cs_emit_qw(cs, available_iova);
+ tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
+ tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
+
+ /* Start of conditional execution */
+ copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
+ k /* offset */, flags);
+ /* End of conditional execution */
+ }
}
if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
- 1 /* offset */, flags);
+ result_count /* offset */, flags);
}
}
tu_cs_emit_qw(cs, 0x0);
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
- tu_cs_emit_qw(cs, query_result_iova(pool, query));
+ tu_cs_emit_qw(cs, query_result_iova(pool, query, 0));
+ tu_cs_emit_qw(cs, 0x0);
+ tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
+ tu_cs_emit_qw(cs, query_result_iova(pool, query, 1));
tu_cs_emit_qw(cs, 0x0);
}
}
uint64_t available_iova = query_available_iova(pool, query);
uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
uint64_t end_iova = occlusion_query_iova(pool, query, end);
- uint64_t result_iova = query_result_iova(pool, query);
+ uint64_t result_iova = query_result_iova(pool, query, 0);
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
tu_cs_emit_qw(cs, end_iova);
tu_cs_emit_qw(cs, 0xffffffffffffffffull);
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO) |
CP_REG_TO_MEM_0_CNT(2) |
CP_REG_TO_MEM_0_64B);
- tu_cs_emit_qw(cs, query_result_iova(pool, query));
+ tu_cs_emit_qw(cs, query_result_iova(pool, query, 0));
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
tu_cs_emit_qw(cs, query_available_iova(pool, query));