turnip: Implement vkCmdEndQuery for occlusion queries
[mesa.git] / src / freedreno / vulkan / tu_query.c
1 /*
2 * Copyrigh 2016 Red Hat Inc.
3 * Based on anv:
4 * Copyright © 2015 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
24 */
25
26 #include "tu_private.h"
27
28 #include <assert.h>
29 #include <fcntl.h>
30 #include <stdbool.h>
31 #include <string.h>
32 #include <unistd.h>
33
34 #include "registers/adreno_pm4.xml.h"
35 #include "registers/adreno_common.xml.h"
36 #include "registers/a6xx.xml.h"
37
38 #include "nir/nir_builder.h"
39
40 #include "tu_cs.h"
41
42 /* It seems like sample counts need to be copied over to 16-byte aligned
43 * memory. */
44 struct PACKED slot_value {
45 uint64_t value;
46 uint64_t __padding;
47 };
48
49 struct PACKED occlusion_query_slot {
50 struct slot_value available; /* 0 when unavailable, 1 when available */
51 struct slot_value begin;
52 struct slot_value end;
53 struct slot_value result;
54 };
55
56 /* Returns the IOVA of a given uint64_t field in a given slot of a query
57 * pool. */
58 #define query_iova(type, pool, query, field) \
59 pool->bo.iova + pool->stride * query + offsetof(type, field) + \
60 offsetof(struct slot_value, value)
61
62 #define occlusion_query_iova(pool, query, field) \
63 query_iova(struct occlusion_query_slot, pool, query, field)
64
65 VkResult
66 tu_CreateQueryPool(VkDevice _device,
67 const VkQueryPoolCreateInfo *pCreateInfo,
68 const VkAllocationCallbacks *pAllocator,
69 VkQueryPool *pQueryPool)
70 {
71 TU_FROM_HANDLE(tu_device, device, _device);
72 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
73 assert(pCreateInfo->queryCount > 0);
74
75 uint32_t slot_size;
76 switch (pCreateInfo->queryType) {
77 case VK_QUERY_TYPE_OCCLUSION:
78 slot_size = sizeof(struct occlusion_query_slot);
79 break;
80 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
81 case VK_QUERY_TYPE_TIMESTAMP:
82 unreachable("Unimplemented query type");
83 default:
84 assert(!"Invalid query type");
85 }
86
87 struct tu_query_pool *pool =
88 vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
89 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
90
91 if (!pool)
92 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
93
94 VkResult result = tu_bo_init_new(device, &pool->bo,
95 pCreateInfo->queryCount * slot_size);
96 if (result != VK_SUCCESS) {
97 vk_free2(&device->alloc, pAllocator, pool);
98 return result;
99 }
100
101 result = tu_bo_map(device, &pool->bo);
102 if (result != VK_SUCCESS) {
103 tu_bo_finish(device, &pool->bo);
104 vk_free2(&device->alloc, pAllocator, pool);
105 return result;
106 }
107
108 /* Initialize all query statuses to unavailable */
109 memset(pool->bo.map, 0, pool->bo.size);
110
111 pool->type = pCreateInfo->queryType;
112 pool->stride = slot_size;
113 pool->size = pCreateInfo->queryCount;
114 pool->pipeline_statistics = pCreateInfo->pipelineStatistics;
115 *pQueryPool = tu_query_pool_to_handle(pool);
116
117 return VK_SUCCESS;
118 }
119
120 void
121 tu_DestroyQueryPool(VkDevice _device,
122 VkQueryPool _pool,
123 const VkAllocationCallbacks *pAllocator)
124 {
125 TU_FROM_HANDLE(tu_device, device, _device);
126 TU_FROM_HANDLE(tu_query_pool, pool, _pool);
127
128 if (!pool)
129 return;
130
131 tu_bo_finish(device, &pool->bo);
132 vk_free2(&device->alloc, pAllocator, pool);
133 }
134
135 VkResult
136 tu_GetQueryPoolResults(VkDevice _device,
137 VkQueryPool queryPool,
138 uint32_t firstQuery,
139 uint32_t queryCount,
140 size_t dataSize,
141 void *pData,
142 VkDeviceSize stride,
143 VkQueryResultFlags flags)
144 {
145 return VK_SUCCESS;
146 }
147
148 void
149 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
150 VkQueryPool queryPool,
151 uint32_t firstQuery,
152 uint32_t queryCount,
153 VkBuffer dstBuffer,
154 VkDeviceSize dstOffset,
155 VkDeviceSize stride,
156 VkQueryResultFlags flags)
157 {
158 }
159
160 void
161 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
162 VkQueryPool queryPool,
163 uint32_t firstQuery,
164 uint32_t queryCount)
165 {
166 }
167
168 static void
169 emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
170 struct tu_query_pool *pool,
171 uint32_t query)
172 {
173 /* From the Vulkan 1.1.130 spec:
174 *
175 * A query must begin and end inside the same subpass of a render pass
176 * instance, or must both begin and end outside of a render pass
177 * instance.
178 *
179 * Unlike on an immediate-mode renderer, Turnip renders all tiles on
180 * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
181 * query begins/ends inside the same subpass of a render pass, we need to
182 * record the packets on the secondary draw command stream. cmdbuf->draw_cs
183 * is then run on every tile during render, so we just need to accumulate
184 * sample counts in slot->result to compute the query result.
185 */
186 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
187
188 uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
189
190 tu_cs_reserve_space(cmdbuf->device, cs, 7);
191 tu_cs_emit_regs(cs,
192 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
193
194 tu_cs_emit_regs(cs,
195 A6XX_RB_SAMPLE_COUNT_ADDR_LO(begin_iova));
196
197 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
198 tu_cs_emit(cs, ZPASS_DONE);
199 }
200
201 void
202 tu_CmdBeginQuery(VkCommandBuffer commandBuffer,
203 VkQueryPool queryPool,
204 uint32_t query,
205 VkQueryControlFlags flags)
206 {
207 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
208 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
209 assert(query < pool->size);
210
211 switch (pool->type) {
212 case VK_QUERY_TYPE_OCCLUSION:
213 /* In freedreno, there is no implementation difference between
214 * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
215 * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
216 */
217 emit_begin_occlusion_query(cmdbuf, pool, query);
218 break;
219 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
220 case VK_QUERY_TYPE_TIMESTAMP:
221 unreachable("Unimplemented query type");
222 default:
223 assert(!"Invalid query type");
224 }
225
226 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
227 }
228
229 static void
230 emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
231 struct tu_query_pool *pool,
232 uint32_t query)
233 {
234 /* Ending an occlusion query happens in a few steps:
235 * 1) Set the slot->end to UINT64_MAX.
236 * 2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
237 * write the current sample count value into slot->end.
238 * 3) Since (2) is asynchronous, wait until slot->end is not equal to
239 * UINT64_MAX before continuing via CP_WAIT_REG_MEM.
240 * 4) Accumulate the results of the query (slot->end - slot->begin) into
241 * slot->result.
242 * 5) If vkCmdEndQuery is *not* called from within the scope of a render
243 * pass, set the slot's available bit since the query is now done.
244 * 6) If vkCmdEndQuery *is* called from within the scope of a render
245 * pass, we cannot mark as available yet since the commands in
246 * draw_cs are not run until vkCmdEndRenderPass.
247 */
248 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
249
250 uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
251 uint64_t end_iova = occlusion_query_iova(pool, query, end);
252 uint64_t result_iova = occlusion_query_iova(pool, query, result);
253 tu_cs_reserve_space(cmdbuf->device, cs, 31);
254 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
255 tu_cs_emit_qw(cs, end_iova);
256 tu_cs_emit_qw(cs, 0xffffffffffffffffull);
257
258 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
259
260 tu_cs_emit_regs(cs,
261 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
262
263 tu_cs_emit_regs(cs,
264 A6XX_RB_SAMPLE_COUNT_ADDR_LO(end_iova));
265
266 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
267 tu_cs_emit(cs, ZPASS_DONE);
268
269 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
270 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
271 CP_WAIT_REG_MEM_0_POLL_MEMORY);
272 tu_cs_emit_qw(cs, end_iova);
273 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
274 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
275 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
276
277 /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
278 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
279 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
280 tu_cs_emit_qw(cs, result_iova);
281 tu_cs_emit_qw(cs, result_iova);
282 tu_cs_emit_qw(cs, end_iova);
283 tu_cs_emit_qw(cs, begin_iova);
284
285 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
286
287 if (!cmdbuf->state.pass) {
288 tu_cs_reserve_space(cmdbuf->device, cs, 5);
289 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
290 tu_cs_emit_qw(cs, occlusion_query_iova(pool, query, available));
291 tu_cs_emit_qw(cs, 0x1);
292 }
293 }
294
295 void
296 tu_CmdEndQuery(VkCommandBuffer commandBuffer,
297 VkQueryPool queryPool,
298 uint32_t query)
299 {
300 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
301 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
302 assert(query < pool->size);
303
304 switch (pool->type) {
305 case VK_QUERY_TYPE_OCCLUSION:
306 emit_end_occlusion_query(cmdbuf, pool, query);
307 break;
308 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
309 case VK_QUERY_TYPE_TIMESTAMP:
310 unreachable("Unimplemented query type");
311 default:
312 assert(!"Invalid query type");
313 }
314
315 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
316 }
317
318 void
319 tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
320 VkPipelineStageFlagBits pipelineStage,
321 VkQueryPool queryPool,
322 uint32_t query)
323 {
324 }