iris: Implement DrawTransformFeedback()
[mesa.git] / src / gallium / drivers / iris / iris_query.c
1 /*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
21 */
22
23 /**
24 * @file iris_query.c
25 *
26 * Query object support. This allows measuring various simple statistics
27 * via counters on the GPU.
28 */
29
30 #include <stdio.h>
31 #include <errno.h>
32 #include "pipe/p_defines.h"
33 #include "pipe/p_state.h"
34 #include "pipe/p_context.h"
35 #include "pipe/p_screen.h"
36 #include "util/fast_idiv_by_const.h"
37 #include "util/u_inlines.h"
38 #include "iris_context.h"
39 #include "iris_defines.h"
40 #include "iris_resource.h"
41 #include "iris_screen.h"
42 #include "vulkan/util/vk_util.h"
43
44 #define IA_VERTICES_COUNT 0x2310
45 #define IA_PRIMITIVES_COUNT 0x2318
46 #define VS_INVOCATION_COUNT 0x2320
47 #define HS_INVOCATION_COUNT 0x2300
48 #define DS_INVOCATION_COUNT 0x2308
49 #define GS_INVOCATION_COUNT 0x2328
50 #define GS_PRIMITIVES_COUNT 0x2330
51 #define CL_INVOCATION_COUNT 0x2338
52 #define CL_PRIMITIVES_COUNT 0x2340
53 #define PS_INVOCATION_COUNT 0x2348
54 #define CS_INVOCATION_COUNT 0x2290
55 #define PS_DEPTH_COUNT 0x2350
56
57 #define SO_PRIM_STORAGE_NEEDED(n) (0x5240 + (n) * 8)
58
59 #define SO_NUM_PRIMS_WRITTEN(n) (0x5200 + (n) * 8)
60
61 #define MI_MATH (0x1a << 23)
62
63 #define MI_ALU_LOAD 0x080
64 #define MI_ALU_LOADINV 0x480
65 #define MI_ALU_LOAD0 0x081
66 #define MI_ALU_LOAD1 0x481
67 #define MI_ALU_ADD 0x100
68 #define MI_ALU_SUB 0x101
69 #define MI_ALU_AND 0x102
70 #define MI_ALU_OR 0x103
71 #define MI_ALU_XOR 0x104
72 #define MI_ALU_STORE 0x180
73 #define MI_ALU_STOREINV 0x580
74
75 #define MI_ALU_R0 0x00
76 #define MI_ALU_R1 0x01
77 #define MI_ALU_R2 0x02
78 #define MI_ALU_R3 0x03
79 #define MI_ALU_R4 0x04
80 #define MI_ALU_SRCA 0x20
81 #define MI_ALU_SRCB 0x21
82 #define MI_ALU_ACCU 0x31
83 #define MI_ALU_ZF 0x32
84 #define MI_ALU_CF 0x33
85
86 #define _MI_ALU(op, x, y) (((op) << 20) | ((x) << 10) | (y))
87
88 #define _MI_ALU0(op) _MI_ALU(MI_ALU_##op, 0, 0)
89 #define _MI_ALU1(op, x) _MI_ALU(MI_ALU_##op, x, 0)
90 #define _MI_ALU2(op, x, y) _MI_ALU(MI_ALU_##op, x, y)
91
92 #define MI_ALU0(op) _MI_ALU0(op)
93 #define MI_ALU1(op, x) _MI_ALU1(op, MI_ALU_##x)
94 #define MI_ALU2(op, x, y) _MI_ALU2(op, MI_ALU_##x, MI_ALU_##y)
95
96 #define emit_lri32 ice->vtbl.load_register_imm32
97 #define emit_lri64 ice->vtbl.load_register_imm64
98 #define emit_lrr32 ice->vtbl.load_register_reg32
99
100 struct iris_query {
101 enum pipe_query_type type;
102 int index;
103
104 bool ready;
105
106 bool stalled;
107
108 uint64_t result;
109
110 struct iris_bo *bo;
111 struct iris_query_snapshots *map;
112
113 int batch_idx;
114 };
115
116 struct iris_query_snapshots {
117 /** iris_render_condition's saved MI_PREDICATE_DATA value. */
118 uint64_t predicate_data;
119
120 /** Have the start/end snapshots landed? */
121 uint64_t snapshots_landed;
122
123 /** Starting and ending counter snapshots */
124 uint64_t start;
125 uint64_t end;
126 };
127
128 struct iris_query_so_overflow {
129 uint64_t predicate_data;
130 uint64_t snapshots_landed;
131
132 struct {
133 uint64_t prim_storage_needed[2];
134 uint64_t num_prims[2];
135 } stream[4];
136 };
137
138 /**
139 * Is this type of query written by PIPE_CONTROL?
140 */
141 static bool
142 iris_is_query_pipelined(struct iris_query *q)
143 {
144 switch (q->type) {
145 case PIPE_QUERY_OCCLUSION_COUNTER:
146 case PIPE_QUERY_OCCLUSION_PREDICATE:
147 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
148 case PIPE_QUERY_TIMESTAMP:
149 case PIPE_QUERY_TIMESTAMP_DISJOINT:
150 case PIPE_QUERY_TIME_ELAPSED:
151 return true;
152
153 default:
154 return false;
155 }
156 }
157
158 static void
159 mark_available(struct iris_context *ice, struct iris_query *q)
160 {
161 struct iris_batch *batch = &ice->batches[q->batch_idx];
162 unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
163 unsigned offset = offsetof(struct iris_query_snapshots, snapshots_landed);
164
165 if (!iris_is_query_pipelined(q)) {
166 ice->vtbl.store_data_imm64(batch, q->bo, offset, true);
167 } else {
168 /* Order available *after* the query results. */
169 flags |= PIPE_CONTROL_FLUSH_ENABLE;
170 iris_emit_pipe_control_write(batch, flags, q->bo, offset, true);
171 }
172 }
173
174 /**
175 * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
176 */
177 static void
178 iris_pipelined_write(struct iris_batch *batch,
179 struct iris_query *q,
180 enum pipe_control_flags flags,
181 unsigned offset)
182 {
183 const struct gen_device_info *devinfo = &batch->screen->devinfo;
184 const unsigned optional_cs_stall =
185 devinfo->gen == 9 && devinfo->gt == 4 ? PIPE_CONTROL_CS_STALL : 0;
186
187 iris_emit_pipe_control_write(batch, flags | optional_cs_stall,
188 q->bo, offset, 0ull);
189 }
190
191 static void
192 write_value(struct iris_context *ice, struct iris_query *q, unsigned offset)
193 {
194 struct iris_batch *batch = &ice->batches[q->batch_idx];
195 const struct gen_device_info *devinfo = &batch->screen->devinfo;
196
197 if (!iris_is_query_pipelined(q)) {
198 iris_emit_pipe_control_flush(batch,
199 PIPE_CONTROL_CS_STALL |
200 PIPE_CONTROL_STALL_AT_SCOREBOARD);
201 q->stalled = true;
202 }
203
204 switch (q->type) {
205 case PIPE_QUERY_OCCLUSION_COUNTER:
206 case PIPE_QUERY_OCCLUSION_PREDICATE:
207 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
208 if (devinfo->gen >= 10) {
209 /* "Driver must program PIPE_CONTROL with only Depth Stall Enable
210 * bit set prior to programming a PIPE_CONTROL with Write PS Depth
211 * Count sync operation."
212 */
213 iris_emit_pipe_control_flush(batch, PIPE_CONTROL_DEPTH_STALL);
214 }
215 iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
216 PIPE_CONTROL_WRITE_DEPTH_COUNT |
217 PIPE_CONTROL_DEPTH_STALL,
218 offset);
219 break;
220 case PIPE_QUERY_TIME_ELAPSED:
221 case PIPE_QUERY_TIMESTAMP:
222 case PIPE_QUERY_TIMESTAMP_DISJOINT:
223 iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
224 PIPE_CONTROL_WRITE_TIMESTAMP,
225 offset);
226 break;
227 case PIPE_QUERY_PRIMITIVES_GENERATED:
228 ice->vtbl.store_register_mem64(batch,
229 q->index == 0 ? CL_INVOCATION_COUNT :
230 SO_PRIM_STORAGE_NEEDED(q->index),
231 q->bo, offset, false);
232 break;
233 case PIPE_QUERY_PRIMITIVES_EMITTED:
234 ice->vtbl.store_register_mem64(batch,
235 SO_NUM_PRIMS_WRITTEN(q->index),
236 q->bo, offset, false);
237 break;
238 case PIPE_QUERY_PIPELINE_STATISTICS: {
239 static const uint32_t index_to_reg[] = {
240 IA_VERTICES_COUNT,
241 IA_PRIMITIVES_COUNT,
242 VS_INVOCATION_COUNT,
243 GS_INVOCATION_COUNT,
244 GS_PRIMITIVES_COUNT,
245 CL_INVOCATION_COUNT,
246 CL_PRIMITIVES_COUNT,
247 PS_INVOCATION_COUNT,
248 HS_INVOCATION_COUNT,
249 DS_INVOCATION_COUNT,
250 CS_INVOCATION_COUNT,
251 };
252 const uint32_t reg = index_to_reg[q->index];
253
254 ice->vtbl.store_register_mem64(batch, reg, q->bo, offset, false);
255 break;
256 }
257 default:
258 assert(false);
259 }
260 }
261
262 static void
263 write_overflow_values(struct iris_context *ice, struct iris_query *q, bool end)
264 {
265 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
266 uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4;
267
268 iris_emit_pipe_control_flush(batch,
269 PIPE_CONTROL_CS_STALL |
270 PIPE_CONTROL_STALL_AT_SCOREBOARD);
271 for (uint32_t i = 0; i < count; i++) {
272 int s = q->index + i;
273 int g_idx = offsetof(struct iris_query_so_overflow,
274 stream[s].num_prims[end]);
275 int w_idx = offsetof(struct iris_query_so_overflow,
276 stream[s].prim_storage_needed[end]);
277 ice->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s),
278 q->bo, g_idx, false);
279 ice->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s),
280 q->bo, w_idx, false);
281 }
282 }
283
284 uint64_t
285 iris_timebase_scale(const struct gen_device_info *devinfo,
286 uint64_t gpu_timestamp)
287 {
288 return (1000000000ull * gpu_timestamp) / devinfo->timestamp_frequency;
289 }
290
291 static uint64_t
292 iris_raw_timestamp_delta(uint64_t time0, uint64_t time1)
293 {
294 if (time0 > time1) {
295 return (1ULL << TIMESTAMP_BITS) + time1 - time0;
296 } else {
297 return time1 - time0;
298 }
299 }
300
301 static bool
302 stream_overflowed(struct iris_query_so_overflow *so, int s)
303 {
304 return (so->stream[s].prim_storage_needed[1] -
305 so->stream[s].prim_storage_needed[0]) !=
306 (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]);
307 }
308
309 static void
310 calculate_result_on_cpu(const struct gen_device_info *devinfo,
311 struct iris_query *q)
312 {
313 switch (q->type) {
314 case PIPE_QUERY_OCCLUSION_PREDICATE:
315 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
316 q->result = q->map->end != q->map->start;
317 break;
318 case PIPE_QUERY_TIMESTAMP:
319 case PIPE_QUERY_TIMESTAMP_DISJOINT:
320 /* The timestamp is the single starting snapshot. */
321 q->result = iris_timebase_scale(devinfo, q->map->start);
322 q->result &= (1ull << TIMESTAMP_BITS) - 1;
323 break;
324 case PIPE_QUERY_TIME_ELAPSED:
325 q->result = iris_raw_timestamp_delta(q->map->start, q->map->end);
326 q->result = iris_timebase_scale(devinfo, q->result);
327 q->result &= (1ull << TIMESTAMP_BITS) - 1;
328 break;
329 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
330 q->result = stream_overflowed((void *) q->map, q->index);
331 break;
332 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
333 q->result = false;
334 for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
335 q->result |= stream_overflowed((void *) q->map, i);
336 break;
337 case PIPE_QUERY_OCCLUSION_COUNTER:
338 case PIPE_QUERY_PRIMITIVES_GENERATED:
339 case PIPE_QUERY_PRIMITIVES_EMITTED:
340 case PIPE_QUERY_PIPELINE_STATISTICS:
341 default:
342 q->result = q->map->end - q->map->start;
343 break;
344 }
345
346 q->ready = true;
347 }
348
349 static void
350 emit_alu_add(struct iris_batch *batch, unsigned dst_reg,
351 unsigned reg_a, unsigned reg_b)
352 {
353 uint32_t *math = iris_get_command_space(batch, 5 * sizeof(uint32_t));
354
355 math[0] = MI_MATH | (5 - 2);
356 math[1] = _MI_ALU2(LOAD, MI_ALU_SRCA, reg_a);
357 math[2] = _MI_ALU2(LOAD, MI_ALU_SRCB, reg_b);
358 math[3] = _MI_ALU0(ADD);
359 math[4] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
360 }
361
362 static void
363 emit_alu_shl(struct iris_batch *batch, unsigned dst_reg,
364 unsigned src_reg, unsigned shift)
365 {
366 assert(shift > 0);
367
368 int dwords = 1 + 4 * shift;
369
370 uint32_t *math = iris_get_command_space(batch, sizeof(uint32_t) * dwords);
371
372 math[0] = MI_MATH | ((1 + 4 * shift) - 2);
373
374 for (unsigned i = 0; i < shift; i++) {
375 unsigned add_src = (i == 0) ? src_reg : dst_reg;
376 math[1 + (i * 4) + 0] = _MI_ALU2(LOAD, MI_ALU_SRCA, add_src);
377 math[1 + (i * 4) + 1] = _MI_ALU2(LOAD, MI_ALU_SRCB, add_src);
378 math[1 + (i * 4) + 2] = _MI_ALU0(ADD);
379 math[1 + (i * 4) + 3] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
380 }
381 }
382
383 /* Emit dwords to multiply GPR0 by N */
384 static void
385 build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N)
386 {
387 VK_OUTARRAY_MAKE(out, dw, dw_count);
388
389 #define APPEND_ALU(op, x, y) \
390 vk_outarray_append(&out, alu_dw) *alu_dw = _MI_ALU(MI_ALU_##op, x, y)
391
392 assert(N > 0);
393 unsigned top_bit = 31 - __builtin_clz(N);
394 for (int i = top_bit - 1; i >= 0; i--) {
395 /* We get our initial data in GPR0 and we write the final data out to
396 * GPR0 but we use GPR1 as our scratch register.
397 */
398 unsigned src_reg = i == top_bit - 1 ? MI_ALU_R0 : MI_ALU_R1;
399 unsigned dst_reg = i == 0 ? MI_ALU_R0 : MI_ALU_R1;
400
401 /* Shift the current value left by 1 */
402 APPEND_ALU(LOAD, MI_ALU_SRCA, src_reg);
403 APPEND_ALU(LOAD, MI_ALU_SRCB, src_reg);
404 APPEND_ALU(ADD, 0, 0);
405
406 if (N & (1 << i)) {
407 /* Store ACCU to R1 and add R0 to R1 */
408 APPEND_ALU(STORE, MI_ALU_R1, MI_ALU_ACCU);
409 APPEND_ALU(LOAD, MI_ALU_SRCA, MI_ALU_R0);
410 APPEND_ALU(LOAD, MI_ALU_SRCB, MI_ALU_R1);
411 APPEND_ALU(ADD, 0, 0);
412 }
413
414 APPEND_ALU(STORE, dst_reg, MI_ALU_ACCU);
415 }
416
417 #undef APPEND_ALU
418 }
419
420 static void
421 emit_mul_gpr0(struct iris_batch *batch, uint32_t N)
422 {
423 uint32_t num_dwords;
424 build_alu_multiply_gpr0(NULL, &num_dwords, N);
425
426 uint32_t *math = iris_get_command_space(batch, 4 * num_dwords);
427 math[0] = MI_MATH | (num_dwords - 2);
428 build_alu_multiply_gpr0(&math[1], &num_dwords, N);
429 }
430
431 void
432 iris_math_div32_gpr0(struct iris_context *ice,
433 struct iris_batch *batch,
434 uint32_t D)
435 {
436 /* Zero out the top of GPR0 */
437 emit_lri32(batch, CS_GPR(0) + 4, 0);
438
439 if (D == 0) {
440 /* This invalid, but we should do something so we set GPR0 to 0. */
441 emit_lri32(batch, CS_GPR(0), 0);
442 } else if (util_is_power_of_two_or_zero(D)) {
443 unsigned log2_D = util_logbase2(D);
444 assert(log2_D < 32);
445 /* We right-shift by log2(D) by left-shifting by 32 - log2(D) and taking
446 * the top 32 bits of the result.
447 */
448 emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - log2_D);
449 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
450 emit_lri32(batch, CS_GPR(0) + 4, 0);
451 } else {
452 struct util_fast_udiv_info m = util_compute_fast_udiv_info(D, 32, 32);
453 assert(m.multiplier <= UINT32_MAX);
454
455 if (m.pre_shift) {
456 /* We right-shift by L by left-shifting by 32 - l and taking the top
457 * 32 bits of the result.
458 */
459 if (m.pre_shift < 32)
460 emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.pre_shift);
461 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
462 emit_lri32(batch, CS_GPR(0) + 4, 0);
463 }
464
465 /* Do the 32x32 multiply into gpr0 */
466 emit_mul_gpr0(batch, m.multiplier);
467
468 if (m.increment) {
469 /* If we need to increment, save off a copy of GPR0 */
470 emit_lri32(batch, CS_GPR(1) + 0, m.multiplier);
471 emit_lri32(batch, CS_GPR(1) + 4, 0);
472 emit_alu_add(batch, MI_ALU_R0, MI_ALU_R0, MI_ALU_R1);
473 }
474
475 /* Shift by 32 */
476 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
477 emit_lri32(batch, CS_GPR(0) + 4, 0);
478
479 if (m.post_shift) {
480 /* We right-shift by L by left-shifting by 32 - l and taking the top
481 * 32 bits of the result.
482 */
483 if (m.post_shift < 32)
484 emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.post_shift);
485 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
486 emit_lri32(batch, CS_GPR(0) + 4, 0);
487 }
488 }
489 }
490
491 /*
492 * GPR0 = (GPR0 == 0) ? 0 : 1;
493 */
494 static void
495 gpr0_to_bool(struct iris_context *ice)
496 {
497 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
498
499 ice->vtbl.load_register_imm64(batch, CS_GPR(1), 1ull);
500
501 static const uint32_t math[] = {
502 MI_MATH | (9 - 2),
503 MI_ALU2(LOAD, SRCA, R0),
504 MI_ALU1(LOAD0, SRCB),
505 MI_ALU0(ADD),
506 MI_ALU2(STOREINV, R0, ZF),
507 MI_ALU2(LOAD, SRCA, R0),
508 MI_ALU2(LOAD, SRCB, R1),
509 MI_ALU0(AND),
510 MI_ALU2(STORE, R0, ACCU),
511 };
512 iris_batch_emit(batch, math, sizeof(math));
513 }
514
515 static void
516 load_overflow_data_to_cs_gprs(struct iris_context *ice,
517 struct iris_query *q,
518 int idx)
519 {
520 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
521
522 ice->vtbl.load_register_mem64(batch, CS_GPR(1), q->bo,
523 offsetof(struct iris_query_so_overflow,
524 stream[idx].prim_storage_needed[0]));
525 ice->vtbl.load_register_mem64(batch, CS_GPR(2), q->bo,
526 offsetof(struct iris_query_so_overflow,
527 stream[idx].prim_storage_needed[1]));
528
529 ice->vtbl.load_register_mem64(batch, CS_GPR(3), q->bo,
530 offsetof(struct iris_query_so_overflow,
531 stream[idx].num_prims[0]));
532 ice->vtbl.load_register_mem64(batch, CS_GPR(4), q->bo,
533 offsetof(struct iris_query_so_overflow,
534 stream[idx].num_prims[1]));
535 }
536
537 /*
538 * R3 = R4 - R3;
539 * R1 = R2 - R1;
540 * R1 = R3 - R1;
541 * R0 = R0 | R1;
542 */
543 static void
544 calc_overflow_for_stream(struct iris_context *ice)
545 {
546 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
547 static const uint32_t maths[] = {
548 MI_MATH | (17 - 2),
549 MI_ALU2(LOAD, SRCA, R4),
550 MI_ALU2(LOAD, SRCB, R3),
551 MI_ALU0(SUB),
552 MI_ALU2(STORE, R3, ACCU),
553 MI_ALU2(LOAD, SRCA, R2),
554 MI_ALU2(LOAD, SRCB, R1),
555 MI_ALU0(SUB),
556 MI_ALU2(STORE, R1, ACCU),
557 MI_ALU2(LOAD, SRCA, R3),
558 MI_ALU2(LOAD, SRCB, R1),
559 MI_ALU0(SUB),
560 MI_ALU2(STORE, R1, ACCU),
561 MI_ALU2(LOAD, SRCA, R1),
562 MI_ALU2(LOAD, SRCB, R0),
563 MI_ALU0(OR),
564 MI_ALU2(STORE, R0, ACCU),
565 };
566
567 iris_batch_emit(batch, maths, sizeof(maths));
568 }
569
570 static void
571 overflow_result_to_gpr0(struct iris_context *ice, struct iris_query *q)
572 {
573 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
574
575 ice->vtbl.load_register_imm64(batch, CS_GPR(0), 0ull);
576
577 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) {
578 load_overflow_data_to_cs_gprs(ice, q, q->index);
579 calc_overflow_for_stream(ice);
580 } else {
581 for (int i = 0; i < MAX_VERTEX_STREAMS; i++) {
582 load_overflow_data_to_cs_gprs(ice, q, i);
583 calc_overflow_for_stream(ice);
584 }
585 }
586
587 gpr0_to_bool(ice);
588 }
589
590 /**
591 * Calculate the result and store it to CS_GPR0.
592 */
593 static void
594 calculate_result_on_gpu(struct iris_context *ice, struct iris_query *q)
595 {
596 struct iris_batch *batch = &ice->batches[q->batch_idx];
597
598 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
599 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
600 overflow_result_to_gpr0(ice, q);
601 return;
602 }
603
604 ice->vtbl.load_register_mem64(batch, CS_GPR(1), q->bo,
605 offsetof(struct iris_query_snapshots, start));
606 ice->vtbl.load_register_mem64(batch, CS_GPR(2), q->bo,
607 offsetof(struct iris_query_snapshots, end));
608
609 static const uint32_t math[] = {
610 MI_MATH | (5 - 2),
611 MI_ALU2(LOAD, SRCA, R2),
612 MI_ALU2(LOAD, SRCB, R1),
613 MI_ALU0(SUB),
614 MI_ALU2(STORE, R0, ACCU),
615 };
616 iris_batch_emit(batch, math, sizeof(math));
617
618 if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE ||
619 q->type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
620 gpr0_to_bool(ice);
621 }
622
623 static struct pipe_query *
624 iris_create_query(struct pipe_context *ctx,
625 unsigned query_type,
626 unsigned index)
627 {
628 struct iris_query *q = calloc(1, sizeof(struct iris_query));
629
630 q->type = query_type;
631 q->index = index;
632
633 if (q->type == PIPE_QUERY_PIPELINE_STATISTICS && q->index == 10)
634 q->batch_idx = IRIS_BATCH_COMPUTE;
635 else
636 q->batch_idx = IRIS_BATCH_RENDER;
637 return (struct pipe_query *) q;
638 }
639
640 static void
641 iris_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query)
642 {
643 struct iris_query *query = (void *) p_query;
644 iris_bo_unreference(query->bo);
645 free(query);
646 }
647
648
649 static boolean
650 iris_begin_query(struct pipe_context *ctx, struct pipe_query *query)
651 {
652 struct iris_screen *screen = (void *) ctx->screen;
653 struct iris_context *ice = (void *) ctx;
654 struct iris_query *q = (void *) query;
655
656 iris_bo_unreference(q->bo);
657 q->bo = iris_bo_alloc(screen->bufmgr, "query object", 4096,
658 IRIS_MEMZONE_OTHER);
659 if (!q->bo)
660 return false;
661
662 q->map = iris_bo_map(&ice->dbg, q->bo, MAP_READ | MAP_WRITE | MAP_ASYNC);
663 if (!q->map)
664 return false;
665
666 q->result = 0ull;
667 q->ready = false;
668 q->map->snapshots_landed = false;
669
670 if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
671 ice->state.prims_generated_query_active = true;
672 ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
673 }
674
675 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
676 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
677 write_overflow_values(ice, q, false);
678 else
679 write_value(ice, q, offsetof(struct iris_query_snapshots, start));
680
681 return true;
682 }
683
684 static bool
685 iris_end_query(struct pipe_context *ctx, struct pipe_query *query)
686 {
687 struct iris_context *ice = (void *) ctx;
688 struct iris_query *q = (void *) query;
689
690 if (q->type == PIPE_QUERY_TIMESTAMP) {
691 iris_begin_query(ctx, query);
692 mark_available(ice, q);
693 return true;
694 }
695
696 if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
697 ice->state.prims_generated_query_active = false;
698 ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
699 }
700
701 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
702 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
703 write_overflow_values(ice, q, true);
704 else
705 write_value(ice, q, offsetof(struct iris_query_snapshots, end));
706 mark_available(ice, q);
707
708 return true;
709 }
710
711 /**
712 * See if the snapshots have landed for a query, and if so, compute the
713 * result and mark it ready. Does not flush (unlike iris_get_query_result).
714 */
715 static void
716 iris_check_query_no_flush(struct iris_context *ice, struct iris_query *q)
717 {
718 struct iris_screen *screen = (void *) ice->ctx.screen;
719 const struct gen_device_info *devinfo = &screen->devinfo;
720
721 if (!q->ready && q->map->snapshots_landed) {
722 calculate_result_on_cpu(devinfo, q);
723 }
724 }
725
726 static boolean
727 iris_get_query_result(struct pipe_context *ctx,
728 struct pipe_query *query,
729 boolean wait,
730 union pipe_query_result *result)
731 {
732 struct iris_context *ice = (void *) ctx;
733 struct iris_query *q = (void *) query;
734 struct iris_screen *screen = (void *) ctx->screen;
735 const struct gen_device_info *devinfo = &screen->devinfo;
736
737 if (!q->ready) {
738 if (iris_batch_references(&ice->batches[q->batch_idx], q->bo))
739 iris_batch_flush(&ice->batches[q->batch_idx]);
740
741 if (!q->map->snapshots_landed) {
742 if (wait)
743 iris_bo_wait_rendering(q->bo);
744 else
745 return false;
746 }
747
748 assert(q->map->snapshots_landed);
749 calculate_result_on_cpu(devinfo, q);
750 }
751
752 assert(q->ready);
753
754 if (q->type == PIPE_QUERY_PIPELINE_STATISTICS) {
755 switch (q->index) {
756 case 0:
757 result->pipeline_statistics.ia_vertices = q->result;
758 break;
759 case 1:
760 result->pipeline_statistics.ia_primitives = q->result;
761 break;
762 case 2:
763 result->pipeline_statistics.vs_invocations = q->result;
764 break;
765 case 3:
766 result->pipeline_statistics.gs_invocations = q->result;
767 break;
768 case 4:
769 result->pipeline_statistics.gs_primitives = q->result;
770 break;
771 case 5:
772 result->pipeline_statistics.c_invocations = q->result;
773 break;
774 case 6:
775 result->pipeline_statistics.c_primitives = q->result;
776 break;
777 case 7:
778 result->pipeline_statistics.ps_invocations = q->result;
779 break;
780 case 8:
781 result->pipeline_statistics.hs_invocations = q->result;
782 break;
783 case 9:
784 result->pipeline_statistics.ds_invocations = q->result;
785 break;
786 case 10:
787 result->pipeline_statistics.cs_invocations = q->result;
788 break;
789 }
790 } else {
791 result->u64 = q->result;
792 }
793
794 return true;
795 }
796
797 static void
798 iris_get_query_result_resource(struct pipe_context *ctx,
799 struct pipe_query *query,
800 boolean wait,
801 enum pipe_query_value_type result_type,
802 int index,
803 struct pipe_resource *p_res,
804 unsigned offset)
805 {
806 struct iris_context *ice = (void *) ctx;
807 struct iris_query *q = (void *) query;
808 struct iris_batch *batch = &ice->batches[q->batch_idx];
809 const struct gen_device_info *devinfo = &batch->screen->devinfo;
810 struct iris_resource *res = (void *) p_res;
811 unsigned snapshots_landed_offset =
812 offsetof(struct iris_query_snapshots, snapshots_landed);
813
814 res->bind_history |= PIPE_BIND_QUERY_BUFFER;
815
816 if (index == -1) {
817 /* They're asking for the availability of the result. If we still
818 * have commands queued up which produce the result, submit them
819 * now so that progress happens. Either way, copy the snapshots
820 * landed field to the destination resource.
821 */
822 if (iris_batch_references(batch, q->bo))
823 iris_batch_flush(batch);
824
825 ice->vtbl.copy_mem_mem(batch, iris_resource_bo(p_res), offset,
826 q->bo, snapshots_landed_offset,
827 result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8);
828 return;
829 }
830
831 if (!q->ready && q->map->snapshots_landed) {
832 /* The final snapshots happen to have landed, so let's just compute
833 * the result on the CPU now...
834 */
835 calculate_result_on_cpu(devinfo, q);
836 }
837
838 if (q->ready) {
839 /* We happen to have the result on the CPU, so just copy it. */
840 if (result_type <= PIPE_QUERY_TYPE_U32) {
841 ice->vtbl.store_data_imm32(batch, iris_resource_bo(p_res), offset,
842 q->result);
843 } else {
844 ice->vtbl.store_data_imm64(batch, iris_resource_bo(p_res), offset,
845 q->result);
846 }
847
848 /* Make sure the result lands before they use bind the QBO elsewhere
849 * and use the result.
850 */
851 // XXX: Why? i965 doesn't do this.
852 iris_emit_pipe_control_flush(batch, PIPE_CONTROL_CS_STALL);
853 return;
854 }
855
856 /* Calculate the result to CS_GPR0 */
857 calculate_result_on_gpu(ice, q);
858
859 bool predicated = !wait && !q->stalled;
860
861 if (predicated) {
862 ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
863 ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, q->bo,
864 snapshots_landed_offset);
865 uint32_t predicate = MI_PREDICATE |
866 MI_PREDICATE_LOADOP_LOADINV |
867 MI_PREDICATE_COMBINEOP_SET |
868 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
869 iris_batch_emit(batch, &predicate, sizeof(uint32_t));
870 }
871
872 if (result_type <= PIPE_QUERY_TYPE_U32) {
873 ice->vtbl.store_register_mem32(batch, CS_GPR(0),
874 iris_resource_bo(p_res),
875 offset, predicated);
876 } else {
877 ice->vtbl.store_register_mem64(batch, CS_GPR(0),
878 iris_resource_bo(p_res),
879 offset, predicated);
880 }
881 }
882
883 static void
884 iris_set_active_query_state(struct pipe_context *ctx, boolean enable)
885 {
886 struct iris_context *ice = (void *) ctx;
887
888 if (ice->state.statistics_counters_enabled == enable)
889 return;
890
891 // XXX: most packets aren't paying attention to this yet, because it'd
892 // have to be done dynamically at draw time, which is a pain
893 ice->state.statistics_counters_enabled = enable;
894 ice->state.dirty |= IRIS_DIRTY_CLIP |
895 IRIS_DIRTY_GS |
896 IRIS_DIRTY_RASTER |
897 IRIS_DIRTY_STREAMOUT |
898 IRIS_DIRTY_TCS |
899 IRIS_DIRTY_TES |
900 IRIS_DIRTY_VS |
901 IRIS_DIRTY_WM;
902 }
903
904 static void
905 set_predicate_enable(struct iris_context *ice, bool value)
906 {
907 if (value)
908 ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
909 else
910 ice->state.predicate = IRIS_PREDICATE_STATE_DONT_RENDER;
911 }
912
913 static void
914 set_predicate_for_result(struct iris_context *ice,
915 struct iris_query *q,
916 bool inverted)
917 {
918 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
919
920 /* The CPU doesn't have the query result yet; use hardware predication */
921 ice->state.predicate = IRIS_PREDICATE_STATE_USE_BIT;
922
923 /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
924 iris_emit_pipe_control_flush(batch, PIPE_CONTROL_FLUSH_ENABLE);
925 q->stalled = true;
926
927 switch (q->type) {
928 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
929 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
930 overflow_result_to_gpr0(ice, q);
931
932 ice->vtbl.load_register_reg64(batch, MI_PREDICATE_SRC0, CS_GPR(0));
933 ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
934 break;
935 default:
936 /* PIPE_QUERY_OCCLUSION_* */
937 ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, q->bo,
938 offsetof(struct iris_query_snapshots, start));
939 ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, q->bo,
940 offsetof(struct iris_query_snapshots, end));
941 break;
942 }
943
944 uint32_t mi_predicate = MI_PREDICATE |
945 MI_PREDICATE_COMBINEOP_SET |
946 MI_PREDICATE_COMPAREOP_SRCS_EQUAL |
947 (inverted ? MI_PREDICATE_LOADOP_LOAD
948 : MI_PREDICATE_LOADOP_LOADINV);
949 iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
950
951 /* We immediately set the predicate on the render batch, as all the
952 * counters come from 3D operations. However, we may need to predicate
953 * a compute dispatch, which executes in a different GEM context and has
954 * a different MI_PREDICATE_DATA register. So, we save the result to
955 * memory and reload it in iris_launch_grid.
956 */
957 unsigned offset = offsetof(struct iris_query_snapshots, predicate_data);
958 ice->vtbl.store_register_mem64(batch, MI_PREDICATE_DATA,
959 q->bo, offset, false);
960 ice->state.compute_predicate = q->bo;
961 }
962
963 static void
964 iris_render_condition(struct pipe_context *ctx,
965 struct pipe_query *query,
966 boolean condition,
967 enum pipe_render_cond_flag mode)
968 {
969 struct iris_context *ice = (void *) ctx;
970 struct iris_query *q = (void *) query;
971
972 if (!q) {
973 ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
974 return;
975 }
976
977 iris_check_query_no_flush(ice, q);
978
979 if (q->result || q->ready) {
980 set_predicate_enable(ice, (q->result != 0) ^ condition);
981 } else {
982 if (mode == PIPE_RENDER_COND_NO_WAIT ||
983 mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) {
984 perf_debug(&ice->dbg, "Conditional rendering demoted from "
985 "\"no wait\" to \"wait\".");
986 }
987 set_predicate_for_result(ice, q, condition);
988 }
989 }
990
991 void
992 iris_init_query_functions(struct pipe_context *ctx)
993 {
994 ctx->create_query = iris_create_query;
995 ctx->destroy_query = iris_destroy_query;
996 ctx->begin_query = iris_begin_query;
997 ctx->end_query = iris_end_query;
998 ctx->get_query_result = iris_get_query_result;
999 ctx->get_query_result_resource = iris_get_query_result_resource;
1000 ctx->set_active_query_state = iris_set_active_query_state;
1001 ctx->render_condition = iris_render_condition;
1002 }