iris: Add a more long term TODO about timebase scaling
[mesa.git] / src / gallium / drivers / iris / iris_query.c
1 /*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
21 */
22
23 /**
24 * @file iris_query.c
25 *
26 * Query object support. This allows measuring various simple statistics
27 * via counters on the GPU.
28 */
29
30 #include <stdio.h>
31 #include <errno.h>
32 #include "pipe/p_defines.h"
33 #include "pipe/p_state.h"
34 #include "pipe/p_context.h"
35 #include "pipe/p_screen.h"
36 #include "util/fast_idiv_by_const.h"
37 #include "util/u_inlines.h"
38 #include "iris_context.h"
39 #include "iris_defines.h"
40 #include "iris_resource.h"
41 #include "iris_screen.h"
42 #include "vulkan/util/vk_util.h"
43
44 #define IA_VERTICES_COUNT 0x2310
45 #define IA_PRIMITIVES_COUNT 0x2318
46 #define VS_INVOCATION_COUNT 0x2320
47 #define HS_INVOCATION_COUNT 0x2300
48 #define DS_INVOCATION_COUNT 0x2308
49 #define GS_INVOCATION_COUNT 0x2328
50 #define GS_PRIMITIVES_COUNT 0x2330
51 #define CL_INVOCATION_COUNT 0x2338
52 #define CL_PRIMITIVES_COUNT 0x2340
53 #define PS_INVOCATION_COUNT 0x2348
54 #define CS_INVOCATION_COUNT 0x2290
55 #define PS_DEPTH_COUNT 0x2350
56
57 #define SO_PRIM_STORAGE_NEEDED(n) (0x5240 + (n) * 8)
58
59 #define SO_NUM_PRIMS_WRITTEN(n) (0x5200 + (n) * 8)
60
61 #define MI_MATH (0x1a << 23)
62
63 #define MI_ALU_LOAD 0x080
64 #define MI_ALU_LOADINV 0x480
65 #define MI_ALU_LOAD0 0x081
66 #define MI_ALU_LOAD1 0x481
67 #define MI_ALU_ADD 0x100
68 #define MI_ALU_SUB 0x101
69 #define MI_ALU_AND 0x102
70 #define MI_ALU_OR 0x103
71 #define MI_ALU_XOR 0x104
72 #define MI_ALU_STORE 0x180
73 #define MI_ALU_STOREINV 0x580
74
75 #define MI_ALU_R0 0x00
76 #define MI_ALU_R1 0x01
77 #define MI_ALU_R2 0x02
78 #define MI_ALU_R3 0x03
79 #define MI_ALU_R4 0x04
80 #define MI_ALU_SRCA 0x20
81 #define MI_ALU_SRCB 0x21
82 #define MI_ALU_ACCU 0x31
83 #define MI_ALU_ZF 0x32
84 #define MI_ALU_CF 0x33
85
86 #define _MI_ALU(op, x, y) (((op) << 20) | ((x) << 10) | (y))
87
88 #define _MI_ALU0(op) _MI_ALU(MI_ALU_##op, 0, 0)
89 #define _MI_ALU1(op, x) _MI_ALU(MI_ALU_##op, x, 0)
90 #define _MI_ALU2(op, x, y) _MI_ALU(MI_ALU_##op, x, y)
91
92 #define MI_ALU0(op) _MI_ALU0(op)
93 #define MI_ALU1(op, x) _MI_ALU1(op, MI_ALU_##x)
94 #define MI_ALU2(op, x, y) _MI_ALU2(op, MI_ALU_##x, MI_ALU_##y)
95
96 #define emit_lri32 ice->vtbl.load_register_imm32
97 #define emit_lri64 ice->vtbl.load_register_imm64
98 #define emit_lrr32 ice->vtbl.load_register_reg32
99
100 struct iris_query {
101 enum pipe_query_type type;
102 int index;
103
104 bool ready;
105
106 bool stalled;
107
108 uint64_t result;
109
110 struct iris_bo *bo;
111 struct iris_query_snapshots *map;
112
113 int batch_idx;
114 };
115
116 struct iris_query_snapshots {
117 /** iris_render_condition's saved MI_PREDICATE_DATA value. */
118 uint64_t predicate_data;
119
120 /** Have the start/end snapshots landed? */
121 uint64_t snapshots_landed;
122
123 /** Starting and ending counter snapshots */
124 uint64_t start;
125 uint64_t end;
126 };
127
128 struct iris_query_so_overflow {
129 uint64_t predicate_data;
130 uint64_t snapshots_landed;
131
132 struct {
133 uint64_t prim_storage_needed[2];
134 uint64_t num_prims[2];
135 } stream[4];
136 };
137
138 /**
139 * Is this type of query written by PIPE_CONTROL?
140 */
141 static bool
142 iris_is_query_pipelined(struct iris_query *q)
143 {
144 switch (q->type) {
145 case PIPE_QUERY_OCCLUSION_COUNTER:
146 case PIPE_QUERY_OCCLUSION_PREDICATE:
147 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
148 case PIPE_QUERY_TIMESTAMP:
149 case PIPE_QUERY_TIMESTAMP_DISJOINT:
150 case PIPE_QUERY_TIME_ELAPSED:
151 return true;
152
153 default:
154 return false;
155 }
156 }
157
158 static void
159 mark_available(struct iris_context *ice, struct iris_query *q)
160 {
161 struct iris_batch *batch = &ice->batches[q->batch_idx];
162 unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
163 unsigned offset = offsetof(struct iris_query_snapshots, snapshots_landed);
164
165 if (!iris_is_query_pipelined(q)) {
166 ice->vtbl.store_data_imm64(batch, q->bo, offset, true);
167 } else {
168 /* Order available *after* the query results. */
169 flags |= PIPE_CONTROL_FLUSH_ENABLE;
170 iris_emit_pipe_control_write(batch, flags, q->bo, offset, true);
171 }
172 }
173
174 /**
175 * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
176 */
177 static void
178 iris_pipelined_write(struct iris_batch *batch,
179 struct iris_query *q,
180 enum pipe_control_flags flags,
181 unsigned offset)
182 {
183 const struct gen_device_info *devinfo = &batch->screen->devinfo;
184 const unsigned optional_cs_stall =
185 devinfo->gen == 9 && devinfo->gt == 4 ? PIPE_CONTROL_CS_STALL : 0;
186
187 iris_emit_pipe_control_write(batch, flags | optional_cs_stall,
188 q->bo, offset, 0ull);
189 }
190
191 static void
192 write_value(struct iris_context *ice, struct iris_query *q, unsigned offset)
193 {
194 struct iris_batch *batch = &ice->batches[q->batch_idx];
195 const struct gen_device_info *devinfo = &batch->screen->devinfo;
196
197 if (!iris_is_query_pipelined(q)) {
198 iris_emit_pipe_control_flush(batch,
199 PIPE_CONTROL_CS_STALL |
200 PIPE_CONTROL_STALL_AT_SCOREBOARD);
201 q->stalled = true;
202 }
203
204 switch (q->type) {
205 case PIPE_QUERY_OCCLUSION_COUNTER:
206 case PIPE_QUERY_OCCLUSION_PREDICATE:
207 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
208 if (devinfo->gen >= 10) {
209 /* "Driver must program PIPE_CONTROL with only Depth Stall Enable
210 * bit set prior to programming a PIPE_CONTROL with Write PS Depth
211 * Count sync operation."
212 */
213 iris_emit_pipe_control_flush(batch, PIPE_CONTROL_DEPTH_STALL);
214 }
215 iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
216 PIPE_CONTROL_WRITE_DEPTH_COUNT |
217 PIPE_CONTROL_DEPTH_STALL,
218 offset);
219 break;
220 case PIPE_QUERY_TIME_ELAPSED:
221 case PIPE_QUERY_TIMESTAMP:
222 case PIPE_QUERY_TIMESTAMP_DISJOINT:
223 iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
224 PIPE_CONTROL_WRITE_TIMESTAMP,
225 offset);
226 break;
227 case PIPE_QUERY_PRIMITIVES_GENERATED:
228 ice->vtbl.store_register_mem64(batch,
229 q->index == 0 ? CL_INVOCATION_COUNT :
230 SO_PRIM_STORAGE_NEEDED(q->index),
231 q->bo, offset, false);
232 break;
233 case PIPE_QUERY_PRIMITIVES_EMITTED:
234 ice->vtbl.store_register_mem64(batch,
235 SO_NUM_PRIMS_WRITTEN(q->index),
236 q->bo, offset, false);
237 break;
238 case PIPE_QUERY_PIPELINE_STATISTICS: {
239 static const uint32_t index_to_reg[] = {
240 IA_VERTICES_COUNT,
241 IA_PRIMITIVES_COUNT,
242 VS_INVOCATION_COUNT,
243 GS_INVOCATION_COUNT,
244 GS_PRIMITIVES_COUNT,
245 CL_INVOCATION_COUNT,
246 CL_PRIMITIVES_COUNT,
247 PS_INVOCATION_COUNT,
248 HS_INVOCATION_COUNT,
249 DS_INVOCATION_COUNT,
250 CS_INVOCATION_COUNT,
251 };
252 const uint32_t reg = index_to_reg[q->index];
253
254 ice->vtbl.store_register_mem64(batch, reg, q->bo, offset, false);
255 break;
256 }
257 default:
258 assert(false);
259 }
260 }
261
262 static void
263 write_overflow_values(struct iris_context *ice, struct iris_query *q, bool end)
264 {
265 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
266 uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4;
267
268 iris_emit_pipe_control_flush(batch,
269 PIPE_CONTROL_CS_STALL |
270 PIPE_CONTROL_STALL_AT_SCOREBOARD);
271 for (uint32_t i = 0; i < count; i++) {
272 int s = q->index + i;
273 int g_idx = offsetof(struct iris_query_so_overflow,
274 stream[s].num_prims[end]);
275 int w_idx = offsetof(struct iris_query_so_overflow,
276 stream[s].prim_storage_needed[end]);
277 ice->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s),
278 q->bo, g_idx, false);
279 ice->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s),
280 q->bo, w_idx, false);
281 }
282 }
283
284 uint64_t
285 iris_timebase_scale(const struct gen_device_info *devinfo,
286 uint64_t gpu_timestamp)
287 {
288 return (1000000000ull * gpu_timestamp) / devinfo->timestamp_frequency;
289 }
290
291 static uint64_t
292 iris_raw_timestamp_delta(uint64_t time0, uint64_t time1)
293 {
294 if (time0 > time1) {
295 return (1ULL << TIMESTAMP_BITS) + time1 - time0;
296 } else {
297 return time1 - time0;
298 }
299 }
300
301 static bool
302 stream_overflowed(struct iris_query_so_overflow *so, int s)
303 {
304 return (so->stream[s].prim_storage_needed[1] -
305 so->stream[s].prim_storage_needed[0]) !=
306 (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]);
307 }
308
309 static void
310 calculate_result_on_cpu(const struct gen_device_info *devinfo,
311 struct iris_query *q)
312 {
313 switch (q->type) {
314 case PIPE_QUERY_OCCLUSION_PREDICATE:
315 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
316 q->result = q->map->end != q->map->start;
317 break;
318 case PIPE_QUERY_TIMESTAMP:
319 case PIPE_QUERY_TIMESTAMP_DISJOINT:
320 /* The timestamp is the single starting snapshot. */
321 q->result = iris_timebase_scale(devinfo, q->map->start);
322 q->result &= (1ull << TIMESTAMP_BITS) - 1;
323 break;
324 case PIPE_QUERY_TIME_ELAPSED:
325 q->result = iris_raw_timestamp_delta(q->map->start, q->map->end);
326 q->result = iris_timebase_scale(devinfo, q->result);
327 q->result &= (1ull << TIMESTAMP_BITS) - 1;
328 break;
329 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
330 q->result = stream_overflowed((void *) q->map, q->index);
331 break;
332 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
333 q->result = false;
334 for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
335 q->result |= stream_overflowed((void *) q->map, i);
336 break;
337 case PIPE_QUERY_OCCLUSION_COUNTER:
338 case PIPE_QUERY_PRIMITIVES_GENERATED:
339 case PIPE_QUERY_PRIMITIVES_EMITTED:
340 case PIPE_QUERY_PIPELINE_STATISTICS:
341 default:
342 q->result = q->map->end - q->map->start;
343 break;
344 }
345
346 q->ready = true;
347 }
348
349 static void
350 emit_alu_add(struct iris_batch *batch, unsigned dst_reg,
351 unsigned reg_a, unsigned reg_b)
352 {
353 uint32_t *math = iris_get_command_space(batch, 5 * sizeof(uint32_t));
354
355 math[0] = MI_MATH | (5 - 2);
356 math[1] = _MI_ALU2(LOAD, MI_ALU_SRCA, reg_a);
357 math[2] = _MI_ALU2(LOAD, MI_ALU_SRCB, reg_b);
358 math[3] = _MI_ALU0(ADD);
359 math[4] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
360 }
361
362 static void
363 emit_alu_shl(struct iris_batch *batch, unsigned dst_reg,
364 unsigned src_reg, unsigned shift)
365 {
366 assert(shift > 0);
367
368 int dwords = 1 + 4 * shift;
369
370 uint32_t *math = iris_get_command_space(batch, sizeof(uint32_t) * dwords);
371
372 math[0] = MI_MATH | ((1 + 4 * shift) - 2);
373
374 for (unsigned i = 0; i < shift; i++) {
375 unsigned add_src = (i == 0) ? src_reg : dst_reg;
376 math[1 + (i * 4) + 0] = _MI_ALU2(LOAD, MI_ALU_SRCA, add_src);
377 math[1 + (i * 4) + 1] = _MI_ALU2(LOAD, MI_ALU_SRCB, add_src);
378 math[1 + (i * 4) + 2] = _MI_ALU0(ADD);
379 math[1 + (i * 4) + 3] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
380 }
381 }
382
383 /* Emit dwords to multiply GPR0 by N */
384 static void
385 build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N)
386 {
387 VK_OUTARRAY_MAKE(out, dw, dw_count);
388
389 #define APPEND_ALU(op, x, y) \
390 vk_outarray_append(&out, alu_dw) *alu_dw = _MI_ALU(MI_ALU_##op, x, y)
391
392 assert(N > 0);
393 unsigned top_bit = 31 - __builtin_clz(N);
394 for (int i = top_bit - 1; i >= 0; i--) {
395 /* We get our initial data in GPR0 and we write the final data out to
396 * GPR0 but we use GPR1 as our scratch register.
397 */
398 unsigned src_reg = i == top_bit - 1 ? MI_ALU_R0 : MI_ALU_R1;
399 unsigned dst_reg = i == 0 ? MI_ALU_R0 : MI_ALU_R1;
400
401 /* Shift the current value left by 1 */
402 APPEND_ALU(LOAD, MI_ALU_SRCA, src_reg);
403 APPEND_ALU(LOAD, MI_ALU_SRCB, src_reg);
404 APPEND_ALU(ADD, 0, 0);
405
406 if (N & (1 << i)) {
407 /* Store ACCU to R1 and add R0 to R1 */
408 APPEND_ALU(STORE, MI_ALU_R1, MI_ALU_ACCU);
409 APPEND_ALU(LOAD, MI_ALU_SRCA, MI_ALU_R0);
410 APPEND_ALU(LOAD, MI_ALU_SRCB, MI_ALU_R1);
411 APPEND_ALU(ADD, 0, 0);
412 }
413
414 APPEND_ALU(STORE, dst_reg, MI_ALU_ACCU);
415 }
416
417 #undef APPEND_ALU
418 }
419
420 static void
421 emit_mul_gpr0(struct iris_batch *batch, uint32_t N)
422 {
423 uint32_t num_dwords;
424 build_alu_multiply_gpr0(NULL, &num_dwords, N);
425
426 uint32_t *math = iris_get_command_space(batch, 4 * num_dwords);
427 math[0] = MI_MATH | (num_dwords - 2);
428 build_alu_multiply_gpr0(&math[1], &num_dwords, N);
429 }
430
431 void
432 iris_math_div32_gpr0(struct iris_context *ice,
433 struct iris_batch *batch,
434 uint32_t D)
435 {
436 /* Zero out the top of GPR0 */
437 emit_lri32(batch, CS_GPR(0) + 4, 0);
438
439 if (D == 0) {
440 /* This invalid, but we should do something so we set GPR0 to 0. */
441 emit_lri32(batch, CS_GPR(0), 0);
442 } else if (util_is_power_of_two_or_zero(D)) {
443 unsigned log2_D = util_logbase2(D);
444 assert(log2_D < 32);
445 /* We right-shift by log2(D) by left-shifting by 32 - log2(D) and taking
446 * the top 32 bits of the result.
447 */
448 emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - log2_D);
449 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
450 emit_lri32(batch, CS_GPR(0) + 4, 0);
451 } else {
452 struct util_fast_udiv_info m = util_compute_fast_udiv_info(D, 32, 32);
453 assert(m.multiplier <= UINT32_MAX);
454
455 if (m.pre_shift) {
456 /* We right-shift by L by left-shifting by 32 - l and taking the top
457 * 32 bits of the result.
458 */
459 if (m.pre_shift < 32)
460 emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.pre_shift);
461 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
462 emit_lri32(batch, CS_GPR(0) + 4, 0);
463 }
464
465 /* Do the 32x32 multiply into gpr0 */
466 emit_mul_gpr0(batch, m.multiplier);
467
468 if (m.increment) {
469 /* If we need to increment, save off a copy of GPR0 */
470 emit_lri32(batch, CS_GPR(1) + 0, m.multiplier);
471 emit_lri32(batch, CS_GPR(1) + 4, 0);
472 emit_alu_add(batch, MI_ALU_R0, MI_ALU_R0, MI_ALU_R1);
473 }
474
475 /* Shift by 32 */
476 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
477 emit_lri32(batch, CS_GPR(0) + 4, 0);
478
479 if (m.post_shift) {
480 /* We right-shift by L by left-shifting by 32 - l and taking the top
481 * 32 bits of the result.
482 */
483 if (m.post_shift < 32)
484 emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.post_shift);
485 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
486 emit_lri32(batch, CS_GPR(0) + 4, 0);
487 }
488 }
489 }
490
491 /*
492 * GPR0 = (GPR0 == 0) ? 0 : 1;
493 */
494 static void
495 gpr0_to_bool(struct iris_context *ice)
496 {
497 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
498
499 ice->vtbl.load_register_imm64(batch, CS_GPR(1), 1ull);
500
501 static const uint32_t math[] = {
502 MI_MATH | (9 - 2),
503 MI_ALU2(LOAD, SRCA, R0),
504 MI_ALU1(LOAD0, SRCB),
505 MI_ALU0(ADD),
506 MI_ALU2(STOREINV, R0, ZF),
507 MI_ALU2(LOAD, SRCA, R0),
508 MI_ALU2(LOAD, SRCB, R1),
509 MI_ALU0(AND),
510 MI_ALU2(STORE, R0, ACCU),
511 };
512 iris_batch_emit(batch, math, sizeof(math));
513 }
514
515 static void
516 load_overflow_data_to_cs_gprs(struct iris_context *ice,
517 struct iris_query *q,
518 int idx)
519 {
520 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
521
522 ice->vtbl.load_register_mem64(batch, CS_GPR(1), q->bo,
523 offsetof(struct iris_query_so_overflow,
524 stream[idx].prim_storage_needed[0]));
525 ice->vtbl.load_register_mem64(batch, CS_GPR(2), q->bo,
526 offsetof(struct iris_query_so_overflow,
527 stream[idx].prim_storage_needed[1]));
528
529 ice->vtbl.load_register_mem64(batch, CS_GPR(3), q->bo,
530 offsetof(struct iris_query_so_overflow,
531 stream[idx].num_prims[0]));
532 ice->vtbl.load_register_mem64(batch, CS_GPR(4), q->bo,
533 offsetof(struct iris_query_so_overflow,
534 stream[idx].num_prims[1]));
535 }
536
537 /*
538 * R3 = R4 - R3;
539 * R1 = R2 - R1;
540 * R1 = R3 - R1;
541 * R0 = R0 | R1;
542 */
543 static void
544 calc_overflow_for_stream(struct iris_context *ice)
545 {
546 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
547 static const uint32_t maths[] = {
548 MI_MATH | (17 - 2),
549 MI_ALU2(LOAD, SRCA, R4),
550 MI_ALU2(LOAD, SRCB, R3),
551 MI_ALU0(SUB),
552 MI_ALU2(STORE, R3, ACCU),
553 MI_ALU2(LOAD, SRCA, R2),
554 MI_ALU2(LOAD, SRCB, R1),
555 MI_ALU0(SUB),
556 MI_ALU2(STORE, R1, ACCU),
557 MI_ALU2(LOAD, SRCA, R3),
558 MI_ALU2(LOAD, SRCB, R1),
559 MI_ALU0(SUB),
560 MI_ALU2(STORE, R1, ACCU),
561 MI_ALU2(LOAD, SRCA, R1),
562 MI_ALU2(LOAD, SRCB, R0),
563 MI_ALU0(OR),
564 MI_ALU2(STORE, R0, ACCU),
565 };
566
567 iris_batch_emit(batch, maths, sizeof(maths));
568 }
569
570 static void
571 overflow_result_to_gpr0(struct iris_context *ice, struct iris_query *q)
572 {
573 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
574
575 ice->vtbl.load_register_imm64(batch, CS_GPR(0), 0ull);
576
577 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) {
578 load_overflow_data_to_cs_gprs(ice, q, q->index);
579 calc_overflow_for_stream(ice);
580 } else {
581 for (int i = 0; i < MAX_VERTEX_STREAMS; i++) {
582 load_overflow_data_to_cs_gprs(ice, q, i);
583 calc_overflow_for_stream(ice);
584 }
585 }
586
587 gpr0_to_bool(ice);
588 }
589
590 /*
591 * GPR0 = GPR0 & ((1ull << n) -1);
592 */
593 static void
594 keep_gpr0_lower_n_bits(struct iris_context *ice, uint32_t n)
595 {
596 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
597
598 ice->vtbl.load_register_imm64(batch, CS_GPR(1), (1ull << n) - 1);
599 static const uint32_t math[] = {
600 MI_MATH | (5 - 2),
601 MI_ALU2(LOAD, SRCA, R0),
602 MI_ALU2(LOAD, SRCB, R1),
603 MI_ALU0(AND),
604 MI_ALU2(STORE, R0, ACCU),
605 };
606 iris_batch_emit(batch, math, sizeof(math));
607 }
608
609 /**
610 * Calculate the result and store it to CS_GPR0.
611 */
612 static void
613 calculate_result_on_gpu(struct iris_context *ice, struct iris_query *q)
614 {
615 struct iris_batch *batch = &ice->batches[q->batch_idx];
616 struct iris_screen *screen = (void *) ice->ctx.screen;
617 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
618 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
619 overflow_result_to_gpr0(ice, q);
620 return;
621 }
622
623 if (q->type == PIPE_QUERY_TIMESTAMP) {
624 ice->vtbl.load_register_mem64(batch, CS_GPR(0), q->bo,
625 offsetof(struct iris_query_snapshots, start));
626 /* TODO: This discards any fractional bits of the timebase scale.
627 * We would need to do a bit of fixed point math on the CS ALU, or
628 * launch an actual shader to calculate this with full precision.
629 */
630 emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
631 keep_gpr0_lower_n_bits(ice, 36);
632 return;
633 }
634
635 ice->vtbl.load_register_mem64(batch, CS_GPR(1), q->bo,
636 offsetof(struct iris_query_snapshots, start));
637 ice->vtbl.load_register_mem64(batch, CS_GPR(2), q->bo,
638 offsetof(struct iris_query_snapshots, end));
639
640 static const uint32_t math[] = {
641 MI_MATH | (5 - 2),
642 MI_ALU2(LOAD, SRCA, R2),
643 MI_ALU2(LOAD, SRCB, R1),
644 MI_ALU0(SUB),
645 MI_ALU2(STORE, R0, ACCU),
646 };
647 iris_batch_emit(batch, math, sizeof(math));
648
649 if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE ||
650 q->type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
651 gpr0_to_bool(ice);
652
653 if (q->type == PIPE_QUERY_TIME_ELAPSED) {
654 /* TODO: This discards fractional bits (see above). */
655 emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
656 }
657 }
658
659 static struct pipe_query *
660 iris_create_query(struct pipe_context *ctx,
661 unsigned query_type,
662 unsigned index)
663 {
664 struct iris_query *q = calloc(1, sizeof(struct iris_query));
665
666 q->type = query_type;
667 q->index = index;
668
669 if (q->type == PIPE_QUERY_PIPELINE_STATISTICS && q->index == 10)
670 q->batch_idx = IRIS_BATCH_COMPUTE;
671 else
672 q->batch_idx = IRIS_BATCH_RENDER;
673 return (struct pipe_query *) q;
674 }
675
676 static void
677 iris_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query)
678 {
679 struct iris_query *query = (void *) p_query;
680 iris_bo_unreference(query->bo);
681 free(query);
682 }
683
684
685 static boolean
686 iris_begin_query(struct pipe_context *ctx, struct pipe_query *query)
687 {
688 struct iris_screen *screen = (void *) ctx->screen;
689 struct iris_context *ice = (void *) ctx;
690 struct iris_query *q = (void *) query;
691
692 iris_bo_unreference(q->bo);
693 q->bo = iris_bo_alloc(screen->bufmgr, "query object", 4096,
694 IRIS_MEMZONE_OTHER);
695 if (!q->bo)
696 return false;
697
698 q->map = iris_bo_map(&ice->dbg, q->bo, MAP_READ | MAP_WRITE | MAP_ASYNC);
699 if (!q->map)
700 return false;
701
702 q->result = 0ull;
703 q->ready = false;
704 q->map->snapshots_landed = false;
705
706 if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
707 ice->state.prims_generated_query_active = true;
708 ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
709 }
710
711 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
712 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
713 write_overflow_values(ice, q, false);
714 else
715 write_value(ice, q, offsetof(struct iris_query_snapshots, start));
716
717 return true;
718 }
719
720 static bool
721 iris_end_query(struct pipe_context *ctx, struct pipe_query *query)
722 {
723 struct iris_context *ice = (void *) ctx;
724 struct iris_query *q = (void *) query;
725
726 if (q->type == PIPE_QUERY_TIMESTAMP) {
727 iris_begin_query(ctx, query);
728 mark_available(ice, q);
729 return true;
730 }
731
732 if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
733 ice->state.prims_generated_query_active = false;
734 ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
735 }
736
737 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
738 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
739 write_overflow_values(ice, q, true);
740 else
741 write_value(ice, q, offsetof(struct iris_query_snapshots, end));
742 mark_available(ice, q);
743
744 return true;
745 }
746
747 /**
748 * See if the snapshots have landed for a query, and if so, compute the
749 * result and mark it ready. Does not flush (unlike iris_get_query_result).
750 */
751 static void
752 iris_check_query_no_flush(struct iris_context *ice, struct iris_query *q)
753 {
754 struct iris_screen *screen = (void *) ice->ctx.screen;
755 const struct gen_device_info *devinfo = &screen->devinfo;
756
757 if (!q->ready && q->map->snapshots_landed) {
758 calculate_result_on_cpu(devinfo, q);
759 }
760 }
761
762 static boolean
763 iris_get_query_result(struct pipe_context *ctx,
764 struct pipe_query *query,
765 boolean wait,
766 union pipe_query_result *result)
767 {
768 struct iris_context *ice = (void *) ctx;
769 struct iris_query *q = (void *) query;
770 struct iris_screen *screen = (void *) ctx->screen;
771 const struct gen_device_info *devinfo = &screen->devinfo;
772
773 if (!q->ready) {
774 if (iris_batch_references(&ice->batches[q->batch_idx], q->bo))
775 iris_batch_flush(&ice->batches[q->batch_idx]);
776
777 if (!q->map->snapshots_landed) {
778 if (wait)
779 iris_bo_wait_rendering(q->bo);
780 else
781 return false;
782 }
783
784 assert(q->map->snapshots_landed);
785 calculate_result_on_cpu(devinfo, q);
786 }
787
788 assert(q->ready);
789
790 if (q->type == PIPE_QUERY_PIPELINE_STATISTICS) {
791 switch (q->index) {
792 case 0:
793 result->pipeline_statistics.ia_vertices = q->result;
794 break;
795 case 1:
796 result->pipeline_statistics.ia_primitives = q->result;
797 break;
798 case 2:
799 result->pipeline_statistics.vs_invocations = q->result;
800 break;
801 case 3:
802 result->pipeline_statistics.gs_invocations = q->result;
803 break;
804 case 4:
805 result->pipeline_statistics.gs_primitives = q->result;
806 break;
807 case 5:
808 result->pipeline_statistics.c_invocations = q->result;
809 break;
810 case 6:
811 result->pipeline_statistics.c_primitives = q->result;
812 break;
813 case 7:
814 result->pipeline_statistics.ps_invocations = q->result;
815 break;
816 case 8:
817 result->pipeline_statistics.hs_invocations = q->result;
818 break;
819 case 9:
820 result->pipeline_statistics.ds_invocations = q->result;
821 break;
822 case 10:
823 result->pipeline_statistics.cs_invocations = q->result;
824 break;
825 }
826 } else {
827 result->u64 = q->result;
828 }
829
830 return true;
831 }
832
833 static void
834 iris_get_query_result_resource(struct pipe_context *ctx,
835 struct pipe_query *query,
836 boolean wait,
837 enum pipe_query_value_type result_type,
838 int index,
839 struct pipe_resource *p_res,
840 unsigned offset)
841 {
842 struct iris_context *ice = (void *) ctx;
843 struct iris_query *q = (void *) query;
844 struct iris_batch *batch = &ice->batches[q->batch_idx];
845 const struct gen_device_info *devinfo = &batch->screen->devinfo;
846 struct iris_resource *res = (void *) p_res;
847 unsigned snapshots_landed_offset =
848 offsetof(struct iris_query_snapshots, snapshots_landed);
849
850 res->bind_history |= PIPE_BIND_QUERY_BUFFER;
851
852 if (index == -1) {
853 /* They're asking for the availability of the result. If we still
854 * have commands queued up which produce the result, submit them
855 * now so that progress happens. Either way, copy the snapshots
856 * landed field to the destination resource.
857 */
858 if (iris_batch_references(batch, q->bo))
859 iris_batch_flush(batch);
860
861 ice->vtbl.copy_mem_mem(batch, iris_resource_bo(p_res), offset,
862 q->bo, snapshots_landed_offset,
863 result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8);
864 return;
865 }
866
867 if (!q->ready && q->map->snapshots_landed) {
868 /* The final snapshots happen to have landed, so let's just compute
869 * the result on the CPU now...
870 */
871 calculate_result_on_cpu(devinfo, q);
872 }
873
874 if (q->ready) {
875 /* We happen to have the result on the CPU, so just copy it. */
876 if (result_type <= PIPE_QUERY_TYPE_U32) {
877 ice->vtbl.store_data_imm32(batch, iris_resource_bo(p_res), offset,
878 q->result);
879 } else {
880 ice->vtbl.store_data_imm64(batch, iris_resource_bo(p_res), offset,
881 q->result);
882 }
883
884 /* Make sure the result lands before they use bind the QBO elsewhere
885 * and use the result.
886 */
887 // XXX: Why? i965 doesn't do this.
888 iris_emit_pipe_control_flush(batch, PIPE_CONTROL_CS_STALL);
889 return;
890 }
891
892 /* Calculate the result to CS_GPR0 */
893 calculate_result_on_gpu(ice, q);
894
895 bool predicated = !wait && !q->stalled;
896
897 if (predicated) {
898 ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
899 ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, q->bo,
900 snapshots_landed_offset);
901 uint32_t predicate = MI_PREDICATE |
902 MI_PREDICATE_LOADOP_LOADINV |
903 MI_PREDICATE_COMBINEOP_SET |
904 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
905 iris_batch_emit(batch, &predicate, sizeof(uint32_t));
906 }
907
908 if (result_type <= PIPE_QUERY_TYPE_U32) {
909 ice->vtbl.store_register_mem32(batch, CS_GPR(0),
910 iris_resource_bo(p_res),
911 offset, predicated);
912 } else {
913 ice->vtbl.store_register_mem64(batch, CS_GPR(0),
914 iris_resource_bo(p_res),
915 offset, predicated);
916 }
917 }
918
919 static void
920 iris_set_active_query_state(struct pipe_context *ctx, boolean enable)
921 {
922 struct iris_context *ice = (void *) ctx;
923
924 if (ice->state.statistics_counters_enabled == enable)
925 return;
926
927 // XXX: most packets aren't paying attention to this yet, because it'd
928 // have to be done dynamically at draw time, which is a pain
929 ice->state.statistics_counters_enabled = enable;
930 ice->state.dirty |= IRIS_DIRTY_CLIP |
931 IRIS_DIRTY_GS |
932 IRIS_DIRTY_RASTER |
933 IRIS_DIRTY_STREAMOUT |
934 IRIS_DIRTY_TCS |
935 IRIS_DIRTY_TES |
936 IRIS_DIRTY_VS |
937 IRIS_DIRTY_WM;
938 }
939
940 static void
941 set_predicate_enable(struct iris_context *ice, bool value)
942 {
943 if (value)
944 ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
945 else
946 ice->state.predicate = IRIS_PREDICATE_STATE_DONT_RENDER;
947 }
948
949 static void
950 set_predicate_for_result(struct iris_context *ice,
951 struct iris_query *q,
952 bool inverted)
953 {
954 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
955
956 /* The CPU doesn't have the query result yet; use hardware predication */
957 ice->state.predicate = IRIS_PREDICATE_STATE_USE_BIT;
958
959 /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
960 iris_emit_pipe_control_flush(batch, PIPE_CONTROL_FLUSH_ENABLE);
961 q->stalled = true;
962
963 switch (q->type) {
964 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
965 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
966 overflow_result_to_gpr0(ice, q);
967
968 ice->vtbl.load_register_reg64(batch, MI_PREDICATE_SRC0, CS_GPR(0));
969 ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
970 break;
971 default:
972 /* PIPE_QUERY_OCCLUSION_* */
973 ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, q->bo,
974 offsetof(struct iris_query_snapshots, start));
975 ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, q->bo,
976 offsetof(struct iris_query_snapshots, end));
977 break;
978 }
979
980 uint32_t mi_predicate = MI_PREDICATE |
981 MI_PREDICATE_COMBINEOP_SET |
982 MI_PREDICATE_COMPAREOP_SRCS_EQUAL |
983 (inverted ? MI_PREDICATE_LOADOP_LOAD
984 : MI_PREDICATE_LOADOP_LOADINV);
985 iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
986
987 /* We immediately set the predicate on the render batch, as all the
988 * counters come from 3D operations. However, we may need to predicate
989 * a compute dispatch, which executes in a different GEM context and has
990 * a different MI_PREDICATE_DATA register. So, we save the result to
991 * memory and reload it in iris_launch_grid.
992 */
993 unsigned offset = offsetof(struct iris_query_snapshots, predicate_data);
994 ice->vtbl.store_register_mem64(batch, MI_PREDICATE_DATA,
995 q->bo, offset, false);
996 ice->state.compute_predicate = q->bo;
997 }
998
999 static void
1000 iris_render_condition(struct pipe_context *ctx,
1001 struct pipe_query *query,
1002 boolean condition,
1003 enum pipe_render_cond_flag mode)
1004 {
1005 struct iris_context *ice = (void *) ctx;
1006 struct iris_query *q = (void *) query;
1007
1008 if (!q) {
1009 ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
1010 return;
1011 }
1012
1013 iris_check_query_no_flush(ice, q);
1014
1015 if (q->result || q->ready) {
1016 set_predicate_enable(ice, (q->result != 0) ^ condition);
1017 } else {
1018 if (mode == PIPE_RENDER_COND_NO_WAIT ||
1019 mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) {
1020 perf_debug(&ice->dbg, "Conditional rendering demoted from "
1021 "\"no wait\" to \"wait\".");
1022 }
1023 set_predicate_for_result(ice, q, condition);
1024 }
1025 }
1026
1027 void
1028 iris_init_query_functions(struct pipe_context *ctx)
1029 {
1030 ctx->create_query = iris_create_query;
1031 ctx->destroy_query = iris_destroy_query;
1032 ctx->begin_query = iris_begin_query;
1033 ctx->end_query = iris_end_query;
1034 ctx->get_query_result = iris_get_query_result;
1035 ctx->get_query_result_resource = iris_get_query_result_resource;
1036 ctx->set_active_query_state = iris_set_active_query_state;
1037 ctx->render_condition = iris_render_condition;
1038 }