eace45c64ed065536b8924290b658ac8749f67ab
[mesa.git] / src / gallium / drivers / iris / iris_query.c
1 /*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
21 */
22
23 /**
24 * @file iris_query.c
25 *
26 * Query object support. This allows measuring various simple statistics
27 * via counters on the GPU.
28 */
29
30 #include <stdio.h>
31 #include <errno.h>
32 #include "pipe/p_defines.h"
33 #include "pipe/p_state.h"
34 #include "pipe/p_context.h"
35 #include "pipe/p_screen.h"
36 #include "util/fast_idiv_by_const.h"
37 #include "util/u_inlines.h"
38 #include "util/u_upload_mgr.h"
39 #include "iris_context.h"
40 #include "iris_defines.h"
41 #include "iris_fence.h"
42 #include "iris_resource.h"
43 #include "iris_screen.h"
44 #include "vulkan/util/vk_util.h"
45
46 #define IA_VERTICES_COUNT 0x2310
47 #define IA_PRIMITIVES_COUNT 0x2318
48 #define VS_INVOCATION_COUNT 0x2320
49 #define HS_INVOCATION_COUNT 0x2300
50 #define DS_INVOCATION_COUNT 0x2308
51 #define GS_INVOCATION_COUNT 0x2328
52 #define GS_PRIMITIVES_COUNT 0x2330
53 #define CL_INVOCATION_COUNT 0x2338
54 #define CL_PRIMITIVES_COUNT 0x2340
55 #define PS_INVOCATION_COUNT 0x2348
56 #define CS_INVOCATION_COUNT 0x2290
57 #define PS_DEPTH_COUNT 0x2350
58
59 #define SO_PRIM_STORAGE_NEEDED(n) (0x5240 + (n) * 8)
60
61 #define SO_NUM_PRIMS_WRITTEN(n) (0x5200 + (n) * 8)
62
63 #define MI_MATH (0x1a << 23)
64
65 #define MI_ALU_LOAD 0x080
66 #define MI_ALU_LOADINV 0x480
67 #define MI_ALU_LOAD0 0x081
68 #define MI_ALU_LOAD1 0x481
69 #define MI_ALU_ADD 0x100
70 #define MI_ALU_SUB 0x101
71 #define MI_ALU_AND 0x102
72 #define MI_ALU_OR 0x103
73 #define MI_ALU_XOR 0x104
74 #define MI_ALU_STORE 0x180
75 #define MI_ALU_STOREINV 0x580
76
77 #define MI_ALU_R0 0x00
78 #define MI_ALU_R1 0x01
79 #define MI_ALU_R2 0x02
80 #define MI_ALU_R3 0x03
81 #define MI_ALU_R4 0x04
82 #define MI_ALU_SRCA 0x20
83 #define MI_ALU_SRCB 0x21
84 #define MI_ALU_ACCU 0x31
85 #define MI_ALU_ZF 0x32
86 #define MI_ALU_CF 0x33
87
88 #define _MI_ALU(op, x, y) (((op) << 20) | ((x) << 10) | (y))
89
90 #define _MI_ALU0(op) _MI_ALU(MI_ALU_##op, 0, 0)
91 #define _MI_ALU1(op, x) _MI_ALU(MI_ALU_##op, x, 0)
92 #define _MI_ALU2(op, x, y) _MI_ALU(MI_ALU_##op, x, y)
93
94 #define MI_ALU0(op) _MI_ALU0(op)
95 #define MI_ALU1(op, x) _MI_ALU1(op, MI_ALU_##x)
96 #define MI_ALU2(op, x, y) _MI_ALU2(op, MI_ALU_##x, MI_ALU_##y)
97
98 #define emit_lri32 ice->vtbl.load_register_imm32
99 #define emit_lri64 ice->vtbl.load_register_imm64
100 #define emit_lrr32 ice->vtbl.load_register_reg32
101
102 struct iris_query {
103 enum pipe_query_type type;
104 int index;
105
106 bool ready;
107
108 bool stalled;
109
110 uint64_t result;
111
112 struct iris_state_ref query_state_ref;
113 struct iris_query_snapshots *map;
114 struct iris_syncpt *syncpt;
115
116 int batch_idx;
117 };
118
119 struct iris_query_snapshots {
120 /** iris_render_condition's saved MI_PREDICATE_RESULT value. */
121 uint64_t predicate_result;
122
123 /** Have the start/end snapshots landed? */
124 uint64_t snapshots_landed;
125
126 /** Starting and ending counter snapshots */
127 uint64_t start;
128 uint64_t end;
129 };
130
131 struct iris_query_so_overflow {
132 uint64_t predicate_result;
133 uint64_t snapshots_landed;
134
135 struct {
136 uint64_t prim_storage_needed[2];
137 uint64_t num_prims[2];
138 } stream[4];
139 };
140
141 /**
142 * Is this type of query written by PIPE_CONTROL?
143 */
144 static bool
145 iris_is_query_pipelined(struct iris_query *q)
146 {
147 switch (q->type) {
148 case PIPE_QUERY_OCCLUSION_COUNTER:
149 case PIPE_QUERY_OCCLUSION_PREDICATE:
150 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
151 case PIPE_QUERY_TIMESTAMP:
152 case PIPE_QUERY_TIMESTAMP_DISJOINT:
153 case PIPE_QUERY_TIME_ELAPSED:
154 return true;
155
156 default:
157 return false;
158 }
159 }
160
161 static void
162 mark_available(struct iris_context *ice, struct iris_query *q)
163 {
164 struct iris_batch *batch = &ice->batches[q->batch_idx];
165 unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
166 unsigned offset = offsetof(struct iris_query_snapshots, snapshots_landed);
167 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
168 offset += q->query_state_ref.offset;
169
170 if (!iris_is_query_pipelined(q)) {
171 ice->vtbl.store_data_imm64(batch, bo, offset, true);
172 } else {
173 /* Order available *after* the query results. */
174 flags |= PIPE_CONTROL_FLUSH_ENABLE;
175 iris_emit_pipe_control_write(batch, flags, bo, offset, true);
176 }
177 }
178
179 /**
180 * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
181 */
182 static void
183 iris_pipelined_write(struct iris_batch *batch,
184 struct iris_query *q,
185 enum pipe_control_flags flags,
186 unsigned offset)
187 {
188 const struct gen_device_info *devinfo = &batch->screen->devinfo;
189 const unsigned optional_cs_stall =
190 devinfo->gen == 9 && devinfo->gt == 4 ? PIPE_CONTROL_CS_STALL : 0;
191 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
192
193 iris_emit_pipe_control_write(batch, flags | optional_cs_stall,
194 bo, offset, 0ull);
195 }
196
197 static void
198 write_value(struct iris_context *ice, struct iris_query *q, unsigned offset)
199 {
200 struct iris_batch *batch = &ice->batches[q->batch_idx];
201 const struct gen_device_info *devinfo = &batch->screen->devinfo;
202 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
203
204 if (!iris_is_query_pipelined(q)) {
205 iris_emit_pipe_control_flush(batch,
206 PIPE_CONTROL_CS_STALL |
207 PIPE_CONTROL_STALL_AT_SCOREBOARD);
208 q->stalled = true;
209 }
210
211 switch (q->type) {
212 case PIPE_QUERY_OCCLUSION_COUNTER:
213 case PIPE_QUERY_OCCLUSION_PREDICATE:
214 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
215 if (devinfo->gen >= 10) {
216 /* "Driver must program PIPE_CONTROL with only Depth Stall Enable
217 * bit set prior to programming a PIPE_CONTROL with Write PS Depth
218 * Count sync operation."
219 */
220 iris_emit_pipe_control_flush(batch, PIPE_CONTROL_DEPTH_STALL);
221 }
222 iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
223 PIPE_CONTROL_WRITE_DEPTH_COUNT |
224 PIPE_CONTROL_DEPTH_STALL,
225 offset);
226 break;
227 case PIPE_QUERY_TIME_ELAPSED:
228 case PIPE_QUERY_TIMESTAMP:
229 case PIPE_QUERY_TIMESTAMP_DISJOINT:
230 iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
231 PIPE_CONTROL_WRITE_TIMESTAMP,
232 offset);
233 break;
234 case PIPE_QUERY_PRIMITIVES_GENERATED:
235 ice->vtbl.store_register_mem64(batch,
236 q->index == 0 ? CL_INVOCATION_COUNT :
237 SO_PRIM_STORAGE_NEEDED(q->index),
238 bo, offset, false);
239 break;
240 case PIPE_QUERY_PRIMITIVES_EMITTED:
241 ice->vtbl.store_register_mem64(batch,
242 SO_NUM_PRIMS_WRITTEN(q->index),
243 bo, offset, false);
244 break;
245 case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE: {
246 static const uint32_t index_to_reg[] = {
247 IA_VERTICES_COUNT,
248 IA_PRIMITIVES_COUNT,
249 VS_INVOCATION_COUNT,
250 GS_INVOCATION_COUNT,
251 GS_PRIMITIVES_COUNT,
252 CL_INVOCATION_COUNT,
253 CL_PRIMITIVES_COUNT,
254 PS_INVOCATION_COUNT,
255 HS_INVOCATION_COUNT,
256 DS_INVOCATION_COUNT,
257 CS_INVOCATION_COUNT,
258 };
259 const uint32_t reg = index_to_reg[q->index];
260
261 ice->vtbl.store_register_mem64(batch, reg, bo, offset, false);
262 break;
263 }
264 default:
265 assert(false);
266 }
267 }
268
269 static void
270 write_overflow_values(struct iris_context *ice, struct iris_query *q, bool end)
271 {
272 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
273 uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4;
274 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
275 uint32_t offset = q->query_state_ref.offset;
276
277 iris_emit_pipe_control_flush(batch,
278 PIPE_CONTROL_CS_STALL |
279 PIPE_CONTROL_STALL_AT_SCOREBOARD);
280 for (uint32_t i = 0; i < count; i++) {
281 int s = q->index + i;
282 int g_idx = offset + offsetof(struct iris_query_so_overflow,
283 stream[s].num_prims[end]);
284 int w_idx = offset + offsetof(struct iris_query_so_overflow,
285 stream[s].prim_storage_needed[end]);
286 ice->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s),
287 bo, g_idx, false);
288 ice->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s),
289 bo, w_idx, false);
290 }
291 }
292
293 uint64_t
294 iris_timebase_scale(const struct gen_device_info *devinfo,
295 uint64_t gpu_timestamp)
296 {
297 return (1000000000ull * gpu_timestamp) / devinfo->timestamp_frequency;
298 }
299
300 static uint64_t
301 iris_raw_timestamp_delta(uint64_t time0, uint64_t time1)
302 {
303 if (time0 > time1) {
304 return (1ULL << TIMESTAMP_BITS) + time1 - time0;
305 } else {
306 return time1 - time0;
307 }
308 }
309
310 static bool
311 stream_overflowed(struct iris_query_so_overflow *so, int s)
312 {
313 return (so->stream[s].prim_storage_needed[1] -
314 so->stream[s].prim_storage_needed[0]) !=
315 (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]);
316 }
317
318 static void
319 calculate_result_on_cpu(const struct gen_device_info *devinfo,
320 struct iris_query *q)
321 {
322 switch (q->type) {
323 case PIPE_QUERY_OCCLUSION_PREDICATE:
324 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
325 q->result = q->map->end != q->map->start;
326 break;
327 case PIPE_QUERY_TIMESTAMP:
328 case PIPE_QUERY_TIMESTAMP_DISJOINT:
329 /* The timestamp is the single starting snapshot. */
330 q->result = iris_timebase_scale(devinfo, q->map->start);
331 q->result &= (1ull << TIMESTAMP_BITS) - 1;
332 break;
333 case PIPE_QUERY_TIME_ELAPSED:
334 q->result = iris_raw_timestamp_delta(q->map->start, q->map->end);
335 q->result = iris_timebase_scale(devinfo, q->result);
336 q->result &= (1ull << TIMESTAMP_BITS) - 1;
337 break;
338 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
339 q->result = stream_overflowed((void *) q->map, q->index);
340 break;
341 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
342 q->result = false;
343 for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
344 q->result |= stream_overflowed((void *) q->map, i);
345 break;
346 case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE:
347 q->result = q->map->end - q->map->start;
348
349 /* WaDividePSInvocationCountBy4:HSW,BDW */
350 if (devinfo->gen == 8 && q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
351 q->result /= 4;
352 break;
353 case PIPE_QUERY_OCCLUSION_COUNTER:
354 case PIPE_QUERY_PRIMITIVES_GENERATED:
355 case PIPE_QUERY_PRIMITIVES_EMITTED:
356 default:
357 q->result = q->map->end - q->map->start;
358 break;
359 }
360
361 q->ready = true;
362 }
363
364 static void
365 emit_alu_add(struct iris_batch *batch, unsigned dst_reg,
366 unsigned reg_a, unsigned reg_b)
367 {
368 uint32_t *math = iris_get_command_space(batch, 5 * sizeof(uint32_t));
369
370 math[0] = MI_MATH | (5 - 2);
371 math[1] = _MI_ALU2(LOAD, MI_ALU_SRCA, reg_a);
372 math[2] = _MI_ALU2(LOAD, MI_ALU_SRCB, reg_b);
373 math[3] = _MI_ALU0(ADD);
374 math[4] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
375 }
376
377 static void
378 emit_alu_shl(struct iris_batch *batch, unsigned dst_reg,
379 unsigned src_reg, unsigned shift)
380 {
381 assert(shift > 0);
382
383 int dwords = 1 + 4 * shift;
384
385 uint32_t *math = iris_get_command_space(batch, sizeof(uint32_t) * dwords);
386
387 math[0] = MI_MATH | ((1 + 4 * shift) - 2);
388
389 for (unsigned i = 0; i < shift; i++) {
390 unsigned add_src = (i == 0) ? src_reg : dst_reg;
391 math[1 + (i * 4) + 0] = _MI_ALU2(LOAD, MI_ALU_SRCA, add_src);
392 math[1 + (i * 4) + 1] = _MI_ALU2(LOAD, MI_ALU_SRCB, add_src);
393 math[1 + (i * 4) + 2] = _MI_ALU0(ADD);
394 math[1 + (i * 4) + 3] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
395 }
396 }
397
398 /* Emit dwords to multiply GPR0 by N */
399 static void
400 build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N)
401 {
402 VK_OUTARRAY_MAKE(out, dw, dw_count);
403
404 #define APPEND_ALU(op, x, y) \
405 vk_outarray_append(&out, alu_dw) *alu_dw = _MI_ALU(MI_ALU_##op, x, y)
406
407 assert(N > 0);
408 unsigned top_bit = 31 - __builtin_clz(N);
409 for (int i = top_bit - 1; i >= 0; i--) {
410 /* We get our initial data in GPR0 and we write the final data out to
411 * GPR0 but we use GPR1 as our scratch register.
412 */
413 unsigned src_reg = i == top_bit - 1 ? MI_ALU_R0 : MI_ALU_R1;
414 unsigned dst_reg = i == 0 ? MI_ALU_R0 : MI_ALU_R1;
415
416 /* Shift the current value left by 1 */
417 APPEND_ALU(LOAD, MI_ALU_SRCA, src_reg);
418 APPEND_ALU(LOAD, MI_ALU_SRCB, src_reg);
419 APPEND_ALU(ADD, 0, 0);
420
421 if (N & (1 << i)) {
422 /* Store ACCU to R1 and add R0 to R1 */
423 APPEND_ALU(STORE, MI_ALU_R1, MI_ALU_ACCU);
424 APPEND_ALU(LOAD, MI_ALU_SRCA, MI_ALU_R0);
425 APPEND_ALU(LOAD, MI_ALU_SRCB, MI_ALU_R1);
426 APPEND_ALU(ADD, 0, 0);
427 }
428
429 APPEND_ALU(STORE, dst_reg, MI_ALU_ACCU);
430 }
431
432 #undef APPEND_ALU
433 }
434
435 static void
436 emit_mul_gpr0(struct iris_batch *batch, uint32_t N)
437 {
438 uint32_t num_dwords;
439 build_alu_multiply_gpr0(NULL, &num_dwords, N);
440
441 uint32_t *math = iris_get_command_space(batch, 4 * num_dwords);
442 math[0] = MI_MATH | (num_dwords - 2);
443 build_alu_multiply_gpr0(&math[1], &num_dwords, N);
444 }
445
446 void
447 iris_math_div32_gpr0(struct iris_context *ice,
448 struct iris_batch *batch,
449 uint32_t D)
450 {
451 /* Zero out the top of GPR0 */
452 emit_lri32(batch, CS_GPR(0) + 4, 0);
453
454 if (D == 0) {
455 /* This invalid, but we should do something so we set GPR0 to 0. */
456 emit_lri32(batch, CS_GPR(0), 0);
457 } else if (util_is_power_of_two_or_zero(D)) {
458 unsigned log2_D = util_logbase2(D);
459 assert(log2_D < 32);
460 /* We right-shift by log2(D) by left-shifting by 32 - log2(D) and taking
461 * the top 32 bits of the result.
462 */
463 emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - log2_D);
464 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
465 emit_lri32(batch, CS_GPR(0) + 4, 0);
466 } else {
467 struct util_fast_udiv_info m = util_compute_fast_udiv_info(D, 32, 32);
468 assert(m.multiplier <= UINT32_MAX);
469
470 if (m.pre_shift) {
471 /* We right-shift by L by left-shifting by 32 - l and taking the top
472 * 32 bits of the result.
473 */
474 if (m.pre_shift < 32)
475 emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.pre_shift);
476 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
477 emit_lri32(batch, CS_GPR(0) + 4, 0);
478 }
479
480 /* Do the 32x32 multiply into gpr0 */
481 emit_mul_gpr0(batch, m.multiplier);
482
483 if (m.increment) {
484 /* If we need to increment, save off a copy of GPR0 */
485 emit_lri32(batch, CS_GPR(1) + 0, m.multiplier);
486 emit_lri32(batch, CS_GPR(1) + 4, 0);
487 emit_alu_add(batch, MI_ALU_R0, MI_ALU_R0, MI_ALU_R1);
488 }
489
490 /* Shift by 32 */
491 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
492 emit_lri32(batch, CS_GPR(0) + 4, 0);
493
494 if (m.post_shift) {
495 /* We right-shift by L by left-shifting by 32 - l and taking the top
496 * 32 bits of the result.
497 */
498 if (m.post_shift < 32)
499 emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.post_shift);
500 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
501 emit_lri32(batch, CS_GPR(0) + 4, 0);
502 }
503 }
504 }
505
506 void
507 iris_math_add32_gpr0(struct iris_context *ice,
508 struct iris_batch *batch,
509 uint32_t x)
510 {
511 emit_lri32(batch, CS_GPR(1), x);
512 emit_alu_add(batch, MI_ALU_R0, MI_ALU_R0, MI_ALU_R1);
513 }
514
515 /*
516 * GPR0 = (GPR0 == 0) ? 0 : 1;
517 */
518 static void
519 gpr0_to_bool(struct iris_context *ice)
520 {
521 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
522
523 ice->vtbl.load_register_imm64(batch, CS_GPR(1), 1ull);
524
525 static const uint32_t math[] = {
526 MI_MATH | (9 - 2),
527 MI_ALU2(LOAD, SRCA, R0),
528 MI_ALU1(LOAD0, SRCB),
529 MI_ALU0(ADD),
530 MI_ALU2(STOREINV, R0, ZF),
531 MI_ALU2(LOAD, SRCA, R0),
532 MI_ALU2(LOAD, SRCB, R1),
533 MI_ALU0(AND),
534 MI_ALU2(STORE, R0, ACCU),
535 };
536 iris_batch_emit(batch, math, sizeof(math));
537 }
538
539 static void
540 load_overflow_data_to_cs_gprs(struct iris_context *ice,
541 struct iris_query *q,
542 int idx)
543 {
544 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
545 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
546 uint32_t offset = q->query_state_ref.offset;
547
548 ice->vtbl.load_register_mem64(batch, CS_GPR(1), bo, offset +
549 offsetof(struct iris_query_so_overflow,
550 stream[idx].prim_storage_needed[0]));
551 ice->vtbl.load_register_mem64(batch, CS_GPR(2), bo, offset +
552 offsetof(struct iris_query_so_overflow,
553 stream[idx].prim_storage_needed[1]));
554
555 ice->vtbl.load_register_mem64(batch, CS_GPR(3), bo, offset +
556 offsetof(struct iris_query_so_overflow,
557 stream[idx].num_prims[0]));
558 ice->vtbl.load_register_mem64(batch, CS_GPR(4), bo, offset +
559 offsetof(struct iris_query_so_overflow,
560 stream[idx].num_prims[1]));
561 }
562
563 /*
564 * R3 = R4 - R3;
565 * R1 = R2 - R1;
566 * R1 = R3 - R1;
567 * R0 = R0 | R1;
568 */
569 static void
570 calc_overflow_for_stream(struct iris_context *ice)
571 {
572 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
573 static const uint32_t maths[] = {
574 MI_MATH | (17 - 2),
575 MI_ALU2(LOAD, SRCA, R4),
576 MI_ALU2(LOAD, SRCB, R3),
577 MI_ALU0(SUB),
578 MI_ALU2(STORE, R3, ACCU),
579 MI_ALU2(LOAD, SRCA, R2),
580 MI_ALU2(LOAD, SRCB, R1),
581 MI_ALU0(SUB),
582 MI_ALU2(STORE, R1, ACCU),
583 MI_ALU2(LOAD, SRCA, R3),
584 MI_ALU2(LOAD, SRCB, R1),
585 MI_ALU0(SUB),
586 MI_ALU2(STORE, R1, ACCU),
587 MI_ALU2(LOAD, SRCA, R1),
588 MI_ALU2(LOAD, SRCB, R0),
589 MI_ALU0(OR),
590 MI_ALU2(STORE, R0, ACCU),
591 };
592
593 iris_batch_emit(batch, maths, sizeof(maths));
594 }
595
596 static void
597 overflow_result_to_gpr0(struct iris_context *ice, struct iris_query *q)
598 {
599 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
600
601 ice->vtbl.load_register_imm64(batch, CS_GPR(0), 0ull);
602
603 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) {
604 load_overflow_data_to_cs_gprs(ice, q, q->index);
605 calc_overflow_for_stream(ice);
606 } else {
607 for (int i = 0; i < MAX_VERTEX_STREAMS; i++) {
608 load_overflow_data_to_cs_gprs(ice, q, i);
609 calc_overflow_for_stream(ice);
610 }
611 }
612
613 gpr0_to_bool(ice);
614 }
615
616 /*
617 * GPR0 = GPR0 & ((1ull << n) -1);
618 */
619 static void
620 keep_gpr0_lower_n_bits(struct iris_context *ice, uint32_t n)
621 {
622 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
623
624 ice->vtbl.load_register_imm64(batch, CS_GPR(1), (1ull << n) - 1);
625 static const uint32_t math[] = {
626 MI_MATH | (5 - 2),
627 MI_ALU2(LOAD, SRCA, R0),
628 MI_ALU2(LOAD, SRCB, R1),
629 MI_ALU0(AND),
630 MI_ALU2(STORE, R0, ACCU),
631 };
632 iris_batch_emit(batch, math, sizeof(math));
633 }
634
635 /*
636 * GPR0 = GPR0 << 30;
637 */
638 static void
639 shl_gpr0_by_30_bits(struct iris_context *ice)
640 {
641 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
642 /* First we mask 34 bits of GPR0 to prevent overflow */
643 keep_gpr0_lower_n_bits(ice, 34);
644
645 static const uint32_t shl_math[] = {
646 MI_ALU2(LOAD, SRCA, R0),
647 MI_ALU2(LOAD, SRCB, R0),
648 MI_ALU0(ADD),
649 MI_ALU2(STORE, R0, ACCU),
650 };
651
652 const uint32_t outer_count = 5;
653 const uint32_t inner_count = 6;
654 const uint32_t cmd_len = 1 + inner_count * ARRAY_SIZE(shl_math);
655 const uint32_t batch_len = cmd_len * outer_count;
656 uint32_t *map = iris_get_command_space(batch, batch_len * 4);
657 uint32_t offset = 0;
658 for (int o = 0; o < outer_count; o++) {
659 map[offset++] = MI_MATH | (cmd_len - 2);
660 for (int i = 0; i < inner_count; i++) {
661 memcpy(&map[offset], shl_math, sizeof(shl_math));
662 offset += 4;
663 }
664 }
665 }
666
667 /*
668 * GPR0 = GPR0 >> 2;
669 *
670 * Note that the upper 30 bits of GPR0 are lost!
671 */
672 static void
673 shr_gpr0_by_2_bits(struct iris_context *ice)
674 {
675 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
676 shl_gpr0_by_30_bits(ice);
677 ice->vtbl.load_register_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
678 ice->vtbl.load_register_imm32(batch, CS_GPR(0) + 4, 0);
679 }
680
681 /**
682 * Calculate the result and store it to CS_GPR0.
683 */
684 static void
685 calculate_result_on_gpu(struct iris_context *ice, struct iris_query *q)
686 {
687 struct iris_batch *batch = &ice->batches[q->batch_idx];
688 struct iris_screen *screen = (void *) ice->ctx.screen;
689 const struct gen_device_info *devinfo = &batch->screen->devinfo;
690 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
691 uint32_t offset = q->query_state_ref.offset;
692
693 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
694 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
695 overflow_result_to_gpr0(ice, q);
696 return;
697 }
698
699 if (q->type == PIPE_QUERY_TIMESTAMP) {
700 ice->vtbl.load_register_mem64(batch, CS_GPR(0), bo,
701 offset +
702 offsetof(struct iris_query_snapshots, start));
703 /* TODO: This discards any fractional bits of the timebase scale.
704 * We would need to do a bit of fixed point math on the CS ALU, or
705 * launch an actual shader to calculate this with full precision.
706 */
707 emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
708 keep_gpr0_lower_n_bits(ice, 36);
709 return;
710 }
711
712 ice->vtbl.load_register_mem64(batch, CS_GPR(1), bo,
713 offset +
714 offsetof(struct iris_query_snapshots, start));
715 ice->vtbl.load_register_mem64(batch, CS_GPR(2), bo,
716 offset +
717 offsetof(struct iris_query_snapshots, end));
718
719 static const uint32_t math[] = {
720 MI_MATH | (5 - 2),
721 MI_ALU2(LOAD, SRCA, R2),
722 MI_ALU2(LOAD, SRCB, R1),
723 MI_ALU0(SUB),
724 MI_ALU2(STORE, R0, ACCU),
725 };
726 iris_batch_emit(batch, math, sizeof(math));
727
728 /* WaDividePSInvocationCountBy4:HSW,BDW */
729 if (devinfo->gen == 8 &&
730 q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
731 q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
732 shr_gpr0_by_2_bits(ice);
733
734 if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE ||
735 q->type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
736 gpr0_to_bool(ice);
737
738 if (q->type == PIPE_QUERY_TIME_ELAPSED) {
739 /* TODO: This discards fractional bits (see above). */
740 emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
741 }
742 }
743
744 static struct pipe_query *
745 iris_create_query(struct pipe_context *ctx,
746 unsigned query_type,
747 unsigned index)
748 {
749 struct iris_query *q = calloc(1, sizeof(struct iris_query));
750
751 q->type = query_type;
752 q->index = index;
753
754 if (q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
755 q->index == PIPE_STAT_QUERY_CS_INVOCATIONS)
756 q->batch_idx = IRIS_BATCH_COMPUTE;
757 else
758 q->batch_idx = IRIS_BATCH_RENDER;
759 return (struct pipe_query *) q;
760 }
761
762 static void
763 iris_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query)
764 {
765 struct iris_query *query = (void *) p_query;
766 struct iris_screen *screen = (void *) ctx->screen;
767 iris_syncpt_reference(screen, &query->syncpt, NULL);
768 free(query);
769 }
770
771
772 static boolean
773 iris_begin_query(struct pipe_context *ctx, struct pipe_query *query)
774 {
775 struct iris_context *ice = (void *) ctx;
776 struct iris_query *q = (void *) query;
777 void *ptr = NULL;
778 uint32_t size;
779
780 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
781 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
782 size = sizeof(struct iris_query_so_overflow);
783 else
784 size = sizeof(struct iris_query_snapshots);
785
786 u_upload_alloc(ice->query_buffer_uploader, 0,
787 size, size, &q->query_state_ref.offset,
788 &q->query_state_ref.res, &ptr);
789
790 if (!iris_resource_bo(q->query_state_ref.res))
791 return false;
792
793 q->map = ptr;
794 if (!q->map)
795 return false;
796
797 q->result = 0ull;
798 q->ready = false;
799 WRITE_ONCE(q->map->snapshots_landed, false);
800
801 if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
802 ice->state.prims_generated_query_active = true;
803 ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
804 }
805
806 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
807 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
808 write_overflow_values(ice, q, false);
809 else
810 write_value(ice, q,
811 q->query_state_ref.offset +
812 offsetof(struct iris_query_snapshots, start));
813
814 return true;
815 }
816
817 static bool
818 iris_end_query(struct pipe_context *ctx, struct pipe_query *query)
819 {
820 struct iris_context *ice = (void *) ctx;
821 struct iris_query *q = (void *) query;
822 struct iris_batch *batch = &ice->batches[q->batch_idx];
823
824 if (q->type == PIPE_QUERY_TIMESTAMP) {
825 iris_begin_query(ctx, query);
826 iris_batch_reference_signal_syncpt(batch, &q->syncpt);
827 mark_available(ice, q);
828 return true;
829 }
830
831 if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
832 ice->state.prims_generated_query_active = false;
833 ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
834 }
835
836 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
837 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
838 write_overflow_values(ice, q, true);
839 else
840 write_value(ice, q,
841 q->query_state_ref.offset +
842 offsetof(struct iris_query_snapshots, end));
843
844 iris_batch_reference_signal_syncpt(batch, &q->syncpt);
845 mark_available(ice, q);
846
847 return true;
848 }
849
850 /**
851 * See if the snapshots have landed for a query, and if so, compute the
852 * result and mark it ready. Does not flush (unlike iris_get_query_result).
853 */
854 static void
855 iris_check_query_no_flush(struct iris_context *ice, struct iris_query *q)
856 {
857 struct iris_screen *screen = (void *) ice->ctx.screen;
858 const struct gen_device_info *devinfo = &screen->devinfo;
859
860 if (!q->ready && READ_ONCE(q->map->snapshots_landed)) {
861 calculate_result_on_cpu(devinfo, q);
862 }
863 }
864
865 static boolean
866 iris_get_query_result(struct pipe_context *ctx,
867 struct pipe_query *query,
868 boolean wait,
869 union pipe_query_result *result)
870 {
871 struct iris_context *ice = (void *) ctx;
872 struct iris_query *q = (void *) query;
873 struct iris_screen *screen = (void *) ctx->screen;
874 const struct gen_device_info *devinfo = &screen->devinfo;
875 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
876
877 if (!q->ready) {
878 if (iris_batch_references(&ice->batches[q->batch_idx], bo))
879 iris_batch_flush(&ice->batches[q->batch_idx]);
880
881 while (!READ_ONCE(q->map->snapshots_landed)) {
882 if (wait)
883 iris_wait_syncpt(ctx->screen, q->syncpt, INT64_MAX);
884 else
885 return false;
886 }
887
888 assert(READ_ONCE(q->map->snapshots_landed));
889 calculate_result_on_cpu(devinfo, q);
890 }
891
892 assert(q->ready);
893
894 result->u64 = q->result;
895
896 return true;
897 }
898
899 static void
900 iris_get_query_result_resource(struct pipe_context *ctx,
901 struct pipe_query *query,
902 boolean wait,
903 enum pipe_query_value_type result_type,
904 int index,
905 struct pipe_resource *p_res,
906 unsigned offset)
907 {
908 struct iris_context *ice = (void *) ctx;
909 struct iris_query *q = (void *) query;
910 struct iris_batch *batch = &ice->batches[q->batch_idx];
911 const struct gen_device_info *devinfo = &batch->screen->devinfo;
912 struct iris_resource *res = (void *) p_res;
913 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
914 unsigned snapshots_landed_offset =
915 offsetof(struct iris_query_snapshots, snapshots_landed);
916
917 res->bind_history |= PIPE_BIND_QUERY_BUFFER;
918
919 if (index == -1) {
920 /* They're asking for the availability of the result. If we still
921 * have commands queued up which produce the result, submit them
922 * now so that progress happens. Either way, copy the snapshots
923 * landed field to the destination resource.
924 */
925 if (iris_batch_references(batch, bo))
926 iris_batch_flush(batch);
927
928 ice->vtbl.copy_mem_mem(batch, iris_resource_bo(p_res), offset,
929 bo, snapshots_landed_offset,
930 result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8);
931 return;
932 }
933
934 if (!q->ready && READ_ONCE(q->map->snapshots_landed)) {
935 /* The final snapshots happen to have landed, so let's just compute
936 * the result on the CPU now...
937 */
938 calculate_result_on_cpu(devinfo, q);
939 }
940
941 if (q->ready) {
942 /* We happen to have the result on the CPU, so just copy it. */
943 if (result_type <= PIPE_QUERY_TYPE_U32) {
944 ice->vtbl.store_data_imm32(batch, iris_resource_bo(p_res), offset,
945 q->result);
946 } else {
947 ice->vtbl.store_data_imm64(batch, iris_resource_bo(p_res), offset,
948 q->result);
949 }
950
951 /* Make sure the result lands before they use bind the QBO elsewhere
952 * and use the result.
953 */
954 // XXX: Why? i965 doesn't do this.
955 iris_emit_pipe_control_flush(batch, PIPE_CONTROL_CS_STALL);
956 return;
957 }
958
959 /* Calculate the result to CS_GPR0 */
960 calculate_result_on_gpu(ice, q);
961
962 bool predicated = !wait && !q->stalled;
963
964 if (predicated) {
965 ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
966 ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, bo,
967 snapshots_landed_offset);
968 uint32_t predicate = MI_PREDICATE |
969 MI_PREDICATE_LOADOP_LOADINV |
970 MI_PREDICATE_COMBINEOP_SET |
971 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
972 iris_batch_emit(batch, &predicate, sizeof(uint32_t));
973 }
974
975 if (result_type <= PIPE_QUERY_TYPE_U32) {
976 ice->vtbl.store_register_mem32(batch, CS_GPR(0),
977 iris_resource_bo(p_res),
978 offset, predicated);
979 } else {
980 ice->vtbl.store_register_mem64(batch, CS_GPR(0),
981 iris_resource_bo(p_res),
982 offset, predicated);
983 }
984 }
985
986 static void
987 iris_set_active_query_state(struct pipe_context *ctx, boolean enable)
988 {
989 struct iris_context *ice = (void *) ctx;
990
991 if (ice->state.statistics_counters_enabled == enable)
992 return;
993
994 // XXX: most packets aren't paying attention to this yet, because it'd
995 // have to be done dynamically at draw time, which is a pain
996 ice->state.statistics_counters_enabled = enable;
997 ice->state.dirty |= IRIS_DIRTY_CLIP |
998 IRIS_DIRTY_GS |
999 IRIS_DIRTY_RASTER |
1000 IRIS_DIRTY_STREAMOUT |
1001 IRIS_DIRTY_TCS |
1002 IRIS_DIRTY_TES |
1003 IRIS_DIRTY_VS |
1004 IRIS_DIRTY_WM;
1005 }
1006
1007 static void
1008 set_predicate_enable(struct iris_context *ice, bool value)
1009 {
1010 if (value)
1011 ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
1012 else
1013 ice->state.predicate = IRIS_PREDICATE_STATE_DONT_RENDER;
1014 }
1015
1016 static void
1017 set_predicate_for_result(struct iris_context *ice,
1018 struct iris_query *q,
1019 bool inverted)
1020 {
1021 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
1022 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
1023
1024 /* The CPU doesn't have the query result yet; use hardware predication */
1025 ice->state.predicate = IRIS_PREDICATE_STATE_USE_BIT;
1026
1027 /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
1028 iris_emit_pipe_control_flush(batch, PIPE_CONTROL_FLUSH_ENABLE);
1029 q->stalled = true;
1030
1031 switch (q->type) {
1032 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1033 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1034 overflow_result_to_gpr0(ice, q);
1035
1036 ice->vtbl.load_register_reg64(batch, MI_PREDICATE_SRC0, CS_GPR(0));
1037 ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
1038 break;
1039 default:
1040 /* PIPE_QUERY_OCCLUSION_* */
1041 ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, bo,
1042 offsetof(struct iris_query_snapshots, start) +
1043 q->query_state_ref.offset);
1044 ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, bo,
1045 offsetof(struct iris_query_snapshots, end) +
1046 q->query_state_ref.offset);
1047 break;
1048 }
1049
1050 uint32_t mi_predicate = MI_PREDICATE |
1051 MI_PREDICATE_COMBINEOP_SET |
1052 MI_PREDICATE_COMPAREOP_SRCS_EQUAL |
1053 (inverted ? MI_PREDICATE_LOADOP_LOAD
1054 : MI_PREDICATE_LOADOP_LOADINV);
1055 iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
1056
1057 /* We immediately set the predicate on the render batch, as all the
1058 * counters come from 3D operations. However, we may need to predicate
1059 * a compute dispatch, which executes in a different GEM context and has
1060 * a different MI_PREDICATE_RESULT register. So, we save the result to
1061 * memory and reload it in iris_launch_grid.
1062 */
1063 unsigned offset = q->query_state_ref.offset +
1064 offsetof(struct iris_query_snapshots, predicate_result);
1065 ice->vtbl.store_register_mem64(batch, MI_PREDICATE_RESULT,
1066 bo, offset, false);
1067 ice->state.compute_predicate = bo;
1068 }
1069
1070 static void
1071 iris_render_condition(struct pipe_context *ctx,
1072 struct pipe_query *query,
1073 boolean condition,
1074 enum pipe_render_cond_flag mode)
1075 {
1076 struct iris_context *ice = (void *) ctx;
1077 struct iris_query *q = (void *) query;
1078
1079 /* The old condition isn't relevant; we'll update it if necessary */
1080 ice->state.compute_predicate = NULL;
1081 ice->condition.query = q;
1082 ice->condition.condition = condition;
1083
1084 if (!q) {
1085 ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
1086 return;
1087 }
1088
1089 iris_check_query_no_flush(ice, q);
1090
1091 if (q->result || q->ready) {
1092 set_predicate_enable(ice, (q->result != 0) ^ condition);
1093 } else {
1094 if (mode == PIPE_RENDER_COND_NO_WAIT ||
1095 mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) {
1096 perf_debug(&ice->dbg, "Conditional rendering demoted from "
1097 "\"no wait\" to \"wait\".");
1098 }
1099 set_predicate_for_result(ice, q, condition);
1100 }
1101 }
1102
1103 void
1104 iris_resolve_conditional_render(struct iris_context *ice)
1105 {
1106 struct pipe_context *ctx = (void *) ice;
1107 struct iris_query *q = ice->condition.query;
1108 struct pipe_query *query = (void *) q;
1109 union pipe_query_result result;
1110
1111 if (ice->state.predicate != IRIS_PREDICATE_STATE_USE_BIT)
1112 return;
1113
1114 assert(q);
1115
1116 iris_get_query_result(ctx, query, true, &result);
1117 set_predicate_enable(ice, (q->result != 0) ^ ice->condition.condition);
1118 }
1119
1120 void
1121 iris_init_query_functions(struct pipe_context *ctx)
1122 {
1123 ctx->create_query = iris_create_query;
1124 ctx->destroy_query = iris_destroy_query;
1125 ctx->begin_query = iris_begin_query;
1126 ctx->end_query = iris_end_query;
1127 ctx->get_query_result = iris_get_query_result;
1128 ctx->get_query_result_resource = iris_get_query_result_resource;
1129 ctx->set_active_query_state = iris_set_active_query_state;
1130 ctx->render_condition = iris_render_condition;
1131 }