c5130bda36a23b3fb5cb383501a50be375195a62
[mesa.git] / src / gallium / drivers / iris / iris_query.c
1 /*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
21 */
22
23 /**
24 * @file iris_query.c
25 *
26 * Query object support. This allows measuring various simple statistics
27 * via counters on the GPU.
28 */
29
30 #include <stdio.h>
31 #include <errno.h>
32 #include "pipe/p_defines.h"
33 #include "pipe/p_state.h"
34 #include "pipe/p_context.h"
35 #include "pipe/p_screen.h"
36 #include "util/fast_idiv_by_const.h"
37 #include "util/u_inlines.h"
38 #include "util/u_upload_mgr.h"
39 #include "iris_context.h"
40 #include "iris_defines.h"
41 #include "iris_fence.h"
42 #include "iris_resource.h"
43 #include "iris_screen.h"
44 #include "vulkan/util/vk_util.h"
45
46 #define IA_VERTICES_COUNT 0x2310
47 #define IA_PRIMITIVES_COUNT 0x2318
48 #define VS_INVOCATION_COUNT 0x2320
49 #define HS_INVOCATION_COUNT 0x2300
50 #define DS_INVOCATION_COUNT 0x2308
51 #define GS_INVOCATION_COUNT 0x2328
52 #define GS_PRIMITIVES_COUNT 0x2330
53 #define CL_INVOCATION_COUNT 0x2338
54 #define CL_PRIMITIVES_COUNT 0x2340
55 #define PS_INVOCATION_COUNT 0x2348
56 #define CS_INVOCATION_COUNT 0x2290
57 #define PS_DEPTH_COUNT 0x2350
58
59 #define SO_PRIM_STORAGE_NEEDED(n) (0x5240 + (n) * 8)
60
61 #define SO_NUM_PRIMS_WRITTEN(n) (0x5200 + (n) * 8)
62
63 #define MI_MATH (0x1a << 23)
64
65 #define MI_ALU_LOAD 0x080
66 #define MI_ALU_LOADINV 0x480
67 #define MI_ALU_LOAD0 0x081
68 #define MI_ALU_LOAD1 0x481
69 #define MI_ALU_ADD 0x100
70 #define MI_ALU_SUB 0x101
71 #define MI_ALU_AND 0x102
72 #define MI_ALU_OR 0x103
73 #define MI_ALU_XOR 0x104
74 #define MI_ALU_STORE 0x180
75 #define MI_ALU_STOREINV 0x580
76
77 #define MI_ALU_R0 0x00
78 #define MI_ALU_R1 0x01
79 #define MI_ALU_R2 0x02
80 #define MI_ALU_R3 0x03
81 #define MI_ALU_R4 0x04
82 #define MI_ALU_SRCA 0x20
83 #define MI_ALU_SRCB 0x21
84 #define MI_ALU_ACCU 0x31
85 #define MI_ALU_ZF 0x32
86 #define MI_ALU_CF 0x33
87
88 #define _MI_ALU(op, x, y) (((op) << 20) | ((x) << 10) | (y))
89
90 #define _MI_ALU0(op) _MI_ALU(MI_ALU_##op, 0, 0)
91 #define _MI_ALU1(op, x) _MI_ALU(MI_ALU_##op, x, 0)
92 #define _MI_ALU2(op, x, y) _MI_ALU(MI_ALU_##op, x, y)
93
94 #define MI_ALU0(op) _MI_ALU0(op)
95 #define MI_ALU1(op, x) _MI_ALU1(op, MI_ALU_##x)
96 #define MI_ALU2(op, x, y) _MI_ALU2(op, MI_ALU_##x, MI_ALU_##y)
97
98 #define emit_lri32 ice->vtbl.load_register_imm32
99 #define emit_lri64 ice->vtbl.load_register_imm64
100 #define emit_lrr32 ice->vtbl.load_register_reg32
101
102 struct iris_query {
103 enum pipe_query_type type;
104 int index;
105
106 bool ready;
107
108 bool stalled;
109
110 uint64_t result;
111
112 struct iris_state_ref query_state_ref;
113 struct iris_query_snapshots *map;
114 struct iris_syncpt *syncpt;
115
116 int batch_idx;
117 };
118
119 struct iris_query_snapshots {
120 /** iris_render_condition's saved MI_PREDICATE_DATA value. */
121 uint64_t predicate_data;
122
123 /** Have the start/end snapshots landed? */
124 uint64_t snapshots_landed;
125
126 /** Starting and ending counter snapshots */
127 uint64_t start;
128 uint64_t end;
129 };
130
131 struct iris_query_so_overflow {
132 uint64_t predicate_data;
133 uint64_t snapshots_landed;
134
135 struct {
136 uint64_t prim_storage_needed[2];
137 uint64_t num_prims[2];
138 } stream[4];
139 };
140
141 /**
142 * Is this type of query written by PIPE_CONTROL?
143 */
144 static bool
145 iris_is_query_pipelined(struct iris_query *q)
146 {
147 switch (q->type) {
148 case PIPE_QUERY_OCCLUSION_COUNTER:
149 case PIPE_QUERY_OCCLUSION_PREDICATE:
150 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
151 case PIPE_QUERY_TIMESTAMP:
152 case PIPE_QUERY_TIMESTAMP_DISJOINT:
153 case PIPE_QUERY_TIME_ELAPSED:
154 return true;
155
156 default:
157 return false;
158 }
159 }
160
161 static void
162 mark_available(struct iris_context *ice, struct iris_query *q)
163 {
164 struct iris_batch *batch = &ice->batches[q->batch_idx];
165 unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
166 unsigned offset = offsetof(struct iris_query_snapshots, snapshots_landed);
167 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
168 offset += q->query_state_ref.offset;
169
170 if (!iris_is_query_pipelined(q)) {
171 ice->vtbl.store_data_imm64(batch, bo, offset, true);
172 } else {
173 /* Order available *after* the query results. */
174 flags |= PIPE_CONTROL_FLUSH_ENABLE;
175 iris_emit_pipe_control_write(batch, flags, bo, offset, true);
176 }
177 }
178
179 /**
180 * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
181 */
182 static void
183 iris_pipelined_write(struct iris_batch *batch,
184 struct iris_query *q,
185 enum pipe_control_flags flags,
186 unsigned offset)
187 {
188 const struct gen_device_info *devinfo = &batch->screen->devinfo;
189 const unsigned optional_cs_stall =
190 devinfo->gen == 9 && devinfo->gt == 4 ? PIPE_CONTROL_CS_STALL : 0;
191 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
192
193 iris_emit_pipe_control_write(batch, flags | optional_cs_stall,
194 bo, offset, 0ull);
195 }
196
197 static void
198 write_value(struct iris_context *ice, struct iris_query *q, unsigned offset)
199 {
200 struct iris_batch *batch = &ice->batches[q->batch_idx];
201 const struct gen_device_info *devinfo = &batch->screen->devinfo;
202 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
203
204 if (!iris_is_query_pipelined(q)) {
205 iris_emit_pipe_control_flush(batch,
206 PIPE_CONTROL_CS_STALL |
207 PIPE_CONTROL_STALL_AT_SCOREBOARD);
208 q->stalled = true;
209 }
210
211 switch (q->type) {
212 case PIPE_QUERY_OCCLUSION_COUNTER:
213 case PIPE_QUERY_OCCLUSION_PREDICATE:
214 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
215 if (devinfo->gen >= 10) {
216 /* "Driver must program PIPE_CONTROL with only Depth Stall Enable
217 * bit set prior to programming a PIPE_CONTROL with Write PS Depth
218 * Count sync operation."
219 */
220 iris_emit_pipe_control_flush(batch, PIPE_CONTROL_DEPTH_STALL);
221 }
222 iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
223 PIPE_CONTROL_WRITE_DEPTH_COUNT |
224 PIPE_CONTROL_DEPTH_STALL,
225 offset);
226 break;
227 case PIPE_QUERY_TIME_ELAPSED:
228 case PIPE_QUERY_TIMESTAMP:
229 case PIPE_QUERY_TIMESTAMP_DISJOINT:
230 iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
231 PIPE_CONTROL_WRITE_TIMESTAMP,
232 offset);
233 break;
234 case PIPE_QUERY_PRIMITIVES_GENERATED:
235 ice->vtbl.store_register_mem64(batch,
236 q->index == 0 ? CL_INVOCATION_COUNT :
237 SO_PRIM_STORAGE_NEEDED(q->index),
238 bo, offset, false);
239 break;
240 case PIPE_QUERY_PRIMITIVES_EMITTED:
241 ice->vtbl.store_register_mem64(batch,
242 SO_NUM_PRIMS_WRITTEN(q->index),
243 bo, offset, false);
244 break;
245 case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE: {
246 static const uint32_t index_to_reg[] = {
247 IA_VERTICES_COUNT,
248 IA_PRIMITIVES_COUNT,
249 VS_INVOCATION_COUNT,
250 GS_INVOCATION_COUNT,
251 GS_PRIMITIVES_COUNT,
252 CL_INVOCATION_COUNT,
253 CL_PRIMITIVES_COUNT,
254 PS_INVOCATION_COUNT,
255 HS_INVOCATION_COUNT,
256 DS_INVOCATION_COUNT,
257 CS_INVOCATION_COUNT,
258 };
259 const uint32_t reg = index_to_reg[q->index];
260
261 ice->vtbl.store_register_mem64(batch, reg, bo, offset, false);
262 break;
263 }
264 default:
265 assert(false);
266 }
267 }
268
269 static void
270 write_overflow_values(struct iris_context *ice, struct iris_query *q, bool end)
271 {
272 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
273 uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4;
274 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
275 uint32_t offset = q->query_state_ref.offset;
276
277 iris_emit_pipe_control_flush(batch,
278 PIPE_CONTROL_CS_STALL |
279 PIPE_CONTROL_STALL_AT_SCOREBOARD);
280 for (uint32_t i = 0; i < count; i++) {
281 int s = q->index + i;
282 int g_idx = offset + offsetof(struct iris_query_so_overflow,
283 stream[s].num_prims[end]);
284 int w_idx = offset + offsetof(struct iris_query_so_overflow,
285 stream[s].prim_storage_needed[end]);
286 ice->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s),
287 bo, g_idx, false);
288 ice->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s),
289 bo, w_idx, false);
290 }
291 }
292
293 uint64_t
294 iris_timebase_scale(const struct gen_device_info *devinfo,
295 uint64_t gpu_timestamp)
296 {
297 return (1000000000ull * gpu_timestamp) / devinfo->timestamp_frequency;
298 }
299
300 static uint64_t
301 iris_raw_timestamp_delta(uint64_t time0, uint64_t time1)
302 {
303 if (time0 > time1) {
304 return (1ULL << TIMESTAMP_BITS) + time1 - time0;
305 } else {
306 return time1 - time0;
307 }
308 }
309
310 static bool
311 stream_overflowed(struct iris_query_so_overflow *so, int s)
312 {
313 return (so->stream[s].prim_storage_needed[1] -
314 so->stream[s].prim_storage_needed[0]) !=
315 (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]);
316 }
317
318 static void
319 calculate_result_on_cpu(const struct gen_device_info *devinfo,
320 struct iris_query *q)
321 {
322 switch (q->type) {
323 case PIPE_QUERY_OCCLUSION_PREDICATE:
324 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
325 q->result = q->map->end != q->map->start;
326 break;
327 case PIPE_QUERY_TIMESTAMP:
328 case PIPE_QUERY_TIMESTAMP_DISJOINT:
329 /* The timestamp is the single starting snapshot. */
330 q->result = iris_timebase_scale(devinfo, q->map->start);
331 q->result &= (1ull << TIMESTAMP_BITS) - 1;
332 break;
333 case PIPE_QUERY_TIME_ELAPSED:
334 q->result = iris_raw_timestamp_delta(q->map->start, q->map->end);
335 q->result = iris_timebase_scale(devinfo, q->result);
336 q->result &= (1ull << TIMESTAMP_BITS) - 1;
337 break;
338 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
339 q->result = stream_overflowed((void *) q->map, q->index);
340 break;
341 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
342 q->result = false;
343 for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
344 q->result |= stream_overflowed((void *) q->map, i);
345 break;
346 case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE:
347 q->result = q->map->end - q->map->start;
348
349 /* WaDividePSInvocationCountBy4:HSW,BDW */
350 if (devinfo->gen == 8 && q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
351 q->result /= 4;
352 break;
353 case PIPE_QUERY_OCCLUSION_COUNTER:
354 case PIPE_QUERY_PRIMITIVES_GENERATED:
355 case PIPE_QUERY_PRIMITIVES_EMITTED:
356 default:
357 q->result = q->map->end - q->map->start;
358 break;
359 }
360
361 q->ready = true;
362 }
363
364 static void
365 emit_alu_add(struct iris_batch *batch, unsigned dst_reg,
366 unsigned reg_a, unsigned reg_b)
367 {
368 uint32_t *math = iris_get_command_space(batch, 5 * sizeof(uint32_t));
369
370 math[0] = MI_MATH | (5 - 2);
371 math[1] = _MI_ALU2(LOAD, MI_ALU_SRCA, reg_a);
372 math[2] = _MI_ALU2(LOAD, MI_ALU_SRCB, reg_b);
373 math[3] = _MI_ALU0(ADD);
374 math[4] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
375 }
376
377 static void
378 emit_alu_shl(struct iris_batch *batch, unsigned dst_reg,
379 unsigned src_reg, unsigned shift)
380 {
381 assert(shift > 0);
382
383 int dwords = 1 + 4 * shift;
384
385 uint32_t *math = iris_get_command_space(batch, sizeof(uint32_t) * dwords);
386
387 math[0] = MI_MATH | ((1 + 4 * shift) - 2);
388
389 for (unsigned i = 0; i < shift; i++) {
390 unsigned add_src = (i == 0) ? src_reg : dst_reg;
391 math[1 + (i * 4) + 0] = _MI_ALU2(LOAD, MI_ALU_SRCA, add_src);
392 math[1 + (i * 4) + 1] = _MI_ALU2(LOAD, MI_ALU_SRCB, add_src);
393 math[1 + (i * 4) + 2] = _MI_ALU0(ADD);
394 math[1 + (i * 4) + 3] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
395 }
396 }
397
398 /* Emit dwords to multiply GPR0 by N */
399 static void
400 build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N)
401 {
402 VK_OUTARRAY_MAKE(out, dw, dw_count);
403
404 #define APPEND_ALU(op, x, y) \
405 vk_outarray_append(&out, alu_dw) *alu_dw = _MI_ALU(MI_ALU_##op, x, y)
406
407 assert(N > 0);
408 unsigned top_bit = 31 - __builtin_clz(N);
409 for (int i = top_bit - 1; i >= 0; i--) {
410 /* We get our initial data in GPR0 and we write the final data out to
411 * GPR0 but we use GPR1 as our scratch register.
412 */
413 unsigned src_reg = i == top_bit - 1 ? MI_ALU_R0 : MI_ALU_R1;
414 unsigned dst_reg = i == 0 ? MI_ALU_R0 : MI_ALU_R1;
415
416 /* Shift the current value left by 1 */
417 APPEND_ALU(LOAD, MI_ALU_SRCA, src_reg);
418 APPEND_ALU(LOAD, MI_ALU_SRCB, src_reg);
419 APPEND_ALU(ADD, 0, 0);
420
421 if (N & (1 << i)) {
422 /* Store ACCU to R1 and add R0 to R1 */
423 APPEND_ALU(STORE, MI_ALU_R1, MI_ALU_ACCU);
424 APPEND_ALU(LOAD, MI_ALU_SRCA, MI_ALU_R0);
425 APPEND_ALU(LOAD, MI_ALU_SRCB, MI_ALU_R1);
426 APPEND_ALU(ADD, 0, 0);
427 }
428
429 APPEND_ALU(STORE, dst_reg, MI_ALU_ACCU);
430 }
431
432 #undef APPEND_ALU
433 }
434
435 static void
436 emit_mul_gpr0(struct iris_batch *batch, uint32_t N)
437 {
438 uint32_t num_dwords;
439 build_alu_multiply_gpr0(NULL, &num_dwords, N);
440
441 uint32_t *math = iris_get_command_space(batch, 4 * num_dwords);
442 math[0] = MI_MATH | (num_dwords - 2);
443 build_alu_multiply_gpr0(&math[1], &num_dwords, N);
444 }
445
446 void
447 iris_math_div32_gpr0(struct iris_context *ice,
448 struct iris_batch *batch,
449 uint32_t D)
450 {
451 /* Zero out the top of GPR0 */
452 emit_lri32(batch, CS_GPR(0) + 4, 0);
453
454 if (D == 0) {
455 /* This invalid, but we should do something so we set GPR0 to 0. */
456 emit_lri32(batch, CS_GPR(0), 0);
457 } else if (util_is_power_of_two_or_zero(D)) {
458 unsigned log2_D = util_logbase2(D);
459 assert(log2_D < 32);
460 /* We right-shift by log2(D) by left-shifting by 32 - log2(D) and taking
461 * the top 32 bits of the result.
462 */
463 emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - log2_D);
464 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
465 emit_lri32(batch, CS_GPR(0) + 4, 0);
466 } else {
467 struct util_fast_udiv_info m = util_compute_fast_udiv_info(D, 32, 32);
468 assert(m.multiplier <= UINT32_MAX);
469
470 if (m.pre_shift) {
471 /* We right-shift by L by left-shifting by 32 - l and taking the top
472 * 32 bits of the result.
473 */
474 if (m.pre_shift < 32)
475 emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.pre_shift);
476 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
477 emit_lri32(batch, CS_GPR(0) + 4, 0);
478 }
479
480 /* Do the 32x32 multiply into gpr0 */
481 emit_mul_gpr0(batch, m.multiplier);
482
483 if (m.increment) {
484 /* If we need to increment, save off a copy of GPR0 */
485 emit_lri32(batch, CS_GPR(1) + 0, m.multiplier);
486 emit_lri32(batch, CS_GPR(1) + 4, 0);
487 emit_alu_add(batch, MI_ALU_R0, MI_ALU_R0, MI_ALU_R1);
488 }
489
490 /* Shift by 32 */
491 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
492 emit_lri32(batch, CS_GPR(0) + 4, 0);
493
494 if (m.post_shift) {
495 /* We right-shift by L by left-shifting by 32 - l and taking the top
496 * 32 bits of the result.
497 */
498 if (m.post_shift < 32)
499 emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.post_shift);
500 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
501 emit_lri32(batch, CS_GPR(0) + 4, 0);
502 }
503 }
504 }
505
506 /*
507 * GPR0 = (GPR0 == 0) ? 0 : 1;
508 */
509 static void
510 gpr0_to_bool(struct iris_context *ice)
511 {
512 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
513
514 ice->vtbl.load_register_imm64(batch, CS_GPR(1), 1ull);
515
516 static const uint32_t math[] = {
517 MI_MATH | (9 - 2),
518 MI_ALU2(LOAD, SRCA, R0),
519 MI_ALU1(LOAD0, SRCB),
520 MI_ALU0(ADD),
521 MI_ALU2(STOREINV, R0, ZF),
522 MI_ALU2(LOAD, SRCA, R0),
523 MI_ALU2(LOAD, SRCB, R1),
524 MI_ALU0(AND),
525 MI_ALU2(STORE, R0, ACCU),
526 };
527 iris_batch_emit(batch, math, sizeof(math));
528 }
529
530 static void
531 load_overflow_data_to_cs_gprs(struct iris_context *ice,
532 struct iris_query *q,
533 int idx)
534 {
535 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
536 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
537 uint32_t offset = q->query_state_ref.offset;
538
539 ice->vtbl.load_register_mem64(batch, CS_GPR(1), bo, offset +
540 offsetof(struct iris_query_so_overflow,
541 stream[idx].prim_storage_needed[0]));
542 ice->vtbl.load_register_mem64(batch, CS_GPR(2), bo, offset +
543 offsetof(struct iris_query_so_overflow,
544 stream[idx].prim_storage_needed[1]));
545
546 ice->vtbl.load_register_mem64(batch, CS_GPR(3), bo, offset +
547 offsetof(struct iris_query_so_overflow,
548 stream[idx].num_prims[0]));
549 ice->vtbl.load_register_mem64(batch, CS_GPR(4), bo, offset +
550 offsetof(struct iris_query_so_overflow,
551 stream[idx].num_prims[1]));
552 }
553
554 /*
555 * R3 = R4 - R3;
556 * R1 = R2 - R1;
557 * R1 = R3 - R1;
558 * R0 = R0 | R1;
559 */
560 static void
561 calc_overflow_for_stream(struct iris_context *ice)
562 {
563 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
564 static const uint32_t maths[] = {
565 MI_MATH | (17 - 2),
566 MI_ALU2(LOAD, SRCA, R4),
567 MI_ALU2(LOAD, SRCB, R3),
568 MI_ALU0(SUB),
569 MI_ALU2(STORE, R3, ACCU),
570 MI_ALU2(LOAD, SRCA, R2),
571 MI_ALU2(LOAD, SRCB, R1),
572 MI_ALU0(SUB),
573 MI_ALU2(STORE, R1, ACCU),
574 MI_ALU2(LOAD, SRCA, R3),
575 MI_ALU2(LOAD, SRCB, R1),
576 MI_ALU0(SUB),
577 MI_ALU2(STORE, R1, ACCU),
578 MI_ALU2(LOAD, SRCA, R1),
579 MI_ALU2(LOAD, SRCB, R0),
580 MI_ALU0(OR),
581 MI_ALU2(STORE, R0, ACCU),
582 };
583
584 iris_batch_emit(batch, maths, sizeof(maths));
585 }
586
587 static void
588 overflow_result_to_gpr0(struct iris_context *ice, struct iris_query *q)
589 {
590 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
591
592 ice->vtbl.load_register_imm64(batch, CS_GPR(0), 0ull);
593
594 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) {
595 load_overflow_data_to_cs_gprs(ice, q, q->index);
596 calc_overflow_for_stream(ice);
597 } else {
598 for (int i = 0; i < MAX_VERTEX_STREAMS; i++) {
599 load_overflow_data_to_cs_gprs(ice, q, i);
600 calc_overflow_for_stream(ice);
601 }
602 }
603
604 gpr0_to_bool(ice);
605 }
606
607 /*
608 * GPR0 = GPR0 & ((1ull << n) -1);
609 */
610 static void
611 keep_gpr0_lower_n_bits(struct iris_context *ice, uint32_t n)
612 {
613 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
614
615 ice->vtbl.load_register_imm64(batch, CS_GPR(1), (1ull << n) - 1);
616 static const uint32_t math[] = {
617 MI_MATH | (5 - 2),
618 MI_ALU2(LOAD, SRCA, R0),
619 MI_ALU2(LOAD, SRCB, R1),
620 MI_ALU0(AND),
621 MI_ALU2(STORE, R0, ACCU),
622 };
623 iris_batch_emit(batch, math, sizeof(math));
624 }
625
626 /*
627 * GPR0 = GPR0 << 30;
628 */
629 static void
630 shl_gpr0_by_30_bits(struct iris_context *ice)
631 {
632 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
633 /* First we mask 34 bits of GPR0 to prevent overflow */
634 keep_gpr0_lower_n_bits(ice, 34);
635
636 static const uint32_t shl_math[] = {
637 MI_ALU2(LOAD, SRCA, R0),
638 MI_ALU2(LOAD, SRCB, R0),
639 MI_ALU0(ADD),
640 MI_ALU2(STORE, R0, ACCU),
641 };
642
643 const uint32_t outer_count = 5;
644 const uint32_t inner_count = 6;
645 const uint32_t cmd_len = 1 + inner_count * ARRAY_SIZE(shl_math);
646 const uint32_t batch_len = cmd_len * outer_count;
647 uint32_t *map = iris_get_command_space(batch, batch_len * 4);
648 uint32_t offset = 0;
649 for (int o = 0; o < outer_count; o++) {
650 map[offset++] = MI_MATH | (cmd_len - 2);
651 for (int i = 0; i < inner_count; i++) {
652 memcpy(&map[offset], shl_math, sizeof(shl_math));
653 offset += 4;
654 }
655 }
656 }
657
658 /*
659 * GPR0 = GPR0 >> 2;
660 *
661 * Note that the upper 30 bits of GPR0 are lost!
662 */
663 static void
664 shr_gpr0_by_2_bits(struct iris_context *ice)
665 {
666 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
667 shl_gpr0_by_30_bits(ice);
668 ice->vtbl.load_register_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
669 ice->vtbl.load_register_imm32(batch, CS_GPR(0) + 4, 0);
670 }
671
672 /**
673 * Calculate the result and store it to CS_GPR0.
674 */
675 static void
676 calculate_result_on_gpu(struct iris_context *ice, struct iris_query *q)
677 {
678 struct iris_batch *batch = &ice->batches[q->batch_idx];
679 struct iris_screen *screen = (void *) ice->ctx.screen;
680 const struct gen_device_info *devinfo = &batch->screen->devinfo;
681 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
682 uint32_t offset = q->query_state_ref.offset;
683
684 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
685 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
686 overflow_result_to_gpr0(ice, q);
687 return;
688 }
689
690 if (q->type == PIPE_QUERY_TIMESTAMP) {
691 ice->vtbl.load_register_mem64(batch, CS_GPR(0), bo,
692 offset +
693 offsetof(struct iris_query_snapshots, start));
694 /* TODO: This discards any fractional bits of the timebase scale.
695 * We would need to do a bit of fixed point math on the CS ALU, or
696 * launch an actual shader to calculate this with full precision.
697 */
698 emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
699 keep_gpr0_lower_n_bits(ice, 36);
700 return;
701 }
702
703 ice->vtbl.load_register_mem64(batch, CS_GPR(1), bo,
704 offset +
705 offsetof(struct iris_query_snapshots, start));
706 ice->vtbl.load_register_mem64(batch, CS_GPR(2), bo,
707 offset +
708 offsetof(struct iris_query_snapshots, end));
709
710 static const uint32_t math[] = {
711 MI_MATH | (5 - 2),
712 MI_ALU2(LOAD, SRCA, R2),
713 MI_ALU2(LOAD, SRCB, R1),
714 MI_ALU0(SUB),
715 MI_ALU2(STORE, R0, ACCU),
716 };
717 iris_batch_emit(batch, math, sizeof(math));
718
719 /* WaDividePSInvocationCountBy4:HSW,BDW */
720 if (devinfo->gen == 8 &&
721 q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
722 q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
723 shr_gpr0_by_2_bits(ice);
724
725 if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE ||
726 q->type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
727 gpr0_to_bool(ice);
728
729 if (q->type == PIPE_QUERY_TIME_ELAPSED) {
730 /* TODO: This discards fractional bits (see above). */
731 emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
732 }
733 }
734
735 static struct pipe_query *
736 iris_create_query(struct pipe_context *ctx,
737 unsigned query_type,
738 unsigned index)
739 {
740 struct iris_query *q = calloc(1, sizeof(struct iris_query));
741
742 q->type = query_type;
743 q->index = index;
744
745 if (q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
746 q->index == PIPE_STAT_QUERY_CS_INVOCATIONS)
747 q->batch_idx = IRIS_BATCH_COMPUTE;
748 else
749 q->batch_idx = IRIS_BATCH_RENDER;
750 return (struct pipe_query *) q;
751 }
752
753 static void
754 iris_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query)
755 {
756 struct iris_query *query = (void *) p_query;
757 struct iris_screen *screen = (void *) ctx->screen;
758 iris_syncpt_reference(screen, &query->syncpt, NULL);
759 free(query);
760 }
761
762
763 static boolean
764 iris_begin_query(struct pipe_context *ctx, struct pipe_query *query)
765 {
766 struct iris_context *ice = (void *) ctx;
767 struct iris_query *q = (void *) query;
768 void *ptr = NULL;
769 uint32_t size;
770
771 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
772 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
773 size = sizeof(struct iris_query_so_overflow);
774 else
775 size = sizeof(struct iris_query_snapshots);
776
777 u_upload_alloc(ice->query_buffer_uploader, 0,
778 size, size, &q->query_state_ref.offset,
779 &q->query_state_ref.res, &ptr);
780
781 if (!iris_resource_bo(q->query_state_ref.res))
782 return false;
783
784 q->map = ptr;
785 if (!q->map)
786 return false;
787
788 q->result = 0ull;
789 q->ready = false;
790 q->map->snapshots_landed = false;
791
792 if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
793 ice->state.prims_generated_query_active = true;
794 ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
795 }
796
797 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
798 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
799 write_overflow_values(ice, q, false);
800 else
801 write_value(ice, q,
802 q->query_state_ref.offset +
803 offsetof(struct iris_query_snapshots, start));
804
805 return true;
806 }
807
808 static bool
809 iris_end_query(struct pipe_context *ctx, struct pipe_query *query)
810 {
811 struct iris_context *ice = (void *) ctx;
812 struct iris_query *q = (void *) query;
813 struct iris_batch *batch = &ice->batches[q->batch_idx];
814 struct iris_screen *screen = (void *) ctx->screen;
815
816 if (q->type == PIPE_QUERY_TIMESTAMP) {
817 iris_begin_query(ctx, query);
818 struct iris_syncpt *syncpt =
819 ((struct iris_syncpt **) util_dynarray_begin(&batch->syncpts))[0];
820 iris_syncpt_reference(screen, &q->syncpt, syncpt);
821 mark_available(ice, q);
822 return true;
823 }
824
825 if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
826 ice->state.prims_generated_query_active = false;
827 ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
828 }
829
830 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
831 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
832 write_overflow_values(ice, q, true);
833 else
834 write_value(ice, q,
835 q->query_state_ref.offset +
836 offsetof(struct iris_query_snapshots, end));
837
838 struct iris_syncpt *syncpt =
839 ((struct iris_syncpt **) util_dynarray_begin(&batch->syncpts))[0];
840 iris_syncpt_reference(screen, &q->syncpt, syncpt);
841 mark_available(ice, q);
842
843 return true;
844 }
845
846 /**
847 * See if the snapshots have landed for a query, and if so, compute the
848 * result and mark it ready. Does not flush (unlike iris_get_query_result).
849 */
850 static void
851 iris_check_query_no_flush(struct iris_context *ice, struct iris_query *q)
852 {
853 struct iris_screen *screen = (void *) ice->ctx.screen;
854 const struct gen_device_info *devinfo = &screen->devinfo;
855
856 if (!q->ready && q->map->snapshots_landed) {
857 calculate_result_on_cpu(devinfo, q);
858 }
859 }
860
861 static boolean
862 iris_get_query_result(struct pipe_context *ctx,
863 struct pipe_query *query,
864 boolean wait,
865 union pipe_query_result *result)
866 {
867 struct iris_context *ice = (void *) ctx;
868 struct iris_query *q = (void *) query;
869 struct iris_screen *screen = (void *) ctx->screen;
870 const struct gen_device_info *devinfo = &screen->devinfo;
871 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
872
873 if (!q->ready) {
874 if (iris_batch_references(&ice->batches[q->batch_idx], bo))
875 iris_batch_flush(&ice->batches[q->batch_idx]);
876
877 while (!q->map->snapshots_landed) {
878 if (wait)
879 iris_wait_syncpt(ctx->screen, q->syncpt, INT64_MAX);
880 else
881 return false;
882 }
883
884 assert(q->map->snapshots_landed);
885 calculate_result_on_cpu(devinfo, q);
886 }
887
888 assert(q->ready);
889
890 result->u64 = q->result;
891
892 return true;
893 }
894
895 static void
896 iris_get_query_result_resource(struct pipe_context *ctx,
897 struct pipe_query *query,
898 boolean wait,
899 enum pipe_query_value_type result_type,
900 int index,
901 struct pipe_resource *p_res,
902 unsigned offset)
903 {
904 struct iris_context *ice = (void *) ctx;
905 struct iris_query *q = (void *) query;
906 struct iris_batch *batch = &ice->batches[q->batch_idx];
907 const struct gen_device_info *devinfo = &batch->screen->devinfo;
908 struct iris_resource *res = (void *) p_res;
909 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
910 unsigned snapshots_landed_offset =
911 offsetof(struct iris_query_snapshots, snapshots_landed);
912
913 res->bind_history |= PIPE_BIND_QUERY_BUFFER;
914
915 if (index == -1) {
916 /* They're asking for the availability of the result. If we still
917 * have commands queued up which produce the result, submit them
918 * now so that progress happens. Either way, copy the snapshots
919 * landed field to the destination resource.
920 */
921 if (iris_batch_references(batch, bo))
922 iris_batch_flush(batch);
923
924 ice->vtbl.copy_mem_mem(batch, iris_resource_bo(p_res), offset,
925 bo, snapshots_landed_offset,
926 result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8);
927 return;
928 }
929
930 if (!q->ready && q->map->snapshots_landed) {
931 /* The final snapshots happen to have landed, so let's just compute
932 * the result on the CPU now...
933 */
934 calculate_result_on_cpu(devinfo, q);
935 }
936
937 if (q->ready) {
938 /* We happen to have the result on the CPU, so just copy it. */
939 if (result_type <= PIPE_QUERY_TYPE_U32) {
940 ice->vtbl.store_data_imm32(batch, iris_resource_bo(p_res), offset,
941 q->result);
942 } else {
943 ice->vtbl.store_data_imm64(batch, iris_resource_bo(p_res), offset,
944 q->result);
945 }
946
947 /* Make sure the result lands before they use bind the QBO elsewhere
948 * and use the result.
949 */
950 // XXX: Why? i965 doesn't do this.
951 iris_emit_pipe_control_flush(batch, PIPE_CONTROL_CS_STALL);
952 return;
953 }
954
955 /* Calculate the result to CS_GPR0 */
956 calculate_result_on_gpu(ice, q);
957
958 bool predicated = !wait && !q->stalled;
959
960 if (predicated) {
961 ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
962 ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, bo,
963 snapshots_landed_offset);
964 uint32_t predicate = MI_PREDICATE |
965 MI_PREDICATE_LOADOP_LOADINV |
966 MI_PREDICATE_COMBINEOP_SET |
967 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
968 iris_batch_emit(batch, &predicate, sizeof(uint32_t));
969 }
970
971 if (result_type <= PIPE_QUERY_TYPE_U32) {
972 ice->vtbl.store_register_mem32(batch, CS_GPR(0),
973 iris_resource_bo(p_res),
974 offset, predicated);
975 } else {
976 ice->vtbl.store_register_mem64(batch, CS_GPR(0),
977 iris_resource_bo(p_res),
978 offset, predicated);
979 }
980 }
981
982 static void
983 iris_set_active_query_state(struct pipe_context *ctx, boolean enable)
984 {
985 struct iris_context *ice = (void *) ctx;
986
987 if (ice->state.statistics_counters_enabled == enable)
988 return;
989
990 // XXX: most packets aren't paying attention to this yet, because it'd
991 // have to be done dynamically at draw time, which is a pain
992 ice->state.statistics_counters_enabled = enable;
993 ice->state.dirty |= IRIS_DIRTY_CLIP |
994 IRIS_DIRTY_GS |
995 IRIS_DIRTY_RASTER |
996 IRIS_DIRTY_STREAMOUT |
997 IRIS_DIRTY_TCS |
998 IRIS_DIRTY_TES |
999 IRIS_DIRTY_VS |
1000 IRIS_DIRTY_WM;
1001 }
1002
1003 static void
1004 set_predicate_enable(struct iris_context *ice, bool value)
1005 {
1006 if (value)
1007 ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
1008 else
1009 ice->state.predicate = IRIS_PREDICATE_STATE_DONT_RENDER;
1010 }
1011
1012 static void
1013 set_predicate_for_result(struct iris_context *ice,
1014 struct iris_query *q,
1015 bool inverted)
1016 {
1017 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
1018 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
1019
1020 /* The CPU doesn't have the query result yet; use hardware predication */
1021 ice->state.predicate = IRIS_PREDICATE_STATE_USE_BIT;
1022
1023 /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
1024 iris_emit_pipe_control_flush(batch, PIPE_CONTROL_FLUSH_ENABLE);
1025 q->stalled = true;
1026
1027 switch (q->type) {
1028 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1029 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1030 overflow_result_to_gpr0(ice, q);
1031
1032 ice->vtbl.load_register_reg64(batch, MI_PREDICATE_SRC0, CS_GPR(0));
1033 ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
1034 break;
1035 default:
1036 /* PIPE_QUERY_OCCLUSION_* */
1037 ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, bo,
1038 offsetof(struct iris_query_snapshots, start) +
1039 q->query_state_ref.offset);
1040 ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, bo,
1041 offsetof(struct iris_query_snapshots, end) +
1042 q->query_state_ref.offset);
1043 break;
1044 }
1045
1046 uint32_t mi_predicate = MI_PREDICATE |
1047 MI_PREDICATE_COMBINEOP_SET |
1048 MI_PREDICATE_COMPAREOP_SRCS_EQUAL |
1049 (inverted ? MI_PREDICATE_LOADOP_LOAD
1050 : MI_PREDICATE_LOADOP_LOADINV);
1051 iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
1052
1053 /* We immediately set the predicate on the render batch, as all the
1054 * counters come from 3D operations. However, we may need to predicate
1055 * a compute dispatch, which executes in a different GEM context and has
1056 * a different MI_PREDICATE_DATA register. So, we save the result to
1057 * memory and reload it in iris_launch_grid.
1058 */
1059 unsigned offset = q->query_state_ref.offset +
1060 offsetof(struct iris_query_snapshots, predicate_data);
1061 ice->vtbl.store_register_mem64(batch, MI_PREDICATE_DATA,
1062 bo, offset, false);
1063 ice->state.compute_predicate = bo;
1064 }
1065
1066 static void
1067 iris_render_condition(struct pipe_context *ctx,
1068 struct pipe_query *query,
1069 boolean condition,
1070 enum pipe_render_cond_flag mode)
1071 {
1072 struct iris_context *ice = (void *) ctx;
1073 struct iris_query *q = (void *) query;
1074
1075 /* The old condition isn't relevant; we'll update it if necessary */
1076 ice->state.compute_predicate = NULL;
1077
1078 if (!q) {
1079 ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
1080 return;
1081 }
1082
1083 iris_check_query_no_flush(ice, q);
1084
1085 if (q->result || q->ready) {
1086 set_predicate_enable(ice, (q->result != 0) ^ condition);
1087 } else {
1088 if (mode == PIPE_RENDER_COND_NO_WAIT ||
1089 mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) {
1090 perf_debug(&ice->dbg, "Conditional rendering demoted from "
1091 "\"no wait\" to \"wait\".");
1092 }
1093 set_predicate_for_result(ice, q, condition);
1094 }
1095 }
1096
1097 void
1098 iris_init_query_functions(struct pipe_context *ctx)
1099 {
1100 ctx->create_query = iris_create_query;
1101 ctx->destroy_query = iris_destroy_query;
1102 ctx->begin_query = iris_begin_query;
1103 ctx->end_query = iris_end_query;
1104 ctx->get_query_result = iris_get_query_result;
1105 ctx->get_query_result_resource = iris_get_query_result_resource;
1106 ctx->set_active_query_state = iris_set_active_query_state;
1107 ctx->render_condition = iris_render_condition;
1108 }