iris: Make a iris_batch_reference_signal_syncpt helper function.
[mesa.git] / src / gallium / drivers / iris / iris_query.c
1 /*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
21 */
22
23 /**
24 * @file iris_query.c
25 *
26 * Query object support. This allows measuring various simple statistics
27 * via counters on the GPU.
28 */
29
30 #include <stdio.h>
31 #include <errno.h>
32 #include "pipe/p_defines.h"
33 #include "pipe/p_state.h"
34 #include "pipe/p_context.h"
35 #include "pipe/p_screen.h"
36 #include "util/fast_idiv_by_const.h"
37 #include "util/u_inlines.h"
38 #include "util/u_upload_mgr.h"
39 #include "iris_context.h"
40 #include "iris_defines.h"
41 #include "iris_fence.h"
42 #include "iris_resource.h"
43 #include "iris_screen.h"
44 #include "vulkan/util/vk_util.h"
45
46 #define IA_VERTICES_COUNT 0x2310
47 #define IA_PRIMITIVES_COUNT 0x2318
48 #define VS_INVOCATION_COUNT 0x2320
49 #define HS_INVOCATION_COUNT 0x2300
50 #define DS_INVOCATION_COUNT 0x2308
51 #define GS_INVOCATION_COUNT 0x2328
52 #define GS_PRIMITIVES_COUNT 0x2330
53 #define CL_INVOCATION_COUNT 0x2338
54 #define CL_PRIMITIVES_COUNT 0x2340
55 #define PS_INVOCATION_COUNT 0x2348
56 #define CS_INVOCATION_COUNT 0x2290
57 #define PS_DEPTH_COUNT 0x2350
58
59 #define SO_PRIM_STORAGE_NEEDED(n) (0x5240 + (n) * 8)
60
61 #define SO_NUM_PRIMS_WRITTEN(n) (0x5200 + (n) * 8)
62
63 #define MI_MATH (0x1a << 23)
64
65 #define MI_ALU_LOAD 0x080
66 #define MI_ALU_LOADINV 0x480
67 #define MI_ALU_LOAD0 0x081
68 #define MI_ALU_LOAD1 0x481
69 #define MI_ALU_ADD 0x100
70 #define MI_ALU_SUB 0x101
71 #define MI_ALU_AND 0x102
72 #define MI_ALU_OR 0x103
73 #define MI_ALU_XOR 0x104
74 #define MI_ALU_STORE 0x180
75 #define MI_ALU_STOREINV 0x580
76
77 #define MI_ALU_R0 0x00
78 #define MI_ALU_R1 0x01
79 #define MI_ALU_R2 0x02
80 #define MI_ALU_R3 0x03
81 #define MI_ALU_R4 0x04
82 #define MI_ALU_SRCA 0x20
83 #define MI_ALU_SRCB 0x21
84 #define MI_ALU_ACCU 0x31
85 #define MI_ALU_ZF 0x32
86 #define MI_ALU_CF 0x33
87
88 #define _MI_ALU(op, x, y) (((op) << 20) | ((x) << 10) | (y))
89
90 #define _MI_ALU0(op) _MI_ALU(MI_ALU_##op, 0, 0)
91 #define _MI_ALU1(op, x) _MI_ALU(MI_ALU_##op, x, 0)
92 #define _MI_ALU2(op, x, y) _MI_ALU(MI_ALU_##op, x, y)
93
94 #define MI_ALU0(op) _MI_ALU0(op)
95 #define MI_ALU1(op, x) _MI_ALU1(op, MI_ALU_##x)
96 #define MI_ALU2(op, x, y) _MI_ALU2(op, MI_ALU_##x, MI_ALU_##y)
97
98 #define emit_lri32 ice->vtbl.load_register_imm32
99 #define emit_lri64 ice->vtbl.load_register_imm64
100 #define emit_lrr32 ice->vtbl.load_register_reg32
101
102 struct iris_query {
103 enum pipe_query_type type;
104 int index;
105
106 bool ready;
107
108 bool stalled;
109
110 uint64_t result;
111
112 struct iris_state_ref query_state_ref;
113 struct iris_query_snapshots *map;
114 struct iris_syncpt *syncpt;
115
116 int batch_idx;
117 };
118
119 struct iris_query_snapshots {
120 /** iris_render_condition's saved MI_PREDICATE_DATA value. */
121 uint64_t predicate_data;
122
123 /** Have the start/end snapshots landed? */
124 uint64_t snapshots_landed;
125
126 /** Starting and ending counter snapshots */
127 uint64_t start;
128 uint64_t end;
129 };
130
131 struct iris_query_so_overflow {
132 uint64_t predicate_data;
133 uint64_t snapshots_landed;
134
135 struct {
136 uint64_t prim_storage_needed[2];
137 uint64_t num_prims[2];
138 } stream[4];
139 };
140
141 /**
142 * Is this type of query written by PIPE_CONTROL?
143 */
144 static bool
145 iris_is_query_pipelined(struct iris_query *q)
146 {
147 switch (q->type) {
148 case PIPE_QUERY_OCCLUSION_COUNTER:
149 case PIPE_QUERY_OCCLUSION_PREDICATE:
150 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
151 case PIPE_QUERY_TIMESTAMP:
152 case PIPE_QUERY_TIMESTAMP_DISJOINT:
153 case PIPE_QUERY_TIME_ELAPSED:
154 return true;
155
156 default:
157 return false;
158 }
159 }
160
161 static void
162 mark_available(struct iris_context *ice, struct iris_query *q)
163 {
164 struct iris_batch *batch = &ice->batches[q->batch_idx];
165 unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
166 unsigned offset = offsetof(struct iris_query_snapshots, snapshots_landed);
167 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
168 offset += q->query_state_ref.offset;
169
170 if (!iris_is_query_pipelined(q)) {
171 ice->vtbl.store_data_imm64(batch, bo, offset, true);
172 } else {
173 /* Order available *after* the query results. */
174 flags |= PIPE_CONTROL_FLUSH_ENABLE;
175 iris_emit_pipe_control_write(batch, flags, bo, offset, true);
176 }
177 }
178
179 /**
180 * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
181 */
182 static void
183 iris_pipelined_write(struct iris_batch *batch,
184 struct iris_query *q,
185 enum pipe_control_flags flags,
186 unsigned offset)
187 {
188 const struct gen_device_info *devinfo = &batch->screen->devinfo;
189 const unsigned optional_cs_stall =
190 devinfo->gen == 9 && devinfo->gt == 4 ? PIPE_CONTROL_CS_STALL : 0;
191 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
192
193 iris_emit_pipe_control_write(batch, flags | optional_cs_stall,
194 bo, offset, 0ull);
195 }
196
197 static void
198 write_value(struct iris_context *ice, struct iris_query *q, unsigned offset)
199 {
200 struct iris_batch *batch = &ice->batches[q->batch_idx];
201 const struct gen_device_info *devinfo = &batch->screen->devinfo;
202 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
203
204 if (!iris_is_query_pipelined(q)) {
205 iris_emit_pipe_control_flush(batch,
206 PIPE_CONTROL_CS_STALL |
207 PIPE_CONTROL_STALL_AT_SCOREBOARD);
208 q->stalled = true;
209 }
210
211 switch (q->type) {
212 case PIPE_QUERY_OCCLUSION_COUNTER:
213 case PIPE_QUERY_OCCLUSION_PREDICATE:
214 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
215 if (devinfo->gen >= 10) {
216 /* "Driver must program PIPE_CONTROL with only Depth Stall Enable
217 * bit set prior to programming a PIPE_CONTROL with Write PS Depth
218 * Count sync operation."
219 */
220 iris_emit_pipe_control_flush(batch, PIPE_CONTROL_DEPTH_STALL);
221 }
222 iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
223 PIPE_CONTROL_WRITE_DEPTH_COUNT |
224 PIPE_CONTROL_DEPTH_STALL,
225 offset);
226 break;
227 case PIPE_QUERY_TIME_ELAPSED:
228 case PIPE_QUERY_TIMESTAMP:
229 case PIPE_QUERY_TIMESTAMP_DISJOINT:
230 iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
231 PIPE_CONTROL_WRITE_TIMESTAMP,
232 offset);
233 break;
234 case PIPE_QUERY_PRIMITIVES_GENERATED:
235 ice->vtbl.store_register_mem64(batch,
236 q->index == 0 ? CL_INVOCATION_COUNT :
237 SO_PRIM_STORAGE_NEEDED(q->index),
238 bo, offset, false);
239 break;
240 case PIPE_QUERY_PRIMITIVES_EMITTED:
241 ice->vtbl.store_register_mem64(batch,
242 SO_NUM_PRIMS_WRITTEN(q->index),
243 bo, offset, false);
244 break;
245 case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE: {
246 static const uint32_t index_to_reg[] = {
247 IA_VERTICES_COUNT,
248 IA_PRIMITIVES_COUNT,
249 VS_INVOCATION_COUNT,
250 GS_INVOCATION_COUNT,
251 GS_PRIMITIVES_COUNT,
252 CL_INVOCATION_COUNT,
253 CL_PRIMITIVES_COUNT,
254 PS_INVOCATION_COUNT,
255 HS_INVOCATION_COUNT,
256 DS_INVOCATION_COUNT,
257 CS_INVOCATION_COUNT,
258 };
259 const uint32_t reg = index_to_reg[q->index];
260
261 ice->vtbl.store_register_mem64(batch, reg, bo, offset, false);
262 break;
263 }
264 default:
265 assert(false);
266 }
267 }
268
269 static void
270 write_overflow_values(struct iris_context *ice, struct iris_query *q, bool end)
271 {
272 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
273 uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4;
274 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
275 uint32_t offset = q->query_state_ref.offset;
276
277 iris_emit_pipe_control_flush(batch,
278 PIPE_CONTROL_CS_STALL |
279 PIPE_CONTROL_STALL_AT_SCOREBOARD);
280 for (uint32_t i = 0; i < count; i++) {
281 int s = q->index + i;
282 int g_idx = offset + offsetof(struct iris_query_so_overflow,
283 stream[s].num_prims[end]);
284 int w_idx = offset + offsetof(struct iris_query_so_overflow,
285 stream[s].prim_storage_needed[end]);
286 ice->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s),
287 bo, g_idx, false);
288 ice->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s),
289 bo, w_idx, false);
290 }
291 }
292
293 uint64_t
294 iris_timebase_scale(const struct gen_device_info *devinfo,
295 uint64_t gpu_timestamp)
296 {
297 return (1000000000ull * gpu_timestamp) / devinfo->timestamp_frequency;
298 }
299
300 static uint64_t
301 iris_raw_timestamp_delta(uint64_t time0, uint64_t time1)
302 {
303 if (time0 > time1) {
304 return (1ULL << TIMESTAMP_BITS) + time1 - time0;
305 } else {
306 return time1 - time0;
307 }
308 }
309
310 static bool
311 stream_overflowed(struct iris_query_so_overflow *so, int s)
312 {
313 return (so->stream[s].prim_storage_needed[1] -
314 so->stream[s].prim_storage_needed[0]) !=
315 (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]);
316 }
317
318 static void
319 calculate_result_on_cpu(const struct gen_device_info *devinfo,
320 struct iris_query *q)
321 {
322 switch (q->type) {
323 case PIPE_QUERY_OCCLUSION_PREDICATE:
324 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
325 q->result = q->map->end != q->map->start;
326 break;
327 case PIPE_QUERY_TIMESTAMP:
328 case PIPE_QUERY_TIMESTAMP_DISJOINT:
329 /* The timestamp is the single starting snapshot. */
330 q->result = iris_timebase_scale(devinfo, q->map->start);
331 q->result &= (1ull << TIMESTAMP_BITS) - 1;
332 break;
333 case PIPE_QUERY_TIME_ELAPSED:
334 q->result = iris_raw_timestamp_delta(q->map->start, q->map->end);
335 q->result = iris_timebase_scale(devinfo, q->result);
336 q->result &= (1ull << TIMESTAMP_BITS) - 1;
337 break;
338 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
339 q->result = stream_overflowed((void *) q->map, q->index);
340 break;
341 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
342 q->result = false;
343 for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
344 q->result |= stream_overflowed((void *) q->map, i);
345 break;
346 case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE:
347 q->result = q->map->end - q->map->start;
348
349 /* WaDividePSInvocationCountBy4:HSW,BDW */
350 if (devinfo->gen == 8 && q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
351 q->result /= 4;
352 break;
353 case PIPE_QUERY_OCCLUSION_COUNTER:
354 case PIPE_QUERY_PRIMITIVES_GENERATED:
355 case PIPE_QUERY_PRIMITIVES_EMITTED:
356 default:
357 q->result = q->map->end - q->map->start;
358 break;
359 }
360
361 q->ready = true;
362 }
363
364 static void
365 emit_alu_add(struct iris_batch *batch, unsigned dst_reg,
366 unsigned reg_a, unsigned reg_b)
367 {
368 uint32_t *math = iris_get_command_space(batch, 5 * sizeof(uint32_t));
369
370 math[0] = MI_MATH | (5 - 2);
371 math[1] = _MI_ALU2(LOAD, MI_ALU_SRCA, reg_a);
372 math[2] = _MI_ALU2(LOAD, MI_ALU_SRCB, reg_b);
373 math[3] = _MI_ALU0(ADD);
374 math[4] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
375 }
376
377 static void
378 emit_alu_shl(struct iris_batch *batch, unsigned dst_reg,
379 unsigned src_reg, unsigned shift)
380 {
381 assert(shift > 0);
382
383 int dwords = 1 + 4 * shift;
384
385 uint32_t *math = iris_get_command_space(batch, sizeof(uint32_t) * dwords);
386
387 math[0] = MI_MATH | ((1 + 4 * shift) - 2);
388
389 for (unsigned i = 0; i < shift; i++) {
390 unsigned add_src = (i == 0) ? src_reg : dst_reg;
391 math[1 + (i * 4) + 0] = _MI_ALU2(LOAD, MI_ALU_SRCA, add_src);
392 math[1 + (i * 4) + 1] = _MI_ALU2(LOAD, MI_ALU_SRCB, add_src);
393 math[1 + (i * 4) + 2] = _MI_ALU0(ADD);
394 math[1 + (i * 4) + 3] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
395 }
396 }
397
398 /* Emit dwords to multiply GPR0 by N */
399 static void
400 build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N)
401 {
402 VK_OUTARRAY_MAKE(out, dw, dw_count);
403
404 #define APPEND_ALU(op, x, y) \
405 vk_outarray_append(&out, alu_dw) *alu_dw = _MI_ALU(MI_ALU_##op, x, y)
406
407 assert(N > 0);
408 unsigned top_bit = 31 - __builtin_clz(N);
409 for (int i = top_bit - 1; i >= 0; i--) {
410 /* We get our initial data in GPR0 and we write the final data out to
411 * GPR0 but we use GPR1 as our scratch register.
412 */
413 unsigned src_reg = i == top_bit - 1 ? MI_ALU_R0 : MI_ALU_R1;
414 unsigned dst_reg = i == 0 ? MI_ALU_R0 : MI_ALU_R1;
415
416 /* Shift the current value left by 1 */
417 APPEND_ALU(LOAD, MI_ALU_SRCA, src_reg);
418 APPEND_ALU(LOAD, MI_ALU_SRCB, src_reg);
419 APPEND_ALU(ADD, 0, 0);
420
421 if (N & (1 << i)) {
422 /* Store ACCU to R1 and add R0 to R1 */
423 APPEND_ALU(STORE, MI_ALU_R1, MI_ALU_ACCU);
424 APPEND_ALU(LOAD, MI_ALU_SRCA, MI_ALU_R0);
425 APPEND_ALU(LOAD, MI_ALU_SRCB, MI_ALU_R1);
426 APPEND_ALU(ADD, 0, 0);
427 }
428
429 APPEND_ALU(STORE, dst_reg, MI_ALU_ACCU);
430 }
431
432 #undef APPEND_ALU
433 }
434
435 static void
436 emit_mul_gpr0(struct iris_batch *batch, uint32_t N)
437 {
438 uint32_t num_dwords;
439 build_alu_multiply_gpr0(NULL, &num_dwords, N);
440
441 uint32_t *math = iris_get_command_space(batch, 4 * num_dwords);
442 math[0] = MI_MATH | (num_dwords - 2);
443 build_alu_multiply_gpr0(&math[1], &num_dwords, N);
444 }
445
446 void
447 iris_math_div32_gpr0(struct iris_context *ice,
448 struct iris_batch *batch,
449 uint32_t D)
450 {
451 /* Zero out the top of GPR0 */
452 emit_lri32(batch, CS_GPR(0) + 4, 0);
453
454 if (D == 0) {
455 /* This invalid, but we should do something so we set GPR0 to 0. */
456 emit_lri32(batch, CS_GPR(0), 0);
457 } else if (util_is_power_of_two_or_zero(D)) {
458 unsigned log2_D = util_logbase2(D);
459 assert(log2_D < 32);
460 /* We right-shift by log2(D) by left-shifting by 32 - log2(D) and taking
461 * the top 32 bits of the result.
462 */
463 emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - log2_D);
464 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
465 emit_lri32(batch, CS_GPR(0) + 4, 0);
466 } else {
467 struct util_fast_udiv_info m = util_compute_fast_udiv_info(D, 32, 32);
468 assert(m.multiplier <= UINT32_MAX);
469
470 if (m.pre_shift) {
471 /* We right-shift by L by left-shifting by 32 - l and taking the top
472 * 32 bits of the result.
473 */
474 if (m.pre_shift < 32)
475 emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.pre_shift);
476 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
477 emit_lri32(batch, CS_GPR(0) + 4, 0);
478 }
479
480 /* Do the 32x32 multiply into gpr0 */
481 emit_mul_gpr0(batch, m.multiplier);
482
483 if (m.increment) {
484 /* If we need to increment, save off a copy of GPR0 */
485 emit_lri32(batch, CS_GPR(1) + 0, m.multiplier);
486 emit_lri32(batch, CS_GPR(1) + 4, 0);
487 emit_alu_add(batch, MI_ALU_R0, MI_ALU_R0, MI_ALU_R1);
488 }
489
490 /* Shift by 32 */
491 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
492 emit_lri32(batch, CS_GPR(0) + 4, 0);
493
494 if (m.post_shift) {
495 /* We right-shift by L by left-shifting by 32 - l and taking the top
496 * 32 bits of the result.
497 */
498 if (m.post_shift < 32)
499 emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.post_shift);
500 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
501 emit_lri32(batch, CS_GPR(0) + 4, 0);
502 }
503 }
504 }
505
506 /*
507 * GPR0 = (GPR0 == 0) ? 0 : 1;
508 */
509 static void
510 gpr0_to_bool(struct iris_context *ice)
511 {
512 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
513
514 ice->vtbl.load_register_imm64(batch, CS_GPR(1), 1ull);
515
516 static const uint32_t math[] = {
517 MI_MATH | (9 - 2),
518 MI_ALU2(LOAD, SRCA, R0),
519 MI_ALU1(LOAD0, SRCB),
520 MI_ALU0(ADD),
521 MI_ALU2(STOREINV, R0, ZF),
522 MI_ALU2(LOAD, SRCA, R0),
523 MI_ALU2(LOAD, SRCB, R1),
524 MI_ALU0(AND),
525 MI_ALU2(STORE, R0, ACCU),
526 };
527 iris_batch_emit(batch, math, sizeof(math));
528 }
529
530 static void
531 load_overflow_data_to_cs_gprs(struct iris_context *ice,
532 struct iris_query *q,
533 int idx)
534 {
535 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
536 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
537 uint32_t offset = q->query_state_ref.offset;
538
539 ice->vtbl.load_register_mem64(batch, CS_GPR(1), bo, offset +
540 offsetof(struct iris_query_so_overflow,
541 stream[idx].prim_storage_needed[0]));
542 ice->vtbl.load_register_mem64(batch, CS_GPR(2), bo, offset +
543 offsetof(struct iris_query_so_overflow,
544 stream[idx].prim_storage_needed[1]));
545
546 ice->vtbl.load_register_mem64(batch, CS_GPR(3), bo, offset +
547 offsetof(struct iris_query_so_overflow,
548 stream[idx].num_prims[0]));
549 ice->vtbl.load_register_mem64(batch, CS_GPR(4), bo, offset +
550 offsetof(struct iris_query_so_overflow,
551 stream[idx].num_prims[1]));
552 }
553
554 /*
555 * R3 = R4 - R3;
556 * R1 = R2 - R1;
557 * R1 = R3 - R1;
558 * R0 = R0 | R1;
559 */
560 static void
561 calc_overflow_for_stream(struct iris_context *ice)
562 {
563 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
564 static const uint32_t maths[] = {
565 MI_MATH | (17 - 2),
566 MI_ALU2(LOAD, SRCA, R4),
567 MI_ALU2(LOAD, SRCB, R3),
568 MI_ALU0(SUB),
569 MI_ALU2(STORE, R3, ACCU),
570 MI_ALU2(LOAD, SRCA, R2),
571 MI_ALU2(LOAD, SRCB, R1),
572 MI_ALU0(SUB),
573 MI_ALU2(STORE, R1, ACCU),
574 MI_ALU2(LOAD, SRCA, R3),
575 MI_ALU2(LOAD, SRCB, R1),
576 MI_ALU0(SUB),
577 MI_ALU2(STORE, R1, ACCU),
578 MI_ALU2(LOAD, SRCA, R1),
579 MI_ALU2(LOAD, SRCB, R0),
580 MI_ALU0(OR),
581 MI_ALU2(STORE, R0, ACCU),
582 };
583
584 iris_batch_emit(batch, maths, sizeof(maths));
585 }
586
587 static void
588 overflow_result_to_gpr0(struct iris_context *ice, struct iris_query *q)
589 {
590 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
591
592 ice->vtbl.load_register_imm64(batch, CS_GPR(0), 0ull);
593
594 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) {
595 load_overflow_data_to_cs_gprs(ice, q, q->index);
596 calc_overflow_for_stream(ice);
597 } else {
598 for (int i = 0; i < MAX_VERTEX_STREAMS; i++) {
599 load_overflow_data_to_cs_gprs(ice, q, i);
600 calc_overflow_for_stream(ice);
601 }
602 }
603
604 gpr0_to_bool(ice);
605 }
606
607 /*
608 * GPR0 = GPR0 & ((1ull << n) -1);
609 */
610 static void
611 keep_gpr0_lower_n_bits(struct iris_context *ice, uint32_t n)
612 {
613 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
614
615 ice->vtbl.load_register_imm64(batch, CS_GPR(1), (1ull << n) - 1);
616 static const uint32_t math[] = {
617 MI_MATH | (5 - 2),
618 MI_ALU2(LOAD, SRCA, R0),
619 MI_ALU2(LOAD, SRCB, R1),
620 MI_ALU0(AND),
621 MI_ALU2(STORE, R0, ACCU),
622 };
623 iris_batch_emit(batch, math, sizeof(math));
624 }
625
626 /*
627 * GPR0 = GPR0 << 30;
628 */
629 static void
630 shl_gpr0_by_30_bits(struct iris_context *ice)
631 {
632 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
633 /* First we mask 34 bits of GPR0 to prevent overflow */
634 keep_gpr0_lower_n_bits(ice, 34);
635
636 static const uint32_t shl_math[] = {
637 MI_ALU2(LOAD, SRCA, R0),
638 MI_ALU2(LOAD, SRCB, R0),
639 MI_ALU0(ADD),
640 MI_ALU2(STORE, R0, ACCU),
641 };
642
643 const uint32_t outer_count = 5;
644 const uint32_t inner_count = 6;
645 const uint32_t cmd_len = 1 + inner_count * ARRAY_SIZE(shl_math);
646 const uint32_t batch_len = cmd_len * outer_count;
647 uint32_t *map = iris_get_command_space(batch, batch_len * 4);
648 uint32_t offset = 0;
649 for (int o = 0; o < outer_count; o++) {
650 map[offset++] = MI_MATH | (cmd_len - 2);
651 for (int i = 0; i < inner_count; i++) {
652 memcpy(&map[offset], shl_math, sizeof(shl_math));
653 offset += 4;
654 }
655 }
656 }
657
658 /*
659 * GPR0 = GPR0 >> 2;
660 *
661 * Note that the upper 30 bits of GPR0 are lost!
662 */
663 static void
664 shr_gpr0_by_2_bits(struct iris_context *ice)
665 {
666 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
667 shl_gpr0_by_30_bits(ice);
668 ice->vtbl.load_register_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
669 ice->vtbl.load_register_imm32(batch, CS_GPR(0) + 4, 0);
670 }
671
672 /**
673 * Calculate the result and store it to CS_GPR0.
674 */
675 static void
676 calculate_result_on_gpu(struct iris_context *ice, struct iris_query *q)
677 {
678 struct iris_batch *batch = &ice->batches[q->batch_idx];
679 struct iris_screen *screen = (void *) ice->ctx.screen;
680 const struct gen_device_info *devinfo = &batch->screen->devinfo;
681 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
682 uint32_t offset = q->query_state_ref.offset;
683
684 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
685 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
686 overflow_result_to_gpr0(ice, q);
687 return;
688 }
689
690 if (q->type == PIPE_QUERY_TIMESTAMP) {
691 ice->vtbl.load_register_mem64(batch, CS_GPR(0), bo,
692 offset +
693 offsetof(struct iris_query_snapshots, start));
694 /* TODO: This discards any fractional bits of the timebase scale.
695 * We would need to do a bit of fixed point math on the CS ALU, or
696 * launch an actual shader to calculate this with full precision.
697 */
698 emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
699 keep_gpr0_lower_n_bits(ice, 36);
700 return;
701 }
702
703 ice->vtbl.load_register_mem64(batch, CS_GPR(1), bo,
704 offset +
705 offsetof(struct iris_query_snapshots, start));
706 ice->vtbl.load_register_mem64(batch, CS_GPR(2), bo,
707 offset +
708 offsetof(struct iris_query_snapshots, end));
709
710 static const uint32_t math[] = {
711 MI_MATH | (5 - 2),
712 MI_ALU2(LOAD, SRCA, R2),
713 MI_ALU2(LOAD, SRCB, R1),
714 MI_ALU0(SUB),
715 MI_ALU2(STORE, R0, ACCU),
716 };
717 iris_batch_emit(batch, math, sizeof(math));
718
719 /* WaDividePSInvocationCountBy4:HSW,BDW */
720 if (devinfo->gen == 8 &&
721 q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
722 q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
723 shr_gpr0_by_2_bits(ice);
724
725 if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE ||
726 q->type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
727 gpr0_to_bool(ice);
728
729 if (q->type == PIPE_QUERY_TIME_ELAPSED) {
730 /* TODO: This discards fractional bits (see above). */
731 emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
732 }
733 }
734
735 static struct pipe_query *
736 iris_create_query(struct pipe_context *ctx,
737 unsigned query_type,
738 unsigned index)
739 {
740 struct iris_query *q = calloc(1, sizeof(struct iris_query));
741
742 q->type = query_type;
743 q->index = index;
744
745 if (q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
746 q->index == PIPE_STAT_QUERY_CS_INVOCATIONS)
747 q->batch_idx = IRIS_BATCH_COMPUTE;
748 else
749 q->batch_idx = IRIS_BATCH_RENDER;
750 return (struct pipe_query *) q;
751 }
752
753 static void
754 iris_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query)
755 {
756 struct iris_query *query = (void *) p_query;
757 struct iris_screen *screen = (void *) ctx->screen;
758 iris_syncpt_reference(screen, &query->syncpt, NULL);
759 free(query);
760 }
761
762
763 static boolean
764 iris_begin_query(struct pipe_context *ctx, struct pipe_query *query)
765 {
766 struct iris_context *ice = (void *) ctx;
767 struct iris_query *q = (void *) query;
768 void *ptr = NULL;
769 uint32_t size;
770
771 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
772 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
773 size = sizeof(struct iris_query_so_overflow);
774 else
775 size = sizeof(struct iris_query_snapshots);
776
777 u_upload_alloc(ice->query_buffer_uploader, 0,
778 size, size, &q->query_state_ref.offset,
779 &q->query_state_ref.res, &ptr);
780
781 if (!iris_resource_bo(q->query_state_ref.res))
782 return false;
783
784 q->map = ptr;
785 if (!q->map)
786 return false;
787
788 q->result = 0ull;
789 q->ready = false;
790 WRITE_ONCE(q->map->snapshots_landed, false);
791
792 if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
793 ice->state.prims_generated_query_active = true;
794 ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
795 }
796
797 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
798 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
799 write_overflow_values(ice, q, false);
800 else
801 write_value(ice, q,
802 q->query_state_ref.offset +
803 offsetof(struct iris_query_snapshots, start));
804
805 return true;
806 }
807
808 static bool
809 iris_end_query(struct pipe_context *ctx, struct pipe_query *query)
810 {
811 struct iris_context *ice = (void *) ctx;
812 struct iris_query *q = (void *) query;
813 struct iris_batch *batch = &ice->batches[q->batch_idx];
814
815 if (q->type == PIPE_QUERY_TIMESTAMP) {
816 iris_begin_query(ctx, query);
817 iris_batch_reference_signal_syncpt(batch, &q->syncpt);
818 mark_available(ice, q);
819 return true;
820 }
821
822 if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
823 ice->state.prims_generated_query_active = false;
824 ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
825 }
826
827 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
828 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
829 write_overflow_values(ice, q, true);
830 else
831 write_value(ice, q,
832 q->query_state_ref.offset +
833 offsetof(struct iris_query_snapshots, end));
834
835 iris_batch_reference_signal_syncpt(batch, &q->syncpt);
836 mark_available(ice, q);
837
838 return true;
839 }
840
841 /**
842 * See if the snapshots have landed for a query, and if so, compute the
843 * result and mark it ready. Does not flush (unlike iris_get_query_result).
844 */
845 static void
846 iris_check_query_no_flush(struct iris_context *ice, struct iris_query *q)
847 {
848 struct iris_screen *screen = (void *) ice->ctx.screen;
849 const struct gen_device_info *devinfo = &screen->devinfo;
850
851 if (!q->ready && READ_ONCE(q->map->snapshots_landed)) {
852 calculate_result_on_cpu(devinfo, q);
853 }
854 }
855
856 static boolean
857 iris_get_query_result(struct pipe_context *ctx,
858 struct pipe_query *query,
859 boolean wait,
860 union pipe_query_result *result)
861 {
862 struct iris_context *ice = (void *) ctx;
863 struct iris_query *q = (void *) query;
864 struct iris_screen *screen = (void *) ctx->screen;
865 const struct gen_device_info *devinfo = &screen->devinfo;
866 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
867
868 if (!q->ready) {
869 if (iris_batch_references(&ice->batches[q->batch_idx], bo))
870 iris_batch_flush(&ice->batches[q->batch_idx]);
871
872 while (!READ_ONCE(q->map->snapshots_landed)) {
873 if (wait)
874 iris_wait_syncpt(ctx->screen, q->syncpt, INT64_MAX);
875 else
876 return false;
877 }
878
879 assert(READ_ONCE(q->map->snapshots_landed));
880 calculate_result_on_cpu(devinfo, q);
881 }
882
883 assert(q->ready);
884
885 result->u64 = q->result;
886
887 return true;
888 }
889
890 static void
891 iris_get_query_result_resource(struct pipe_context *ctx,
892 struct pipe_query *query,
893 boolean wait,
894 enum pipe_query_value_type result_type,
895 int index,
896 struct pipe_resource *p_res,
897 unsigned offset)
898 {
899 struct iris_context *ice = (void *) ctx;
900 struct iris_query *q = (void *) query;
901 struct iris_batch *batch = &ice->batches[q->batch_idx];
902 const struct gen_device_info *devinfo = &batch->screen->devinfo;
903 struct iris_resource *res = (void *) p_res;
904 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
905 unsigned snapshots_landed_offset =
906 offsetof(struct iris_query_snapshots, snapshots_landed);
907
908 res->bind_history |= PIPE_BIND_QUERY_BUFFER;
909
910 if (index == -1) {
911 /* They're asking for the availability of the result. If we still
912 * have commands queued up which produce the result, submit them
913 * now so that progress happens. Either way, copy the snapshots
914 * landed field to the destination resource.
915 */
916 if (iris_batch_references(batch, bo))
917 iris_batch_flush(batch);
918
919 ice->vtbl.copy_mem_mem(batch, iris_resource_bo(p_res), offset,
920 bo, snapshots_landed_offset,
921 result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8);
922 return;
923 }
924
925 if (!q->ready && READ_ONCE(q->map->snapshots_landed)) {
926 /* The final snapshots happen to have landed, so let's just compute
927 * the result on the CPU now...
928 */
929 calculate_result_on_cpu(devinfo, q);
930 }
931
932 if (q->ready) {
933 /* We happen to have the result on the CPU, so just copy it. */
934 if (result_type <= PIPE_QUERY_TYPE_U32) {
935 ice->vtbl.store_data_imm32(batch, iris_resource_bo(p_res), offset,
936 q->result);
937 } else {
938 ice->vtbl.store_data_imm64(batch, iris_resource_bo(p_res), offset,
939 q->result);
940 }
941
942 /* Make sure the result lands before they use bind the QBO elsewhere
943 * and use the result.
944 */
945 // XXX: Why? i965 doesn't do this.
946 iris_emit_pipe_control_flush(batch, PIPE_CONTROL_CS_STALL);
947 return;
948 }
949
950 /* Calculate the result to CS_GPR0 */
951 calculate_result_on_gpu(ice, q);
952
953 bool predicated = !wait && !q->stalled;
954
955 if (predicated) {
956 ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
957 ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, bo,
958 snapshots_landed_offset);
959 uint32_t predicate = MI_PREDICATE |
960 MI_PREDICATE_LOADOP_LOADINV |
961 MI_PREDICATE_COMBINEOP_SET |
962 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
963 iris_batch_emit(batch, &predicate, sizeof(uint32_t));
964 }
965
966 if (result_type <= PIPE_QUERY_TYPE_U32) {
967 ice->vtbl.store_register_mem32(batch, CS_GPR(0),
968 iris_resource_bo(p_res),
969 offset, predicated);
970 } else {
971 ice->vtbl.store_register_mem64(batch, CS_GPR(0),
972 iris_resource_bo(p_res),
973 offset, predicated);
974 }
975 }
976
977 static void
978 iris_set_active_query_state(struct pipe_context *ctx, boolean enable)
979 {
980 struct iris_context *ice = (void *) ctx;
981
982 if (ice->state.statistics_counters_enabled == enable)
983 return;
984
985 // XXX: most packets aren't paying attention to this yet, because it'd
986 // have to be done dynamically at draw time, which is a pain
987 ice->state.statistics_counters_enabled = enable;
988 ice->state.dirty |= IRIS_DIRTY_CLIP |
989 IRIS_DIRTY_GS |
990 IRIS_DIRTY_RASTER |
991 IRIS_DIRTY_STREAMOUT |
992 IRIS_DIRTY_TCS |
993 IRIS_DIRTY_TES |
994 IRIS_DIRTY_VS |
995 IRIS_DIRTY_WM;
996 }
997
998 static void
999 set_predicate_enable(struct iris_context *ice, bool value)
1000 {
1001 if (value)
1002 ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
1003 else
1004 ice->state.predicate = IRIS_PREDICATE_STATE_DONT_RENDER;
1005 }
1006
1007 static void
1008 set_predicate_for_result(struct iris_context *ice,
1009 struct iris_query *q,
1010 bool inverted)
1011 {
1012 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
1013 struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
1014
1015 /* The CPU doesn't have the query result yet; use hardware predication */
1016 ice->state.predicate = IRIS_PREDICATE_STATE_USE_BIT;
1017
1018 /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
1019 iris_emit_pipe_control_flush(batch, PIPE_CONTROL_FLUSH_ENABLE);
1020 q->stalled = true;
1021
1022 switch (q->type) {
1023 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1024 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1025 overflow_result_to_gpr0(ice, q);
1026
1027 ice->vtbl.load_register_reg64(batch, MI_PREDICATE_SRC0, CS_GPR(0));
1028 ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
1029 break;
1030 default:
1031 /* PIPE_QUERY_OCCLUSION_* */
1032 ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, bo,
1033 offsetof(struct iris_query_snapshots, start) +
1034 q->query_state_ref.offset);
1035 ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, bo,
1036 offsetof(struct iris_query_snapshots, end) +
1037 q->query_state_ref.offset);
1038 break;
1039 }
1040
1041 uint32_t mi_predicate = MI_PREDICATE |
1042 MI_PREDICATE_COMBINEOP_SET |
1043 MI_PREDICATE_COMPAREOP_SRCS_EQUAL |
1044 (inverted ? MI_PREDICATE_LOADOP_LOAD
1045 : MI_PREDICATE_LOADOP_LOADINV);
1046 iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
1047
1048 /* We immediately set the predicate on the render batch, as all the
1049 * counters come from 3D operations. However, we may need to predicate
1050 * a compute dispatch, which executes in a different GEM context and has
1051 * a different MI_PREDICATE_DATA register. So, we save the result to
1052 * memory and reload it in iris_launch_grid.
1053 */
1054 unsigned offset = q->query_state_ref.offset +
1055 offsetof(struct iris_query_snapshots, predicate_data);
1056 ice->vtbl.store_register_mem64(batch, MI_PREDICATE_DATA,
1057 bo, offset, false);
1058 ice->state.compute_predicate = bo;
1059 }
1060
1061 static void
1062 iris_render_condition(struct pipe_context *ctx,
1063 struct pipe_query *query,
1064 boolean condition,
1065 enum pipe_render_cond_flag mode)
1066 {
1067 struct iris_context *ice = (void *) ctx;
1068 struct iris_query *q = (void *) query;
1069
1070 /* The old condition isn't relevant; we'll update it if necessary */
1071 ice->state.compute_predicate = NULL;
1072
1073 if (!q) {
1074 ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
1075 return;
1076 }
1077
1078 iris_check_query_no_flush(ice, q);
1079
1080 if (q->result || q->ready) {
1081 set_predicate_enable(ice, (q->result != 0) ^ condition);
1082 } else {
1083 if (mode == PIPE_RENDER_COND_NO_WAIT ||
1084 mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) {
1085 perf_debug(&ice->dbg, "Conditional rendering demoted from "
1086 "\"no wait\" to \"wait\".");
1087 }
1088 set_predicate_for_result(ice, q, condition);
1089 }
1090 }
1091
1092 void
1093 iris_init_query_functions(struct pipe_context *ctx)
1094 {
1095 ctx->create_query = iris_create_query;
1096 ctx->destroy_query = iris_destroy_query;
1097 ctx->begin_query = iris_begin_query;
1098 ctx->end_query = iris_end_query;
1099 ctx->get_query_result = iris_get_query_result;
1100 ctx->get_query_result_resource = iris_get_query_result_resource;
1101 ctx->set_active_query_state = iris_set_active_query_state;
1102 ctx->render_condition = iris_render_condition;
1103 }