66169d1db05d0155d0a6c00867eaa336e04205ad
[mesa.git] / src / gallium / drivers / iris / iris_query.c
1 /*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
21 */
22
23 /**
24 * @file iris_query.c
25 *
26 * Query object support. This allows measuring various simple statistics
27 * via counters on the GPU.
28 */
29
30 #include <stdio.h>
31 #include <errno.h>
32 #include "pipe/p_defines.h"
33 #include "pipe/p_state.h"
34 #include "pipe/p_context.h"
35 #include "pipe/p_screen.h"
36 #include "util/fast_idiv_by_const.h"
37 #include "util/u_inlines.h"
38 #include "iris_context.h"
39 #include "iris_defines.h"
40 #include "iris_resource.h"
41 #include "iris_screen.h"
42 #include "vulkan/util/vk_util.h"
43
44 #define IA_VERTICES_COUNT 0x2310
45 #define IA_PRIMITIVES_COUNT 0x2318
46 #define VS_INVOCATION_COUNT 0x2320
47 #define HS_INVOCATION_COUNT 0x2300
48 #define DS_INVOCATION_COUNT 0x2308
49 #define GS_INVOCATION_COUNT 0x2328
50 #define GS_PRIMITIVES_COUNT 0x2330
51 #define CL_INVOCATION_COUNT 0x2338
52 #define CL_PRIMITIVES_COUNT 0x2340
53 #define PS_INVOCATION_COUNT 0x2348
54 #define CS_INVOCATION_COUNT 0x2290
55 #define PS_DEPTH_COUNT 0x2350
56
57 #define SO_PRIM_STORAGE_NEEDED(n) (0x5240 + (n) * 8)
58
59 #define SO_NUM_PRIMS_WRITTEN(n) (0x5200 + (n) * 8)
60
61 #define MI_MATH (0x1a << 23)
62
63 #define MI_ALU_LOAD 0x080
64 #define MI_ALU_LOADINV 0x480
65 #define MI_ALU_LOAD0 0x081
66 #define MI_ALU_LOAD1 0x481
67 #define MI_ALU_ADD 0x100
68 #define MI_ALU_SUB 0x101
69 #define MI_ALU_AND 0x102
70 #define MI_ALU_OR 0x103
71 #define MI_ALU_XOR 0x104
72 #define MI_ALU_STORE 0x180
73 #define MI_ALU_STOREINV 0x580
74
75 #define MI_ALU_R0 0x00
76 #define MI_ALU_R1 0x01
77 #define MI_ALU_R2 0x02
78 #define MI_ALU_R3 0x03
79 #define MI_ALU_R4 0x04
80 #define MI_ALU_SRCA 0x20
81 #define MI_ALU_SRCB 0x21
82 #define MI_ALU_ACCU 0x31
83 #define MI_ALU_ZF 0x32
84 #define MI_ALU_CF 0x33
85
86 #define _MI_ALU(op, x, y) (((op) << 20) | ((x) << 10) | (y))
87
88 #define _MI_ALU0(op) _MI_ALU(MI_ALU_##op, 0, 0)
89 #define _MI_ALU1(op, x) _MI_ALU(MI_ALU_##op, x, 0)
90 #define _MI_ALU2(op, x, y) _MI_ALU(MI_ALU_##op, x, y)
91
92 #define MI_ALU0(op) _MI_ALU0(op)
93 #define MI_ALU1(op, x) _MI_ALU1(op, MI_ALU_##x)
94 #define MI_ALU2(op, x, y) _MI_ALU2(op, MI_ALU_##x, MI_ALU_##y)
95
96 #define emit_lri32 ice->vtbl.load_register_imm32
97 #define emit_lri64 ice->vtbl.load_register_imm64
98 #define emit_lrr32 ice->vtbl.load_register_reg32
99
100 struct iris_query {
101 enum pipe_query_type type;
102 int index;
103
104 bool ready;
105
106 bool stalled;
107
108 uint64_t result;
109
110 struct iris_bo *bo;
111 struct iris_query_snapshots *map;
112
113 int batch_idx;
114 };
115
116 struct iris_query_snapshots {
117 /** iris_render_condition's saved MI_PREDICATE_DATA value. */
118 uint64_t predicate_data;
119
120 /** Have the start/end snapshots landed? */
121 uint64_t snapshots_landed;
122
123 /** Starting and ending counter snapshots */
124 uint64_t start;
125 uint64_t end;
126 };
127
128 struct iris_query_so_overflow {
129 uint64_t predicate_data;
130 uint64_t snapshots_landed;
131
132 struct {
133 uint64_t prim_storage_needed[2];
134 uint64_t num_prims[2];
135 } stream[4];
136 };
137
138 /**
139 * Is this type of query written by PIPE_CONTROL?
140 */
141 static bool
142 iris_is_query_pipelined(struct iris_query *q)
143 {
144 switch (q->type) {
145 case PIPE_QUERY_OCCLUSION_COUNTER:
146 case PIPE_QUERY_OCCLUSION_PREDICATE:
147 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
148 case PIPE_QUERY_TIMESTAMP:
149 case PIPE_QUERY_TIMESTAMP_DISJOINT:
150 case PIPE_QUERY_TIME_ELAPSED:
151 return true;
152
153 default:
154 return false;
155 }
156 }
157
158 static void
159 mark_available(struct iris_context *ice, struct iris_query *q)
160 {
161 struct iris_batch *batch = &ice->batches[q->batch_idx];
162 unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
163 unsigned offset = offsetof(struct iris_query_snapshots, snapshots_landed);
164
165 if (!iris_is_query_pipelined(q)) {
166 ice->vtbl.store_data_imm64(batch, q->bo, offset, true);
167 } else {
168 /* Order available *after* the query results. */
169 flags |= PIPE_CONTROL_FLUSH_ENABLE;
170 iris_emit_pipe_control_write(batch, flags, q->bo, offset, true);
171 }
172 }
173
174 /**
175 * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
176 */
177 static void
178 iris_pipelined_write(struct iris_batch *batch,
179 struct iris_query *q,
180 enum pipe_control_flags flags,
181 unsigned offset)
182 {
183 const struct gen_device_info *devinfo = &batch->screen->devinfo;
184 const unsigned optional_cs_stall =
185 devinfo->gen == 9 && devinfo->gt == 4 ? PIPE_CONTROL_CS_STALL : 0;
186
187 iris_emit_pipe_control_write(batch, flags | optional_cs_stall,
188 q->bo, offset, 0ull);
189 }
190
191 static void
192 write_value(struct iris_context *ice, struct iris_query *q, unsigned offset)
193 {
194 struct iris_batch *batch = &ice->batches[q->batch_idx];
195 const struct gen_device_info *devinfo = &batch->screen->devinfo;
196
197 if (!iris_is_query_pipelined(q)) {
198 iris_emit_pipe_control_flush(batch,
199 PIPE_CONTROL_CS_STALL |
200 PIPE_CONTROL_STALL_AT_SCOREBOARD);
201 q->stalled = true;
202 }
203
204 switch (q->type) {
205 case PIPE_QUERY_OCCLUSION_COUNTER:
206 case PIPE_QUERY_OCCLUSION_PREDICATE:
207 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
208 if (devinfo->gen >= 10) {
209 /* "Driver must program PIPE_CONTROL with only Depth Stall Enable
210 * bit set prior to programming a PIPE_CONTROL with Write PS Depth
211 * Count sync operation."
212 */
213 iris_emit_pipe_control_flush(batch, PIPE_CONTROL_DEPTH_STALL);
214 }
215 iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
216 PIPE_CONTROL_WRITE_DEPTH_COUNT |
217 PIPE_CONTROL_DEPTH_STALL,
218 offset);
219 break;
220 case PIPE_QUERY_TIME_ELAPSED:
221 case PIPE_QUERY_TIMESTAMP:
222 case PIPE_QUERY_TIMESTAMP_DISJOINT:
223 iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
224 PIPE_CONTROL_WRITE_TIMESTAMP,
225 offset);
226 break;
227 case PIPE_QUERY_PRIMITIVES_GENERATED:
228 ice->vtbl.store_register_mem64(batch,
229 q->index == 0 ? CL_INVOCATION_COUNT :
230 SO_PRIM_STORAGE_NEEDED(q->index),
231 q->bo, offset, false);
232 break;
233 case PIPE_QUERY_PRIMITIVES_EMITTED:
234 ice->vtbl.store_register_mem64(batch,
235 SO_NUM_PRIMS_WRITTEN(q->index),
236 q->bo, offset, false);
237 break;
238 case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE: {
239 static const uint32_t index_to_reg[] = {
240 IA_VERTICES_COUNT,
241 IA_PRIMITIVES_COUNT,
242 VS_INVOCATION_COUNT,
243 GS_INVOCATION_COUNT,
244 GS_PRIMITIVES_COUNT,
245 CL_INVOCATION_COUNT,
246 CL_PRIMITIVES_COUNT,
247 PS_INVOCATION_COUNT,
248 HS_INVOCATION_COUNT,
249 DS_INVOCATION_COUNT,
250 CS_INVOCATION_COUNT,
251 };
252 const uint32_t reg = index_to_reg[q->index];
253
254 ice->vtbl.store_register_mem64(batch, reg, q->bo, offset, false);
255 break;
256 }
257 default:
258 assert(false);
259 }
260 }
261
262 static void
263 write_overflow_values(struct iris_context *ice, struct iris_query *q, bool end)
264 {
265 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
266 uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4;
267
268 iris_emit_pipe_control_flush(batch,
269 PIPE_CONTROL_CS_STALL |
270 PIPE_CONTROL_STALL_AT_SCOREBOARD);
271 for (uint32_t i = 0; i < count; i++) {
272 int s = q->index + i;
273 int g_idx = offsetof(struct iris_query_so_overflow,
274 stream[s].num_prims[end]);
275 int w_idx = offsetof(struct iris_query_so_overflow,
276 stream[s].prim_storage_needed[end]);
277 ice->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s),
278 q->bo, g_idx, false);
279 ice->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s),
280 q->bo, w_idx, false);
281 }
282 }
283
284 uint64_t
285 iris_timebase_scale(const struct gen_device_info *devinfo,
286 uint64_t gpu_timestamp)
287 {
288 return (1000000000ull * gpu_timestamp) / devinfo->timestamp_frequency;
289 }
290
291 static uint64_t
292 iris_raw_timestamp_delta(uint64_t time0, uint64_t time1)
293 {
294 if (time0 > time1) {
295 return (1ULL << TIMESTAMP_BITS) + time1 - time0;
296 } else {
297 return time1 - time0;
298 }
299 }
300
301 static bool
302 stream_overflowed(struct iris_query_so_overflow *so, int s)
303 {
304 return (so->stream[s].prim_storage_needed[1] -
305 so->stream[s].prim_storage_needed[0]) !=
306 (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]);
307 }
308
309 static void
310 calculate_result_on_cpu(const struct gen_device_info *devinfo,
311 struct iris_query *q)
312 {
313 switch (q->type) {
314 case PIPE_QUERY_OCCLUSION_PREDICATE:
315 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
316 q->result = q->map->end != q->map->start;
317 break;
318 case PIPE_QUERY_TIMESTAMP:
319 case PIPE_QUERY_TIMESTAMP_DISJOINT:
320 /* The timestamp is the single starting snapshot. */
321 q->result = iris_timebase_scale(devinfo, q->map->start);
322 q->result &= (1ull << TIMESTAMP_BITS) - 1;
323 break;
324 case PIPE_QUERY_TIME_ELAPSED:
325 q->result = iris_raw_timestamp_delta(q->map->start, q->map->end);
326 q->result = iris_timebase_scale(devinfo, q->result);
327 q->result &= (1ull << TIMESTAMP_BITS) - 1;
328 break;
329 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
330 q->result = stream_overflowed((void *) q->map, q->index);
331 break;
332 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
333 q->result = false;
334 for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
335 q->result |= stream_overflowed((void *) q->map, i);
336 break;
337 case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE:
338 q->result = q->map->end - q->map->start;
339
340 /* WaDividePSInvocationCountBy4:HSW,BDW */
341 if (devinfo->gen == 8 && q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
342 q->result /= 4;
343 break;
344 case PIPE_QUERY_OCCLUSION_COUNTER:
345 case PIPE_QUERY_PRIMITIVES_GENERATED:
346 case PIPE_QUERY_PRIMITIVES_EMITTED:
347 default:
348 q->result = q->map->end - q->map->start;
349 break;
350 }
351
352 q->ready = true;
353 }
354
355 static void
356 emit_alu_add(struct iris_batch *batch, unsigned dst_reg,
357 unsigned reg_a, unsigned reg_b)
358 {
359 uint32_t *math = iris_get_command_space(batch, 5 * sizeof(uint32_t));
360
361 math[0] = MI_MATH | (5 - 2);
362 math[1] = _MI_ALU2(LOAD, MI_ALU_SRCA, reg_a);
363 math[2] = _MI_ALU2(LOAD, MI_ALU_SRCB, reg_b);
364 math[3] = _MI_ALU0(ADD);
365 math[4] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
366 }
367
368 static void
369 emit_alu_shl(struct iris_batch *batch, unsigned dst_reg,
370 unsigned src_reg, unsigned shift)
371 {
372 assert(shift > 0);
373
374 int dwords = 1 + 4 * shift;
375
376 uint32_t *math = iris_get_command_space(batch, sizeof(uint32_t) * dwords);
377
378 math[0] = MI_MATH | ((1 + 4 * shift) - 2);
379
380 for (unsigned i = 0; i < shift; i++) {
381 unsigned add_src = (i == 0) ? src_reg : dst_reg;
382 math[1 + (i * 4) + 0] = _MI_ALU2(LOAD, MI_ALU_SRCA, add_src);
383 math[1 + (i * 4) + 1] = _MI_ALU2(LOAD, MI_ALU_SRCB, add_src);
384 math[1 + (i * 4) + 2] = _MI_ALU0(ADD);
385 math[1 + (i * 4) + 3] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
386 }
387 }
388
389 /* Emit dwords to multiply GPR0 by N */
390 static void
391 build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N)
392 {
393 VK_OUTARRAY_MAKE(out, dw, dw_count);
394
395 #define APPEND_ALU(op, x, y) \
396 vk_outarray_append(&out, alu_dw) *alu_dw = _MI_ALU(MI_ALU_##op, x, y)
397
398 assert(N > 0);
399 unsigned top_bit = 31 - __builtin_clz(N);
400 for (int i = top_bit - 1; i >= 0; i--) {
401 /* We get our initial data in GPR0 and we write the final data out to
402 * GPR0 but we use GPR1 as our scratch register.
403 */
404 unsigned src_reg = i == top_bit - 1 ? MI_ALU_R0 : MI_ALU_R1;
405 unsigned dst_reg = i == 0 ? MI_ALU_R0 : MI_ALU_R1;
406
407 /* Shift the current value left by 1 */
408 APPEND_ALU(LOAD, MI_ALU_SRCA, src_reg);
409 APPEND_ALU(LOAD, MI_ALU_SRCB, src_reg);
410 APPEND_ALU(ADD, 0, 0);
411
412 if (N & (1 << i)) {
413 /* Store ACCU to R1 and add R0 to R1 */
414 APPEND_ALU(STORE, MI_ALU_R1, MI_ALU_ACCU);
415 APPEND_ALU(LOAD, MI_ALU_SRCA, MI_ALU_R0);
416 APPEND_ALU(LOAD, MI_ALU_SRCB, MI_ALU_R1);
417 APPEND_ALU(ADD, 0, 0);
418 }
419
420 APPEND_ALU(STORE, dst_reg, MI_ALU_ACCU);
421 }
422
423 #undef APPEND_ALU
424 }
425
426 static void
427 emit_mul_gpr0(struct iris_batch *batch, uint32_t N)
428 {
429 uint32_t num_dwords;
430 build_alu_multiply_gpr0(NULL, &num_dwords, N);
431
432 uint32_t *math = iris_get_command_space(batch, 4 * num_dwords);
433 math[0] = MI_MATH | (num_dwords - 2);
434 build_alu_multiply_gpr0(&math[1], &num_dwords, N);
435 }
436
437 void
438 iris_math_div32_gpr0(struct iris_context *ice,
439 struct iris_batch *batch,
440 uint32_t D)
441 {
442 /* Zero out the top of GPR0 */
443 emit_lri32(batch, CS_GPR(0) + 4, 0);
444
445 if (D == 0) {
446 /* This invalid, but we should do something so we set GPR0 to 0. */
447 emit_lri32(batch, CS_GPR(0), 0);
448 } else if (util_is_power_of_two_or_zero(D)) {
449 unsigned log2_D = util_logbase2(D);
450 assert(log2_D < 32);
451 /* We right-shift by log2(D) by left-shifting by 32 - log2(D) and taking
452 * the top 32 bits of the result.
453 */
454 emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - log2_D);
455 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
456 emit_lri32(batch, CS_GPR(0) + 4, 0);
457 } else {
458 struct util_fast_udiv_info m = util_compute_fast_udiv_info(D, 32, 32);
459 assert(m.multiplier <= UINT32_MAX);
460
461 if (m.pre_shift) {
462 /* We right-shift by L by left-shifting by 32 - l and taking the top
463 * 32 bits of the result.
464 */
465 if (m.pre_shift < 32)
466 emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.pre_shift);
467 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
468 emit_lri32(batch, CS_GPR(0) + 4, 0);
469 }
470
471 /* Do the 32x32 multiply into gpr0 */
472 emit_mul_gpr0(batch, m.multiplier);
473
474 if (m.increment) {
475 /* If we need to increment, save off a copy of GPR0 */
476 emit_lri32(batch, CS_GPR(1) + 0, m.multiplier);
477 emit_lri32(batch, CS_GPR(1) + 4, 0);
478 emit_alu_add(batch, MI_ALU_R0, MI_ALU_R0, MI_ALU_R1);
479 }
480
481 /* Shift by 32 */
482 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
483 emit_lri32(batch, CS_GPR(0) + 4, 0);
484
485 if (m.post_shift) {
486 /* We right-shift by L by left-shifting by 32 - l and taking the top
487 * 32 bits of the result.
488 */
489 if (m.post_shift < 32)
490 emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.post_shift);
491 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
492 emit_lri32(batch, CS_GPR(0) + 4, 0);
493 }
494 }
495 }
496
497 /*
498 * GPR0 = (GPR0 == 0) ? 0 : 1;
499 */
500 static void
501 gpr0_to_bool(struct iris_context *ice)
502 {
503 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
504
505 ice->vtbl.load_register_imm64(batch, CS_GPR(1), 1ull);
506
507 static const uint32_t math[] = {
508 MI_MATH | (9 - 2),
509 MI_ALU2(LOAD, SRCA, R0),
510 MI_ALU1(LOAD0, SRCB),
511 MI_ALU0(ADD),
512 MI_ALU2(STOREINV, R0, ZF),
513 MI_ALU2(LOAD, SRCA, R0),
514 MI_ALU2(LOAD, SRCB, R1),
515 MI_ALU0(AND),
516 MI_ALU2(STORE, R0, ACCU),
517 };
518 iris_batch_emit(batch, math, sizeof(math));
519 }
520
521 static void
522 load_overflow_data_to_cs_gprs(struct iris_context *ice,
523 struct iris_query *q,
524 int idx)
525 {
526 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
527
528 ice->vtbl.load_register_mem64(batch, CS_GPR(1), q->bo,
529 offsetof(struct iris_query_so_overflow,
530 stream[idx].prim_storage_needed[0]));
531 ice->vtbl.load_register_mem64(batch, CS_GPR(2), q->bo,
532 offsetof(struct iris_query_so_overflow,
533 stream[idx].prim_storage_needed[1]));
534
535 ice->vtbl.load_register_mem64(batch, CS_GPR(3), q->bo,
536 offsetof(struct iris_query_so_overflow,
537 stream[idx].num_prims[0]));
538 ice->vtbl.load_register_mem64(batch, CS_GPR(4), q->bo,
539 offsetof(struct iris_query_so_overflow,
540 stream[idx].num_prims[1]));
541 }
542
543 /*
544 * R3 = R4 - R3;
545 * R1 = R2 - R1;
546 * R1 = R3 - R1;
547 * R0 = R0 | R1;
548 */
549 static void
550 calc_overflow_for_stream(struct iris_context *ice)
551 {
552 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
553 static const uint32_t maths[] = {
554 MI_MATH | (17 - 2),
555 MI_ALU2(LOAD, SRCA, R4),
556 MI_ALU2(LOAD, SRCB, R3),
557 MI_ALU0(SUB),
558 MI_ALU2(STORE, R3, ACCU),
559 MI_ALU2(LOAD, SRCA, R2),
560 MI_ALU2(LOAD, SRCB, R1),
561 MI_ALU0(SUB),
562 MI_ALU2(STORE, R1, ACCU),
563 MI_ALU2(LOAD, SRCA, R3),
564 MI_ALU2(LOAD, SRCB, R1),
565 MI_ALU0(SUB),
566 MI_ALU2(STORE, R1, ACCU),
567 MI_ALU2(LOAD, SRCA, R1),
568 MI_ALU2(LOAD, SRCB, R0),
569 MI_ALU0(OR),
570 MI_ALU2(STORE, R0, ACCU),
571 };
572
573 iris_batch_emit(batch, maths, sizeof(maths));
574 }
575
576 static void
577 overflow_result_to_gpr0(struct iris_context *ice, struct iris_query *q)
578 {
579 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
580
581 ice->vtbl.load_register_imm64(batch, CS_GPR(0), 0ull);
582
583 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) {
584 load_overflow_data_to_cs_gprs(ice, q, q->index);
585 calc_overflow_for_stream(ice);
586 } else {
587 for (int i = 0; i < MAX_VERTEX_STREAMS; i++) {
588 load_overflow_data_to_cs_gprs(ice, q, i);
589 calc_overflow_for_stream(ice);
590 }
591 }
592
593 gpr0_to_bool(ice);
594 }
595
596 /*
597 * GPR0 = GPR0 & ((1ull << n) -1);
598 */
599 static void
600 keep_gpr0_lower_n_bits(struct iris_context *ice, uint32_t n)
601 {
602 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
603
604 ice->vtbl.load_register_imm64(batch, CS_GPR(1), (1ull << n) - 1);
605 static const uint32_t math[] = {
606 MI_MATH | (5 - 2),
607 MI_ALU2(LOAD, SRCA, R0),
608 MI_ALU2(LOAD, SRCB, R1),
609 MI_ALU0(AND),
610 MI_ALU2(STORE, R0, ACCU),
611 };
612 iris_batch_emit(batch, math, sizeof(math));
613 }
614
615 /*
616 * GPR0 = GPR0 << 30;
617 */
618 static void
619 shl_gpr0_by_30_bits(struct iris_context *ice)
620 {
621 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
622 /* First we mask 34 bits of GPR0 to prevent overflow */
623 keep_gpr0_lower_n_bits(ice, 34);
624
625 static const uint32_t shl_math[] = {
626 MI_ALU2(LOAD, SRCA, R0),
627 MI_ALU2(LOAD, SRCB, R0),
628 MI_ALU0(ADD),
629 MI_ALU2(STORE, R0, ACCU),
630 };
631
632 const uint32_t outer_count = 5;
633 const uint32_t inner_count = 6;
634 const uint32_t cmd_len = 1 + inner_count * ARRAY_SIZE(shl_math);
635 const uint32_t batch_len = cmd_len * outer_count;
636 uint32_t *map = iris_get_command_space(batch, batch_len * 4);
637 uint32_t offset = 0;
638 for (int o = 0; o < outer_count; o++) {
639 map[offset++] = MI_MATH | (cmd_len - 2);
640 for (int i = 0; i < inner_count; i++) {
641 memcpy(&map[offset], shl_math, sizeof(shl_math));
642 offset += 4;
643 }
644 }
645 }
646
647 /*
648 * GPR0 = GPR0 >> 2;
649 *
650 * Note that the upper 30 bits of GPR0 are lost!
651 */
652 static void
653 shr_gpr0_by_2_bits(struct iris_context *ice)
654 {
655 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
656 shl_gpr0_by_30_bits(ice);
657 ice->vtbl.load_register_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
658 ice->vtbl.load_register_imm32(batch, CS_GPR(0) + 4, 0);
659 }
660
661 /**
662 * Calculate the result and store it to CS_GPR0.
663 */
664 static void
665 calculate_result_on_gpu(struct iris_context *ice, struct iris_query *q)
666 {
667 struct iris_batch *batch = &ice->batches[q->batch_idx];
668 struct iris_screen *screen = (void *) ice->ctx.screen;
669 const struct gen_device_info *devinfo = &batch->screen->devinfo;
670
671 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
672 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
673 overflow_result_to_gpr0(ice, q);
674 return;
675 }
676
677 if (q->type == PIPE_QUERY_TIMESTAMP) {
678 ice->vtbl.load_register_mem64(batch, CS_GPR(0), q->bo,
679 offsetof(struct iris_query_snapshots, start));
680 /* TODO: This discards any fractional bits of the timebase scale.
681 * We would need to do a bit of fixed point math on the CS ALU, or
682 * launch an actual shader to calculate this with full precision.
683 */
684 emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
685 keep_gpr0_lower_n_bits(ice, 36);
686 return;
687 }
688
689 ice->vtbl.load_register_mem64(batch, CS_GPR(1), q->bo,
690 offsetof(struct iris_query_snapshots, start));
691 ice->vtbl.load_register_mem64(batch, CS_GPR(2), q->bo,
692 offsetof(struct iris_query_snapshots, end));
693
694 static const uint32_t math[] = {
695 MI_MATH | (5 - 2),
696 MI_ALU2(LOAD, SRCA, R2),
697 MI_ALU2(LOAD, SRCB, R1),
698 MI_ALU0(SUB),
699 MI_ALU2(STORE, R0, ACCU),
700 };
701 iris_batch_emit(batch, math, sizeof(math));
702
703 /* WaDividePSInvocationCountBy4:HSW,BDW */
704 if (devinfo->gen == 8 &&
705 q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
706 q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
707 shr_gpr0_by_2_bits(ice);
708
709 if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE ||
710 q->type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
711 gpr0_to_bool(ice);
712
713 if (q->type == PIPE_QUERY_TIME_ELAPSED) {
714 /* TODO: This discards fractional bits (see above). */
715 emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
716 }
717 }
718
719 static struct pipe_query *
720 iris_create_query(struct pipe_context *ctx,
721 unsigned query_type,
722 unsigned index)
723 {
724 struct iris_query *q = calloc(1, sizeof(struct iris_query));
725
726 q->type = query_type;
727 q->index = index;
728
729 if (q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
730 q->index == PIPE_STAT_QUERY_CS_INVOCATIONS)
731 q->batch_idx = IRIS_BATCH_COMPUTE;
732 else
733 q->batch_idx = IRIS_BATCH_RENDER;
734 return (struct pipe_query *) q;
735 }
736
737 static void
738 iris_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query)
739 {
740 struct iris_query *query = (void *) p_query;
741 iris_bo_unreference(query->bo);
742 free(query);
743 }
744
745
746 static boolean
747 iris_begin_query(struct pipe_context *ctx, struct pipe_query *query)
748 {
749 struct iris_screen *screen = (void *) ctx->screen;
750 struct iris_context *ice = (void *) ctx;
751 struct iris_query *q = (void *) query;
752
753 iris_bo_unreference(q->bo);
754 q->bo = iris_bo_alloc(screen->bufmgr, "query object", 4096,
755 IRIS_MEMZONE_OTHER);
756 if (!q->bo)
757 return false;
758
759 q->map = iris_bo_map(&ice->dbg, q->bo, MAP_READ | MAP_WRITE | MAP_ASYNC);
760 if (!q->map)
761 return false;
762
763 q->result = 0ull;
764 q->ready = false;
765 q->map->snapshots_landed = false;
766
767 if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
768 ice->state.prims_generated_query_active = true;
769 ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
770 }
771
772 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
773 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
774 write_overflow_values(ice, q, false);
775 else
776 write_value(ice, q, offsetof(struct iris_query_snapshots, start));
777
778 return true;
779 }
780
781 static bool
782 iris_end_query(struct pipe_context *ctx, struct pipe_query *query)
783 {
784 struct iris_context *ice = (void *) ctx;
785 struct iris_query *q = (void *) query;
786
787 if (q->type == PIPE_QUERY_TIMESTAMP) {
788 iris_begin_query(ctx, query);
789 mark_available(ice, q);
790 return true;
791 }
792
793 if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
794 ice->state.prims_generated_query_active = false;
795 ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
796 }
797
798 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
799 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
800 write_overflow_values(ice, q, true);
801 else
802 write_value(ice, q, offsetof(struct iris_query_snapshots, end));
803 mark_available(ice, q);
804
805 return true;
806 }
807
808 /**
809 * See if the snapshots have landed for a query, and if so, compute the
810 * result and mark it ready. Does not flush (unlike iris_get_query_result).
811 */
812 static void
813 iris_check_query_no_flush(struct iris_context *ice, struct iris_query *q)
814 {
815 struct iris_screen *screen = (void *) ice->ctx.screen;
816 const struct gen_device_info *devinfo = &screen->devinfo;
817
818 if (!q->ready && q->map->snapshots_landed) {
819 calculate_result_on_cpu(devinfo, q);
820 }
821 }
822
823 static boolean
824 iris_get_query_result(struct pipe_context *ctx,
825 struct pipe_query *query,
826 boolean wait,
827 union pipe_query_result *result)
828 {
829 struct iris_context *ice = (void *) ctx;
830 struct iris_query *q = (void *) query;
831 struct iris_screen *screen = (void *) ctx->screen;
832 const struct gen_device_info *devinfo = &screen->devinfo;
833
834 if (!q->ready) {
835 if (iris_batch_references(&ice->batches[q->batch_idx], q->bo))
836 iris_batch_flush(&ice->batches[q->batch_idx]);
837
838 if (!q->map->snapshots_landed) {
839 if (wait)
840 iris_bo_wait_rendering(q->bo);
841 else
842 return false;
843 }
844
845 assert(q->map->snapshots_landed);
846 calculate_result_on_cpu(devinfo, q);
847 }
848
849 assert(q->ready);
850
851 result->u64 = q->result;
852
853 return true;
854 }
855
856 static void
857 iris_get_query_result_resource(struct pipe_context *ctx,
858 struct pipe_query *query,
859 boolean wait,
860 enum pipe_query_value_type result_type,
861 int index,
862 struct pipe_resource *p_res,
863 unsigned offset)
864 {
865 struct iris_context *ice = (void *) ctx;
866 struct iris_query *q = (void *) query;
867 struct iris_batch *batch = &ice->batches[q->batch_idx];
868 const struct gen_device_info *devinfo = &batch->screen->devinfo;
869 struct iris_resource *res = (void *) p_res;
870 unsigned snapshots_landed_offset =
871 offsetof(struct iris_query_snapshots, snapshots_landed);
872
873 res->bind_history |= PIPE_BIND_QUERY_BUFFER;
874
875 if (index == -1) {
876 /* They're asking for the availability of the result. If we still
877 * have commands queued up which produce the result, submit them
878 * now so that progress happens. Either way, copy the snapshots
879 * landed field to the destination resource.
880 */
881 if (iris_batch_references(batch, q->bo))
882 iris_batch_flush(batch);
883
884 ice->vtbl.copy_mem_mem(batch, iris_resource_bo(p_res), offset,
885 q->bo, snapshots_landed_offset,
886 result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8);
887 return;
888 }
889
890 if (!q->ready && q->map->snapshots_landed) {
891 /* The final snapshots happen to have landed, so let's just compute
892 * the result on the CPU now...
893 */
894 calculate_result_on_cpu(devinfo, q);
895 }
896
897 if (q->ready) {
898 /* We happen to have the result on the CPU, so just copy it. */
899 if (result_type <= PIPE_QUERY_TYPE_U32) {
900 ice->vtbl.store_data_imm32(batch, iris_resource_bo(p_res), offset,
901 q->result);
902 } else {
903 ice->vtbl.store_data_imm64(batch, iris_resource_bo(p_res), offset,
904 q->result);
905 }
906
907 /* Make sure the result lands before they use bind the QBO elsewhere
908 * and use the result.
909 */
910 // XXX: Why? i965 doesn't do this.
911 iris_emit_pipe_control_flush(batch, PIPE_CONTROL_CS_STALL);
912 return;
913 }
914
915 /* Calculate the result to CS_GPR0 */
916 calculate_result_on_gpu(ice, q);
917
918 bool predicated = !wait && !q->stalled;
919
920 if (predicated) {
921 ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
922 ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, q->bo,
923 snapshots_landed_offset);
924 uint32_t predicate = MI_PREDICATE |
925 MI_PREDICATE_LOADOP_LOADINV |
926 MI_PREDICATE_COMBINEOP_SET |
927 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
928 iris_batch_emit(batch, &predicate, sizeof(uint32_t));
929 }
930
931 if (result_type <= PIPE_QUERY_TYPE_U32) {
932 ice->vtbl.store_register_mem32(batch, CS_GPR(0),
933 iris_resource_bo(p_res),
934 offset, predicated);
935 } else {
936 ice->vtbl.store_register_mem64(batch, CS_GPR(0),
937 iris_resource_bo(p_res),
938 offset, predicated);
939 }
940 }
941
942 static void
943 iris_set_active_query_state(struct pipe_context *ctx, boolean enable)
944 {
945 struct iris_context *ice = (void *) ctx;
946
947 if (ice->state.statistics_counters_enabled == enable)
948 return;
949
950 // XXX: most packets aren't paying attention to this yet, because it'd
951 // have to be done dynamically at draw time, which is a pain
952 ice->state.statistics_counters_enabled = enable;
953 ice->state.dirty |= IRIS_DIRTY_CLIP |
954 IRIS_DIRTY_GS |
955 IRIS_DIRTY_RASTER |
956 IRIS_DIRTY_STREAMOUT |
957 IRIS_DIRTY_TCS |
958 IRIS_DIRTY_TES |
959 IRIS_DIRTY_VS |
960 IRIS_DIRTY_WM;
961 }
962
963 static void
964 set_predicate_enable(struct iris_context *ice, bool value)
965 {
966 if (value)
967 ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
968 else
969 ice->state.predicate = IRIS_PREDICATE_STATE_DONT_RENDER;
970 }
971
972 static void
973 set_predicate_for_result(struct iris_context *ice,
974 struct iris_query *q,
975 bool inverted)
976 {
977 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
978
979 /* The CPU doesn't have the query result yet; use hardware predication */
980 ice->state.predicate = IRIS_PREDICATE_STATE_USE_BIT;
981
982 /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
983 iris_emit_pipe_control_flush(batch, PIPE_CONTROL_FLUSH_ENABLE);
984 q->stalled = true;
985
986 switch (q->type) {
987 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
988 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
989 overflow_result_to_gpr0(ice, q);
990
991 ice->vtbl.load_register_reg64(batch, MI_PREDICATE_SRC0, CS_GPR(0));
992 ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
993 break;
994 default:
995 /* PIPE_QUERY_OCCLUSION_* */
996 ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, q->bo,
997 offsetof(struct iris_query_snapshots, start));
998 ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, q->bo,
999 offsetof(struct iris_query_snapshots, end));
1000 break;
1001 }
1002
1003 uint32_t mi_predicate = MI_PREDICATE |
1004 MI_PREDICATE_COMBINEOP_SET |
1005 MI_PREDICATE_COMPAREOP_SRCS_EQUAL |
1006 (inverted ? MI_PREDICATE_LOADOP_LOAD
1007 : MI_PREDICATE_LOADOP_LOADINV);
1008 iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
1009
1010 /* We immediately set the predicate on the render batch, as all the
1011 * counters come from 3D operations. However, we may need to predicate
1012 * a compute dispatch, which executes in a different GEM context and has
1013 * a different MI_PREDICATE_DATA register. So, we save the result to
1014 * memory and reload it in iris_launch_grid.
1015 */
1016 unsigned offset = offsetof(struct iris_query_snapshots, predicate_data);
1017 ice->vtbl.store_register_mem64(batch, MI_PREDICATE_DATA,
1018 q->bo, offset, false);
1019 ice->state.compute_predicate = q->bo;
1020 }
1021
1022 static void
1023 iris_render_condition(struct pipe_context *ctx,
1024 struct pipe_query *query,
1025 boolean condition,
1026 enum pipe_render_cond_flag mode)
1027 {
1028 struct iris_context *ice = (void *) ctx;
1029 struct iris_query *q = (void *) query;
1030
1031 if (!q) {
1032 ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
1033 return;
1034 }
1035
1036 iris_check_query_no_flush(ice, q);
1037
1038 if (q->result || q->ready) {
1039 set_predicate_enable(ice, (q->result != 0) ^ condition);
1040 } else {
1041 if (mode == PIPE_RENDER_COND_NO_WAIT ||
1042 mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) {
1043 perf_debug(&ice->dbg, "Conditional rendering demoted from "
1044 "\"no wait\" to \"wait\".");
1045 }
1046 set_predicate_for_result(ice, q, condition);
1047 }
1048 }
1049
1050 void
1051 iris_init_query_functions(struct pipe_context *ctx)
1052 {
1053 ctx->create_query = iris_create_query;
1054 ctx->destroy_query = iris_destroy_query;
1055 ctx->begin_query = iris_begin_query;
1056 ctx->end_query = iris_end_query;
1057 ctx->get_query_result = iris_get_query_result;
1058 ctx->get_query_result_resource = iris_get_query_result_resource;
1059 ctx->set_active_query_state = iris_set_active_query_state;
1060 ctx->render_condition = iris_render_condition;
1061 }