iris: Fix Broadwell WaDividePSInvocationCountBy4
[mesa.git] / src / gallium / drivers / iris / iris_query.c
1 /*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
21 */
22
23 /**
24 * @file iris_query.c
25 *
26 * Query object support. This allows measuring various simple statistics
27 * via counters on the GPU.
28 */
29
30 #include <stdio.h>
31 #include <errno.h>
32 #include "pipe/p_defines.h"
33 #include "pipe/p_state.h"
34 #include "pipe/p_context.h"
35 #include "pipe/p_screen.h"
36 #include "util/fast_idiv_by_const.h"
37 #include "util/u_inlines.h"
38 #include "iris_context.h"
39 #include "iris_defines.h"
40 #include "iris_resource.h"
41 #include "iris_screen.h"
42 #include "vulkan/util/vk_util.h"
43
44 #define IA_VERTICES_COUNT 0x2310
45 #define IA_PRIMITIVES_COUNT 0x2318
46 #define VS_INVOCATION_COUNT 0x2320
47 #define HS_INVOCATION_COUNT 0x2300
48 #define DS_INVOCATION_COUNT 0x2308
49 #define GS_INVOCATION_COUNT 0x2328
50 #define GS_PRIMITIVES_COUNT 0x2330
51 #define CL_INVOCATION_COUNT 0x2338
52 #define CL_PRIMITIVES_COUNT 0x2340
53 #define PS_INVOCATION_COUNT 0x2348
54 #define CS_INVOCATION_COUNT 0x2290
55 #define PS_DEPTH_COUNT 0x2350
56
57 #define SO_PRIM_STORAGE_NEEDED(n) (0x5240 + (n) * 8)
58
59 #define SO_NUM_PRIMS_WRITTEN(n) (0x5200 + (n) * 8)
60
61 #define MI_MATH (0x1a << 23)
62
63 #define MI_ALU_LOAD 0x080
64 #define MI_ALU_LOADINV 0x480
65 #define MI_ALU_LOAD0 0x081
66 #define MI_ALU_LOAD1 0x481
67 #define MI_ALU_ADD 0x100
68 #define MI_ALU_SUB 0x101
69 #define MI_ALU_AND 0x102
70 #define MI_ALU_OR 0x103
71 #define MI_ALU_XOR 0x104
72 #define MI_ALU_STORE 0x180
73 #define MI_ALU_STOREINV 0x580
74
75 #define MI_ALU_R0 0x00
76 #define MI_ALU_R1 0x01
77 #define MI_ALU_R2 0x02
78 #define MI_ALU_R3 0x03
79 #define MI_ALU_R4 0x04
80 #define MI_ALU_SRCA 0x20
81 #define MI_ALU_SRCB 0x21
82 #define MI_ALU_ACCU 0x31
83 #define MI_ALU_ZF 0x32
84 #define MI_ALU_CF 0x33
85
86 #define _MI_ALU(op, x, y) (((op) << 20) | ((x) << 10) | (y))
87
88 #define _MI_ALU0(op) _MI_ALU(MI_ALU_##op, 0, 0)
89 #define _MI_ALU1(op, x) _MI_ALU(MI_ALU_##op, x, 0)
90 #define _MI_ALU2(op, x, y) _MI_ALU(MI_ALU_##op, x, y)
91
92 #define MI_ALU0(op) _MI_ALU0(op)
93 #define MI_ALU1(op, x) _MI_ALU1(op, MI_ALU_##x)
94 #define MI_ALU2(op, x, y) _MI_ALU2(op, MI_ALU_##x, MI_ALU_##y)
95
96 #define emit_lri32 ice->vtbl.load_register_imm32
97 #define emit_lri64 ice->vtbl.load_register_imm64
98 #define emit_lrr32 ice->vtbl.load_register_reg32
99
100 struct iris_query {
101 enum pipe_query_type type;
102 int index;
103
104 bool ready;
105
106 bool stalled;
107
108 uint64_t result;
109
110 struct iris_bo *bo;
111 struct iris_query_snapshots *map;
112
113 int batch_idx;
114 };
115
116 struct iris_query_snapshots {
117 /** iris_render_condition's saved MI_PREDICATE_DATA value. */
118 uint64_t predicate_data;
119
120 /** Have the start/end snapshots landed? */
121 uint64_t snapshots_landed;
122
123 /** Starting and ending counter snapshots */
124 uint64_t start;
125 uint64_t end;
126 };
127
128 struct iris_query_so_overflow {
129 uint64_t predicate_data;
130 uint64_t snapshots_landed;
131
132 struct {
133 uint64_t prim_storage_needed[2];
134 uint64_t num_prims[2];
135 } stream[4];
136 };
137
138 /**
139 * Is this type of query written by PIPE_CONTROL?
140 */
141 static bool
142 iris_is_query_pipelined(struct iris_query *q)
143 {
144 switch (q->type) {
145 case PIPE_QUERY_OCCLUSION_COUNTER:
146 case PIPE_QUERY_OCCLUSION_PREDICATE:
147 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
148 case PIPE_QUERY_TIMESTAMP:
149 case PIPE_QUERY_TIMESTAMP_DISJOINT:
150 case PIPE_QUERY_TIME_ELAPSED:
151 return true;
152
153 default:
154 return false;
155 }
156 }
157
158 static void
159 mark_available(struct iris_context *ice, struct iris_query *q)
160 {
161 struct iris_batch *batch = &ice->batches[q->batch_idx];
162 unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
163 unsigned offset = offsetof(struct iris_query_snapshots, snapshots_landed);
164
165 if (!iris_is_query_pipelined(q)) {
166 ice->vtbl.store_data_imm64(batch, q->bo, offset, true);
167 } else {
168 /* Order available *after* the query results. */
169 flags |= PIPE_CONTROL_FLUSH_ENABLE;
170 iris_emit_pipe_control_write(batch, flags, q->bo, offset, true);
171 }
172 }
173
174 /**
175 * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
176 */
177 static void
178 iris_pipelined_write(struct iris_batch *batch,
179 struct iris_query *q,
180 enum pipe_control_flags flags,
181 unsigned offset)
182 {
183 const struct gen_device_info *devinfo = &batch->screen->devinfo;
184 const unsigned optional_cs_stall =
185 devinfo->gen == 9 && devinfo->gt == 4 ? PIPE_CONTROL_CS_STALL : 0;
186
187 iris_emit_pipe_control_write(batch, flags | optional_cs_stall,
188 q->bo, offset, 0ull);
189 }
190
191 static void
192 write_value(struct iris_context *ice, struct iris_query *q, unsigned offset)
193 {
194 struct iris_batch *batch = &ice->batches[q->batch_idx];
195 const struct gen_device_info *devinfo = &batch->screen->devinfo;
196
197 if (!iris_is_query_pipelined(q)) {
198 iris_emit_pipe_control_flush(batch,
199 PIPE_CONTROL_CS_STALL |
200 PIPE_CONTROL_STALL_AT_SCOREBOARD);
201 q->stalled = true;
202 }
203
204 switch (q->type) {
205 case PIPE_QUERY_OCCLUSION_COUNTER:
206 case PIPE_QUERY_OCCLUSION_PREDICATE:
207 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
208 if (devinfo->gen >= 10) {
209 /* "Driver must program PIPE_CONTROL with only Depth Stall Enable
210 * bit set prior to programming a PIPE_CONTROL with Write PS Depth
211 * Count sync operation."
212 */
213 iris_emit_pipe_control_flush(batch, PIPE_CONTROL_DEPTH_STALL);
214 }
215 iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
216 PIPE_CONTROL_WRITE_DEPTH_COUNT |
217 PIPE_CONTROL_DEPTH_STALL,
218 offset);
219 break;
220 case PIPE_QUERY_TIME_ELAPSED:
221 case PIPE_QUERY_TIMESTAMP:
222 case PIPE_QUERY_TIMESTAMP_DISJOINT:
223 iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
224 PIPE_CONTROL_WRITE_TIMESTAMP,
225 offset);
226 break;
227 case PIPE_QUERY_PRIMITIVES_GENERATED:
228 ice->vtbl.store_register_mem64(batch,
229 q->index == 0 ? CL_INVOCATION_COUNT :
230 SO_PRIM_STORAGE_NEEDED(q->index),
231 q->bo, offset, false);
232 break;
233 case PIPE_QUERY_PRIMITIVES_EMITTED:
234 ice->vtbl.store_register_mem64(batch,
235 SO_NUM_PRIMS_WRITTEN(q->index),
236 q->bo, offset, false);
237 break;
238 case PIPE_QUERY_PIPELINE_STATISTICS: {
239 static const uint32_t index_to_reg[] = {
240 IA_VERTICES_COUNT,
241 IA_PRIMITIVES_COUNT,
242 VS_INVOCATION_COUNT,
243 GS_INVOCATION_COUNT,
244 GS_PRIMITIVES_COUNT,
245 CL_INVOCATION_COUNT,
246 CL_PRIMITIVES_COUNT,
247 PS_INVOCATION_COUNT,
248 HS_INVOCATION_COUNT,
249 DS_INVOCATION_COUNT,
250 CS_INVOCATION_COUNT,
251 };
252 const uint32_t reg = index_to_reg[q->index];
253
254 ice->vtbl.store_register_mem64(batch, reg, q->bo, offset, false);
255 break;
256 }
257 default:
258 assert(false);
259 }
260 }
261
262 static void
263 write_overflow_values(struct iris_context *ice, struct iris_query *q, bool end)
264 {
265 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
266 uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4;
267
268 iris_emit_pipe_control_flush(batch,
269 PIPE_CONTROL_CS_STALL |
270 PIPE_CONTROL_STALL_AT_SCOREBOARD);
271 for (uint32_t i = 0; i < count; i++) {
272 int s = q->index + i;
273 int g_idx = offsetof(struct iris_query_so_overflow,
274 stream[s].num_prims[end]);
275 int w_idx = offsetof(struct iris_query_so_overflow,
276 stream[s].prim_storage_needed[end]);
277 ice->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s),
278 q->bo, g_idx, false);
279 ice->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s),
280 q->bo, w_idx, false);
281 }
282 }
283
284 uint64_t
285 iris_timebase_scale(const struct gen_device_info *devinfo,
286 uint64_t gpu_timestamp)
287 {
288 return (1000000000ull * gpu_timestamp) / devinfo->timestamp_frequency;
289 }
290
291 static uint64_t
292 iris_raw_timestamp_delta(uint64_t time0, uint64_t time1)
293 {
294 if (time0 > time1) {
295 return (1ULL << TIMESTAMP_BITS) + time1 - time0;
296 } else {
297 return time1 - time0;
298 }
299 }
300
301 static bool
302 stream_overflowed(struct iris_query_so_overflow *so, int s)
303 {
304 return (so->stream[s].prim_storage_needed[1] -
305 so->stream[s].prim_storage_needed[0]) !=
306 (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]);
307 }
308
309 static void
310 calculate_result_on_cpu(const struct gen_device_info *devinfo,
311 struct iris_query *q)
312 {
313 switch (q->type) {
314 case PIPE_QUERY_OCCLUSION_PREDICATE:
315 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
316 q->result = q->map->end != q->map->start;
317 break;
318 case PIPE_QUERY_TIMESTAMP:
319 case PIPE_QUERY_TIMESTAMP_DISJOINT:
320 /* The timestamp is the single starting snapshot. */
321 q->result = iris_timebase_scale(devinfo, q->map->start);
322 q->result &= (1ull << TIMESTAMP_BITS) - 1;
323 break;
324 case PIPE_QUERY_TIME_ELAPSED:
325 q->result = iris_raw_timestamp_delta(q->map->start, q->map->end);
326 q->result = iris_timebase_scale(devinfo, q->result);
327 q->result &= (1ull << TIMESTAMP_BITS) - 1;
328 break;
329 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
330 q->result = stream_overflowed((void *) q->map, q->index);
331 break;
332 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
333 q->result = false;
334 for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
335 q->result |= stream_overflowed((void *) q->map, i);
336 break;
337 case PIPE_QUERY_PIPELINE_STATISTICS:
338 q->result = q->map->end - q->map->start;
339
340 /* WaDividePSInvocationCountBy4:HSW,BDW */
341 if (devinfo->gen == 8 && q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
342 q->result /= 4;
343 break;
344 case PIPE_QUERY_OCCLUSION_COUNTER:
345 case PIPE_QUERY_PRIMITIVES_GENERATED:
346 case PIPE_QUERY_PRIMITIVES_EMITTED:
347 default:
348 q->result = q->map->end - q->map->start;
349 break;
350 }
351
352 q->ready = true;
353 }
354
355 static void
356 emit_alu_add(struct iris_batch *batch, unsigned dst_reg,
357 unsigned reg_a, unsigned reg_b)
358 {
359 uint32_t *math = iris_get_command_space(batch, 5 * sizeof(uint32_t));
360
361 math[0] = MI_MATH | (5 - 2);
362 math[1] = _MI_ALU2(LOAD, MI_ALU_SRCA, reg_a);
363 math[2] = _MI_ALU2(LOAD, MI_ALU_SRCB, reg_b);
364 math[3] = _MI_ALU0(ADD);
365 math[4] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
366 }
367
368 static void
369 emit_alu_shl(struct iris_batch *batch, unsigned dst_reg,
370 unsigned src_reg, unsigned shift)
371 {
372 assert(shift > 0);
373
374 int dwords = 1 + 4 * shift;
375
376 uint32_t *math = iris_get_command_space(batch, sizeof(uint32_t) * dwords);
377
378 math[0] = MI_MATH | ((1 + 4 * shift) - 2);
379
380 for (unsigned i = 0; i < shift; i++) {
381 unsigned add_src = (i == 0) ? src_reg : dst_reg;
382 math[1 + (i * 4) + 0] = _MI_ALU2(LOAD, MI_ALU_SRCA, add_src);
383 math[1 + (i * 4) + 1] = _MI_ALU2(LOAD, MI_ALU_SRCB, add_src);
384 math[1 + (i * 4) + 2] = _MI_ALU0(ADD);
385 math[1 + (i * 4) + 3] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
386 }
387 }
388
389 /* Emit dwords to multiply GPR0 by N */
390 static void
391 build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N)
392 {
393 VK_OUTARRAY_MAKE(out, dw, dw_count);
394
395 #define APPEND_ALU(op, x, y) \
396 vk_outarray_append(&out, alu_dw) *alu_dw = _MI_ALU(MI_ALU_##op, x, y)
397
398 assert(N > 0);
399 unsigned top_bit = 31 - __builtin_clz(N);
400 for (int i = top_bit - 1; i >= 0; i--) {
401 /* We get our initial data in GPR0 and we write the final data out to
402 * GPR0 but we use GPR1 as our scratch register.
403 */
404 unsigned src_reg = i == top_bit - 1 ? MI_ALU_R0 : MI_ALU_R1;
405 unsigned dst_reg = i == 0 ? MI_ALU_R0 : MI_ALU_R1;
406
407 /* Shift the current value left by 1 */
408 APPEND_ALU(LOAD, MI_ALU_SRCA, src_reg);
409 APPEND_ALU(LOAD, MI_ALU_SRCB, src_reg);
410 APPEND_ALU(ADD, 0, 0);
411
412 if (N & (1 << i)) {
413 /* Store ACCU to R1 and add R0 to R1 */
414 APPEND_ALU(STORE, MI_ALU_R1, MI_ALU_ACCU);
415 APPEND_ALU(LOAD, MI_ALU_SRCA, MI_ALU_R0);
416 APPEND_ALU(LOAD, MI_ALU_SRCB, MI_ALU_R1);
417 APPEND_ALU(ADD, 0, 0);
418 }
419
420 APPEND_ALU(STORE, dst_reg, MI_ALU_ACCU);
421 }
422
423 #undef APPEND_ALU
424 }
425
426 static void
427 emit_mul_gpr0(struct iris_batch *batch, uint32_t N)
428 {
429 uint32_t num_dwords;
430 build_alu_multiply_gpr0(NULL, &num_dwords, N);
431
432 uint32_t *math = iris_get_command_space(batch, 4 * num_dwords);
433 math[0] = MI_MATH | (num_dwords - 2);
434 build_alu_multiply_gpr0(&math[1], &num_dwords, N);
435 }
436
437 void
438 iris_math_div32_gpr0(struct iris_context *ice,
439 struct iris_batch *batch,
440 uint32_t D)
441 {
442 /* Zero out the top of GPR0 */
443 emit_lri32(batch, CS_GPR(0) + 4, 0);
444
445 if (D == 0) {
446 /* This invalid, but we should do something so we set GPR0 to 0. */
447 emit_lri32(batch, CS_GPR(0), 0);
448 } else if (util_is_power_of_two_or_zero(D)) {
449 unsigned log2_D = util_logbase2(D);
450 assert(log2_D < 32);
451 /* We right-shift by log2(D) by left-shifting by 32 - log2(D) and taking
452 * the top 32 bits of the result.
453 */
454 emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - log2_D);
455 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
456 emit_lri32(batch, CS_GPR(0) + 4, 0);
457 } else {
458 struct util_fast_udiv_info m = util_compute_fast_udiv_info(D, 32, 32);
459 assert(m.multiplier <= UINT32_MAX);
460
461 if (m.pre_shift) {
462 /* We right-shift by L by left-shifting by 32 - l and taking the top
463 * 32 bits of the result.
464 */
465 if (m.pre_shift < 32)
466 emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.pre_shift);
467 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
468 emit_lri32(batch, CS_GPR(0) + 4, 0);
469 }
470
471 /* Do the 32x32 multiply into gpr0 */
472 emit_mul_gpr0(batch, m.multiplier);
473
474 if (m.increment) {
475 /* If we need to increment, save off a copy of GPR0 */
476 emit_lri32(batch, CS_GPR(1) + 0, m.multiplier);
477 emit_lri32(batch, CS_GPR(1) + 4, 0);
478 emit_alu_add(batch, MI_ALU_R0, MI_ALU_R0, MI_ALU_R1);
479 }
480
481 /* Shift by 32 */
482 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
483 emit_lri32(batch, CS_GPR(0) + 4, 0);
484
485 if (m.post_shift) {
486 /* We right-shift by L by left-shifting by 32 - l and taking the top
487 * 32 bits of the result.
488 */
489 if (m.post_shift < 32)
490 emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.post_shift);
491 emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
492 emit_lri32(batch, CS_GPR(0) + 4, 0);
493 }
494 }
495 }
496
497 /*
498 * GPR0 = (GPR0 == 0) ? 0 : 1;
499 */
500 static void
501 gpr0_to_bool(struct iris_context *ice)
502 {
503 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
504
505 ice->vtbl.load_register_imm64(batch, CS_GPR(1), 1ull);
506
507 static const uint32_t math[] = {
508 MI_MATH | (9 - 2),
509 MI_ALU2(LOAD, SRCA, R0),
510 MI_ALU1(LOAD0, SRCB),
511 MI_ALU0(ADD),
512 MI_ALU2(STOREINV, R0, ZF),
513 MI_ALU2(LOAD, SRCA, R0),
514 MI_ALU2(LOAD, SRCB, R1),
515 MI_ALU0(AND),
516 MI_ALU2(STORE, R0, ACCU),
517 };
518 iris_batch_emit(batch, math, sizeof(math));
519 }
520
521 static void
522 load_overflow_data_to_cs_gprs(struct iris_context *ice,
523 struct iris_query *q,
524 int idx)
525 {
526 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
527
528 ice->vtbl.load_register_mem64(batch, CS_GPR(1), q->bo,
529 offsetof(struct iris_query_so_overflow,
530 stream[idx].prim_storage_needed[0]));
531 ice->vtbl.load_register_mem64(batch, CS_GPR(2), q->bo,
532 offsetof(struct iris_query_so_overflow,
533 stream[idx].prim_storage_needed[1]));
534
535 ice->vtbl.load_register_mem64(batch, CS_GPR(3), q->bo,
536 offsetof(struct iris_query_so_overflow,
537 stream[idx].num_prims[0]));
538 ice->vtbl.load_register_mem64(batch, CS_GPR(4), q->bo,
539 offsetof(struct iris_query_so_overflow,
540 stream[idx].num_prims[1]));
541 }
542
543 /*
544 * R3 = R4 - R3;
545 * R1 = R2 - R1;
546 * R1 = R3 - R1;
547 * R0 = R0 | R1;
548 */
549 static void
550 calc_overflow_for_stream(struct iris_context *ice)
551 {
552 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
553 static const uint32_t maths[] = {
554 MI_MATH | (17 - 2),
555 MI_ALU2(LOAD, SRCA, R4),
556 MI_ALU2(LOAD, SRCB, R3),
557 MI_ALU0(SUB),
558 MI_ALU2(STORE, R3, ACCU),
559 MI_ALU2(LOAD, SRCA, R2),
560 MI_ALU2(LOAD, SRCB, R1),
561 MI_ALU0(SUB),
562 MI_ALU2(STORE, R1, ACCU),
563 MI_ALU2(LOAD, SRCA, R3),
564 MI_ALU2(LOAD, SRCB, R1),
565 MI_ALU0(SUB),
566 MI_ALU2(STORE, R1, ACCU),
567 MI_ALU2(LOAD, SRCA, R1),
568 MI_ALU2(LOAD, SRCB, R0),
569 MI_ALU0(OR),
570 MI_ALU2(STORE, R0, ACCU),
571 };
572
573 iris_batch_emit(batch, maths, sizeof(maths));
574 }
575
576 static void
577 overflow_result_to_gpr0(struct iris_context *ice, struct iris_query *q)
578 {
579 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
580
581 ice->vtbl.load_register_imm64(batch, CS_GPR(0), 0ull);
582
583 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) {
584 load_overflow_data_to_cs_gprs(ice, q, q->index);
585 calc_overflow_for_stream(ice);
586 } else {
587 for (int i = 0; i < MAX_VERTEX_STREAMS; i++) {
588 load_overflow_data_to_cs_gprs(ice, q, i);
589 calc_overflow_for_stream(ice);
590 }
591 }
592
593 gpr0_to_bool(ice);
594 }
595
596 /*
597 * GPR0 = GPR0 & ((1ull << n) -1);
598 */
599 static void
600 keep_gpr0_lower_n_bits(struct iris_context *ice, uint32_t n)
601 {
602 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
603
604 ice->vtbl.load_register_imm64(batch, CS_GPR(1), (1ull << n) - 1);
605 static const uint32_t math[] = {
606 MI_MATH | (5 - 2),
607 MI_ALU2(LOAD, SRCA, R0),
608 MI_ALU2(LOAD, SRCB, R1),
609 MI_ALU0(AND),
610 MI_ALU2(STORE, R0, ACCU),
611 };
612 iris_batch_emit(batch, math, sizeof(math));
613 }
614
615 /*
616 * GPR0 = GPR0 << 30;
617 */
618 static void
619 shl_gpr0_by_30_bits(struct iris_context *ice)
620 {
621 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
622 /* First we mask 34 bits of GPR0 to prevent overflow */
623 keep_gpr0_lower_n_bits(ice, 34);
624
625 static const uint32_t shl_math[] = {
626 MI_ALU2(LOAD, SRCA, R0),
627 MI_ALU2(LOAD, SRCB, R0),
628 MI_ALU0(ADD),
629 MI_ALU2(STORE, R0, ACCU),
630 };
631
632 const uint32_t outer_count = 5;
633 const uint32_t inner_count = 6;
634 const uint32_t cmd_len = 1 + inner_count * ARRAY_SIZE(shl_math);
635 const uint32_t batch_len = cmd_len * outer_count;
636 uint32_t *map = iris_get_command_space(batch, batch_len * 4);
637 uint32_t offset = 0;
638 for (int o = 0; o < outer_count; o++) {
639 map[offset++] = MI_MATH | (cmd_len - 2);
640 for (int i = 0; i < inner_count; i++) {
641 memcpy(&map[offset], shl_math, sizeof(shl_math));
642 offset += 4;
643 }
644 }
645 }
646
647 /*
648 * GPR0 = GPR0 >> 2;
649 *
650 * Note that the upper 30 bits of GPR0 are lost!
651 */
652 static void
653 shr_gpr0_by_2_bits(struct iris_context *ice)
654 {
655 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
656 shl_gpr0_by_30_bits(ice);
657 ice->vtbl.load_register_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
658 ice->vtbl.load_register_imm32(batch, CS_GPR(0) + 4, 0);
659 }
660
661 /**
662 * Calculate the result and store it to CS_GPR0.
663 */
664 static void
665 calculate_result_on_gpu(struct iris_context *ice, struct iris_query *q)
666 {
667 struct iris_batch *batch = &ice->batches[q->batch_idx];
668 struct iris_screen *screen = (void *) ice->ctx.screen;
669 const struct gen_device_info *devinfo = &batch->screen->devinfo;
670
671 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
672 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
673 overflow_result_to_gpr0(ice, q);
674 return;
675 }
676
677 if (q->type == PIPE_QUERY_TIMESTAMP) {
678 ice->vtbl.load_register_mem64(batch, CS_GPR(0), q->bo,
679 offsetof(struct iris_query_snapshots, start));
680 /* TODO: This discards any fractional bits of the timebase scale.
681 * We would need to do a bit of fixed point math on the CS ALU, or
682 * launch an actual shader to calculate this with full precision.
683 */
684 emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
685 keep_gpr0_lower_n_bits(ice, 36);
686 return;
687 }
688
689 ice->vtbl.load_register_mem64(batch, CS_GPR(1), q->bo,
690 offsetof(struct iris_query_snapshots, start));
691 ice->vtbl.load_register_mem64(batch, CS_GPR(2), q->bo,
692 offsetof(struct iris_query_snapshots, end));
693
694 static const uint32_t math[] = {
695 MI_MATH | (5 - 2),
696 MI_ALU2(LOAD, SRCA, R2),
697 MI_ALU2(LOAD, SRCB, R1),
698 MI_ALU0(SUB),
699 MI_ALU2(STORE, R0, ACCU),
700 };
701 iris_batch_emit(batch, math, sizeof(math));
702
703 /* WaDividePSInvocationCountBy4:HSW,BDW */
704 if (q->type == PIPE_QUERY_PIPELINE_STATISTICS && q->index == 7 && devinfo->gen == 8)
705 shr_gpr0_by_2_bits(ice);
706
707 if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE ||
708 q->type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
709 gpr0_to_bool(ice);
710
711 if (q->type == PIPE_QUERY_TIME_ELAPSED) {
712 /* TODO: This discards fractional bits (see above). */
713 emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
714 }
715 }
716
717 static struct pipe_query *
718 iris_create_query(struct pipe_context *ctx,
719 unsigned query_type,
720 unsigned index)
721 {
722 struct iris_query *q = calloc(1, sizeof(struct iris_query));
723
724 q->type = query_type;
725 q->index = index;
726
727 if (q->type == PIPE_QUERY_PIPELINE_STATISTICS && q->index == 10)
728 q->batch_idx = IRIS_BATCH_COMPUTE;
729 else
730 q->batch_idx = IRIS_BATCH_RENDER;
731 return (struct pipe_query *) q;
732 }
733
734 static void
735 iris_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query)
736 {
737 struct iris_query *query = (void *) p_query;
738 iris_bo_unreference(query->bo);
739 free(query);
740 }
741
742
743 static boolean
744 iris_begin_query(struct pipe_context *ctx, struct pipe_query *query)
745 {
746 struct iris_screen *screen = (void *) ctx->screen;
747 struct iris_context *ice = (void *) ctx;
748 struct iris_query *q = (void *) query;
749
750 iris_bo_unreference(q->bo);
751 q->bo = iris_bo_alloc(screen->bufmgr, "query object", 4096,
752 IRIS_MEMZONE_OTHER);
753 if (!q->bo)
754 return false;
755
756 q->map = iris_bo_map(&ice->dbg, q->bo, MAP_READ | MAP_WRITE | MAP_ASYNC);
757 if (!q->map)
758 return false;
759
760 q->result = 0ull;
761 q->ready = false;
762 q->map->snapshots_landed = false;
763
764 if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
765 ice->state.prims_generated_query_active = true;
766 ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
767 }
768
769 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
770 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
771 write_overflow_values(ice, q, false);
772 else
773 write_value(ice, q, offsetof(struct iris_query_snapshots, start));
774
775 return true;
776 }
777
778 static bool
779 iris_end_query(struct pipe_context *ctx, struct pipe_query *query)
780 {
781 struct iris_context *ice = (void *) ctx;
782 struct iris_query *q = (void *) query;
783
784 if (q->type == PIPE_QUERY_TIMESTAMP) {
785 iris_begin_query(ctx, query);
786 mark_available(ice, q);
787 return true;
788 }
789
790 if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
791 ice->state.prims_generated_query_active = false;
792 ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
793 }
794
795 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
796 q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
797 write_overflow_values(ice, q, true);
798 else
799 write_value(ice, q, offsetof(struct iris_query_snapshots, end));
800 mark_available(ice, q);
801
802 return true;
803 }
804
805 /**
806 * See if the snapshots have landed for a query, and if so, compute the
807 * result and mark it ready. Does not flush (unlike iris_get_query_result).
808 */
809 static void
810 iris_check_query_no_flush(struct iris_context *ice, struct iris_query *q)
811 {
812 struct iris_screen *screen = (void *) ice->ctx.screen;
813 const struct gen_device_info *devinfo = &screen->devinfo;
814
815 if (!q->ready && q->map->snapshots_landed) {
816 calculate_result_on_cpu(devinfo, q);
817 }
818 }
819
820 static boolean
821 iris_get_query_result(struct pipe_context *ctx,
822 struct pipe_query *query,
823 boolean wait,
824 union pipe_query_result *result)
825 {
826 struct iris_context *ice = (void *) ctx;
827 struct iris_query *q = (void *) query;
828 struct iris_screen *screen = (void *) ctx->screen;
829 const struct gen_device_info *devinfo = &screen->devinfo;
830
831 if (!q->ready) {
832 if (iris_batch_references(&ice->batches[q->batch_idx], q->bo))
833 iris_batch_flush(&ice->batches[q->batch_idx]);
834
835 if (!q->map->snapshots_landed) {
836 if (wait)
837 iris_bo_wait_rendering(q->bo);
838 else
839 return false;
840 }
841
842 assert(q->map->snapshots_landed);
843 calculate_result_on_cpu(devinfo, q);
844 }
845
846 assert(q->ready);
847
848 if (q->type == PIPE_QUERY_PIPELINE_STATISTICS) {
849 switch (q->index) {
850 case 0:
851 result->pipeline_statistics.ia_vertices = q->result;
852 break;
853 case 1:
854 result->pipeline_statistics.ia_primitives = q->result;
855 break;
856 case 2:
857 result->pipeline_statistics.vs_invocations = q->result;
858 break;
859 case 3:
860 result->pipeline_statistics.gs_invocations = q->result;
861 break;
862 case 4:
863 result->pipeline_statistics.gs_primitives = q->result;
864 break;
865 case 5:
866 result->pipeline_statistics.c_invocations = q->result;
867 break;
868 case 6:
869 result->pipeline_statistics.c_primitives = q->result;
870 break;
871 case 7:
872 result->pipeline_statistics.ps_invocations = q->result;
873 break;
874 case 8:
875 result->pipeline_statistics.hs_invocations = q->result;
876 break;
877 case 9:
878 result->pipeline_statistics.ds_invocations = q->result;
879 break;
880 case 10:
881 result->pipeline_statistics.cs_invocations = q->result;
882 break;
883 }
884 } else {
885 result->u64 = q->result;
886 }
887
888 return true;
889 }
890
891 static void
892 iris_get_query_result_resource(struct pipe_context *ctx,
893 struct pipe_query *query,
894 boolean wait,
895 enum pipe_query_value_type result_type,
896 int index,
897 struct pipe_resource *p_res,
898 unsigned offset)
899 {
900 struct iris_context *ice = (void *) ctx;
901 struct iris_query *q = (void *) query;
902 struct iris_batch *batch = &ice->batches[q->batch_idx];
903 const struct gen_device_info *devinfo = &batch->screen->devinfo;
904 struct iris_resource *res = (void *) p_res;
905 unsigned snapshots_landed_offset =
906 offsetof(struct iris_query_snapshots, snapshots_landed);
907
908 res->bind_history |= PIPE_BIND_QUERY_BUFFER;
909
910 if (index == -1) {
911 /* They're asking for the availability of the result. If we still
912 * have commands queued up which produce the result, submit them
913 * now so that progress happens. Either way, copy the snapshots
914 * landed field to the destination resource.
915 */
916 if (iris_batch_references(batch, q->bo))
917 iris_batch_flush(batch);
918
919 ice->vtbl.copy_mem_mem(batch, iris_resource_bo(p_res), offset,
920 q->bo, snapshots_landed_offset,
921 result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8);
922 return;
923 }
924
925 if (!q->ready && q->map->snapshots_landed) {
926 /* The final snapshots happen to have landed, so let's just compute
927 * the result on the CPU now...
928 */
929 calculate_result_on_cpu(devinfo, q);
930 }
931
932 if (q->ready) {
933 /* We happen to have the result on the CPU, so just copy it. */
934 if (result_type <= PIPE_QUERY_TYPE_U32) {
935 ice->vtbl.store_data_imm32(batch, iris_resource_bo(p_res), offset,
936 q->result);
937 } else {
938 ice->vtbl.store_data_imm64(batch, iris_resource_bo(p_res), offset,
939 q->result);
940 }
941
942 /* Make sure the result lands before they use bind the QBO elsewhere
943 * and use the result.
944 */
945 // XXX: Why? i965 doesn't do this.
946 iris_emit_pipe_control_flush(batch, PIPE_CONTROL_CS_STALL);
947 return;
948 }
949
950 /* Calculate the result to CS_GPR0 */
951 calculate_result_on_gpu(ice, q);
952
953 bool predicated = !wait && !q->stalled;
954
955 if (predicated) {
956 ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
957 ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, q->bo,
958 snapshots_landed_offset);
959 uint32_t predicate = MI_PREDICATE |
960 MI_PREDICATE_LOADOP_LOADINV |
961 MI_PREDICATE_COMBINEOP_SET |
962 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
963 iris_batch_emit(batch, &predicate, sizeof(uint32_t));
964 }
965
966 if (result_type <= PIPE_QUERY_TYPE_U32) {
967 ice->vtbl.store_register_mem32(batch, CS_GPR(0),
968 iris_resource_bo(p_res),
969 offset, predicated);
970 } else {
971 ice->vtbl.store_register_mem64(batch, CS_GPR(0),
972 iris_resource_bo(p_res),
973 offset, predicated);
974 }
975 }
976
977 static void
978 iris_set_active_query_state(struct pipe_context *ctx, boolean enable)
979 {
980 struct iris_context *ice = (void *) ctx;
981
982 if (ice->state.statistics_counters_enabled == enable)
983 return;
984
985 // XXX: most packets aren't paying attention to this yet, because it'd
986 // have to be done dynamically at draw time, which is a pain
987 ice->state.statistics_counters_enabled = enable;
988 ice->state.dirty |= IRIS_DIRTY_CLIP |
989 IRIS_DIRTY_GS |
990 IRIS_DIRTY_RASTER |
991 IRIS_DIRTY_STREAMOUT |
992 IRIS_DIRTY_TCS |
993 IRIS_DIRTY_TES |
994 IRIS_DIRTY_VS |
995 IRIS_DIRTY_WM;
996 }
997
998 static void
999 set_predicate_enable(struct iris_context *ice, bool value)
1000 {
1001 if (value)
1002 ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
1003 else
1004 ice->state.predicate = IRIS_PREDICATE_STATE_DONT_RENDER;
1005 }
1006
1007 static void
1008 set_predicate_for_result(struct iris_context *ice,
1009 struct iris_query *q,
1010 bool inverted)
1011 {
1012 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
1013
1014 /* The CPU doesn't have the query result yet; use hardware predication */
1015 ice->state.predicate = IRIS_PREDICATE_STATE_USE_BIT;
1016
1017 /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
1018 iris_emit_pipe_control_flush(batch, PIPE_CONTROL_FLUSH_ENABLE);
1019 q->stalled = true;
1020
1021 switch (q->type) {
1022 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1023 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1024 overflow_result_to_gpr0(ice, q);
1025
1026 ice->vtbl.load_register_reg64(batch, MI_PREDICATE_SRC0, CS_GPR(0));
1027 ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
1028 break;
1029 default:
1030 /* PIPE_QUERY_OCCLUSION_* */
1031 ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, q->bo,
1032 offsetof(struct iris_query_snapshots, start));
1033 ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, q->bo,
1034 offsetof(struct iris_query_snapshots, end));
1035 break;
1036 }
1037
1038 uint32_t mi_predicate = MI_PREDICATE |
1039 MI_PREDICATE_COMBINEOP_SET |
1040 MI_PREDICATE_COMPAREOP_SRCS_EQUAL |
1041 (inverted ? MI_PREDICATE_LOADOP_LOAD
1042 : MI_PREDICATE_LOADOP_LOADINV);
1043 iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
1044
1045 /* We immediately set the predicate on the render batch, as all the
1046 * counters come from 3D operations. However, we may need to predicate
1047 * a compute dispatch, which executes in a different GEM context and has
1048 * a different MI_PREDICATE_DATA register. So, we save the result to
1049 * memory and reload it in iris_launch_grid.
1050 */
1051 unsigned offset = offsetof(struct iris_query_snapshots, predicate_data);
1052 ice->vtbl.store_register_mem64(batch, MI_PREDICATE_DATA,
1053 q->bo, offset, false);
1054 ice->state.compute_predicate = q->bo;
1055 }
1056
1057 static void
1058 iris_render_condition(struct pipe_context *ctx,
1059 struct pipe_query *query,
1060 boolean condition,
1061 enum pipe_render_cond_flag mode)
1062 {
1063 struct iris_context *ice = (void *) ctx;
1064 struct iris_query *q = (void *) query;
1065
1066 if (!q) {
1067 ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
1068 return;
1069 }
1070
1071 iris_check_query_no_flush(ice, q);
1072
1073 if (q->result || q->ready) {
1074 set_predicate_enable(ice, (q->result != 0) ^ condition);
1075 } else {
1076 if (mode == PIPE_RENDER_COND_NO_WAIT ||
1077 mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) {
1078 perf_debug(&ice->dbg, "Conditional rendering demoted from "
1079 "\"no wait\" to \"wait\".");
1080 }
1081 set_predicate_for_result(ice, q, condition);
1082 }
1083 }
1084
1085 void
1086 iris_init_query_functions(struct pipe_context *ctx)
1087 {
1088 ctx->create_query = iris_create_query;
1089 ctx->destroy_query = iris_destroy_query;
1090 ctx->begin_query = iris_begin_query;
1091 ctx->end_query = iris_end_query;
1092 ctx->get_query_result = iris_get_query_result;
1093 ctx->get_query_result_resource = iris_get_query_result_resource;
1094 ctx->set_active_query_state = iris_set_active_query_state;
1095 ctx->render_condition = iris_render_condition;
1096 }