2 * Copyright © 2017 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
26 * Query object support. This allows measuring various simple statistics
27 * via counters on the GPU.
32 #include "pipe/p_defines.h"
33 #include "pipe/p_state.h"
34 #include "pipe/p_context.h"
35 #include "pipe/p_screen.h"
36 #include "util/fast_idiv_by_const.h"
37 #include "util/u_inlines.h"
38 #include "iris_context.h"
39 #include "iris_defines.h"
40 #include "iris_resource.h"
41 #include "iris_screen.h"
42 #include "vulkan/util/vk_util.h"
44 #define IA_VERTICES_COUNT 0x2310
45 #define IA_PRIMITIVES_COUNT 0x2318
46 #define VS_INVOCATION_COUNT 0x2320
47 #define HS_INVOCATION_COUNT 0x2300
48 #define DS_INVOCATION_COUNT 0x2308
49 #define GS_INVOCATION_COUNT 0x2328
50 #define GS_PRIMITIVES_COUNT 0x2330
51 #define CL_INVOCATION_COUNT 0x2338
52 #define CL_PRIMITIVES_COUNT 0x2340
53 #define PS_INVOCATION_COUNT 0x2348
54 #define CS_INVOCATION_COUNT 0x2290
55 #define PS_DEPTH_COUNT 0x2350
57 #define SO_PRIM_STORAGE_NEEDED(n) (0x5240 + (n) * 8)
59 #define SO_NUM_PRIMS_WRITTEN(n) (0x5200 + (n) * 8)
61 #define MI_MATH (0x1a << 23)
63 #define MI_ALU_LOAD 0x080
64 #define MI_ALU_LOADINV 0x480
65 #define MI_ALU_LOAD0 0x081
66 #define MI_ALU_LOAD1 0x481
67 #define MI_ALU_ADD 0x100
68 #define MI_ALU_SUB 0x101
69 #define MI_ALU_AND 0x102
70 #define MI_ALU_OR 0x103
71 #define MI_ALU_XOR 0x104
72 #define MI_ALU_STORE 0x180
73 #define MI_ALU_STOREINV 0x580
75 #define MI_ALU_R0 0x00
76 #define MI_ALU_R1 0x01
77 #define MI_ALU_R2 0x02
78 #define MI_ALU_R3 0x03
79 #define MI_ALU_R4 0x04
80 #define MI_ALU_SRCA 0x20
81 #define MI_ALU_SRCB 0x21
82 #define MI_ALU_ACCU 0x31
83 #define MI_ALU_ZF 0x32
84 #define MI_ALU_CF 0x33
86 #define _MI_ALU(op, x, y) (((op) << 20) | ((x) << 10) | (y))
88 #define _MI_ALU0(op) _MI_ALU(MI_ALU_##op, 0, 0)
89 #define _MI_ALU1(op, x) _MI_ALU(MI_ALU_##op, x, 0)
90 #define _MI_ALU2(op, x, y) _MI_ALU(MI_ALU_##op, x, y)
92 #define MI_ALU0(op) _MI_ALU0(op)
93 #define MI_ALU1(op, x) _MI_ALU1(op, MI_ALU_##x)
94 #define MI_ALU2(op, x, y) _MI_ALU2(op, MI_ALU_##x, MI_ALU_##y)
96 #define emit_lri32 ice->vtbl.load_register_imm32
97 #define emit_lri64 ice->vtbl.load_register_imm64
98 #define emit_lrr32 ice->vtbl.load_register_reg32
101 enum pipe_query_type type
;
111 struct iris_query_snapshots
*map
;
116 struct iris_query_snapshots
{
117 /** iris_render_condition's saved MI_PREDICATE_DATA value. */
118 uint64_t predicate_data
;
120 /** Have the start/end snapshots landed? */
121 uint64_t snapshots_landed
;
123 /** Starting and ending counter snapshots */
128 struct iris_query_so_overflow
{
129 uint64_t predicate_data
;
130 uint64_t snapshots_landed
;
133 uint64_t prim_storage_needed
[2];
134 uint64_t num_prims
[2];
139 * Is this type of query written by PIPE_CONTROL?
142 iris_is_query_pipelined(struct iris_query
*q
)
145 case PIPE_QUERY_OCCLUSION_COUNTER
:
146 case PIPE_QUERY_OCCLUSION_PREDICATE
:
147 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE
:
148 case PIPE_QUERY_TIMESTAMP
:
149 case PIPE_QUERY_TIMESTAMP_DISJOINT
:
150 case PIPE_QUERY_TIME_ELAPSED
:
159 mark_available(struct iris_context
*ice
, struct iris_query
*q
)
161 struct iris_batch
*batch
= &ice
->batches
[q
->batch_idx
];
162 unsigned flags
= PIPE_CONTROL_WRITE_IMMEDIATE
;
163 unsigned offset
= offsetof(struct iris_query_snapshots
, snapshots_landed
);
165 if (!iris_is_query_pipelined(q
)) {
166 ice
->vtbl
.store_data_imm64(batch
, q
->bo
, offset
, true);
168 /* Order available *after* the query results. */
169 flags
|= PIPE_CONTROL_FLUSH_ENABLE
;
170 iris_emit_pipe_control_write(batch
, flags
, q
->bo
, offset
, true);
175 * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
178 iris_pipelined_write(struct iris_batch
*batch
,
179 struct iris_query
*q
,
180 enum pipe_control_flags flags
,
183 const struct gen_device_info
*devinfo
= &batch
->screen
->devinfo
;
184 const unsigned optional_cs_stall
=
185 devinfo
->gen
== 9 && devinfo
->gt
== 4 ? PIPE_CONTROL_CS_STALL
: 0;
187 iris_emit_pipe_control_write(batch
, flags
| optional_cs_stall
,
188 q
->bo
, offset
, 0ull);
192 write_value(struct iris_context
*ice
, struct iris_query
*q
, unsigned offset
)
194 struct iris_batch
*batch
= &ice
->batches
[q
->batch_idx
];
195 const struct gen_device_info
*devinfo
= &batch
->screen
->devinfo
;
197 if (!iris_is_query_pipelined(q
)) {
198 iris_emit_pipe_control_flush(batch
,
199 PIPE_CONTROL_CS_STALL
|
200 PIPE_CONTROL_STALL_AT_SCOREBOARD
);
205 case PIPE_QUERY_OCCLUSION_COUNTER
:
206 case PIPE_QUERY_OCCLUSION_PREDICATE
:
207 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE
:
208 if (devinfo
->gen
>= 10) {
209 /* "Driver must program PIPE_CONTROL with only Depth Stall Enable
210 * bit set prior to programming a PIPE_CONTROL with Write PS Depth
211 * Count sync operation."
213 iris_emit_pipe_control_flush(batch
, PIPE_CONTROL_DEPTH_STALL
);
215 iris_pipelined_write(&ice
->batches
[IRIS_BATCH_RENDER
], q
,
216 PIPE_CONTROL_WRITE_DEPTH_COUNT
|
217 PIPE_CONTROL_DEPTH_STALL
,
220 case PIPE_QUERY_TIME_ELAPSED
:
221 case PIPE_QUERY_TIMESTAMP
:
222 case PIPE_QUERY_TIMESTAMP_DISJOINT
:
223 iris_pipelined_write(&ice
->batches
[IRIS_BATCH_RENDER
], q
,
224 PIPE_CONTROL_WRITE_TIMESTAMP
,
227 case PIPE_QUERY_PRIMITIVES_GENERATED
:
228 ice
->vtbl
.store_register_mem64(batch
,
229 q
->index
== 0 ? CL_INVOCATION_COUNT
:
230 SO_PRIM_STORAGE_NEEDED(q
->index
),
231 q
->bo
, offset
, false);
233 case PIPE_QUERY_PRIMITIVES_EMITTED
:
234 ice
->vtbl
.store_register_mem64(batch
,
235 SO_NUM_PRIMS_WRITTEN(q
->index
),
236 q
->bo
, offset
, false);
238 case PIPE_QUERY_PIPELINE_STATISTICS
: {
239 static const uint32_t index_to_reg
[] = {
252 const uint32_t reg
= index_to_reg
[q
->index
];
254 ice
->vtbl
.store_register_mem64(batch
, reg
, q
->bo
, offset
, false);
263 write_overflow_values(struct iris_context
*ice
, struct iris_query
*q
, bool end
)
265 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
266 uint32_t count
= q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
? 1 : 4;
268 iris_emit_pipe_control_flush(batch
,
269 PIPE_CONTROL_CS_STALL
|
270 PIPE_CONTROL_STALL_AT_SCOREBOARD
);
271 for (uint32_t i
= 0; i
< count
; i
++) {
272 int s
= q
->index
+ i
;
273 int g_idx
= offsetof(struct iris_query_so_overflow
,
274 stream
[s
].num_prims
[end
]);
275 int w_idx
= offsetof(struct iris_query_so_overflow
,
276 stream
[s
].prim_storage_needed
[end
]);
277 ice
->vtbl
.store_register_mem64(batch
, SO_NUM_PRIMS_WRITTEN(s
),
278 q
->bo
, g_idx
, false);
279 ice
->vtbl
.store_register_mem64(batch
, SO_PRIM_STORAGE_NEEDED(s
),
280 q
->bo
, w_idx
, false);
285 iris_timebase_scale(const struct gen_device_info
*devinfo
,
286 uint64_t gpu_timestamp
)
288 return (1000000000ull * gpu_timestamp
) / devinfo
->timestamp_frequency
;
292 iris_raw_timestamp_delta(uint64_t time0
, uint64_t time1
)
295 return (1ULL << TIMESTAMP_BITS
) + time1
- time0
;
297 return time1
- time0
;
302 stream_overflowed(struct iris_query_so_overflow
*so
, int s
)
304 return (so
->stream
[s
].prim_storage_needed
[1] -
305 so
->stream
[s
].prim_storage_needed
[0]) !=
306 (so
->stream
[s
].num_prims
[1] - so
->stream
[s
].num_prims
[0]);
310 calculate_result_on_cpu(const struct gen_device_info
*devinfo
,
311 struct iris_query
*q
)
314 case PIPE_QUERY_OCCLUSION_PREDICATE
:
315 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE
:
316 q
->result
= q
->map
->end
!= q
->map
->start
;
318 case PIPE_QUERY_TIMESTAMP
:
319 case PIPE_QUERY_TIMESTAMP_DISJOINT
:
320 /* The timestamp is the single starting snapshot. */
321 q
->result
= iris_timebase_scale(devinfo
, q
->map
->start
);
322 q
->result
&= (1ull << TIMESTAMP_BITS
) - 1;
324 case PIPE_QUERY_TIME_ELAPSED
:
325 q
->result
= iris_raw_timestamp_delta(q
->map
->start
, q
->map
->end
);
326 q
->result
= iris_timebase_scale(devinfo
, q
->result
);
327 q
->result
&= (1ull << TIMESTAMP_BITS
) - 1;
329 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
330 q
->result
= stream_overflowed((void *) q
->map
, q
->index
);
332 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
:
334 for (int i
= 0; i
< MAX_VERTEX_STREAMS
; i
++)
335 q
->result
|= stream_overflowed((void *) q
->map
, i
);
337 case PIPE_QUERY_PIPELINE_STATISTICS
:
338 q
->result
= q
->map
->end
- q
->map
->start
;
340 /* WaDividePSInvocationCountBy4:HSW,BDW */
341 if (devinfo
->gen
== 8 && q
->index
== PIPE_STAT_QUERY_PS_INVOCATIONS
)
344 case PIPE_QUERY_OCCLUSION_COUNTER
:
345 case PIPE_QUERY_PRIMITIVES_GENERATED
:
346 case PIPE_QUERY_PRIMITIVES_EMITTED
:
348 q
->result
= q
->map
->end
- q
->map
->start
;
356 emit_alu_add(struct iris_batch
*batch
, unsigned dst_reg
,
357 unsigned reg_a
, unsigned reg_b
)
359 uint32_t *math
= iris_get_command_space(batch
, 5 * sizeof(uint32_t));
361 math
[0] = MI_MATH
| (5 - 2);
362 math
[1] = _MI_ALU2(LOAD
, MI_ALU_SRCA
, reg_a
);
363 math
[2] = _MI_ALU2(LOAD
, MI_ALU_SRCB
, reg_b
);
364 math
[3] = _MI_ALU0(ADD
);
365 math
[4] = _MI_ALU2(STORE
, dst_reg
, MI_ALU_ACCU
);
369 emit_alu_shl(struct iris_batch
*batch
, unsigned dst_reg
,
370 unsigned src_reg
, unsigned shift
)
374 int dwords
= 1 + 4 * shift
;
376 uint32_t *math
= iris_get_command_space(batch
, sizeof(uint32_t) * dwords
);
378 math
[0] = MI_MATH
| ((1 + 4 * shift
) - 2);
380 for (unsigned i
= 0; i
< shift
; i
++) {
381 unsigned add_src
= (i
== 0) ? src_reg
: dst_reg
;
382 math
[1 + (i
* 4) + 0] = _MI_ALU2(LOAD
, MI_ALU_SRCA
, add_src
);
383 math
[1 + (i
* 4) + 1] = _MI_ALU2(LOAD
, MI_ALU_SRCB
, add_src
);
384 math
[1 + (i
* 4) + 2] = _MI_ALU0(ADD
);
385 math
[1 + (i
* 4) + 3] = _MI_ALU2(STORE
, dst_reg
, MI_ALU_ACCU
);
389 /* Emit dwords to multiply GPR0 by N */
391 build_alu_multiply_gpr0(uint32_t *dw
, unsigned *dw_count
, uint32_t N
)
393 VK_OUTARRAY_MAKE(out
, dw
, dw_count
);
395 #define APPEND_ALU(op, x, y) \
396 vk_outarray_append(&out, alu_dw) *alu_dw = _MI_ALU(MI_ALU_##op, x, y)
399 unsigned top_bit
= 31 - __builtin_clz(N
);
400 for (int i
= top_bit
- 1; i
>= 0; i
--) {
401 /* We get our initial data in GPR0 and we write the final data out to
402 * GPR0 but we use GPR1 as our scratch register.
404 unsigned src_reg
= i
== top_bit
- 1 ? MI_ALU_R0
: MI_ALU_R1
;
405 unsigned dst_reg
= i
== 0 ? MI_ALU_R0
: MI_ALU_R1
;
407 /* Shift the current value left by 1 */
408 APPEND_ALU(LOAD
, MI_ALU_SRCA
, src_reg
);
409 APPEND_ALU(LOAD
, MI_ALU_SRCB
, src_reg
);
410 APPEND_ALU(ADD
, 0, 0);
413 /* Store ACCU to R1 and add R0 to R1 */
414 APPEND_ALU(STORE
, MI_ALU_R1
, MI_ALU_ACCU
);
415 APPEND_ALU(LOAD
, MI_ALU_SRCA
, MI_ALU_R0
);
416 APPEND_ALU(LOAD
, MI_ALU_SRCB
, MI_ALU_R1
);
417 APPEND_ALU(ADD
, 0, 0);
420 APPEND_ALU(STORE
, dst_reg
, MI_ALU_ACCU
);
427 emit_mul_gpr0(struct iris_batch
*batch
, uint32_t N
)
430 build_alu_multiply_gpr0(NULL
, &num_dwords
, N
);
432 uint32_t *math
= iris_get_command_space(batch
, 4 * num_dwords
);
433 math
[0] = MI_MATH
| (num_dwords
- 2);
434 build_alu_multiply_gpr0(&math
[1], &num_dwords
, N
);
438 iris_math_div32_gpr0(struct iris_context
*ice
,
439 struct iris_batch
*batch
,
442 /* Zero out the top of GPR0 */
443 emit_lri32(batch
, CS_GPR(0) + 4, 0);
446 /* This invalid, but we should do something so we set GPR0 to 0. */
447 emit_lri32(batch
, CS_GPR(0), 0);
448 } else if (util_is_power_of_two_or_zero(D
)) {
449 unsigned log2_D
= util_logbase2(D
);
451 /* We right-shift by log2(D) by left-shifting by 32 - log2(D) and taking
452 * the top 32 bits of the result.
454 emit_alu_shl(batch
, MI_ALU_R0
, MI_ALU_R0
, 32 - log2_D
);
455 emit_lrr32(batch
, CS_GPR(0) + 0, CS_GPR(0) + 4);
456 emit_lri32(batch
, CS_GPR(0) + 4, 0);
458 struct util_fast_udiv_info m
= util_compute_fast_udiv_info(D
, 32, 32);
459 assert(m
.multiplier
<= UINT32_MAX
);
462 /* We right-shift by L by left-shifting by 32 - l and taking the top
463 * 32 bits of the result.
465 if (m
.pre_shift
< 32)
466 emit_alu_shl(batch
, MI_ALU_R0
, MI_ALU_R0
, 32 - m
.pre_shift
);
467 emit_lrr32(batch
, CS_GPR(0) + 0, CS_GPR(0) + 4);
468 emit_lri32(batch
, CS_GPR(0) + 4, 0);
471 /* Do the 32x32 multiply into gpr0 */
472 emit_mul_gpr0(batch
, m
.multiplier
);
475 /* If we need to increment, save off a copy of GPR0 */
476 emit_lri32(batch
, CS_GPR(1) + 0, m
.multiplier
);
477 emit_lri32(batch
, CS_GPR(1) + 4, 0);
478 emit_alu_add(batch
, MI_ALU_R0
, MI_ALU_R0
, MI_ALU_R1
);
482 emit_lrr32(batch
, CS_GPR(0) + 0, CS_GPR(0) + 4);
483 emit_lri32(batch
, CS_GPR(0) + 4, 0);
486 /* We right-shift by L by left-shifting by 32 - l and taking the top
487 * 32 bits of the result.
489 if (m
.post_shift
< 32)
490 emit_alu_shl(batch
, MI_ALU_R0
, MI_ALU_R0
, 32 - m
.post_shift
);
491 emit_lrr32(batch
, CS_GPR(0) + 0, CS_GPR(0) + 4);
492 emit_lri32(batch
, CS_GPR(0) + 4, 0);
498 * GPR0 = (GPR0 == 0) ? 0 : 1;
501 gpr0_to_bool(struct iris_context
*ice
)
503 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
505 ice
->vtbl
.load_register_imm64(batch
, CS_GPR(1), 1ull);
507 static const uint32_t math
[] = {
509 MI_ALU2(LOAD
, SRCA
, R0
),
510 MI_ALU1(LOAD0
, SRCB
),
512 MI_ALU2(STOREINV
, R0
, ZF
),
513 MI_ALU2(LOAD
, SRCA
, R0
),
514 MI_ALU2(LOAD
, SRCB
, R1
),
516 MI_ALU2(STORE
, R0
, ACCU
),
518 iris_batch_emit(batch
, math
, sizeof(math
));
522 load_overflow_data_to_cs_gprs(struct iris_context
*ice
,
523 struct iris_query
*q
,
526 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
528 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(1), q
->bo
,
529 offsetof(struct iris_query_so_overflow
,
530 stream
[idx
].prim_storage_needed
[0]));
531 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(2), q
->bo
,
532 offsetof(struct iris_query_so_overflow
,
533 stream
[idx
].prim_storage_needed
[1]));
535 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(3), q
->bo
,
536 offsetof(struct iris_query_so_overflow
,
537 stream
[idx
].num_prims
[0]));
538 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(4), q
->bo
,
539 offsetof(struct iris_query_so_overflow
,
540 stream
[idx
].num_prims
[1]));
550 calc_overflow_for_stream(struct iris_context
*ice
)
552 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
553 static const uint32_t maths
[] = {
555 MI_ALU2(LOAD
, SRCA
, R4
),
556 MI_ALU2(LOAD
, SRCB
, R3
),
558 MI_ALU2(STORE
, R3
, ACCU
),
559 MI_ALU2(LOAD
, SRCA
, R2
),
560 MI_ALU2(LOAD
, SRCB
, R1
),
562 MI_ALU2(STORE
, R1
, ACCU
),
563 MI_ALU2(LOAD
, SRCA
, R3
),
564 MI_ALU2(LOAD
, SRCB
, R1
),
566 MI_ALU2(STORE
, R1
, ACCU
),
567 MI_ALU2(LOAD
, SRCA
, R1
),
568 MI_ALU2(LOAD
, SRCB
, R0
),
570 MI_ALU2(STORE
, R0
, ACCU
),
573 iris_batch_emit(batch
, maths
, sizeof(maths
));
577 overflow_result_to_gpr0(struct iris_context
*ice
, struct iris_query
*q
)
579 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
581 ice
->vtbl
.load_register_imm64(batch
, CS_GPR(0), 0ull);
583 if (q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
) {
584 load_overflow_data_to_cs_gprs(ice
, q
, q
->index
);
585 calc_overflow_for_stream(ice
);
587 for (int i
= 0; i
< MAX_VERTEX_STREAMS
; i
++) {
588 load_overflow_data_to_cs_gprs(ice
, q
, i
);
589 calc_overflow_for_stream(ice
);
597 * GPR0 = GPR0 & ((1ull << n) -1);
600 keep_gpr0_lower_n_bits(struct iris_context
*ice
, uint32_t n
)
602 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
604 ice
->vtbl
.load_register_imm64(batch
, CS_GPR(1), (1ull << n
) - 1);
605 static const uint32_t math
[] = {
607 MI_ALU2(LOAD
, SRCA
, R0
),
608 MI_ALU2(LOAD
, SRCB
, R1
),
610 MI_ALU2(STORE
, R0
, ACCU
),
612 iris_batch_emit(batch
, math
, sizeof(math
));
619 shl_gpr0_by_30_bits(struct iris_context
*ice
)
621 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
622 /* First we mask 34 bits of GPR0 to prevent overflow */
623 keep_gpr0_lower_n_bits(ice
, 34);
625 static const uint32_t shl_math
[] = {
626 MI_ALU2(LOAD
, SRCA
, R0
),
627 MI_ALU2(LOAD
, SRCB
, R0
),
629 MI_ALU2(STORE
, R0
, ACCU
),
632 const uint32_t outer_count
= 5;
633 const uint32_t inner_count
= 6;
634 const uint32_t cmd_len
= 1 + inner_count
* ARRAY_SIZE(shl_math
);
635 const uint32_t batch_len
= cmd_len
* outer_count
;
636 uint32_t *map
= iris_get_command_space(batch
, batch_len
* 4);
638 for (int o
= 0; o
< outer_count
; o
++) {
639 map
[offset
++] = MI_MATH
| (cmd_len
- 2);
640 for (int i
= 0; i
< inner_count
; i
++) {
641 memcpy(&map
[offset
], shl_math
, sizeof(shl_math
));
650 * Note that the upper 30 bits of GPR0 are lost!
653 shr_gpr0_by_2_bits(struct iris_context
*ice
)
655 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
656 shl_gpr0_by_30_bits(ice
);
657 ice
->vtbl
.load_register_reg32(batch
, CS_GPR(0) + 4, CS_GPR(0));
658 ice
->vtbl
.load_register_imm32(batch
, CS_GPR(0) + 4, 0);
662 * Calculate the result and store it to CS_GPR0.
665 calculate_result_on_gpu(struct iris_context
*ice
, struct iris_query
*q
)
667 struct iris_batch
*batch
= &ice
->batches
[q
->batch_idx
];
668 struct iris_screen
*screen
= (void *) ice
->ctx
.screen
;
669 const struct gen_device_info
*devinfo
= &batch
->screen
->devinfo
;
671 if (q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
||
672 q
->type
== PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
) {
673 overflow_result_to_gpr0(ice
, q
);
677 if (q
->type
== PIPE_QUERY_TIMESTAMP
) {
678 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(0), q
->bo
,
679 offsetof(struct iris_query_snapshots
, start
));
680 /* TODO: This discards any fractional bits of the timebase scale.
681 * We would need to do a bit of fixed point math on the CS ALU, or
682 * launch an actual shader to calculate this with full precision.
684 emit_mul_gpr0(batch
, (1000000000ull / screen
->devinfo
.timestamp_frequency
));
685 keep_gpr0_lower_n_bits(ice
, 36);
689 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(1), q
->bo
,
690 offsetof(struct iris_query_snapshots
, start
));
691 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(2), q
->bo
,
692 offsetof(struct iris_query_snapshots
, end
));
694 static const uint32_t math
[] = {
696 MI_ALU2(LOAD
, SRCA
, R2
),
697 MI_ALU2(LOAD
, SRCB
, R1
),
699 MI_ALU2(STORE
, R0
, ACCU
),
701 iris_batch_emit(batch
, math
, sizeof(math
));
703 /* WaDividePSInvocationCountBy4:HSW,BDW */
704 if (q
->type
== PIPE_QUERY_PIPELINE_STATISTICS
&& q
->index
== 7 && devinfo
->gen
== 8)
705 shr_gpr0_by_2_bits(ice
);
707 if (q
->type
== PIPE_QUERY_OCCLUSION_PREDICATE
||
708 q
->type
== PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE
)
711 if (q
->type
== PIPE_QUERY_TIME_ELAPSED
) {
712 /* TODO: This discards fractional bits (see above). */
713 emit_mul_gpr0(batch
, (1000000000ull / screen
->devinfo
.timestamp_frequency
));
717 static struct pipe_query
*
718 iris_create_query(struct pipe_context
*ctx
,
722 struct iris_query
*q
= calloc(1, sizeof(struct iris_query
));
724 q
->type
= query_type
;
727 if (q
->type
== PIPE_QUERY_PIPELINE_STATISTICS
&& q
->index
== 10)
728 q
->batch_idx
= IRIS_BATCH_COMPUTE
;
730 q
->batch_idx
= IRIS_BATCH_RENDER
;
731 return (struct pipe_query
*) q
;
735 iris_destroy_query(struct pipe_context
*ctx
, struct pipe_query
*p_query
)
737 struct iris_query
*query
= (void *) p_query
;
738 iris_bo_unreference(query
->bo
);
744 iris_begin_query(struct pipe_context
*ctx
, struct pipe_query
*query
)
746 struct iris_screen
*screen
= (void *) ctx
->screen
;
747 struct iris_context
*ice
= (void *) ctx
;
748 struct iris_query
*q
= (void *) query
;
750 iris_bo_unreference(q
->bo
);
751 q
->bo
= iris_bo_alloc(screen
->bufmgr
, "query object", 4096,
756 q
->map
= iris_bo_map(&ice
->dbg
, q
->bo
, MAP_READ
| MAP_WRITE
| MAP_ASYNC
);
762 q
->map
->snapshots_landed
= false;
764 if (q
->type
== PIPE_QUERY_PRIMITIVES_GENERATED
&& q
->index
== 0) {
765 ice
->state
.prims_generated_query_active
= true;
766 ice
->state
.dirty
|= IRIS_DIRTY_STREAMOUT
| IRIS_DIRTY_CLIP
;
769 if (q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
||
770 q
->type
== PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
)
771 write_overflow_values(ice
, q
, false);
773 write_value(ice
, q
, offsetof(struct iris_query_snapshots
, start
));
779 iris_end_query(struct pipe_context
*ctx
, struct pipe_query
*query
)
781 struct iris_context
*ice
= (void *) ctx
;
782 struct iris_query
*q
= (void *) query
;
784 if (q
->type
== PIPE_QUERY_TIMESTAMP
) {
785 iris_begin_query(ctx
, query
);
786 mark_available(ice
, q
);
790 if (q
->type
== PIPE_QUERY_PRIMITIVES_GENERATED
&& q
->index
== 0) {
791 ice
->state
.prims_generated_query_active
= false;
792 ice
->state
.dirty
|= IRIS_DIRTY_STREAMOUT
| IRIS_DIRTY_CLIP
;
795 if (q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
||
796 q
->type
== PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
)
797 write_overflow_values(ice
, q
, true);
799 write_value(ice
, q
, offsetof(struct iris_query_snapshots
, end
));
800 mark_available(ice
, q
);
806 * See if the snapshots have landed for a query, and if so, compute the
807 * result and mark it ready. Does not flush (unlike iris_get_query_result).
810 iris_check_query_no_flush(struct iris_context
*ice
, struct iris_query
*q
)
812 struct iris_screen
*screen
= (void *) ice
->ctx
.screen
;
813 const struct gen_device_info
*devinfo
= &screen
->devinfo
;
815 if (!q
->ready
&& q
->map
->snapshots_landed
) {
816 calculate_result_on_cpu(devinfo
, q
);
821 iris_get_query_result(struct pipe_context
*ctx
,
822 struct pipe_query
*query
,
824 union pipe_query_result
*result
)
826 struct iris_context
*ice
= (void *) ctx
;
827 struct iris_query
*q
= (void *) query
;
828 struct iris_screen
*screen
= (void *) ctx
->screen
;
829 const struct gen_device_info
*devinfo
= &screen
->devinfo
;
832 if (iris_batch_references(&ice
->batches
[q
->batch_idx
], q
->bo
))
833 iris_batch_flush(&ice
->batches
[q
->batch_idx
]);
835 if (!q
->map
->snapshots_landed
) {
837 iris_bo_wait_rendering(q
->bo
);
842 assert(q
->map
->snapshots_landed
);
843 calculate_result_on_cpu(devinfo
, q
);
848 if (q
->type
== PIPE_QUERY_PIPELINE_STATISTICS
) {
851 result
->pipeline_statistics
.ia_vertices
= q
->result
;
854 result
->pipeline_statistics
.ia_primitives
= q
->result
;
857 result
->pipeline_statistics
.vs_invocations
= q
->result
;
860 result
->pipeline_statistics
.gs_invocations
= q
->result
;
863 result
->pipeline_statistics
.gs_primitives
= q
->result
;
866 result
->pipeline_statistics
.c_invocations
= q
->result
;
869 result
->pipeline_statistics
.c_primitives
= q
->result
;
872 result
->pipeline_statistics
.ps_invocations
= q
->result
;
875 result
->pipeline_statistics
.hs_invocations
= q
->result
;
878 result
->pipeline_statistics
.ds_invocations
= q
->result
;
881 result
->pipeline_statistics
.cs_invocations
= q
->result
;
885 result
->u64
= q
->result
;
892 iris_get_query_result_resource(struct pipe_context
*ctx
,
893 struct pipe_query
*query
,
895 enum pipe_query_value_type result_type
,
897 struct pipe_resource
*p_res
,
900 struct iris_context
*ice
= (void *) ctx
;
901 struct iris_query
*q
= (void *) query
;
902 struct iris_batch
*batch
= &ice
->batches
[q
->batch_idx
];
903 const struct gen_device_info
*devinfo
= &batch
->screen
->devinfo
;
904 struct iris_resource
*res
= (void *) p_res
;
905 unsigned snapshots_landed_offset
=
906 offsetof(struct iris_query_snapshots
, snapshots_landed
);
908 res
->bind_history
|= PIPE_BIND_QUERY_BUFFER
;
911 /* They're asking for the availability of the result. If we still
912 * have commands queued up which produce the result, submit them
913 * now so that progress happens. Either way, copy the snapshots
914 * landed field to the destination resource.
916 if (iris_batch_references(batch
, q
->bo
))
917 iris_batch_flush(batch
);
919 ice
->vtbl
.copy_mem_mem(batch
, iris_resource_bo(p_res
), offset
,
920 q
->bo
, snapshots_landed_offset
,
921 result_type
<= PIPE_QUERY_TYPE_U32
? 4 : 8);
925 if (!q
->ready
&& q
->map
->snapshots_landed
) {
926 /* The final snapshots happen to have landed, so let's just compute
927 * the result on the CPU now...
929 calculate_result_on_cpu(devinfo
, q
);
933 /* We happen to have the result on the CPU, so just copy it. */
934 if (result_type
<= PIPE_QUERY_TYPE_U32
) {
935 ice
->vtbl
.store_data_imm32(batch
, iris_resource_bo(p_res
), offset
,
938 ice
->vtbl
.store_data_imm64(batch
, iris_resource_bo(p_res
), offset
,
942 /* Make sure the result lands before they use bind the QBO elsewhere
943 * and use the result.
945 // XXX: Why? i965 doesn't do this.
946 iris_emit_pipe_control_flush(batch
, PIPE_CONTROL_CS_STALL
);
950 /* Calculate the result to CS_GPR0 */
951 calculate_result_on_gpu(ice
, q
);
953 bool predicated
= !wait
&& !q
->stalled
;
956 ice
->vtbl
.load_register_imm64(batch
, MI_PREDICATE_SRC1
, 0ull);
957 ice
->vtbl
.load_register_mem64(batch
, MI_PREDICATE_SRC0
, q
->bo
,
958 snapshots_landed_offset
);
959 uint32_t predicate
= MI_PREDICATE
|
960 MI_PREDICATE_LOADOP_LOADINV
|
961 MI_PREDICATE_COMBINEOP_SET
|
962 MI_PREDICATE_COMPAREOP_SRCS_EQUAL
;
963 iris_batch_emit(batch
, &predicate
, sizeof(uint32_t));
966 if (result_type
<= PIPE_QUERY_TYPE_U32
) {
967 ice
->vtbl
.store_register_mem32(batch
, CS_GPR(0),
968 iris_resource_bo(p_res
),
971 ice
->vtbl
.store_register_mem64(batch
, CS_GPR(0),
972 iris_resource_bo(p_res
),
978 iris_set_active_query_state(struct pipe_context
*ctx
, boolean enable
)
980 struct iris_context
*ice
= (void *) ctx
;
982 if (ice
->state
.statistics_counters_enabled
== enable
)
985 // XXX: most packets aren't paying attention to this yet, because it'd
986 // have to be done dynamically at draw time, which is a pain
987 ice
->state
.statistics_counters_enabled
= enable
;
988 ice
->state
.dirty
|= IRIS_DIRTY_CLIP
|
991 IRIS_DIRTY_STREAMOUT
|
999 set_predicate_enable(struct iris_context
*ice
, bool value
)
1002 ice
->state
.predicate
= IRIS_PREDICATE_STATE_RENDER
;
1004 ice
->state
.predicate
= IRIS_PREDICATE_STATE_DONT_RENDER
;
1008 set_predicate_for_result(struct iris_context
*ice
,
1009 struct iris_query
*q
,
1012 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
1014 /* The CPU doesn't have the query result yet; use hardware predication */
1015 ice
->state
.predicate
= IRIS_PREDICATE_STATE_USE_BIT
;
1017 /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
1018 iris_emit_pipe_control_flush(batch
, PIPE_CONTROL_FLUSH_ENABLE
);
1022 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
1023 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
:
1024 overflow_result_to_gpr0(ice
, q
);
1026 ice
->vtbl
.load_register_reg64(batch
, MI_PREDICATE_SRC0
, CS_GPR(0));
1027 ice
->vtbl
.load_register_imm64(batch
, MI_PREDICATE_SRC1
, 0ull);
1030 /* PIPE_QUERY_OCCLUSION_* */
1031 ice
->vtbl
.load_register_mem64(batch
, MI_PREDICATE_SRC0
, q
->bo
,
1032 offsetof(struct iris_query_snapshots
, start
));
1033 ice
->vtbl
.load_register_mem64(batch
, MI_PREDICATE_SRC1
, q
->bo
,
1034 offsetof(struct iris_query_snapshots
, end
));
1038 uint32_t mi_predicate
= MI_PREDICATE
|
1039 MI_PREDICATE_COMBINEOP_SET
|
1040 MI_PREDICATE_COMPAREOP_SRCS_EQUAL
|
1041 (inverted
? MI_PREDICATE_LOADOP_LOAD
1042 : MI_PREDICATE_LOADOP_LOADINV
);
1043 iris_batch_emit(batch
, &mi_predicate
, sizeof(uint32_t));
1045 /* We immediately set the predicate on the render batch, as all the
1046 * counters come from 3D operations. However, we may need to predicate
1047 * a compute dispatch, which executes in a different GEM context and has
1048 * a different MI_PREDICATE_DATA register. So, we save the result to
1049 * memory and reload it in iris_launch_grid.
1051 unsigned offset
= offsetof(struct iris_query_snapshots
, predicate_data
);
1052 ice
->vtbl
.store_register_mem64(batch
, MI_PREDICATE_DATA
,
1053 q
->bo
, offset
, false);
1054 ice
->state
.compute_predicate
= q
->bo
;
1058 iris_render_condition(struct pipe_context
*ctx
,
1059 struct pipe_query
*query
,
1061 enum pipe_render_cond_flag mode
)
1063 struct iris_context
*ice
= (void *) ctx
;
1064 struct iris_query
*q
= (void *) query
;
1067 ice
->state
.predicate
= IRIS_PREDICATE_STATE_RENDER
;
1071 iris_check_query_no_flush(ice
, q
);
1073 if (q
->result
|| q
->ready
) {
1074 set_predicate_enable(ice
, (q
->result
!= 0) ^ condition
);
1076 if (mode
== PIPE_RENDER_COND_NO_WAIT
||
1077 mode
== PIPE_RENDER_COND_BY_REGION_NO_WAIT
) {
1078 perf_debug(&ice
->dbg
, "Conditional rendering demoted from "
1079 "\"no wait\" to \"wait\".");
1081 set_predicate_for_result(ice
, q
, condition
);
1086 iris_init_query_functions(struct pipe_context
*ctx
)
1088 ctx
->create_query
= iris_create_query
;
1089 ctx
->destroy_query
= iris_destroy_query
;
1090 ctx
->begin_query
= iris_begin_query
;
1091 ctx
->end_query
= iris_end_query
;
1092 ctx
->get_query_result
= iris_get_query_result
;
1093 ctx
->get_query_result_resource
= iris_get_query_result_resource
;
1094 ctx
->set_active_query_state
= iris_set_active_query_state
;
1095 ctx
->render_condition
= iris_render_condition
;