2 * Copyright © 2017 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
26 * Query object support. This allows measuring various simple statistics
27 * via counters on the GPU.
32 #include "pipe/p_defines.h"
33 #include "pipe/p_state.h"
34 #include "pipe/p_context.h"
35 #include "pipe/p_screen.h"
36 #include "util/fast_idiv_by_const.h"
37 #include "util/u_inlines.h"
38 #include "util/u_upload_mgr.h"
39 #include "iris_context.h"
40 #include "iris_defines.h"
41 #include "iris_fence.h"
42 #include "iris_resource.h"
43 #include "iris_screen.h"
44 #include "vulkan/util/vk_util.h"
46 #define IA_VERTICES_COUNT 0x2310
47 #define IA_PRIMITIVES_COUNT 0x2318
48 #define VS_INVOCATION_COUNT 0x2320
49 #define HS_INVOCATION_COUNT 0x2300
50 #define DS_INVOCATION_COUNT 0x2308
51 #define GS_INVOCATION_COUNT 0x2328
52 #define GS_PRIMITIVES_COUNT 0x2330
53 #define CL_INVOCATION_COUNT 0x2338
54 #define CL_PRIMITIVES_COUNT 0x2340
55 #define PS_INVOCATION_COUNT 0x2348
56 #define CS_INVOCATION_COUNT 0x2290
57 #define PS_DEPTH_COUNT 0x2350
59 #define SO_PRIM_STORAGE_NEEDED(n) (0x5240 + (n) * 8)
61 #define SO_NUM_PRIMS_WRITTEN(n) (0x5200 + (n) * 8)
63 #define MI_MATH (0x1a << 23)
65 #define MI_ALU_LOAD 0x080
66 #define MI_ALU_LOADINV 0x480
67 #define MI_ALU_LOAD0 0x081
68 #define MI_ALU_LOAD1 0x481
69 #define MI_ALU_ADD 0x100
70 #define MI_ALU_SUB 0x101
71 #define MI_ALU_AND 0x102
72 #define MI_ALU_OR 0x103
73 #define MI_ALU_XOR 0x104
74 #define MI_ALU_STORE 0x180
75 #define MI_ALU_STOREINV 0x580
77 #define MI_ALU_SRCA 0x20
78 #define MI_ALU_SRCB 0x21
79 #define MI_ALU_ACCU 0x31
80 #define MI_ALU_ZF 0x32
81 #define MI_ALU_CF 0x33
83 #define emit_lri32 ice->vtbl.load_register_imm32
84 #define emit_lri64 ice->vtbl.load_register_imm64
85 #define emit_lrr32 ice->vtbl.load_register_reg32
88 enum pipe_query_type type
;
97 struct iris_state_ref query_state_ref
;
98 struct iris_query_snapshots
*map
;
99 struct iris_syncpt
*syncpt
;
104 struct iris_query_snapshots
{
105 /** iris_render_condition's saved MI_PREDICATE_RESULT value. */
106 uint64_t predicate_result
;
108 /** Have the start/end snapshots landed? */
109 uint64_t snapshots_landed
;
111 /** Starting and ending counter snapshots */
116 struct iris_query_so_overflow
{
117 uint64_t predicate_result
;
118 uint64_t snapshots_landed
;
121 uint64_t prim_storage_needed
[2];
122 uint64_t num_prims
[2];
127 * Is this type of query written by PIPE_CONTROL?
130 iris_is_query_pipelined(struct iris_query
*q
)
133 case PIPE_QUERY_OCCLUSION_COUNTER
:
134 case PIPE_QUERY_OCCLUSION_PREDICATE
:
135 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE
:
136 case PIPE_QUERY_TIMESTAMP
:
137 case PIPE_QUERY_TIMESTAMP_DISJOINT
:
138 case PIPE_QUERY_TIME_ELAPSED
:
147 mark_available(struct iris_context
*ice
, struct iris_query
*q
)
149 struct iris_batch
*batch
= &ice
->batches
[q
->batch_idx
];
150 unsigned flags
= PIPE_CONTROL_WRITE_IMMEDIATE
;
151 unsigned offset
= offsetof(struct iris_query_snapshots
, snapshots_landed
);
152 struct iris_bo
*bo
= iris_resource_bo(q
->query_state_ref
.res
);
153 offset
+= q
->query_state_ref
.offset
;
155 if (!iris_is_query_pipelined(q
)) {
156 ice
->vtbl
.store_data_imm64(batch
, bo
, offset
, true);
158 /* Order available *after* the query results. */
159 flags
|= PIPE_CONTROL_FLUSH_ENABLE
;
160 iris_emit_pipe_control_write(batch
, flags
, bo
, offset
, true);
165 * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
168 iris_pipelined_write(struct iris_batch
*batch
,
169 struct iris_query
*q
,
170 enum pipe_control_flags flags
,
173 const struct gen_device_info
*devinfo
= &batch
->screen
->devinfo
;
174 const unsigned optional_cs_stall
=
175 devinfo
->gen
== 9 && devinfo
->gt
== 4 ? PIPE_CONTROL_CS_STALL
: 0;
176 struct iris_bo
*bo
= iris_resource_bo(q
->query_state_ref
.res
);
178 iris_emit_pipe_control_write(batch
, flags
| optional_cs_stall
,
183 write_value(struct iris_context
*ice
, struct iris_query
*q
, unsigned offset
)
185 struct iris_batch
*batch
= &ice
->batches
[q
->batch_idx
];
186 const struct gen_device_info
*devinfo
= &batch
->screen
->devinfo
;
187 struct iris_bo
*bo
= iris_resource_bo(q
->query_state_ref
.res
);
189 if (!iris_is_query_pipelined(q
)) {
190 iris_emit_pipe_control_flush(batch
,
191 PIPE_CONTROL_CS_STALL
|
192 PIPE_CONTROL_STALL_AT_SCOREBOARD
);
197 case PIPE_QUERY_OCCLUSION_COUNTER
:
198 case PIPE_QUERY_OCCLUSION_PREDICATE
:
199 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE
:
200 if (devinfo
->gen
>= 10) {
201 /* "Driver must program PIPE_CONTROL with only Depth Stall Enable
202 * bit set prior to programming a PIPE_CONTROL with Write PS Depth
203 * Count sync operation."
205 iris_emit_pipe_control_flush(batch
, PIPE_CONTROL_DEPTH_STALL
);
207 iris_pipelined_write(&ice
->batches
[IRIS_BATCH_RENDER
], q
,
208 PIPE_CONTROL_WRITE_DEPTH_COUNT
|
209 PIPE_CONTROL_DEPTH_STALL
,
212 case PIPE_QUERY_TIME_ELAPSED
:
213 case PIPE_QUERY_TIMESTAMP
:
214 case PIPE_QUERY_TIMESTAMP_DISJOINT
:
215 iris_pipelined_write(&ice
->batches
[IRIS_BATCH_RENDER
], q
,
216 PIPE_CONTROL_WRITE_TIMESTAMP
,
219 case PIPE_QUERY_PRIMITIVES_GENERATED
:
220 ice
->vtbl
.store_register_mem64(batch
,
221 q
->index
== 0 ? CL_INVOCATION_COUNT
:
222 SO_PRIM_STORAGE_NEEDED(q
->index
),
225 case PIPE_QUERY_PRIMITIVES_EMITTED
:
226 ice
->vtbl
.store_register_mem64(batch
,
227 SO_NUM_PRIMS_WRITTEN(q
->index
),
230 case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE
: {
231 static const uint32_t index_to_reg
[] = {
244 const uint32_t reg
= index_to_reg
[q
->index
];
246 ice
->vtbl
.store_register_mem64(batch
, reg
, bo
, offset
, false);
255 write_overflow_values(struct iris_context
*ice
, struct iris_query
*q
, bool end
)
257 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
258 uint32_t count
= q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
? 1 : 4;
259 struct iris_bo
*bo
= iris_resource_bo(q
->query_state_ref
.res
);
260 uint32_t offset
= q
->query_state_ref
.offset
;
262 iris_emit_pipe_control_flush(batch
,
263 PIPE_CONTROL_CS_STALL
|
264 PIPE_CONTROL_STALL_AT_SCOREBOARD
);
265 for (uint32_t i
= 0; i
< count
; i
++) {
266 int s
= q
->index
+ i
;
267 int g_idx
= offset
+ offsetof(struct iris_query_so_overflow
,
268 stream
[s
].num_prims
[end
]);
269 int w_idx
= offset
+ offsetof(struct iris_query_so_overflow
,
270 stream
[s
].prim_storage_needed
[end
]);
271 ice
->vtbl
.store_register_mem64(batch
, SO_NUM_PRIMS_WRITTEN(s
),
273 ice
->vtbl
.store_register_mem64(batch
, SO_PRIM_STORAGE_NEEDED(s
),
279 iris_timebase_scale(const struct gen_device_info
*devinfo
,
280 uint64_t gpu_timestamp
)
282 return (1000000000ull * gpu_timestamp
) / devinfo
->timestamp_frequency
;
286 iris_raw_timestamp_delta(uint64_t time0
, uint64_t time1
)
289 return (1ULL << TIMESTAMP_BITS
) + time1
- time0
;
291 return time1
- time0
;
296 stream_overflowed(struct iris_query_so_overflow
*so
, int s
)
298 return (so
->stream
[s
].prim_storage_needed
[1] -
299 so
->stream
[s
].prim_storage_needed
[0]) !=
300 (so
->stream
[s
].num_prims
[1] - so
->stream
[s
].num_prims
[0]);
304 calculate_result_on_cpu(const struct gen_device_info
*devinfo
,
305 struct iris_query
*q
)
308 case PIPE_QUERY_OCCLUSION_PREDICATE
:
309 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE
:
310 q
->result
= q
->map
->end
!= q
->map
->start
;
312 case PIPE_QUERY_TIMESTAMP
:
313 case PIPE_QUERY_TIMESTAMP_DISJOINT
:
314 /* The timestamp is the single starting snapshot. */
315 q
->result
= iris_timebase_scale(devinfo
, q
->map
->start
);
316 q
->result
&= (1ull << TIMESTAMP_BITS
) - 1;
318 case PIPE_QUERY_TIME_ELAPSED
:
319 q
->result
= iris_raw_timestamp_delta(q
->map
->start
, q
->map
->end
);
320 q
->result
= iris_timebase_scale(devinfo
, q
->result
);
321 q
->result
&= (1ull << TIMESTAMP_BITS
) - 1;
323 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
324 q
->result
= stream_overflowed((void *) q
->map
, q
->index
);
326 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
:
328 for (int i
= 0; i
< MAX_VERTEX_STREAMS
; i
++)
329 q
->result
|= stream_overflowed((void *) q
->map
, i
);
331 case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE
:
332 q
->result
= q
->map
->end
- q
->map
->start
;
334 /* WaDividePSInvocationCountBy4:HSW,BDW */
335 if (devinfo
->gen
== 8 && q
->index
== PIPE_STAT_QUERY_PS_INVOCATIONS
)
338 case PIPE_QUERY_OCCLUSION_COUNTER
:
339 case PIPE_QUERY_PRIMITIVES_GENERATED
:
340 case PIPE_QUERY_PRIMITIVES_EMITTED
:
342 q
->result
= q
->map
->end
- q
->map
->start
;
350 emit_alu_add(struct iris_batch
*batch
, unsigned dst_reg
,
351 unsigned reg_a
, unsigned reg_b
)
353 uint32_t *math
= iris_get_command_space(batch
, 5 * sizeof(uint32_t));
355 math
[0] = MI_MATH
| (5 - 2);
356 math
[1] = _MI_ALU2(LOAD
, MI_ALU_SRCA
, reg_a
);
357 math
[2] = _MI_ALU2(LOAD
, MI_ALU_SRCB
, reg_b
);
358 math
[3] = _MI_ALU0(ADD
);
359 math
[4] = _MI_ALU2(STORE
, dst_reg
, MI_ALU_ACCU
);
363 emit_alu_shl(struct iris_batch
*batch
, unsigned dst_reg
,
364 unsigned src_reg
, unsigned shift
)
368 int dwords
= 1 + 4 * shift
;
370 uint32_t *math
= iris_get_command_space(batch
, sizeof(uint32_t) * dwords
);
372 math
[0] = MI_MATH
| ((1 + 4 * shift
) - 2);
374 for (unsigned i
= 0; i
< shift
; i
++) {
375 unsigned add_src
= (i
== 0) ? src_reg
: dst_reg
;
376 math
[1 + (i
* 4) + 0] = _MI_ALU2(LOAD
, MI_ALU_SRCA
, add_src
);
377 math
[1 + (i
* 4) + 1] = _MI_ALU2(LOAD
, MI_ALU_SRCB
, add_src
);
378 math
[1 + (i
* 4) + 2] = _MI_ALU0(ADD
);
379 math
[1 + (i
* 4) + 3] = _MI_ALU2(STORE
, dst_reg
, MI_ALU_ACCU
);
383 /* Emit dwords to multiply GPR0 by N */
385 build_alu_multiply_gpr0(uint32_t *dw
, unsigned *dw_count
, uint32_t N
)
387 VK_OUTARRAY_MAKE(out
, dw
, dw_count
);
389 #define APPEND_ALU(op, x, y) \
390 vk_outarray_append(&out, alu_dw) *alu_dw = _MI_ALU(MI_ALU_##op, x, y)
393 unsigned top_bit
= 31 - __builtin_clz(N
);
394 for (int i
= top_bit
- 1; i
>= 0; i
--) {
395 /* We get our initial data in GPR0 and we write the final data out to
396 * GPR0 but we use GPR1 as our scratch register.
398 unsigned src_reg
= i
== top_bit
- 1 ? MI_ALU_R0
: MI_ALU_R1
;
399 unsigned dst_reg
= i
== 0 ? MI_ALU_R0
: MI_ALU_R1
;
401 /* Shift the current value left by 1 */
402 APPEND_ALU(LOAD
, MI_ALU_SRCA
, src_reg
);
403 APPEND_ALU(LOAD
, MI_ALU_SRCB
, src_reg
);
404 APPEND_ALU(ADD
, 0, 0);
407 /* Store ACCU to R1 and add R0 to R1 */
408 APPEND_ALU(STORE
, MI_ALU_R1
, MI_ALU_ACCU
);
409 APPEND_ALU(LOAD
, MI_ALU_SRCA
, MI_ALU_R0
);
410 APPEND_ALU(LOAD
, MI_ALU_SRCB
, MI_ALU_R1
);
411 APPEND_ALU(ADD
, 0, 0);
414 APPEND_ALU(STORE
, dst_reg
, MI_ALU_ACCU
);
421 emit_mul_gpr0(struct iris_batch
*batch
, uint32_t N
)
424 build_alu_multiply_gpr0(NULL
, &num_dwords
, N
);
426 uint32_t *math
= iris_get_command_space(batch
, 4 * num_dwords
);
427 math
[0] = MI_MATH
| (num_dwords
- 2);
428 build_alu_multiply_gpr0(&math
[1], &num_dwords
, N
);
432 iris_math_div32_gpr0(struct iris_context
*ice
,
433 struct iris_batch
*batch
,
436 /* Zero out the top of GPR0 */
437 emit_lri32(batch
, CS_GPR(0) + 4, 0);
440 /* This invalid, but we should do something so we set GPR0 to 0. */
441 emit_lri32(batch
, CS_GPR(0), 0);
442 } else if (util_is_power_of_two_or_zero(D
)) {
443 unsigned log2_D
= util_logbase2(D
);
445 /* We right-shift by log2(D) by left-shifting by 32 - log2(D) and taking
446 * the top 32 bits of the result.
448 emit_alu_shl(batch
, MI_ALU_R0
, MI_ALU_R0
, 32 - log2_D
);
449 emit_lrr32(batch
, CS_GPR(0) + 0, CS_GPR(0) + 4);
450 emit_lri32(batch
, CS_GPR(0) + 4, 0);
452 struct util_fast_udiv_info m
= util_compute_fast_udiv_info(D
, 32, 32);
453 assert(m
.multiplier
<= UINT32_MAX
);
456 /* We right-shift by L by left-shifting by 32 - l and taking the top
457 * 32 bits of the result.
459 if (m
.pre_shift
< 32)
460 emit_alu_shl(batch
, MI_ALU_R0
, MI_ALU_R0
, 32 - m
.pre_shift
);
461 emit_lrr32(batch
, CS_GPR(0) + 0, CS_GPR(0) + 4);
462 emit_lri32(batch
, CS_GPR(0) + 4, 0);
465 /* Do the 32x32 multiply into gpr0 */
466 emit_mul_gpr0(batch
, m
.multiplier
);
469 /* If we need to increment, save off a copy of GPR0 */
470 emit_lri32(batch
, CS_GPR(1) + 0, m
.multiplier
);
471 emit_lri32(batch
, CS_GPR(1) + 4, 0);
472 emit_alu_add(batch
, MI_ALU_R0
, MI_ALU_R0
, MI_ALU_R1
);
476 emit_lrr32(batch
, CS_GPR(0) + 0, CS_GPR(0) + 4);
477 emit_lri32(batch
, CS_GPR(0) + 4, 0);
480 /* We right-shift by L by left-shifting by 32 - l and taking the top
481 * 32 bits of the result.
483 if (m
.post_shift
< 32)
484 emit_alu_shl(batch
, MI_ALU_R0
, MI_ALU_R0
, 32 - m
.post_shift
);
485 emit_lrr32(batch
, CS_GPR(0) + 0, CS_GPR(0) + 4);
486 emit_lri32(batch
, CS_GPR(0) + 4, 0);
492 iris_math_add32_gpr0(struct iris_context
*ice
,
493 struct iris_batch
*batch
,
496 emit_lri32(batch
, CS_GPR(1), x
);
497 emit_alu_add(batch
, MI_ALU_R0
, MI_ALU_R0
, MI_ALU_R1
);
501 * GPR0 = (GPR0 == 0) ? 0 : 1;
504 gpr0_to_bool(struct iris_context
*ice
)
506 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
508 ice
->vtbl
.load_register_imm64(batch
, CS_GPR(1), 1ull);
510 static const uint32_t math
[] = {
512 MI_ALU2(LOAD
, SRCA
, R0
),
513 MI_ALU1(LOAD0
, SRCB
),
515 MI_ALU2(STOREINV
, R0
, ZF
),
516 MI_ALU2(LOAD
, SRCA
, R0
),
517 MI_ALU2(LOAD
, SRCB
, R1
),
519 MI_ALU2(STORE
, R0
, ACCU
),
521 iris_batch_emit(batch
, math
, sizeof(math
));
525 load_overflow_data_to_cs_gprs(struct iris_context
*ice
,
526 struct iris_query
*q
,
529 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
530 struct iris_bo
*bo
= iris_resource_bo(q
->query_state_ref
.res
);
531 uint32_t offset
= q
->query_state_ref
.offset
;
533 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(1), bo
, offset
+
534 offsetof(struct iris_query_so_overflow
,
535 stream
[idx
].prim_storage_needed
[0]));
536 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(2), bo
, offset
+
537 offsetof(struct iris_query_so_overflow
,
538 stream
[idx
].prim_storage_needed
[1]));
540 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(3), bo
, offset
+
541 offsetof(struct iris_query_so_overflow
,
542 stream
[idx
].num_prims
[0]));
543 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(4), bo
, offset
+
544 offsetof(struct iris_query_so_overflow
,
545 stream
[idx
].num_prims
[1]));
555 calc_overflow_for_stream(struct iris_context
*ice
)
557 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
558 static const uint32_t maths
[] = {
560 MI_ALU2(LOAD
, SRCA
, R4
),
561 MI_ALU2(LOAD
, SRCB
, R3
),
563 MI_ALU2(STORE
, R3
, ACCU
),
564 MI_ALU2(LOAD
, SRCA
, R2
),
565 MI_ALU2(LOAD
, SRCB
, R1
),
567 MI_ALU2(STORE
, R1
, ACCU
),
568 MI_ALU2(LOAD
, SRCA
, R3
),
569 MI_ALU2(LOAD
, SRCB
, R1
),
571 MI_ALU2(STORE
, R1
, ACCU
),
572 MI_ALU2(LOAD
, SRCA
, R1
),
573 MI_ALU2(LOAD
, SRCB
, R0
),
575 MI_ALU2(STORE
, R0
, ACCU
),
578 iris_batch_emit(batch
, maths
, sizeof(maths
));
582 overflow_result_to_gpr0(struct iris_context
*ice
, struct iris_query
*q
)
584 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
586 ice
->vtbl
.load_register_imm64(batch
, CS_GPR(0), 0ull);
588 if (q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
) {
589 load_overflow_data_to_cs_gprs(ice
, q
, q
->index
);
590 calc_overflow_for_stream(ice
);
592 for (int i
= 0; i
< MAX_VERTEX_STREAMS
; i
++) {
593 load_overflow_data_to_cs_gprs(ice
, q
, i
);
594 calc_overflow_for_stream(ice
);
602 * GPR0 = GPR0 & ((1ull << n) -1);
605 keep_gpr0_lower_n_bits(struct iris_context
*ice
, uint32_t n
)
607 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
609 ice
->vtbl
.load_register_imm64(batch
, CS_GPR(1), (1ull << n
) - 1);
610 static const uint32_t math
[] = {
612 MI_ALU2(LOAD
, SRCA
, R0
),
613 MI_ALU2(LOAD
, SRCB
, R1
),
615 MI_ALU2(STORE
, R0
, ACCU
),
617 iris_batch_emit(batch
, math
, sizeof(math
));
624 shl_gpr0_by_30_bits(struct iris_context
*ice
)
626 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
627 /* First we mask 34 bits of GPR0 to prevent overflow */
628 keep_gpr0_lower_n_bits(ice
, 34);
630 static const uint32_t shl_math
[] = {
631 MI_ALU2(LOAD
, SRCA
, R0
),
632 MI_ALU2(LOAD
, SRCB
, R0
),
634 MI_ALU2(STORE
, R0
, ACCU
),
637 const uint32_t outer_count
= 5;
638 const uint32_t inner_count
= 6;
639 const uint32_t cmd_len
= 1 + inner_count
* ARRAY_SIZE(shl_math
);
640 const uint32_t batch_len
= cmd_len
* outer_count
;
641 uint32_t *map
= iris_get_command_space(batch
, batch_len
* 4);
643 for (int o
= 0; o
< outer_count
; o
++) {
644 map
[offset
++] = MI_MATH
| (cmd_len
- 2);
645 for (int i
= 0; i
< inner_count
; i
++) {
646 memcpy(&map
[offset
], shl_math
, sizeof(shl_math
));
655 * Note that the upper 30 bits of GPR0 are lost!
658 shr_gpr0_by_2_bits(struct iris_context
*ice
)
660 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
661 shl_gpr0_by_30_bits(ice
);
662 ice
->vtbl
.load_register_reg32(batch
, CS_GPR(0) + 4, CS_GPR(0));
663 ice
->vtbl
.load_register_imm32(batch
, CS_GPR(0) + 4, 0);
667 * Calculate the result and store it to CS_GPR0.
670 calculate_result_on_gpu(struct iris_context
*ice
, struct iris_query
*q
)
672 struct iris_batch
*batch
= &ice
->batches
[q
->batch_idx
];
673 struct iris_screen
*screen
= (void *) ice
->ctx
.screen
;
674 const struct gen_device_info
*devinfo
= &batch
->screen
->devinfo
;
675 struct iris_bo
*bo
= iris_resource_bo(q
->query_state_ref
.res
);
676 uint32_t offset
= q
->query_state_ref
.offset
;
678 if (q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
||
679 q
->type
== PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
) {
680 overflow_result_to_gpr0(ice
, q
);
684 if (q
->type
== PIPE_QUERY_TIMESTAMP
) {
685 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(0), bo
,
687 offsetof(struct iris_query_snapshots
, start
));
688 /* TODO: This discards any fractional bits of the timebase scale.
689 * We would need to do a bit of fixed point math on the CS ALU, or
690 * launch an actual shader to calculate this with full precision.
692 emit_mul_gpr0(batch
, (1000000000ull / screen
->devinfo
.timestamp_frequency
));
693 keep_gpr0_lower_n_bits(ice
, 36);
697 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(1), bo
,
699 offsetof(struct iris_query_snapshots
, start
));
700 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(2), bo
,
702 offsetof(struct iris_query_snapshots
, end
));
704 static const uint32_t math
[] = {
706 MI_ALU2(LOAD
, SRCA
, R2
),
707 MI_ALU2(LOAD
, SRCB
, R1
),
709 MI_ALU2(STORE
, R0
, ACCU
),
711 iris_batch_emit(batch
, math
, sizeof(math
));
713 /* WaDividePSInvocationCountBy4:HSW,BDW */
714 if (devinfo
->gen
== 8 &&
715 q
->type
== PIPE_QUERY_PIPELINE_STATISTICS_SINGLE
&&
716 q
->index
== PIPE_STAT_QUERY_PS_INVOCATIONS
)
717 shr_gpr0_by_2_bits(ice
);
719 if (q
->type
== PIPE_QUERY_OCCLUSION_PREDICATE
||
720 q
->type
== PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE
)
723 if (q
->type
== PIPE_QUERY_TIME_ELAPSED
) {
724 /* TODO: This discards fractional bits (see above). */
725 emit_mul_gpr0(batch
, (1000000000ull / screen
->devinfo
.timestamp_frequency
));
729 static struct pipe_query
*
730 iris_create_query(struct pipe_context
*ctx
,
734 struct iris_query
*q
= calloc(1, sizeof(struct iris_query
));
736 q
->type
= query_type
;
739 if (q
->type
== PIPE_QUERY_PIPELINE_STATISTICS_SINGLE
&&
740 q
->index
== PIPE_STAT_QUERY_CS_INVOCATIONS
)
741 q
->batch_idx
= IRIS_BATCH_COMPUTE
;
743 q
->batch_idx
= IRIS_BATCH_RENDER
;
744 return (struct pipe_query
*) q
;
748 iris_destroy_query(struct pipe_context
*ctx
, struct pipe_query
*p_query
)
750 struct iris_query
*query
= (void *) p_query
;
751 struct iris_screen
*screen
= (void *) ctx
->screen
;
752 iris_syncpt_reference(screen
, &query
->syncpt
, NULL
);
758 iris_begin_query(struct pipe_context
*ctx
, struct pipe_query
*query
)
760 struct iris_context
*ice
= (void *) ctx
;
761 struct iris_query
*q
= (void *) query
;
765 if (q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
||
766 q
->type
== PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
)
767 size
= sizeof(struct iris_query_so_overflow
);
769 size
= sizeof(struct iris_query_snapshots
);
771 u_upload_alloc(ice
->query_buffer_uploader
, 0,
772 size
, size
, &q
->query_state_ref
.offset
,
773 &q
->query_state_ref
.res
, &ptr
);
775 if (!iris_resource_bo(q
->query_state_ref
.res
))
784 WRITE_ONCE(q
->map
->snapshots_landed
, false);
786 if (q
->type
== PIPE_QUERY_PRIMITIVES_GENERATED
&& q
->index
== 0) {
787 ice
->state
.prims_generated_query_active
= true;
788 ice
->state
.dirty
|= IRIS_DIRTY_STREAMOUT
| IRIS_DIRTY_CLIP
;
791 if (q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
||
792 q
->type
== PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
)
793 write_overflow_values(ice
, q
, false);
796 q
->query_state_ref
.offset
+
797 offsetof(struct iris_query_snapshots
, start
));
803 iris_end_query(struct pipe_context
*ctx
, struct pipe_query
*query
)
805 struct iris_context
*ice
= (void *) ctx
;
806 struct iris_query
*q
= (void *) query
;
807 struct iris_batch
*batch
= &ice
->batches
[q
->batch_idx
];
809 if (q
->type
== PIPE_QUERY_TIMESTAMP
) {
810 iris_begin_query(ctx
, query
);
811 iris_batch_reference_signal_syncpt(batch
, &q
->syncpt
);
812 mark_available(ice
, q
);
816 if (q
->type
== PIPE_QUERY_PRIMITIVES_GENERATED
&& q
->index
== 0) {
817 ice
->state
.prims_generated_query_active
= false;
818 ice
->state
.dirty
|= IRIS_DIRTY_STREAMOUT
| IRIS_DIRTY_CLIP
;
821 if (q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
||
822 q
->type
== PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
)
823 write_overflow_values(ice
, q
, true);
826 q
->query_state_ref
.offset
+
827 offsetof(struct iris_query_snapshots
, end
));
829 iris_batch_reference_signal_syncpt(batch
, &q
->syncpt
);
830 mark_available(ice
, q
);
836 * See if the snapshots have landed for a query, and if so, compute the
837 * result and mark it ready. Does not flush (unlike iris_get_query_result).
840 iris_check_query_no_flush(struct iris_context
*ice
, struct iris_query
*q
)
842 struct iris_screen
*screen
= (void *) ice
->ctx
.screen
;
843 const struct gen_device_info
*devinfo
= &screen
->devinfo
;
845 if (!q
->ready
&& READ_ONCE(q
->map
->snapshots_landed
)) {
846 calculate_result_on_cpu(devinfo
, q
);
851 iris_get_query_result(struct pipe_context
*ctx
,
852 struct pipe_query
*query
,
854 union pipe_query_result
*result
)
856 struct iris_context
*ice
= (void *) ctx
;
857 struct iris_query
*q
= (void *) query
;
858 struct iris_screen
*screen
= (void *) ctx
->screen
;
859 const struct gen_device_info
*devinfo
= &screen
->devinfo
;
860 struct iris_bo
*bo
= iris_resource_bo(q
->query_state_ref
.res
);
862 if (unlikely(screen
->no_hw
)) {
868 if (iris_batch_references(&ice
->batches
[q
->batch_idx
], bo
))
869 iris_batch_flush(&ice
->batches
[q
->batch_idx
]);
871 while (!READ_ONCE(q
->map
->snapshots_landed
)) {
873 iris_wait_syncpt(ctx
->screen
, q
->syncpt
, INT64_MAX
);
878 assert(READ_ONCE(q
->map
->snapshots_landed
));
879 calculate_result_on_cpu(devinfo
, q
);
884 result
->u64
= q
->result
;
890 iris_get_query_result_resource(struct pipe_context
*ctx
,
891 struct pipe_query
*query
,
893 enum pipe_query_value_type result_type
,
895 struct pipe_resource
*p_res
,
898 struct iris_context
*ice
= (void *) ctx
;
899 struct iris_query
*q
= (void *) query
;
900 struct iris_batch
*batch
= &ice
->batches
[q
->batch_idx
];
901 const struct gen_device_info
*devinfo
= &batch
->screen
->devinfo
;
902 struct iris_resource
*res
= (void *) p_res
;
903 struct iris_bo
*bo
= iris_resource_bo(q
->query_state_ref
.res
);
904 unsigned snapshots_landed_offset
=
905 offsetof(struct iris_query_snapshots
, snapshots_landed
);
907 res
->bind_history
|= PIPE_BIND_QUERY_BUFFER
;
910 /* They're asking for the availability of the result. If we still
911 * have commands queued up which produce the result, submit them
912 * now so that progress happens. Either way, copy the snapshots
913 * landed field to the destination resource.
915 if (iris_batch_references(batch
, bo
))
916 iris_batch_flush(batch
);
918 ice
->vtbl
.copy_mem_mem(batch
, iris_resource_bo(p_res
), offset
,
919 bo
, snapshots_landed_offset
,
920 result_type
<= PIPE_QUERY_TYPE_U32
? 4 : 8);
924 if (!q
->ready
&& READ_ONCE(q
->map
->snapshots_landed
)) {
925 /* The final snapshots happen to have landed, so let's just compute
926 * the result on the CPU now...
928 calculate_result_on_cpu(devinfo
, q
);
932 /* We happen to have the result on the CPU, so just copy it. */
933 if (result_type
<= PIPE_QUERY_TYPE_U32
) {
934 ice
->vtbl
.store_data_imm32(batch
, iris_resource_bo(p_res
), offset
,
937 ice
->vtbl
.store_data_imm64(batch
, iris_resource_bo(p_res
), offset
,
941 /* Make sure the result lands before they use bind the QBO elsewhere
942 * and use the result.
944 // XXX: Why? i965 doesn't do this.
945 iris_emit_pipe_control_flush(batch
, PIPE_CONTROL_CS_STALL
);
949 /* Calculate the result to CS_GPR0 */
950 calculate_result_on_gpu(ice
, q
);
952 bool predicated
= !wait
&& !q
->stalled
;
955 ice
->vtbl
.load_register_imm64(batch
, MI_PREDICATE_SRC1
, 0ull);
956 ice
->vtbl
.load_register_mem64(batch
, MI_PREDICATE_SRC0
, bo
,
957 snapshots_landed_offset
);
958 uint32_t predicate
= MI_PREDICATE
|
959 MI_PREDICATE_LOADOP_LOADINV
|
960 MI_PREDICATE_COMBINEOP_SET
|
961 MI_PREDICATE_COMPAREOP_SRCS_EQUAL
;
962 iris_batch_emit(batch
, &predicate
, sizeof(uint32_t));
965 if (result_type
<= PIPE_QUERY_TYPE_U32
) {
966 ice
->vtbl
.store_register_mem32(batch
, CS_GPR(0),
967 iris_resource_bo(p_res
),
970 ice
->vtbl
.store_register_mem64(batch
, CS_GPR(0),
971 iris_resource_bo(p_res
),
977 iris_set_active_query_state(struct pipe_context
*ctx
, boolean enable
)
979 struct iris_context
*ice
= (void *) ctx
;
981 if (ice
->state
.statistics_counters_enabled
== enable
)
984 // XXX: most packets aren't paying attention to this yet, because it'd
985 // have to be done dynamically at draw time, which is a pain
986 ice
->state
.statistics_counters_enabled
= enable
;
987 ice
->state
.dirty
|= IRIS_DIRTY_CLIP
|
990 IRIS_DIRTY_STREAMOUT
|
998 set_predicate_enable(struct iris_context
*ice
, bool value
)
1001 ice
->state
.predicate
= IRIS_PREDICATE_STATE_RENDER
;
1003 ice
->state
.predicate
= IRIS_PREDICATE_STATE_DONT_RENDER
;
1007 set_predicate_for_result(struct iris_context
*ice
,
1008 struct iris_query
*q
,
1011 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
1012 struct iris_bo
*bo
= iris_resource_bo(q
->query_state_ref
.res
);
1014 /* The CPU doesn't have the query result yet; use hardware predication */
1015 ice
->state
.predicate
= IRIS_PREDICATE_STATE_USE_BIT
;
1017 /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
1018 iris_emit_pipe_control_flush(batch
, PIPE_CONTROL_FLUSH_ENABLE
);
1022 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
1023 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
:
1024 overflow_result_to_gpr0(ice
, q
);
1026 ice
->vtbl
.load_register_reg64(batch
, MI_PREDICATE_SRC0
, CS_GPR(0));
1027 ice
->vtbl
.load_register_imm64(batch
, MI_PREDICATE_SRC1
, 0ull);
1030 /* PIPE_QUERY_OCCLUSION_* */
1031 ice
->vtbl
.load_register_mem64(batch
, MI_PREDICATE_SRC0
, bo
,
1032 offsetof(struct iris_query_snapshots
, start
) +
1033 q
->query_state_ref
.offset
);
1034 ice
->vtbl
.load_register_mem64(batch
, MI_PREDICATE_SRC1
, bo
,
1035 offsetof(struct iris_query_snapshots
, end
) +
1036 q
->query_state_ref
.offset
);
1040 uint32_t mi_predicate
= MI_PREDICATE
|
1041 MI_PREDICATE_COMBINEOP_SET
|
1042 MI_PREDICATE_COMPAREOP_SRCS_EQUAL
|
1043 (inverted
? MI_PREDICATE_LOADOP_LOAD
1044 : MI_PREDICATE_LOADOP_LOADINV
);
1045 iris_batch_emit(batch
, &mi_predicate
, sizeof(uint32_t));
1047 /* We immediately set the predicate on the render batch, as all the
1048 * counters come from 3D operations. However, we may need to predicate
1049 * a compute dispatch, which executes in a different GEM context and has
1050 * a different MI_PREDICATE_RESULT register. So, we save the result to
1051 * memory and reload it in iris_launch_grid.
1053 unsigned offset
= q
->query_state_ref
.offset
+
1054 offsetof(struct iris_query_snapshots
, predicate_result
);
1055 ice
->vtbl
.store_register_mem64(batch
, MI_PREDICATE_RESULT
,
1057 ice
->state
.compute_predicate
= bo
;
1061 iris_render_condition(struct pipe_context
*ctx
,
1062 struct pipe_query
*query
,
1064 enum pipe_render_cond_flag mode
)
1066 struct iris_context
*ice
= (void *) ctx
;
1067 struct iris_query
*q
= (void *) query
;
1069 /* The old condition isn't relevant; we'll update it if necessary */
1070 ice
->state
.compute_predicate
= NULL
;
1071 ice
->condition
.query
= q
;
1072 ice
->condition
.condition
= condition
;
1075 ice
->state
.predicate
= IRIS_PREDICATE_STATE_RENDER
;
1079 iris_check_query_no_flush(ice
, q
);
1081 if (q
->result
|| q
->ready
) {
1082 set_predicate_enable(ice
, (q
->result
!= 0) ^ condition
);
1084 if (mode
== PIPE_RENDER_COND_NO_WAIT
||
1085 mode
== PIPE_RENDER_COND_BY_REGION_NO_WAIT
) {
1086 perf_debug(&ice
->dbg
, "Conditional rendering demoted from "
1087 "\"no wait\" to \"wait\".");
1089 set_predicate_for_result(ice
, q
, condition
);
1094 iris_resolve_conditional_render(struct iris_context
*ice
)
1096 struct pipe_context
*ctx
= (void *) ice
;
1097 struct iris_query
*q
= ice
->condition
.query
;
1098 struct pipe_query
*query
= (void *) q
;
1099 union pipe_query_result result
;
1101 if (ice
->state
.predicate
!= IRIS_PREDICATE_STATE_USE_BIT
)
1106 iris_get_query_result(ctx
, query
, true, &result
);
1107 set_predicate_enable(ice
, (q
->result
!= 0) ^ ice
->condition
.condition
);
1111 iris_init_query_functions(struct pipe_context
*ctx
)
1113 ctx
->create_query
= iris_create_query
;
1114 ctx
->destroy_query
= iris_destroy_query
;
1115 ctx
->begin_query
= iris_begin_query
;
1116 ctx
->end_query
= iris_end_query
;
1117 ctx
->get_query_result
= iris_get_query_result
;
1118 ctx
->get_query_result_resource
= iris_get_query_result_resource
;
1119 ctx
->set_active_query_state
= iris_set_active_query_state
;
1120 ctx
->render_condition
= iris_render_condition
;