2 * Copyright © 2017 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
26 * Query object support. This allows measuring various simple statistics
27 * via counters on the GPU.
32 #include "pipe/p_defines.h"
33 #include "pipe/p_state.h"
34 #include "pipe/p_context.h"
35 #include "pipe/p_screen.h"
36 #include "util/fast_idiv_by_const.h"
37 #include "util/u_inlines.h"
38 #include "util/u_upload_mgr.h"
39 #include "iris_context.h"
40 #include "iris_defines.h"
41 #include "iris_fence.h"
42 #include "iris_resource.h"
43 #include "iris_screen.h"
44 #include "vulkan/util/vk_util.h"
46 #define IA_VERTICES_COUNT 0x2310
47 #define IA_PRIMITIVES_COUNT 0x2318
48 #define VS_INVOCATION_COUNT 0x2320
49 #define HS_INVOCATION_COUNT 0x2300
50 #define DS_INVOCATION_COUNT 0x2308
51 #define GS_INVOCATION_COUNT 0x2328
52 #define GS_PRIMITIVES_COUNT 0x2330
53 #define CL_INVOCATION_COUNT 0x2338
54 #define CL_PRIMITIVES_COUNT 0x2340
55 #define PS_INVOCATION_COUNT 0x2348
56 #define CS_INVOCATION_COUNT 0x2290
57 #define PS_DEPTH_COUNT 0x2350
59 #define SO_PRIM_STORAGE_NEEDED(n) (0x5240 + (n) * 8)
61 #define SO_NUM_PRIMS_WRITTEN(n) (0x5200 + (n) * 8)
63 #define MI_MATH (0x1a << 23)
65 #define MI_ALU_LOAD 0x080
66 #define MI_ALU_LOADINV 0x480
67 #define MI_ALU_LOAD0 0x081
68 #define MI_ALU_LOAD1 0x481
69 #define MI_ALU_ADD 0x100
70 #define MI_ALU_SUB 0x101
71 #define MI_ALU_AND 0x102
72 #define MI_ALU_OR 0x103
73 #define MI_ALU_XOR 0x104
74 #define MI_ALU_STORE 0x180
75 #define MI_ALU_STOREINV 0x580
77 #define MI_ALU_R0 0x00
78 #define MI_ALU_R1 0x01
79 #define MI_ALU_R2 0x02
80 #define MI_ALU_R3 0x03
81 #define MI_ALU_R4 0x04
82 #define MI_ALU_SRCA 0x20
83 #define MI_ALU_SRCB 0x21
84 #define MI_ALU_ACCU 0x31
85 #define MI_ALU_ZF 0x32
86 #define MI_ALU_CF 0x33
88 #define _MI_ALU(op, x, y) (((op) << 20) | ((x) << 10) | (y))
90 #define _MI_ALU0(op) _MI_ALU(MI_ALU_##op, 0, 0)
91 #define _MI_ALU1(op, x) _MI_ALU(MI_ALU_##op, x, 0)
92 #define _MI_ALU2(op, x, y) _MI_ALU(MI_ALU_##op, x, y)
94 #define MI_ALU0(op) _MI_ALU0(op)
95 #define MI_ALU1(op, x) _MI_ALU1(op, MI_ALU_##x)
96 #define MI_ALU2(op, x, y) _MI_ALU2(op, MI_ALU_##x, MI_ALU_##y)
98 #define emit_lri32 ice->vtbl.load_register_imm32
99 #define emit_lri64 ice->vtbl.load_register_imm64
100 #define emit_lrr32 ice->vtbl.load_register_reg32
103 enum pipe_query_type type
;
112 struct iris_state_ref query_state_ref
;
113 struct iris_query_snapshots
*map
;
114 struct iris_syncpt
*syncpt
;
119 struct iris_query_snapshots
{
120 /** iris_render_condition's saved MI_PREDICATE_RESULT value. */
121 uint64_t predicate_result
;
123 /** Have the start/end snapshots landed? */
124 uint64_t snapshots_landed
;
126 /** Starting and ending counter snapshots */
131 struct iris_query_so_overflow
{
132 uint64_t predicate_result
;
133 uint64_t snapshots_landed
;
136 uint64_t prim_storage_needed
[2];
137 uint64_t num_prims
[2];
142 * Is this type of query written by PIPE_CONTROL?
145 iris_is_query_pipelined(struct iris_query
*q
)
148 case PIPE_QUERY_OCCLUSION_COUNTER
:
149 case PIPE_QUERY_OCCLUSION_PREDICATE
:
150 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE
:
151 case PIPE_QUERY_TIMESTAMP
:
152 case PIPE_QUERY_TIMESTAMP_DISJOINT
:
153 case PIPE_QUERY_TIME_ELAPSED
:
162 mark_available(struct iris_context
*ice
, struct iris_query
*q
)
164 struct iris_batch
*batch
= &ice
->batches
[q
->batch_idx
];
165 unsigned flags
= PIPE_CONTROL_WRITE_IMMEDIATE
;
166 unsigned offset
= offsetof(struct iris_query_snapshots
, snapshots_landed
);
167 struct iris_bo
*bo
= iris_resource_bo(q
->query_state_ref
.res
);
168 offset
+= q
->query_state_ref
.offset
;
170 if (!iris_is_query_pipelined(q
)) {
171 ice
->vtbl
.store_data_imm64(batch
, bo
, offset
, true);
173 /* Order available *after* the query results. */
174 flags
|= PIPE_CONTROL_FLUSH_ENABLE
;
175 iris_emit_pipe_control_write(batch
, flags
, bo
, offset
, true);
180 * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
183 iris_pipelined_write(struct iris_batch
*batch
,
184 struct iris_query
*q
,
185 enum pipe_control_flags flags
,
188 const struct gen_device_info
*devinfo
= &batch
->screen
->devinfo
;
189 const unsigned optional_cs_stall
=
190 devinfo
->gen
== 9 && devinfo
->gt
== 4 ? PIPE_CONTROL_CS_STALL
: 0;
191 struct iris_bo
*bo
= iris_resource_bo(q
->query_state_ref
.res
);
193 iris_emit_pipe_control_write(batch
, flags
| optional_cs_stall
,
198 write_value(struct iris_context
*ice
, struct iris_query
*q
, unsigned offset
)
200 struct iris_batch
*batch
= &ice
->batches
[q
->batch_idx
];
201 const struct gen_device_info
*devinfo
= &batch
->screen
->devinfo
;
202 struct iris_bo
*bo
= iris_resource_bo(q
->query_state_ref
.res
);
204 if (!iris_is_query_pipelined(q
)) {
205 iris_emit_pipe_control_flush(batch
,
206 PIPE_CONTROL_CS_STALL
|
207 PIPE_CONTROL_STALL_AT_SCOREBOARD
);
212 case PIPE_QUERY_OCCLUSION_COUNTER
:
213 case PIPE_QUERY_OCCLUSION_PREDICATE
:
214 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE
:
215 if (devinfo
->gen
>= 10) {
216 /* "Driver must program PIPE_CONTROL with only Depth Stall Enable
217 * bit set prior to programming a PIPE_CONTROL with Write PS Depth
218 * Count sync operation."
220 iris_emit_pipe_control_flush(batch
, PIPE_CONTROL_DEPTH_STALL
);
222 iris_pipelined_write(&ice
->batches
[IRIS_BATCH_RENDER
], q
,
223 PIPE_CONTROL_WRITE_DEPTH_COUNT
|
224 PIPE_CONTROL_DEPTH_STALL
,
227 case PIPE_QUERY_TIME_ELAPSED
:
228 case PIPE_QUERY_TIMESTAMP
:
229 case PIPE_QUERY_TIMESTAMP_DISJOINT
:
230 iris_pipelined_write(&ice
->batches
[IRIS_BATCH_RENDER
], q
,
231 PIPE_CONTROL_WRITE_TIMESTAMP
,
234 case PIPE_QUERY_PRIMITIVES_GENERATED
:
235 ice
->vtbl
.store_register_mem64(batch
,
236 q
->index
== 0 ? CL_INVOCATION_COUNT
:
237 SO_PRIM_STORAGE_NEEDED(q
->index
),
240 case PIPE_QUERY_PRIMITIVES_EMITTED
:
241 ice
->vtbl
.store_register_mem64(batch
,
242 SO_NUM_PRIMS_WRITTEN(q
->index
),
245 case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE
: {
246 static const uint32_t index_to_reg
[] = {
259 const uint32_t reg
= index_to_reg
[q
->index
];
261 ice
->vtbl
.store_register_mem64(batch
, reg
, bo
, offset
, false);
270 write_overflow_values(struct iris_context
*ice
, struct iris_query
*q
, bool end
)
272 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
273 uint32_t count
= q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
? 1 : 4;
274 struct iris_bo
*bo
= iris_resource_bo(q
->query_state_ref
.res
);
275 uint32_t offset
= q
->query_state_ref
.offset
;
277 iris_emit_pipe_control_flush(batch
,
278 PIPE_CONTROL_CS_STALL
|
279 PIPE_CONTROL_STALL_AT_SCOREBOARD
);
280 for (uint32_t i
= 0; i
< count
; i
++) {
281 int s
= q
->index
+ i
;
282 int g_idx
= offset
+ offsetof(struct iris_query_so_overflow
,
283 stream
[s
].num_prims
[end
]);
284 int w_idx
= offset
+ offsetof(struct iris_query_so_overflow
,
285 stream
[s
].prim_storage_needed
[end
]);
286 ice
->vtbl
.store_register_mem64(batch
, SO_NUM_PRIMS_WRITTEN(s
),
288 ice
->vtbl
.store_register_mem64(batch
, SO_PRIM_STORAGE_NEEDED(s
),
294 iris_timebase_scale(const struct gen_device_info
*devinfo
,
295 uint64_t gpu_timestamp
)
297 return (1000000000ull * gpu_timestamp
) / devinfo
->timestamp_frequency
;
301 iris_raw_timestamp_delta(uint64_t time0
, uint64_t time1
)
304 return (1ULL << TIMESTAMP_BITS
) + time1
- time0
;
306 return time1
- time0
;
311 stream_overflowed(struct iris_query_so_overflow
*so
, int s
)
313 return (so
->stream
[s
].prim_storage_needed
[1] -
314 so
->stream
[s
].prim_storage_needed
[0]) !=
315 (so
->stream
[s
].num_prims
[1] - so
->stream
[s
].num_prims
[0]);
319 calculate_result_on_cpu(const struct gen_device_info
*devinfo
,
320 struct iris_query
*q
)
323 case PIPE_QUERY_OCCLUSION_PREDICATE
:
324 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE
:
325 q
->result
= q
->map
->end
!= q
->map
->start
;
327 case PIPE_QUERY_TIMESTAMP
:
328 case PIPE_QUERY_TIMESTAMP_DISJOINT
:
329 /* The timestamp is the single starting snapshot. */
330 q
->result
= iris_timebase_scale(devinfo
, q
->map
->start
);
331 q
->result
&= (1ull << TIMESTAMP_BITS
) - 1;
333 case PIPE_QUERY_TIME_ELAPSED
:
334 q
->result
= iris_raw_timestamp_delta(q
->map
->start
, q
->map
->end
);
335 q
->result
= iris_timebase_scale(devinfo
, q
->result
);
336 q
->result
&= (1ull << TIMESTAMP_BITS
) - 1;
338 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
339 q
->result
= stream_overflowed((void *) q
->map
, q
->index
);
341 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
:
343 for (int i
= 0; i
< MAX_VERTEX_STREAMS
; i
++)
344 q
->result
|= stream_overflowed((void *) q
->map
, i
);
346 case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE
:
347 q
->result
= q
->map
->end
- q
->map
->start
;
349 /* WaDividePSInvocationCountBy4:HSW,BDW */
350 if (devinfo
->gen
== 8 && q
->index
== PIPE_STAT_QUERY_PS_INVOCATIONS
)
353 case PIPE_QUERY_OCCLUSION_COUNTER
:
354 case PIPE_QUERY_PRIMITIVES_GENERATED
:
355 case PIPE_QUERY_PRIMITIVES_EMITTED
:
357 q
->result
= q
->map
->end
- q
->map
->start
;
365 emit_alu_add(struct iris_batch
*batch
, unsigned dst_reg
,
366 unsigned reg_a
, unsigned reg_b
)
368 uint32_t *math
= iris_get_command_space(batch
, 5 * sizeof(uint32_t));
370 math
[0] = MI_MATH
| (5 - 2);
371 math
[1] = _MI_ALU2(LOAD
, MI_ALU_SRCA
, reg_a
);
372 math
[2] = _MI_ALU2(LOAD
, MI_ALU_SRCB
, reg_b
);
373 math
[3] = _MI_ALU0(ADD
);
374 math
[4] = _MI_ALU2(STORE
, dst_reg
, MI_ALU_ACCU
);
378 emit_alu_shl(struct iris_batch
*batch
, unsigned dst_reg
,
379 unsigned src_reg
, unsigned shift
)
383 int dwords
= 1 + 4 * shift
;
385 uint32_t *math
= iris_get_command_space(batch
, sizeof(uint32_t) * dwords
);
387 math
[0] = MI_MATH
| ((1 + 4 * shift
) - 2);
389 for (unsigned i
= 0; i
< shift
; i
++) {
390 unsigned add_src
= (i
== 0) ? src_reg
: dst_reg
;
391 math
[1 + (i
* 4) + 0] = _MI_ALU2(LOAD
, MI_ALU_SRCA
, add_src
);
392 math
[1 + (i
* 4) + 1] = _MI_ALU2(LOAD
, MI_ALU_SRCB
, add_src
);
393 math
[1 + (i
* 4) + 2] = _MI_ALU0(ADD
);
394 math
[1 + (i
* 4) + 3] = _MI_ALU2(STORE
, dst_reg
, MI_ALU_ACCU
);
398 /* Emit dwords to multiply GPR0 by N */
400 build_alu_multiply_gpr0(uint32_t *dw
, unsigned *dw_count
, uint32_t N
)
402 VK_OUTARRAY_MAKE(out
, dw
, dw_count
);
404 #define APPEND_ALU(op, x, y) \
405 vk_outarray_append(&out, alu_dw) *alu_dw = _MI_ALU(MI_ALU_##op, x, y)
408 unsigned top_bit
= 31 - __builtin_clz(N
);
409 for (int i
= top_bit
- 1; i
>= 0; i
--) {
410 /* We get our initial data in GPR0 and we write the final data out to
411 * GPR0 but we use GPR1 as our scratch register.
413 unsigned src_reg
= i
== top_bit
- 1 ? MI_ALU_R0
: MI_ALU_R1
;
414 unsigned dst_reg
= i
== 0 ? MI_ALU_R0
: MI_ALU_R1
;
416 /* Shift the current value left by 1 */
417 APPEND_ALU(LOAD
, MI_ALU_SRCA
, src_reg
);
418 APPEND_ALU(LOAD
, MI_ALU_SRCB
, src_reg
);
419 APPEND_ALU(ADD
, 0, 0);
422 /* Store ACCU to R1 and add R0 to R1 */
423 APPEND_ALU(STORE
, MI_ALU_R1
, MI_ALU_ACCU
);
424 APPEND_ALU(LOAD
, MI_ALU_SRCA
, MI_ALU_R0
);
425 APPEND_ALU(LOAD
, MI_ALU_SRCB
, MI_ALU_R1
);
426 APPEND_ALU(ADD
, 0, 0);
429 APPEND_ALU(STORE
, dst_reg
, MI_ALU_ACCU
);
436 emit_mul_gpr0(struct iris_batch
*batch
, uint32_t N
)
439 build_alu_multiply_gpr0(NULL
, &num_dwords
, N
);
441 uint32_t *math
= iris_get_command_space(batch
, 4 * num_dwords
);
442 math
[0] = MI_MATH
| (num_dwords
- 2);
443 build_alu_multiply_gpr0(&math
[1], &num_dwords
, N
);
447 iris_math_div32_gpr0(struct iris_context
*ice
,
448 struct iris_batch
*batch
,
451 /* Zero out the top of GPR0 */
452 emit_lri32(batch
, CS_GPR(0) + 4, 0);
455 /* This invalid, but we should do something so we set GPR0 to 0. */
456 emit_lri32(batch
, CS_GPR(0), 0);
457 } else if (util_is_power_of_two_or_zero(D
)) {
458 unsigned log2_D
= util_logbase2(D
);
460 /* We right-shift by log2(D) by left-shifting by 32 - log2(D) and taking
461 * the top 32 bits of the result.
463 emit_alu_shl(batch
, MI_ALU_R0
, MI_ALU_R0
, 32 - log2_D
);
464 emit_lrr32(batch
, CS_GPR(0) + 0, CS_GPR(0) + 4);
465 emit_lri32(batch
, CS_GPR(0) + 4, 0);
467 struct util_fast_udiv_info m
= util_compute_fast_udiv_info(D
, 32, 32);
468 assert(m
.multiplier
<= UINT32_MAX
);
471 /* We right-shift by L by left-shifting by 32 - l and taking the top
472 * 32 bits of the result.
474 if (m
.pre_shift
< 32)
475 emit_alu_shl(batch
, MI_ALU_R0
, MI_ALU_R0
, 32 - m
.pre_shift
);
476 emit_lrr32(batch
, CS_GPR(0) + 0, CS_GPR(0) + 4);
477 emit_lri32(batch
, CS_GPR(0) + 4, 0);
480 /* Do the 32x32 multiply into gpr0 */
481 emit_mul_gpr0(batch
, m
.multiplier
);
484 /* If we need to increment, save off a copy of GPR0 */
485 emit_lri32(batch
, CS_GPR(1) + 0, m
.multiplier
);
486 emit_lri32(batch
, CS_GPR(1) + 4, 0);
487 emit_alu_add(batch
, MI_ALU_R0
, MI_ALU_R0
, MI_ALU_R1
);
491 emit_lrr32(batch
, CS_GPR(0) + 0, CS_GPR(0) + 4);
492 emit_lri32(batch
, CS_GPR(0) + 4, 0);
495 /* We right-shift by L by left-shifting by 32 - l and taking the top
496 * 32 bits of the result.
498 if (m
.post_shift
< 32)
499 emit_alu_shl(batch
, MI_ALU_R0
, MI_ALU_R0
, 32 - m
.post_shift
);
500 emit_lrr32(batch
, CS_GPR(0) + 0, CS_GPR(0) + 4);
501 emit_lri32(batch
, CS_GPR(0) + 4, 0);
507 * GPR0 = (GPR0 == 0) ? 0 : 1;
510 gpr0_to_bool(struct iris_context
*ice
)
512 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
514 ice
->vtbl
.load_register_imm64(batch
, CS_GPR(1), 1ull);
516 static const uint32_t math
[] = {
518 MI_ALU2(LOAD
, SRCA
, R0
),
519 MI_ALU1(LOAD0
, SRCB
),
521 MI_ALU2(STOREINV
, R0
, ZF
),
522 MI_ALU2(LOAD
, SRCA
, R0
),
523 MI_ALU2(LOAD
, SRCB
, R1
),
525 MI_ALU2(STORE
, R0
, ACCU
),
527 iris_batch_emit(batch
, math
, sizeof(math
));
531 load_overflow_data_to_cs_gprs(struct iris_context
*ice
,
532 struct iris_query
*q
,
535 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
536 struct iris_bo
*bo
= iris_resource_bo(q
->query_state_ref
.res
);
537 uint32_t offset
= q
->query_state_ref
.offset
;
539 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(1), bo
, offset
+
540 offsetof(struct iris_query_so_overflow
,
541 stream
[idx
].prim_storage_needed
[0]));
542 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(2), bo
, offset
+
543 offsetof(struct iris_query_so_overflow
,
544 stream
[idx
].prim_storage_needed
[1]));
546 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(3), bo
, offset
+
547 offsetof(struct iris_query_so_overflow
,
548 stream
[idx
].num_prims
[0]));
549 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(4), bo
, offset
+
550 offsetof(struct iris_query_so_overflow
,
551 stream
[idx
].num_prims
[1]));
561 calc_overflow_for_stream(struct iris_context
*ice
)
563 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
564 static const uint32_t maths
[] = {
566 MI_ALU2(LOAD
, SRCA
, R4
),
567 MI_ALU2(LOAD
, SRCB
, R3
),
569 MI_ALU2(STORE
, R3
, ACCU
),
570 MI_ALU2(LOAD
, SRCA
, R2
),
571 MI_ALU2(LOAD
, SRCB
, R1
),
573 MI_ALU2(STORE
, R1
, ACCU
),
574 MI_ALU2(LOAD
, SRCA
, R3
),
575 MI_ALU2(LOAD
, SRCB
, R1
),
577 MI_ALU2(STORE
, R1
, ACCU
),
578 MI_ALU2(LOAD
, SRCA
, R1
),
579 MI_ALU2(LOAD
, SRCB
, R0
),
581 MI_ALU2(STORE
, R0
, ACCU
),
584 iris_batch_emit(batch
, maths
, sizeof(maths
));
588 overflow_result_to_gpr0(struct iris_context
*ice
, struct iris_query
*q
)
590 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
592 ice
->vtbl
.load_register_imm64(batch
, CS_GPR(0), 0ull);
594 if (q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
) {
595 load_overflow_data_to_cs_gprs(ice
, q
, q
->index
);
596 calc_overflow_for_stream(ice
);
598 for (int i
= 0; i
< MAX_VERTEX_STREAMS
; i
++) {
599 load_overflow_data_to_cs_gprs(ice
, q
, i
);
600 calc_overflow_for_stream(ice
);
608 * GPR0 = GPR0 & ((1ull << n) -1);
611 keep_gpr0_lower_n_bits(struct iris_context
*ice
, uint32_t n
)
613 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
615 ice
->vtbl
.load_register_imm64(batch
, CS_GPR(1), (1ull << n
) - 1);
616 static const uint32_t math
[] = {
618 MI_ALU2(LOAD
, SRCA
, R0
),
619 MI_ALU2(LOAD
, SRCB
, R1
),
621 MI_ALU2(STORE
, R0
, ACCU
),
623 iris_batch_emit(batch
, math
, sizeof(math
));
630 shl_gpr0_by_30_bits(struct iris_context
*ice
)
632 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
633 /* First we mask 34 bits of GPR0 to prevent overflow */
634 keep_gpr0_lower_n_bits(ice
, 34);
636 static const uint32_t shl_math
[] = {
637 MI_ALU2(LOAD
, SRCA
, R0
),
638 MI_ALU2(LOAD
, SRCB
, R0
),
640 MI_ALU2(STORE
, R0
, ACCU
),
643 const uint32_t outer_count
= 5;
644 const uint32_t inner_count
= 6;
645 const uint32_t cmd_len
= 1 + inner_count
* ARRAY_SIZE(shl_math
);
646 const uint32_t batch_len
= cmd_len
* outer_count
;
647 uint32_t *map
= iris_get_command_space(batch
, batch_len
* 4);
649 for (int o
= 0; o
< outer_count
; o
++) {
650 map
[offset
++] = MI_MATH
| (cmd_len
- 2);
651 for (int i
= 0; i
< inner_count
; i
++) {
652 memcpy(&map
[offset
], shl_math
, sizeof(shl_math
));
661 * Note that the upper 30 bits of GPR0 are lost!
664 shr_gpr0_by_2_bits(struct iris_context
*ice
)
666 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
667 shl_gpr0_by_30_bits(ice
);
668 ice
->vtbl
.load_register_reg32(batch
, CS_GPR(0) + 4, CS_GPR(0));
669 ice
->vtbl
.load_register_imm32(batch
, CS_GPR(0) + 4, 0);
673 * Calculate the result and store it to CS_GPR0.
676 calculate_result_on_gpu(struct iris_context
*ice
, struct iris_query
*q
)
678 struct iris_batch
*batch
= &ice
->batches
[q
->batch_idx
];
679 struct iris_screen
*screen
= (void *) ice
->ctx
.screen
;
680 const struct gen_device_info
*devinfo
= &batch
->screen
->devinfo
;
681 struct iris_bo
*bo
= iris_resource_bo(q
->query_state_ref
.res
);
682 uint32_t offset
= q
->query_state_ref
.offset
;
684 if (q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
||
685 q
->type
== PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
) {
686 overflow_result_to_gpr0(ice
, q
);
690 if (q
->type
== PIPE_QUERY_TIMESTAMP
) {
691 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(0), bo
,
693 offsetof(struct iris_query_snapshots
, start
));
694 /* TODO: This discards any fractional bits of the timebase scale.
695 * We would need to do a bit of fixed point math on the CS ALU, or
696 * launch an actual shader to calculate this with full precision.
698 emit_mul_gpr0(batch
, (1000000000ull / screen
->devinfo
.timestamp_frequency
));
699 keep_gpr0_lower_n_bits(ice
, 36);
703 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(1), bo
,
705 offsetof(struct iris_query_snapshots
, start
));
706 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(2), bo
,
708 offsetof(struct iris_query_snapshots
, end
));
710 static const uint32_t math
[] = {
712 MI_ALU2(LOAD
, SRCA
, R2
),
713 MI_ALU2(LOAD
, SRCB
, R1
),
715 MI_ALU2(STORE
, R0
, ACCU
),
717 iris_batch_emit(batch
, math
, sizeof(math
));
719 /* WaDividePSInvocationCountBy4:HSW,BDW */
720 if (devinfo
->gen
== 8 &&
721 q
->type
== PIPE_QUERY_PIPELINE_STATISTICS_SINGLE
&&
722 q
->index
== PIPE_STAT_QUERY_PS_INVOCATIONS
)
723 shr_gpr0_by_2_bits(ice
);
725 if (q
->type
== PIPE_QUERY_OCCLUSION_PREDICATE
||
726 q
->type
== PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE
)
729 if (q
->type
== PIPE_QUERY_TIME_ELAPSED
) {
730 /* TODO: This discards fractional bits (see above). */
731 emit_mul_gpr0(batch
, (1000000000ull / screen
->devinfo
.timestamp_frequency
));
735 static struct pipe_query
*
736 iris_create_query(struct pipe_context
*ctx
,
740 struct iris_query
*q
= calloc(1, sizeof(struct iris_query
));
742 q
->type
= query_type
;
745 if (q
->type
== PIPE_QUERY_PIPELINE_STATISTICS_SINGLE
&&
746 q
->index
== PIPE_STAT_QUERY_CS_INVOCATIONS
)
747 q
->batch_idx
= IRIS_BATCH_COMPUTE
;
749 q
->batch_idx
= IRIS_BATCH_RENDER
;
750 return (struct pipe_query
*) q
;
754 iris_destroy_query(struct pipe_context
*ctx
, struct pipe_query
*p_query
)
756 struct iris_query
*query
= (void *) p_query
;
757 struct iris_screen
*screen
= (void *) ctx
->screen
;
758 iris_syncpt_reference(screen
, &query
->syncpt
, NULL
);
764 iris_begin_query(struct pipe_context
*ctx
, struct pipe_query
*query
)
766 struct iris_context
*ice
= (void *) ctx
;
767 struct iris_query
*q
= (void *) query
;
771 if (q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
||
772 q
->type
== PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
)
773 size
= sizeof(struct iris_query_so_overflow
);
775 size
= sizeof(struct iris_query_snapshots
);
777 u_upload_alloc(ice
->query_buffer_uploader
, 0,
778 size
, size
, &q
->query_state_ref
.offset
,
779 &q
->query_state_ref
.res
, &ptr
);
781 if (!iris_resource_bo(q
->query_state_ref
.res
))
790 WRITE_ONCE(q
->map
->snapshots_landed
, false);
792 if (q
->type
== PIPE_QUERY_PRIMITIVES_GENERATED
&& q
->index
== 0) {
793 ice
->state
.prims_generated_query_active
= true;
794 ice
->state
.dirty
|= IRIS_DIRTY_STREAMOUT
| IRIS_DIRTY_CLIP
;
797 if (q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
||
798 q
->type
== PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
)
799 write_overflow_values(ice
, q
, false);
802 q
->query_state_ref
.offset
+
803 offsetof(struct iris_query_snapshots
, start
));
809 iris_end_query(struct pipe_context
*ctx
, struct pipe_query
*query
)
811 struct iris_context
*ice
= (void *) ctx
;
812 struct iris_query
*q
= (void *) query
;
813 struct iris_batch
*batch
= &ice
->batches
[q
->batch_idx
];
815 if (q
->type
== PIPE_QUERY_TIMESTAMP
) {
816 iris_begin_query(ctx
, query
);
817 iris_batch_reference_signal_syncpt(batch
, &q
->syncpt
);
818 mark_available(ice
, q
);
822 if (q
->type
== PIPE_QUERY_PRIMITIVES_GENERATED
&& q
->index
== 0) {
823 ice
->state
.prims_generated_query_active
= false;
824 ice
->state
.dirty
|= IRIS_DIRTY_STREAMOUT
| IRIS_DIRTY_CLIP
;
827 if (q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
||
828 q
->type
== PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
)
829 write_overflow_values(ice
, q
, true);
832 q
->query_state_ref
.offset
+
833 offsetof(struct iris_query_snapshots
, end
));
835 iris_batch_reference_signal_syncpt(batch
, &q
->syncpt
);
836 mark_available(ice
, q
);
842 * See if the snapshots have landed for a query, and if so, compute the
843 * result and mark it ready. Does not flush (unlike iris_get_query_result).
846 iris_check_query_no_flush(struct iris_context
*ice
, struct iris_query
*q
)
848 struct iris_screen
*screen
= (void *) ice
->ctx
.screen
;
849 const struct gen_device_info
*devinfo
= &screen
->devinfo
;
851 if (!q
->ready
&& READ_ONCE(q
->map
->snapshots_landed
)) {
852 calculate_result_on_cpu(devinfo
, q
);
857 iris_get_query_result(struct pipe_context
*ctx
,
858 struct pipe_query
*query
,
860 union pipe_query_result
*result
)
862 struct iris_context
*ice
= (void *) ctx
;
863 struct iris_query
*q
= (void *) query
;
864 struct iris_screen
*screen
= (void *) ctx
->screen
;
865 const struct gen_device_info
*devinfo
= &screen
->devinfo
;
866 struct iris_bo
*bo
= iris_resource_bo(q
->query_state_ref
.res
);
869 if (iris_batch_references(&ice
->batches
[q
->batch_idx
], bo
))
870 iris_batch_flush(&ice
->batches
[q
->batch_idx
]);
872 while (!READ_ONCE(q
->map
->snapshots_landed
)) {
874 iris_wait_syncpt(ctx
->screen
, q
->syncpt
, INT64_MAX
);
879 assert(READ_ONCE(q
->map
->snapshots_landed
));
880 calculate_result_on_cpu(devinfo
, q
);
885 result
->u64
= q
->result
;
891 iris_get_query_result_resource(struct pipe_context
*ctx
,
892 struct pipe_query
*query
,
894 enum pipe_query_value_type result_type
,
896 struct pipe_resource
*p_res
,
899 struct iris_context
*ice
= (void *) ctx
;
900 struct iris_query
*q
= (void *) query
;
901 struct iris_batch
*batch
= &ice
->batches
[q
->batch_idx
];
902 const struct gen_device_info
*devinfo
= &batch
->screen
->devinfo
;
903 struct iris_resource
*res
= (void *) p_res
;
904 struct iris_bo
*bo
= iris_resource_bo(q
->query_state_ref
.res
);
905 unsigned snapshots_landed_offset
=
906 offsetof(struct iris_query_snapshots
, snapshots_landed
);
908 res
->bind_history
|= PIPE_BIND_QUERY_BUFFER
;
911 /* They're asking for the availability of the result. If we still
912 * have commands queued up which produce the result, submit them
913 * now so that progress happens. Either way, copy the snapshots
914 * landed field to the destination resource.
916 if (iris_batch_references(batch
, bo
))
917 iris_batch_flush(batch
);
919 ice
->vtbl
.copy_mem_mem(batch
, iris_resource_bo(p_res
), offset
,
920 bo
, snapshots_landed_offset
,
921 result_type
<= PIPE_QUERY_TYPE_U32
? 4 : 8);
925 if (!q
->ready
&& READ_ONCE(q
->map
->snapshots_landed
)) {
926 /* The final snapshots happen to have landed, so let's just compute
927 * the result on the CPU now...
929 calculate_result_on_cpu(devinfo
, q
);
933 /* We happen to have the result on the CPU, so just copy it. */
934 if (result_type
<= PIPE_QUERY_TYPE_U32
) {
935 ice
->vtbl
.store_data_imm32(batch
, iris_resource_bo(p_res
), offset
,
938 ice
->vtbl
.store_data_imm64(batch
, iris_resource_bo(p_res
), offset
,
942 /* Make sure the result lands before they use bind the QBO elsewhere
943 * and use the result.
945 // XXX: Why? i965 doesn't do this.
946 iris_emit_pipe_control_flush(batch
, PIPE_CONTROL_CS_STALL
);
950 /* Calculate the result to CS_GPR0 */
951 calculate_result_on_gpu(ice
, q
);
953 bool predicated
= !wait
&& !q
->stalled
;
956 ice
->vtbl
.load_register_imm64(batch
, MI_PREDICATE_SRC1
, 0ull);
957 ice
->vtbl
.load_register_mem64(batch
, MI_PREDICATE_SRC0
, bo
,
958 snapshots_landed_offset
);
959 uint32_t predicate
= MI_PREDICATE
|
960 MI_PREDICATE_LOADOP_LOADINV
|
961 MI_PREDICATE_COMBINEOP_SET
|
962 MI_PREDICATE_COMPAREOP_SRCS_EQUAL
;
963 iris_batch_emit(batch
, &predicate
, sizeof(uint32_t));
966 if (result_type
<= PIPE_QUERY_TYPE_U32
) {
967 ice
->vtbl
.store_register_mem32(batch
, CS_GPR(0),
968 iris_resource_bo(p_res
),
971 ice
->vtbl
.store_register_mem64(batch
, CS_GPR(0),
972 iris_resource_bo(p_res
),
978 iris_set_active_query_state(struct pipe_context
*ctx
, boolean enable
)
980 struct iris_context
*ice
= (void *) ctx
;
982 if (ice
->state
.statistics_counters_enabled
== enable
)
985 // XXX: most packets aren't paying attention to this yet, because it'd
986 // have to be done dynamically at draw time, which is a pain
987 ice
->state
.statistics_counters_enabled
= enable
;
988 ice
->state
.dirty
|= IRIS_DIRTY_CLIP
|
991 IRIS_DIRTY_STREAMOUT
|
999 set_predicate_enable(struct iris_context
*ice
, bool value
)
1002 ice
->state
.predicate
= IRIS_PREDICATE_STATE_RENDER
;
1004 ice
->state
.predicate
= IRIS_PREDICATE_STATE_DONT_RENDER
;
1008 set_predicate_for_result(struct iris_context
*ice
,
1009 struct iris_query
*q
,
1012 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
1013 struct iris_bo
*bo
= iris_resource_bo(q
->query_state_ref
.res
);
1015 /* The CPU doesn't have the query result yet; use hardware predication */
1016 ice
->state
.predicate
= IRIS_PREDICATE_STATE_USE_BIT
;
1018 /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
1019 iris_emit_pipe_control_flush(batch
, PIPE_CONTROL_FLUSH_ENABLE
);
1023 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
1024 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
:
1025 overflow_result_to_gpr0(ice
, q
);
1027 ice
->vtbl
.load_register_reg64(batch
, MI_PREDICATE_SRC0
, CS_GPR(0));
1028 ice
->vtbl
.load_register_imm64(batch
, MI_PREDICATE_SRC1
, 0ull);
1031 /* PIPE_QUERY_OCCLUSION_* */
1032 ice
->vtbl
.load_register_mem64(batch
, MI_PREDICATE_SRC0
, bo
,
1033 offsetof(struct iris_query_snapshots
, start
) +
1034 q
->query_state_ref
.offset
);
1035 ice
->vtbl
.load_register_mem64(batch
, MI_PREDICATE_SRC1
, bo
,
1036 offsetof(struct iris_query_snapshots
, end
) +
1037 q
->query_state_ref
.offset
);
1041 uint32_t mi_predicate
= MI_PREDICATE
|
1042 MI_PREDICATE_COMBINEOP_SET
|
1043 MI_PREDICATE_COMPAREOP_SRCS_EQUAL
|
1044 (inverted
? MI_PREDICATE_LOADOP_LOAD
1045 : MI_PREDICATE_LOADOP_LOADINV
);
1046 iris_batch_emit(batch
, &mi_predicate
, sizeof(uint32_t));
1048 /* We immediately set the predicate on the render batch, as all the
1049 * counters come from 3D operations. However, we may need to predicate
1050 * a compute dispatch, which executes in a different GEM context and has
1051 * a different MI_PREDICATE_RESULT register. So, we save the result to
1052 * memory and reload it in iris_launch_grid.
1054 unsigned offset
= q
->query_state_ref
.offset
+
1055 offsetof(struct iris_query_snapshots
, predicate_result
);
1056 ice
->vtbl
.store_register_mem64(batch
, MI_PREDICATE_RESULT
,
1058 ice
->state
.compute_predicate
= bo
;
1062 iris_render_condition(struct pipe_context
*ctx
,
1063 struct pipe_query
*query
,
1065 enum pipe_render_cond_flag mode
)
1067 struct iris_context
*ice
= (void *) ctx
;
1068 struct iris_query
*q
= (void *) query
;
1070 /* The old condition isn't relevant; we'll update it if necessary */
1071 ice
->state
.compute_predicate
= NULL
;
1072 ice
->condition
.query
= q
;
1073 ice
->condition
.condition
= condition
;
1076 ice
->state
.predicate
= IRIS_PREDICATE_STATE_RENDER
;
1080 iris_check_query_no_flush(ice
, q
);
1082 if (q
->result
|| q
->ready
) {
1083 set_predicate_enable(ice
, (q
->result
!= 0) ^ condition
);
1085 if (mode
== PIPE_RENDER_COND_NO_WAIT
||
1086 mode
== PIPE_RENDER_COND_BY_REGION_NO_WAIT
) {
1087 perf_debug(&ice
->dbg
, "Conditional rendering demoted from "
1088 "\"no wait\" to \"wait\".");
1090 set_predicate_for_result(ice
, q
, condition
);
1095 iris_resolve_conditional_render(struct iris_context
*ice
)
1097 struct pipe_context
*ctx
= (void *) ice
;
1098 struct iris_query
*q
= ice
->condition
.query
;
1099 struct pipe_query
*query
= (void *) q
;
1100 union pipe_query_result result
;
1102 if (ice
->state
.predicate
!= IRIS_PREDICATE_STATE_USE_BIT
)
1107 iris_get_query_result(ctx
, query
, true, &result
);
1108 set_predicate_enable(ice
, (q
->result
!= 0) ^ ice
->condition
.condition
);
1112 iris_init_query_functions(struct pipe_context
*ctx
)
1114 ctx
->create_query
= iris_create_query
;
1115 ctx
->destroy_query
= iris_destroy_query
;
1116 ctx
->begin_query
= iris_begin_query
;
1117 ctx
->end_query
= iris_end_query
;
1118 ctx
->get_query_result
= iris_get_query_result
;
1119 ctx
->get_query_result_resource
= iris_get_query_result_resource
;
1120 ctx
->set_active_query_state
= iris_set_active_query_state
;
1121 ctx
->render_condition
= iris_render_condition
;