2 * Copyright © 2017 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
26 * Query object support. This allows measuring various simple statistics
27 * via counters on the GPU.
32 #include "pipe/p_defines.h"
33 #include "pipe/p_state.h"
34 #include "pipe/p_context.h"
35 #include "pipe/p_screen.h"
36 #include "util/fast_idiv_by_const.h"
37 #include "util/u_inlines.h"
38 #include "iris_context.h"
39 #include "iris_defines.h"
40 #include "iris_resource.h"
41 #include "iris_screen.h"
42 #include "vulkan/util/vk_util.h"
44 #define IA_VERTICES_COUNT 0x2310
45 #define IA_PRIMITIVES_COUNT 0x2318
46 #define VS_INVOCATION_COUNT 0x2320
47 #define HS_INVOCATION_COUNT 0x2300
48 #define DS_INVOCATION_COUNT 0x2308
49 #define GS_INVOCATION_COUNT 0x2328
50 #define GS_PRIMITIVES_COUNT 0x2330
51 #define CL_INVOCATION_COUNT 0x2338
52 #define CL_PRIMITIVES_COUNT 0x2340
53 #define PS_INVOCATION_COUNT 0x2348
54 #define CS_INVOCATION_COUNT 0x2290
55 #define PS_DEPTH_COUNT 0x2350
57 #define SO_PRIM_STORAGE_NEEDED(n) (0x5240 + (n) * 8)
59 #define SO_NUM_PRIMS_WRITTEN(n) (0x5200 + (n) * 8)
61 #define MI_MATH (0x1a << 23)
63 #define MI_ALU_LOAD 0x080
64 #define MI_ALU_LOADINV 0x480
65 #define MI_ALU_LOAD0 0x081
66 #define MI_ALU_LOAD1 0x481
67 #define MI_ALU_ADD 0x100
68 #define MI_ALU_SUB 0x101
69 #define MI_ALU_AND 0x102
70 #define MI_ALU_OR 0x103
71 #define MI_ALU_XOR 0x104
72 #define MI_ALU_STORE 0x180
73 #define MI_ALU_STOREINV 0x580
75 #define MI_ALU_R0 0x00
76 #define MI_ALU_R1 0x01
77 #define MI_ALU_R2 0x02
78 #define MI_ALU_R3 0x03
79 #define MI_ALU_R4 0x04
80 #define MI_ALU_SRCA 0x20
81 #define MI_ALU_SRCB 0x21
82 #define MI_ALU_ACCU 0x31
83 #define MI_ALU_ZF 0x32
84 #define MI_ALU_CF 0x33
86 #define _MI_ALU(op, x, y) (((op) << 20) | ((x) << 10) | (y))
88 #define _MI_ALU0(op) _MI_ALU(MI_ALU_##op, 0, 0)
89 #define _MI_ALU1(op, x) _MI_ALU(MI_ALU_##op, x, 0)
90 #define _MI_ALU2(op, x, y) _MI_ALU(MI_ALU_##op, x, y)
92 #define MI_ALU0(op) _MI_ALU0(op)
93 #define MI_ALU1(op, x) _MI_ALU1(op, MI_ALU_##x)
94 #define MI_ALU2(op, x, y) _MI_ALU2(op, MI_ALU_##x, MI_ALU_##y)
96 #define emit_lri32 ice->vtbl.load_register_imm32
97 #define emit_lri64 ice->vtbl.load_register_imm64
98 #define emit_lrr32 ice->vtbl.load_register_reg32
101 enum pipe_query_type type
;
111 struct iris_query_snapshots
*map
;
116 struct iris_query_snapshots
{
117 /** iris_render_condition's saved MI_PREDICATE_DATA value. */
118 uint64_t predicate_data
;
120 /** Have the start/end snapshots landed? */
121 uint64_t snapshots_landed
;
123 /** Starting and ending counter snapshots */
128 struct iris_query_so_overflow
{
129 uint64_t predicate_data
;
130 uint64_t snapshots_landed
;
133 uint64_t prim_storage_needed
[2];
134 uint64_t num_prims
[2];
139 * Is this type of query written by PIPE_CONTROL?
142 iris_is_query_pipelined(struct iris_query
*q
)
145 case PIPE_QUERY_OCCLUSION_COUNTER
:
146 case PIPE_QUERY_OCCLUSION_PREDICATE
:
147 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE
:
148 case PIPE_QUERY_TIMESTAMP
:
149 case PIPE_QUERY_TIMESTAMP_DISJOINT
:
150 case PIPE_QUERY_TIME_ELAPSED
:
159 mark_available(struct iris_context
*ice
, struct iris_query
*q
)
161 struct iris_batch
*batch
= &ice
->batches
[q
->batch_idx
];
162 unsigned flags
= PIPE_CONTROL_WRITE_IMMEDIATE
;
163 unsigned offset
= offsetof(struct iris_query_snapshots
, snapshots_landed
);
165 if (!iris_is_query_pipelined(q
)) {
166 ice
->vtbl
.store_data_imm64(batch
, q
->bo
, offset
, true);
168 /* Order available *after* the query results. */
169 flags
|= PIPE_CONTROL_FLUSH_ENABLE
;
170 iris_emit_pipe_control_write(batch
, flags
, q
->bo
, offset
, true);
175 * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
178 iris_pipelined_write(struct iris_batch
*batch
,
179 struct iris_query
*q
,
180 enum pipe_control_flags flags
,
183 const struct gen_device_info
*devinfo
= &batch
->screen
->devinfo
;
184 const unsigned optional_cs_stall
=
185 devinfo
->gen
== 9 && devinfo
->gt
== 4 ? PIPE_CONTROL_CS_STALL
: 0;
187 iris_emit_pipe_control_write(batch
, flags
| optional_cs_stall
,
188 q
->bo
, offset
, 0ull);
192 write_value(struct iris_context
*ice
, struct iris_query
*q
, unsigned offset
)
194 struct iris_batch
*batch
= &ice
->batches
[q
->batch_idx
];
195 const struct gen_device_info
*devinfo
= &batch
->screen
->devinfo
;
197 if (!iris_is_query_pipelined(q
)) {
198 iris_emit_pipe_control_flush(batch
,
199 PIPE_CONTROL_CS_STALL
|
200 PIPE_CONTROL_STALL_AT_SCOREBOARD
);
205 case PIPE_QUERY_OCCLUSION_COUNTER
:
206 case PIPE_QUERY_OCCLUSION_PREDICATE
:
207 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE
:
208 if (devinfo
->gen
>= 10) {
209 /* "Driver must program PIPE_CONTROL with only Depth Stall Enable
210 * bit set prior to programming a PIPE_CONTROL with Write PS Depth
211 * Count sync operation."
213 iris_emit_pipe_control_flush(batch
, PIPE_CONTROL_DEPTH_STALL
);
215 iris_pipelined_write(&ice
->batches
[IRIS_BATCH_RENDER
], q
,
216 PIPE_CONTROL_WRITE_DEPTH_COUNT
|
217 PIPE_CONTROL_DEPTH_STALL
,
220 case PIPE_QUERY_TIME_ELAPSED
:
221 case PIPE_QUERY_TIMESTAMP
:
222 case PIPE_QUERY_TIMESTAMP_DISJOINT
:
223 iris_pipelined_write(&ice
->batches
[IRIS_BATCH_RENDER
], q
,
224 PIPE_CONTROL_WRITE_TIMESTAMP
,
227 case PIPE_QUERY_PRIMITIVES_GENERATED
:
228 ice
->vtbl
.store_register_mem64(batch
,
229 q
->index
== 0 ? CL_INVOCATION_COUNT
:
230 SO_PRIM_STORAGE_NEEDED(q
->index
),
231 q
->bo
, offset
, false);
233 case PIPE_QUERY_PRIMITIVES_EMITTED
:
234 ice
->vtbl
.store_register_mem64(batch
,
235 SO_NUM_PRIMS_WRITTEN(q
->index
),
236 q
->bo
, offset
, false);
238 case PIPE_QUERY_PIPELINE_STATISTICS
: {
239 static const uint32_t index_to_reg
[] = {
252 const uint32_t reg
= index_to_reg
[q
->index
];
254 ice
->vtbl
.store_register_mem64(batch
, reg
, q
->bo
, offset
, false);
263 write_overflow_values(struct iris_context
*ice
, struct iris_query
*q
, bool end
)
265 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
266 uint32_t count
= q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
? 1 : 4;
268 iris_emit_pipe_control_flush(batch
,
269 PIPE_CONTROL_CS_STALL
|
270 PIPE_CONTROL_STALL_AT_SCOREBOARD
);
271 for (uint32_t i
= 0; i
< count
; i
++) {
272 int s
= q
->index
+ i
;
273 int g_idx
= offsetof(struct iris_query_so_overflow
,
274 stream
[s
].num_prims
[end
]);
275 int w_idx
= offsetof(struct iris_query_so_overflow
,
276 stream
[s
].prim_storage_needed
[end
]);
277 ice
->vtbl
.store_register_mem64(batch
, SO_NUM_PRIMS_WRITTEN(s
),
278 q
->bo
, g_idx
, false);
279 ice
->vtbl
.store_register_mem64(batch
, SO_PRIM_STORAGE_NEEDED(s
),
280 q
->bo
, w_idx
, false);
285 iris_timebase_scale(const struct gen_device_info
*devinfo
,
286 uint64_t gpu_timestamp
)
288 return (1000000000ull * gpu_timestamp
) / devinfo
->timestamp_frequency
;
292 iris_raw_timestamp_delta(uint64_t time0
, uint64_t time1
)
295 return (1ULL << TIMESTAMP_BITS
) + time1
- time0
;
297 return time1
- time0
;
302 stream_overflowed(struct iris_query_so_overflow
*so
, int s
)
304 return (so
->stream
[s
].prim_storage_needed
[1] -
305 so
->stream
[s
].prim_storage_needed
[0]) !=
306 (so
->stream
[s
].num_prims
[1] - so
->stream
[s
].num_prims
[0]);
310 calculate_result_on_cpu(const struct gen_device_info
*devinfo
,
311 struct iris_query
*q
)
314 case PIPE_QUERY_OCCLUSION_PREDICATE
:
315 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE
:
316 q
->result
= q
->map
->end
!= q
->map
->start
;
318 case PIPE_QUERY_TIMESTAMP
:
319 case PIPE_QUERY_TIMESTAMP_DISJOINT
:
320 /* The timestamp is the single starting snapshot. */
321 q
->result
= iris_timebase_scale(devinfo
, q
->map
->start
);
322 q
->result
&= (1ull << TIMESTAMP_BITS
) - 1;
324 case PIPE_QUERY_TIME_ELAPSED
:
325 q
->result
= iris_raw_timestamp_delta(q
->map
->start
, q
->map
->end
);
326 q
->result
= iris_timebase_scale(devinfo
, q
->result
);
327 q
->result
&= (1ull << TIMESTAMP_BITS
) - 1;
329 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
330 q
->result
= stream_overflowed((void *) q
->map
, q
->index
);
332 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
:
334 for (int i
= 0; i
< MAX_VERTEX_STREAMS
; i
++)
335 q
->result
|= stream_overflowed((void *) q
->map
, i
);
337 case PIPE_QUERY_OCCLUSION_COUNTER
:
338 case PIPE_QUERY_PRIMITIVES_GENERATED
:
339 case PIPE_QUERY_PRIMITIVES_EMITTED
:
340 case PIPE_QUERY_PIPELINE_STATISTICS
:
342 q
->result
= q
->map
->end
- q
->map
->start
;
350 emit_alu_add(struct iris_batch
*batch
, unsigned dst_reg
,
351 unsigned reg_a
, unsigned reg_b
)
353 uint32_t *math
= iris_get_command_space(batch
, 5 * sizeof(uint32_t));
355 math
[0] = MI_MATH
| (5 - 2);
356 math
[1] = _MI_ALU2(LOAD
, MI_ALU_SRCA
, reg_a
);
357 math
[2] = _MI_ALU2(LOAD
, MI_ALU_SRCB
, reg_b
);
358 math
[3] = _MI_ALU0(ADD
);
359 math
[4] = _MI_ALU2(STORE
, dst_reg
, MI_ALU_ACCU
);
363 emit_alu_shl(struct iris_batch
*batch
, unsigned dst_reg
,
364 unsigned src_reg
, unsigned shift
)
368 int dwords
= 1 + 4 * shift
;
370 uint32_t *math
= iris_get_command_space(batch
, sizeof(uint32_t) * dwords
);
372 math
[0] = MI_MATH
| ((1 + 4 * shift
) - 2);
374 for (unsigned i
= 0; i
< shift
; i
++) {
375 unsigned add_src
= (i
== 0) ? src_reg
: dst_reg
;
376 math
[1 + (i
* 4) + 0] = _MI_ALU2(LOAD
, MI_ALU_SRCA
, add_src
);
377 math
[1 + (i
* 4) + 1] = _MI_ALU2(LOAD
, MI_ALU_SRCB
, add_src
);
378 math
[1 + (i
* 4) + 2] = _MI_ALU0(ADD
);
379 math
[1 + (i
* 4) + 3] = _MI_ALU2(STORE
, dst_reg
, MI_ALU_ACCU
);
383 /* Emit dwords to multiply GPR0 by N */
385 build_alu_multiply_gpr0(uint32_t *dw
, unsigned *dw_count
, uint32_t N
)
387 VK_OUTARRAY_MAKE(out
, dw
, dw_count
);
389 #define APPEND_ALU(op, x, y) \
390 vk_outarray_append(&out, alu_dw) *alu_dw = _MI_ALU(MI_ALU_##op, x, y)
393 unsigned top_bit
= 31 - __builtin_clz(N
);
394 for (int i
= top_bit
- 1; i
>= 0; i
--) {
395 /* We get our initial data in GPR0 and we write the final data out to
396 * GPR0 but we use GPR1 as our scratch register.
398 unsigned src_reg
= i
== top_bit
- 1 ? MI_ALU_R0
: MI_ALU_R1
;
399 unsigned dst_reg
= i
== 0 ? MI_ALU_R0
: MI_ALU_R1
;
401 /* Shift the current value left by 1 */
402 APPEND_ALU(LOAD
, MI_ALU_SRCA
, src_reg
);
403 APPEND_ALU(LOAD
, MI_ALU_SRCB
, src_reg
);
404 APPEND_ALU(ADD
, 0, 0);
407 /* Store ACCU to R1 and add R0 to R1 */
408 APPEND_ALU(STORE
, MI_ALU_R1
, MI_ALU_ACCU
);
409 APPEND_ALU(LOAD
, MI_ALU_SRCA
, MI_ALU_R0
);
410 APPEND_ALU(LOAD
, MI_ALU_SRCB
, MI_ALU_R1
);
411 APPEND_ALU(ADD
, 0, 0);
414 APPEND_ALU(STORE
, dst_reg
, MI_ALU_ACCU
);
421 emit_mul_gpr0(struct iris_batch
*batch
, uint32_t N
)
424 build_alu_multiply_gpr0(NULL
, &num_dwords
, N
);
426 uint32_t *math
= iris_get_command_space(batch
, 4 * num_dwords
);
427 math
[0] = MI_MATH
| (num_dwords
- 2);
428 build_alu_multiply_gpr0(&math
[1], &num_dwords
, N
);
432 iris_math_div32_gpr0(struct iris_context
*ice
,
433 struct iris_batch
*batch
,
436 /* Zero out the top of GPR0 */
437 emit_lri32(batch
, CS_GPR(0) + 4, 0);
440 /* This invalid, but we should do something so we set GPR0 to 0. */
441 emit_lri32(batch
, CS_GPR(0), 0);
442 } else if (util_is_power_of_two_or_zero(D
)) {
443 unsigned log2_D
= util_logbase2(D
);
445 /* We right-shift by log2(D) by left-shifting by 32 - log2(D) and taking
446 * the top 32 bits of the result.
448 emit_alu_shl(batch
, MI_ALU_R0
, MI_ALU_R0
, 32 - log2_D
);
449 emit_lrr32(batch
, CS_GPR(0) + 0, CS_GPR(0) + 4);
450 emit_lri32(batch
, CS_GPR(0) + 4, 0);
452 struct util_fast_udiv_info m
= util_compute_fast_udiv_info(D
, 32, 32);
453 assert(m
.multiplier
<= UINT32_MAX
);
456 /* We right-shift by L by left-shifting by 32 - l and taking the top
457 * 32 bits of the result.
459 if (m
.pre_shift
< 32)
460 emit_alu_shl(batch
, MI_ALU_R0
, MI_ALU_R0
, 32 - m
.pre_shift
);
461 emit_lrr32(batch
, CS_GPR(0) + 0, CS_GPR(0) + 4);
462 emit_lri32(batch
, CS_GPR(0) + 4, 0);
465 /* Do the 32x32 multiply into gpr0 */
466 emit_mul_gpr0(batch
, m
.multiplier
);
469 /* If we need to increment, save off a copy of GPR0 */
470 emit_lri32(batch
, CS_GPR(1) + 0, m
.multiplier
);
471 emit_lri32(batch
, CS_GPR(1) + 4, 0);
472 emit_alu_add(batch
, MI_ALU_R0
, MI_ALU_R0
, MI_ALU_R1
);
476 emit_lrr32(batch
, CS_GPR(0) + 0, CS_GPR(0) + 4);
477 emit_lri32(batch
, CS_GPR(0) + 4, 0);
480 /* We right-shift by L by left-shifting by 32 - l and taking the top
481 * 32 bits of the result.
483 if (m
.post_shift
< 32)
484 emit_alu_shl(batch
, MI_ALU_R0
, MI_ALU_R0
, 32 - m
.post_shift
);
485 emit_lrr32(batch
, CS_GPR(0) + 0, CS_GPR(0) + 4);
486 emit_lri32(batch
, CS_GPR(0) + 4, 0);
492 * GPR0 = (GPR0 == 0) ? 0 : 1;
495 gpr0_to_bool(struct iris_context
*ice
)
497 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
499 ice
->vtbl
.load_register_imm64(batch
, CS_GPR(1), 1ull);
501 static const uint32_t math
[] = {
503 MI_ALU2(LOAD
, SRCA
, R0
),
504 MI_ALU1(LOAD0
, SRCB
),
506 MI_ALU2(STOREINV
, R0
, ZF
),
507 MI_ALU2(LOAD
, SRCA
, R0
),
508 MI_ALU2(LOAD
, SRCB
, R1
),
510 MI_ALU2(STORE
, R0
, ACCU
),
512 iris_batch_emit(batch
, math
, sizeof(math
));
516 load_overflow_data_to_cs_gprs(struct iris_context
*ice
,
517 struct iris_query
*q
,
520 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
522 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(1), q
->bo
,
523 offsetof(struct iris_query_so_overflow
,
524 stream
[idx
].prim_storage_needed
[0]));
525 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(2), q
->bo
,
526 offsetof(struct iris_query_so_overflow
,
527 stream
[idx
].prim_storage_needed
[1]));
529 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(3), q
->bo
,
530 offsetof(struct iris_query_so_overflow
,
531 stream
[idx
].num_prims
[0]));
532 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(4), q
->bo
,
533 offsetof(struct iris_query_so_overflow
,
534 stream
[idx
].num_prims
[1]));
544 calc_overflow_for_stream(struct iris_context
*ice
)
546 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
547 static const uint32_t maths
[] = {
549 MI_ALU2(LOAD
, SRCA
, R4
),
550 MI_ALU2(LOAD
, SRCB
, R3
),
552 MI_ALU2(STORE
, R3
, ACCU
),
553 MI_ALU2(LOAD
, SRCA
, R2
),
554 MI_ALU2(LOAD
, SRCB
, R1
),
556 MI_ALU2(STORE
, R1
, ACCU
),
557 MI_ALU2(LOAD
, SRCA
, R3
),
558 MI_ALU2(LOAD
, SRCB
, R1
),
560 MI_ALU2(STORE
, R1
, ACCU
),
561 MI_ALU2(LOAD
, SRCA
, R1
),
562 MI_ALU2(LOAD
, SRCB
, R0
),
564 MI_ALU2(STORE
, R0
, ACCU
),
567 iris_batch_emit(batch
, maths
, sizeof(maths
));
571 overflow_result_to_gpr0(struct iris_context
*ice
, struct iris_query
*q
)
573 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
575 ice
->vtbl
.load_register_imm64(batch
, CS_GPR(0), 0ull);
577 if (q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
) {
578 load_overflow_data_to_cs_gprs(ice
, q
, q
->index
);
579 calc_overflow_for_stream(ice
);
581 for (int i
= 0; i
< MAX_VERTEX_STREAMS
; i
++) {
582 load_overflow_data_to_cs_gprs(ice
, q
, i
);
583 calc_overflow_for_stream(ice
);
591 * Calculate the result and store it to CS_GPR0.
594 calculate_result_on_gpu(struct iris_context
*ice
, struct iris_query
*q
)
596 struct iris_batch
*batch
= &ice
->batches
[q
->batch_idx
];
598 if (q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
||
599 q
->type
== PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
) {
600 overflow_result_to_gpr0(ice
, q
);
604 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(1), q
->bo
,
605 offsetof(struct iris_query_snapshots
, start
));
606 ice
->vtbl
.load_register_mem64(batch
, CS_GPR(2), q
->bo
,
607 offsetof(struct iris_query_snapshots
, end
));
609 static const uint32_t math
[] = {
611 MI_ALU2(LOAD
, SRCA
, R2
),
612 MI_ALU2(LOAD
, SRCB
, R1
),
614 MI_ALU2(STORE
, R0
, ACCU
),
616 iris_batch_emit(batch
, math
, sizeof(math
));
618 if (q
->type
== PIPE_QUERY_OCCLUSION_PREDICATE
||
619 q
->type
== PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE
)
623 static struct pipe_query
*
624 iris_create_query(struct pipe_context
*ctx
,
628 struct iris_query
*q
= calloc(1, sizeof(struct iris_query
));
630 q
->type
= query_type
;
633 if (q
->type
== PIPE_QUERY_PIPELINE_STATISTICS
&& q
->index
== 10)
634 q
->batch_idx
= IRIS_BATCH_COMPUTE
;
636 q
->batch_idx
= IRIS_BATCH_RENDER
;
637 return (struct pipe_query
*) q
;
641 iris_destroy_query(struct pipe_context
*ctx
, struct pipe_query
*p_query
)
643 struct iris_query
*query
= (void *) p_query
;
644 iris_bo_unreference(query
->bo
);
650 iris_begin_query(struct pipe_context
*ctx
, struct pipe_query
*query
)
652 struct iris_screen
*screen
= (void *) ctx
->screen
;
653 struct iris_context
*ice
= (void *) ctx
;
654 struct iris_query
*q
= (void *) query
;
656 iris_bo_unreference(q
->bo
);
657 q
->bo
= iris_bo_alloc(screen
->bufmgr
, "query object", 4096,
662 q
->map
= iris_bo_map(&ice
->dbg
, q
->bo
, MAP_READ
| MAP_WRITE
| MAP_ASYNC
);
668 q
->map
->snapshots_landed
= false;
670 if (q
->type
== PIPE_QUERY_PRIMITIVES_GENERATED
&& q
->index
== 0) {
671 ice
->state
.prims_generated_query_active
= true;
672 ice
->state
.dirty
|= IRIS_DIRTY_STREAMOUT
| IRIS_DIRTY_CLIP
;
675 if (q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
||
676 q
->type
== PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
)
677 write_overflow_values(ice
, q
, false);
679 write_value(ice
, q
, offsetof(struct iris_query_snapshots
, start
));
685 iris_end_query(struct pipe_context
*ctx
, struct pipe_query
*query
)
687 struct iris_context
*ice
= (void *) ctx
;
688 struct iris_query
*q
= (void *) query
;
690 if (q
->type
== PIPE_QUERY_TIMESTAMP
) {
691 iris_begin_query(ctx
, query
);
692 mark_available(ice
, q
);
696 if (q
->type
== PIPE_QUERY_PRIMITIVES_GENERATED
&& q
->index
== 0) {
697 ice
->state
.prims_generated_query_active
= false;
698 ice
->state
.dirty
|= IRIS_DIRTY_STREAMOUT
| IRIS_DIRTY_CLIP
;
701 if (q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
||
702 q
->type
== PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
)
703 write_overflow_values(ice
, q
, true);
705 write_value(ice
, q
, offsetof(struct iris_query_snapshots
, end
));
706 mark_available(ice
, q
);
712 * See if the snapshots have landed for a query, and if so, compute the
713 * result and mark it ready. Does not flush (unlike iris_get_query_result).
716 iris_check_query_no_flush(struct iris_context
*ice
, struct iris_query
*q
)
718 struct iris_screen
*screen
= (void *) ice
->ctx
.screen
;
719 const struct gen_device_info
*devinfo
= &screen
->devinfo
;
721 if (!q
->ready
&& q
->map
->snapshots_landed
) {
722 calculate_result_on_cpu(devinfo
, q
);
727 iris_get_query_result(struct pipe_context
*ctx
,
728 struct pipe_query
*query
,
730 union pipe_query_result
*result
)
732 struct iris_context
*ice
= (void *) ctx
;
733 struct iris_query
*q
= (void *) query
;
734 struct iris_screen
*screen
= (void *) ctx
->screen
;
735 const struct gen_device_info
*devinfo
= &screen
->devinfo
;
738 if (iris_batch_references(&ice
->batches
[q
->batch_idx
], q
->bo
))
739 iris_batch_flush(&ice
->batches
[q
->batch_idx
]);
741 if (!q
->map
->snapshots_landed
) {
743 iris_bo_wait_rendering(q
->bo
);
748 assert(q
->map
->snapshots_landed
);
749 calculate_result_on_cpu(devinfo
, q
);
754 if (q
->type
== PIPE_QUERY_PIPELINE_STATISTICS
) {
757 result
->pipeline_statistics
.ia_vertices
= q
->result
;
760 result
->pipeline_statistics
.ia_primitives
= q
->result
;
763 result
->pipeline_statistics
.vs_invocations
= q
->result
;
766 result
->pipeline_statistics
.gs_invocations
= q
->result
;
769 result
->pipeline_statistics
.gs_primitives
= q
->result
;
772 result
->pipeline_statistics
.c_invocations
= q
->result
;
775 result
->pipeline_statistics
.c_primitives
= q
->result
;
778 result
->pipeline_statistics
.ps_invocations
= q
->result
;
781 result
->pipeline_statistics
.hs_invocations
= q
->result
;
784 result
->pipeline_statistics
.ds_invocations
= q
->result
;
787 result
->pipeline_statistics
.cs_invocations
= q
->result
;
791 result
->u64
= q
->result
;
798 iris_get_query_result_resource(struct pipe_context
*ctx
,
799 struct pipe_query
*query
,
801 enum pipe_query_value_type result_type
,
803 struct pipe_resource
*p_res
,
806 struct iris_context
*ice
= (void *) ctx
;
807 struct iris_query
*q
= (void *) query
;
808 struct iris_batch
*batch
= &ice
->batches
[q
->batch_idx
];
809 const struct gen_device_info
*devinfo
= &batch
->screen
->devinfo
;
810 struct iris_resource
*res
= (void *) p_res
;
811 unsigned snapshots_landed_offset
=
812 offsetof(struct iris_query_snapshots
, snapshots_landed
);
814 res
->bind_history
|= PIPE_BIND_QUERY_BUFFER
;
817 /* They're asking for the availability of the result. If we still
818 * have commands queued up which produce the result, submit them
819 * now so that progress happens. Either way, copy the snapshots
820 * landed field to the destination resource.
822 if (iris_batch_references(batch
, q
->bo
))
823 iris_batch_flush(batch
);
825 ice
->vtbl
.copy_mem_mem(batch
, iris_resource_bo(p_res
), offset
,
826 q
->bo
, snapshots_landed_offset
,
827 result_type
<= PIPE_QUERY_TYPE_U32
? 4 : 8);
831 if (!q
->ready
&& q
->map
->snapshots_landed
) {
832 /* The final snapshots happen to have landed, so let's just compute
833 * the result on the CPU now...
835 calculate_result_on_cpu(devinfo
, q
);
839 /* We happen to have the result on the CPU, so just copy it. */
840 if (result_type
<= PIPE_QUERY_TYPE_U32
) {
841 ice
->vtbl
.store_data_imm32(batch
, iris_resource_bo(p_res
), offset
,
844 ice
->vtbl
.store_data_imm64(batch
, iris_resource_bo(p_res
), offset
,
848 /* Make sure the result lands before they use bind the QBO elsewhere
849 * and use the result.
851 // XXX: Why? i965 doesn't do this.
852 iris_emit_pipe_control_flush(batch
, PIPE_CONTROL_CS_STALL
);
856 /* Calculate the result to CS_GPR0 */
857 calculate_result_on_gpu(ice
, q
);
859 bool predicated
= !wait
&& !q
->stalled
;
862 ice
->vtbl
.load_register_imm64(batch
, MI_PREDICATE_SRC1
, 0ull);
863 ice
->vtbl
.load_register_mem64(batch
, MI_PREDICATE_SRC0
, q
->bo
,
864 snapshots_landed_offset
);
865 uint32_t predicate
= MI_PREDICATE
|
866 MI_PREDICATE_LOADOP_LOADINV
|
867 MI_PREDICATE_COMBINEOP_SET
|
868 MI_PREDICATE_COMPAREOP_SRCS_EQUAL
;
869 iris_batch_emit(batch
, &predicate
, sizeof(uint32_t));
872 if (result_type
<= PIPE_QUERY_TYPE_U32
) {
873 ice
->vtbl
.store_register_mem32(batch
, CS_GPR(0),
874 iris_resource_bo(p_res
),
877 ice
->vtbl
.store_register_mem64(batch
, CS_GPR(0),
878 iris_resource_bo(p_res
),
884 iris_set_active_query_state(struct pipe_context
*ctx
, boolean enable
)
886 struct iris_context
*ice
= (void *) ctx
;
888 if (ice
->state
.statistics_counters_enabled
== enable
)
891 // XXX: most packets aren't paying attention to this yet, because it'd
892 // have to be done dynamically at draw time, which is a pain
893 ice
->state
.statistics_counters_enabled
= enable
;
894 ice
->state
.dirty
|= IRIS_DIRTY_CLIP
|
897 IRIS_DIRTY_STREAMOUT
|
905 set_predicate_enable(struct iris_context
*ice
, bool value
)
908 ice
->state
.predicate
= IRIS_PREDICATE_STATE_RENDER
;
910 ice
->state
.predicate
= IRIS_PREDICATE_STATE_DONT_RENDER
;
914 set_predicate_for_result(struct iris_context
*ice
,
915 struct iris_query
*q
,
918 struct iris_batch
*batch
= &ice
->batches
[IRIS_BATCH_RENDER
];
920 /* The CPU doesn't have the query result yet; use hardware predication */
921 ice
->state
.predicate
= IRIS_PREDICATE_STATE_USE_BIT
;
923 /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
924 iris_emit_pipe_control_flush(batch
, PIPE_CONTROL_FLUSH_ENABLE
);
928 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
929 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
:
930 overflow_result_to_gpr0(ice
, q
);
932 ice
->vtbl
.load_register_reg64(batch
, MI_PREDICATE_SRC0
, CS_GPR(0));
933 ice
->vtbl
.load_register_imm64(batch
, MI_PREDICATE_SRC1
, 0ull);
936 /* PIPE_QUERY_OCCLUSION_* */
937 ice
->vtbl
.load_register_mem64(batch
, MI_PREDICATE_SRC0
, q
->bo
,
938 offsetof(struct iris_query_snapshots
, start
));
939 ice
->vtbl
.load_register_mem64(batch
, MI_PREDICATE_SRC1
, q
->bo
,
940 offsetof(struct iris_query_snapshots
, end
));
944 uint32_t mi_predicate
= MI_PREDICATE
|
945 MI_PREDICATE_COMBINEOP_SET
|
946 MI_PREDICATE_COMPAREOP_SRCS_EQUAL
|
947 (inverted
? MI_PREDICATE_LOADOP_LOAD
948 : MI_PREDICATE_LOADOP_LOADINV
);
949 iris_batch_emit(batch
, &mi_predicate
, sizeof(uint32_t));
951 /* We immediately set the predicate on the render batch, as all the
952 * counters come from 3D operations. However, we may need to predicate
953 * a compute dispatch, which executes in a different GEM context and has
954 * a different MI_PREDICATE_DATA register. So, we save the result to
955 * memory and reload it in iris_launch_grid.
957 unsigned offset
= offsetof(struct iris_query_snapshots
, predicate_data
);
958 ice
->vtbl
.store_register_mem64(batch
, MI_PREDICATE_DATA
,
959 q
->bo
, offset
, false);
960 ice
->state
.compute_predicate
= q
->bo
;
964 iris_render_condition(struct pipe_context
*ctx
,
965 struct pipe_query
*query
,
967 enum pipe_render_cond_flag mode
)
969 struct iris_context
*ice
= (void *) ctx
;
970 struct iris_query
*q
= (void *) query
;
973 ice
->state
.predicate
= IRIS_PREDICATE_STATE_RENDER
;
977 iris_check_query_no_flush(ice
, q
);
979 if (q
->result
|| q
->ready
) {
980 set_predicate_enable(ice
, (q
->result
!= 0) ^ condition
);
982 if (mode
== PIPE_RENDER_COND_NO_WAIT
||
983 mode
== PIPE_RENDER_COND_BY_REGION_NO_WAIT
) {
984 perf_debug(&ice
->dbg
, "Conditional rendering demoted from "
985 "\"no wait\" to \"wait\".");
987 set_predicate_for_result(ice
, q
, condition
);
992 iris_init_query_functions(struct pipe_context
*ctx
)
994 ctx
->create_query
= iris_create_query
;
995 ctx
->destroy_query
= iris_destroy_query
;
996 ctx
->begin_query
= iris_begin_query
;
997 ctx
->end_query
= iris_end_query
;
998 ctx
->get_query_result
= iris_get_query_result
;
999 ctx
->get_query_result_resource
= iris_get_query_result_resource
;
1000 ctx
->set_active_query_state
= iris_set_active_query_state
;
1001 ctx
->render_condition
= iris_render_condition
;