2 * Copyright 2015 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 #include "si_build_pm4.h"
27 #include "util/u_memory.h"
29 enum si_pc_block_flags
31 /* This block is part of the shader engine */
32 SI_PC_BLOCK_SE
= (1 << 0),
34 /* Expose per-instance groups instead of summing all instances (within
36 SI_PC_BLOCK_INSTANCE_GROUPS
= (1 << 1),
38 /* Expose per-SE groups instead of summing instances across SEs. */
39 SI_PC_BLOCK_SE_GROUPS
= (1 << 2),
42 SI_PC_BLOCK_SHADER
= (1 << 3),
44 /* Non-shader block with perfcounters windowed by shaders. */
45 SI_PC_BLOCK_SHADER_WINDOWED
= (1 << 4),
50 /* All secondary selector dwords follow as one block after the primary
51 * selector dwords for the counters that have secondary selectors.
53 SI_PC_MULTI_BLOCK
= 0,
55 /* Each secondary selector dword follows immediately afters the
56 * corresponding primary.
58 SI_PC_MULTI_ALTERNATE
= 1,
60 /* All secondary selector dwords follow as one block after all primary
65 /* Free-form arrangement of selector registers. */
66 SI_PC_MULTI_CUSTOM
= 3,
70 /* Registers are laid out in decreasing rather than increasing order. */
71 SI_PC_REG_REVERSE
= 4,
76 struct si_pc_block_base
{
78 unsigned num_counters
;
91 struct si_pc_block_gfxdescr
{
92 struct si_pc_block_base
*b
;
98 const struct si_pc_block_gfxdescr
*b
;
99 unsigned num_instances
;
103 unsigned group_name_stride
;
105 char *selector_names
;
106 unsigned selector_name_stride
;
109 /* The order is chosen to be compatible with GPUPerfStudio's hardcoding of
110 * performance counter group IDs.
112 static const char *const si_pc_shader_type_suffixes
[] = {"", "_ES", "_GS", "_VS",
113 "_PS", "_LS", "_HS", "_CS"};
115 static const unsigned si_pc_shader_type_bits
[] = {
126 /* Max counters per HW block */
127 #define SI_QUERY_MAX_COUNTERS 16
129 #define SI_PC_SHADERS_WINDOWING (1u << 31)
131 struct si_query_group
{
132 struct si_query_group
*next
;
133 struct si_pc_block
*block
;
134 unsigned sub_gid
; /* only used during init */
135 unsigned result_base
; /* only used during init */
138 unsigned num_counters
;
139 unsigned selectors
[SI_QUERY_MAX_COUNTERS
];
142 struct si_query_counter
{
145 unsigned stride
; /* in uint64s */
150 struct si_query_buffer buffer
;
152 /* Size of the results in memory, in bytes. */
153 unsigned result_size
;
156 unsigned num_counters
;
157 struct si_query_counter
*counters
;
158 struct si_query_group
*groups
;
161 static struct si_pc_block_base cik_CB
= {
164 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
,
166 .select0
= R_037000_CB_PERFCOUNTER_FILTER
,
167 .counter0_lo
= R_035018_CB_PERFCOUNTER0_LO
,
170 .layout
= SI_PC_MULTI_ALTERNATE
,
173 static unsigned cik_CPC_select
[] = {
174 R_036024_CPC_PERFCOUNTER0_SELECT
,
175 R_036010_CPC_PERFCOUNTER0_SELECT1
,
176 R_03600C_CPC_PERFCOUNTER1_SELECT
,
178 static struct si_pc_block_base cik_CPC
= {
182 .select
= cik_CPC_select
,
183 .counter0_lo
= R_034018_CPC_PERFCOUNTER0_LO
,
185 .layout
= SI_PC_MULTI_CUSTOM
| SI_PC_REG_REVERSE
,
188 static struct si_pc_block_base cik_CPF
= {
192 .select0
= R_03601C_CPF_PERFCOUNTER0_SELECT
,
193 .counter0_lo
= R_034028_CPF_PERFCOUNTER0_LO
,
195 .layout
= SI_PC_MULTI_ALTERNATE
| SI_PC_REG_REVERSE
,
198 static struct si_pc_block_base cik_CPG
= {
202 .select0
= R_036008_CPG_PERFCOUNTER0_SELECT
,
203 .counter0_lo
= R_034008_CPG_PERFCOUNTER0_LO
,
205 .layout
= SI_PC_MULTI_ALTERNATE
| SI_PC_REG_REVERSE
,
208 static struct si_pc_block_base cik_DB
= {
211 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
,
213 .select0
= R_037100_DB_PERFCOUNTER0_SELECT
,
214 .counter0_lo
= R_035100_DB_PERFCOUNTER0_LO
,
215 .num_multi
= 3, // really only 2, but there's a gap between registers
216 .layout
= SI_PC_MULTI_ALTERNATE
,
219 static struct si_pc_block_base cik_GDS
= {
223 .select0
= R_036A00_GDS_PERFCOUNTER0_SELECT
,
224 .counter0_lo
= R_034A00_GDS_PERFCOUNTER0_LO
,
226 .layout
= SI_PC_MULTI_TAIL
,
229 static unsigned cik_GRBM_counters
[] = {
230 R_034100_GRBM_PERFCOUNTER0_LO
,
231 R_03410C_GRBM_PERFCOUNTER1_LO
,
233 static struct si_pc_block_base cik_GRBM
= {
237 .select0
= R_036100_GRBM_PERFCOUNTER0_SELECT
,
238 .counters
= cik_GRBM_counters
,
241 static struct si_pc_block_base cik_GRBMSE
= {
245 .select0
= R_036108_GRBM_SE0_PERFCOUNTER_SELECT
,
246 .counter0_lo
= R_034114_GRBM_SE0_PERFCOUNTER_LO
,
249 static struct si_pc_block_base cik_IA
= {
253 .select0
= R_036210_IA_PERFCOUNTER0_SELECT
,
254 .counter0_lo
= R_034220_IA_PERFCOUNTER0_LO
,
256 .layout
= SI_PC_MULTI_TAIL
,
259 static struct si_pc_block_base cik_PA_SC
= {
262 .flags
= SI_PC_BLOCK_SE
,
264 .select0
= R_036500_PA_SC_PERFCOUNTER0_SELECT
,
265 .counter0_lo
= R_034500_PA_SC_PERFCOUNTER0_LO
,
267 .layout
= SI_PC_MULTI_ALTERNATE
,
270 /* According to docs, PA_SU counters are only 48 bits wide. */
271 static struct si_pc_block_base cik_PA_SU
= {
274 .flags
= SI_PC_BLOCK_SE
,
276 .select0
= R_036400_PA_SU_PERFCOUNTER0_SELECT
,
277 .counter0_lo
= R_034400_PA_SU_PERFCOUNTER0_LO
,
279 .layout
= SI_PC_MULTI_ALTERNATE
,
282 static struct si_pc_block_base cik_SPI
= {
285 .flags
= SI_PC_BLOCK_SE
,
287 .select0
= R_036600_SPI_PERFCOUNTER0_SELECT
,
288 .counter0_lo
= R_034604_SPI_PERFCOUNTER0_LO
,
290 .layout
= SI_PC_MULTI_BLOCK
,
293 static struct si_pc_block_base cik_SQ
= {
296 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_SHADER
,
298 .select0
= R_036700_SQ_PERFCOUNTER0_SELECT
,
299 .select_or
= S_036700_SQC_BANK_MASK(15) | S_036700_SQC_CLIENT_MASK(15) | S_036700_SIMD_MASK(15),
300 .counter0_lo
= R_034700_SQ_PERFCOUNTER0_LO
,
303 static struct si_pc_block_base cik_SX
= {
306 .flags
= SI_PC_BLOCK_SE
,
308 .select0
= R_036900_SX_PERFCOUNTER0_SELECT
,
309 .counter0_lo
= R_034900_SX_PERFCOUNTER0_LO
,
311 .layout
= SI_PC_MULTI_TAIL
,
314 static struct si_pc_block_base cik_TA
= {
317 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
| SI_PC_BLOCK_SHADER_WINDOWED
,
319 .select0
= R_036B00_TA_PERFCOUNTER0_SELECT
,
320 .counter0_lo
= R_034B00_TA_PERFCOUNTER0_LO
,
322 .layout
= SI_PC_MULTI_ALTERNATE
,
325 static struct si_pc_block_base cik_TD
= {
328 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
| SI_PC_BLOCK_SHADER_WINDOWED
,
330 .select0
= R_036C00_TD_PERFCOUNTER0_SELECT
,
331 .counter0_lo
= R_034C00_TD_PERFCOUNTER0_LO
,
333 .layout
= SI_PC_MULTI_ALTERNATE
,
336 static struct si_pc_block_base cik_TCA
= {
339 .flags
= SI_PC_BLOCK_INSTANCE_GROUPS
,
341 .select0
= R_036E40_TCA_PERFCOUNTER0_SELECT
,
342 .counter0_lo
= R_034E40_TCA_PERFCOUNTER0_LO
,
344 .layout
= SI_PC_MULTI_ALTERNATE
,
347 static struct si_pc_block_base cik_TCC
= {
350 .flags
= SI_PC_BLOCK_INSTANCE_GROUPS
,
352 .select0
= R_036E00_TCC_PERFCOUNTER0_SELECT
,
353 .counter0_lo
= R_034E00_TCC_PERFCOUNTER0_LO
,
355 .layout
= SI_PC_MULTI_ALTERNATE
,
358 static struct si_pc_block_base cik_TCP
= {
361 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
| SI_PC_BLOCK_SHADER_WINDOWED
,
363 .select0
= R_036D00_TCP_PERFCOUNTER0_SELECT
,
364 .counter0_lo
= R_034D00_TCP_PERFCOUNTER0_LO
,
366 .layout
= SI_PC_MULTI_ALTERNATE
,
369 static struct si_pc_block_base cik_VGT
= {
372 .flags
= SI_PC_BLOCK_SE
,
374 .select0
= R_036230_VGT_PERFCOUNTER0_SELECT
,
375 .counter0_lo
= R_034240_VGT_PERFCOUNTER0_LO
,
377 .layout
= SI_PC_MULTI_TAIL
,
380 static struct si_pc_block_base cik_WD
= {
384 .select0
= R_036200_WD_PERFCOUNTER0_SELECT
,
385 .counter0_lo
= R_034200_WD_PERFCOUNTER0_LO
,
388 static struct si_pc_block_base cik_MC
= {
392 .layout
= SI_PC_FAKE
,
395 static struct si_pc_block_base cik_SRBM
= {
399 .layout
= SI_PC_FAKE
,
402 /* Both the number of instances and selectors varies between chips of the same
403 * class. We only differentiate by class here and simply expose the maximum
404 * number over all chips in a class.
406 * Unfortunately, GPUPerfStudio uses the order of performance counter groups
407 * blindly once it believes it has identified the hardware, so the order of
408 * blocks here matters.
410 static struct si_pc_block_gfxdescr groups_CIK
[] = {
411 {&cik_CB
, 226}, {&cik_CPF
, 17}, {&cik_DB
, 257}, {&cik_GRBM
, 34}, {&cik_GRBMSE
, 15},
412 {&cik_PA_SU
, 153}, {&cik_PA_SC
, 395}, {&cik_SPI
, 186}, {&cik_SQ
, 252}, {&cik_SX
, 32},
413 {&cik_TA
, 111}, {&cik_TCA
, 39, 2}, {&cik_TCC
, 160}, {&cik_TD
, 55}, {&cik_TCP
, 154},
414 {&cik_GDS
, 121}, {&cik_VGT
, 140}, {&cik_IA
, 22}, {&cik_MC
, 22}, {&cik_SRBM
, 19},
415 {&cik_WD
, 22}, {&cik_CPG
, 46}, {&cik_CPC
, 22},
419 static struct si_pc_block_gfxdescr groups_VI
[] = {
420 {&cik_CB
, 405}, {&cik_CPF
, 19}, {&cik_DB
, 257}, {&cik_GRBM
, 34}, {&cik_GRBMSE
, 15},
421 {&cik_PA_SU
, 154}, {&cik_PA_SC
, 397}, {&cik_SPI
, 197}, {&cik_SQ
, 273}, {&cik_SX
, 34},
422 {&cik_TA
, 119}, {&cik_TCA
, 35, 2}, {&cik_TCC
, 192}, {&cik_TD
, 55}, {&cik_TCP
, 180},
423 {&cik_GDS
, 121}, {&cik_VGT
, 147}, {&cik_IA
, 24}, {&cik_MC
, 22}, {&cik_SRBM
, 27},
424 {&cik_WD
, 37}, {&cik_CPG
, 48}, {&cik_CPC
, 24},
428 static struct si_pc_block_gfxdescr groups_gfx9
[] = {
429 {&cik_CB
, 438}, {&cik_CPF
, 32}, {&cik_DB
, 328}, {&cik_GRBM
, 38}, {&cik_GRBMSE
, 16},
430 {&cik_PA_SU
, 292}, {&cik_PA_SC
, 491}, {&cik_SPI
, 196}, {&cik_SQ
, 374}, {&cik_SX
, 208},
431 {&cik_TA
, 119}, {&cik_TCA
, 35, 2}, {&cik_TCC
, 256}, {&cik_TD
, 57}, {&cik_TCP
, 85},
432 {&cik_GDS
, 121}, {&cik_VGT
, 148}, {&cik_IA
, 32}, {&cik_WD
, 58}, {&cik_CPG
, 59},
436 static bool si_pc_block_has_per_se_groups(const struct si_perfcounters
*pc
,
437 const struct si_pc_block
*block
)
439 return block
->b
->b
->flags
& SI_PC_BLOCK_SE_GROUPS
||
440 (block
->b
->b
->flags
& SI_PC_BLOCK_SE
&& pc
->separate_se
);
443 static bool si_pc_block_has_per_instance_groups(const struct si_perfcounters
*pc
,
444 const struct si_pc_block
*block
)
446 return block
->b
->b
->flags
& SI_PC_BLOCK_INSTANCE_GROUPS
||
447 (block
->num_instances
> 1 && pc
->separate_instance
);
450 static struct si_pc_block
*lookup_counter(struct si_perfcounters
*pc
, unsigned index
,
451 unsigned *base_gid
, unsigned *sub_index
)
453 struct si_pc_block
*block
= pc
->blocks
;
457 for (bid
= 0; bid
< pc
->num_blocks
; ++bid
, ++block
) {
458 unsigned total
= block
->num_groups
* block
->b
->selectors
;
466 *base_gid
+= block
->num_groups
;
472 static struct si_pc_block
*lookup_group(struct si_perfcounters
*pc
, unsigned *index
)
475 struct si_pc_block
*block
= pc
->blocks
;
477 for (bid
= 0; bid
< pc
->num_blocks
; ++bid
, ++block
) {
478 if (*index
< block
->num_groups
)
480 *index
-= block
->num_groups
;
486 static void si_pc_emit_instance(struct si_context
*sctx
, int se
, int instance
)
488 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
489 unsigned value
= S_030800_SH_BROADCAST_WRITES(1);
492 value
|= S_030800_SE_INDEX(se
);
494 value
|= S_030800_SE_BROADCAST_WRITES(1);
498 value
|= S_030800_INSTANCE_INDEX(instance
);
500 value
|= S_030800_INSTANCE_BROADCAST_WRITES(1);
503 radeon_set_uconfig_reg(cs
, R_030800_GRBM_GFX_INDEX
, value
);
506 static void si_pc_emit_shaders(struct si_context
*sctx
, unsigned shaders
)
508 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
510 radeon_set_uconfig_reg_seq(cs
, R_036780_SQ_PERFCOUNTER_CTRL
, 2);
511 radeon_emit(cs
, shaders
& 0x7f);
512 radeon_emit(cs
, 0xffffffff);
515 static void si_pc_emit_select(struct si_context
*sctx
, struct si_pc_block
*block
, unsigned count
,
518 struct si_pc_block_base
*regs
= block
->b
->b
;
519 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
521 unsigned layout_multi
= regs
->layout
& SI_PC_MULTI_MASK
;
524 assert(count
<= regs
->num_counters
);
526 if (regs
->layout
& SI_PC_FAKE
)
529 if (layout_multi
== SI_PC_MULTI_BLOCK
) {
530 assert(!(regs
->layout
& SI_PC_REG_REVERSE
));
532 dw
= count
+ regs
->num_prelude
;
533 if (count
>= regs
->num_multi
)
534 dw
+= regs
->num_multi
;
535 radeon_set_uconfig_reg_seq(cs
, regs
->select0
, dw
);
536 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
538 for (idx
= 0; idx
< MIN2(count
, regs
->num_multi
); ++idx
)
539 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
541 if (count
< regs
->num_multi
) {
542 unsigned select1
= regs
->select0
+ 4 * regs
->num_multi
;
543 radeon_set_uconfig_reg_seq(cs
, select1
, count
);
546 for (idx
= 0; idx
< MIN2(count
, regs
->num_multi
); ++idx
)
549 if (count
> regs
->num_multi
) {
550 for (idx
= regs
->num_multi
; idx
< count
; ++idx
)
551 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
553 } else if (layout_multi
== SI_PC_MULTI_TAIL
) {
554 unsigned select1
, select1_count
;
556 assert(!(regs
->layout
& SI_PC_REG_REVERSE
));
558 radeon_set_uconfig_reg_seq(cs
, regs
->select0
, count
+ regs
->num_prelude
);
559 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
561 for (idx
= 0; idx
< count
; ++idx
)
562 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
564 select1
= regs
->select0
+ 4 * regs
->num_counters
;
565 select1_count
= MIN2(count
, regs
->num_multi
);
566 radeon_set_uconfig_reg_seq(cs
, select1
, select1_count
);
567 for (idx
= 0; idx
< select1_count
; ++idx
)
569 } else if (layout_multi
== SI_PC_MULTI_CUSTOM
) {
570 unsigned *reg
= regs
->select
;
571 for (idx
= 0; idx
< count
; ++idx
) {
572 radeon_set_uconfig_reg(cs
, *reg
++, selectors
[idx
] | regs
->select_or
);
573 if (idx
< regs
->num_multi
)
574 radeon_set_uconfig_reg(cs
, *reg
++, 0);
577 assert(layout_multi
== SI_PC_MULTI_ALTERNATE
);
579 unsigned reg_base
= regs
->select0
;
580 unsigned reg_count
= count
+ MIN2(count
, regs
->num_multi
);
581 reg_count
+= regs
->num_prelude
;
583 if (!(regs
->layout
& SI_PC_REG_REVERSE
)) {
584 radeon_set_uconfig_reg_seq(cs
, reg_base
, reg_count
);
586 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
588 for (idx
= 0; idx
< count
; ++idx
) {
589 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
590 if (idx
< regs
->num_multi
)
594 reg_base
-= (reg_count
- 1) * 4;
595 radeon_set_uconfig_reg_seq(cs
, reg_base
, reg_count
);
597 for (idx
= count
; idx
> 0; --idx
) {
598 if (idx
<= regs
->num_multi
)
600 radeon_emit(cs
, selectors
[idx
- 1] | regs
->select_or
);
602 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
608 static void si_pc_emit_start(struct si_context
*sctx
, struct si_resource
*buffer
, uint64_t va
)
610 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
612 si_cp_copy_data(sctx
, sctx
->gfx_cs
, COPY_DATA_DST_MEM
, buffer
, va
- buffer
->gpu_address
,
613 COPY_DATA_IMM
, NULL
, 1);
615 radeon_set_uconfig_reg(cs
, R_036020_CP_PERFMON_CNTL
,
616 S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET
));
617 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
618 radeon_emit(cs
, EVENT_TYPE(V_028A90_PERFCOUNTER_START
) | EVENT_INDEX(0));
619 radeon_set_uconfig_reg(cs
, R_036020_CP_PERFMON_CNTL
,
620 S_036020_PERFMON_STATE(V_036020_START_COUNTING
));
623 /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
624 * do it again in here. */
625 static void si_pc_emit_stop(struct si_context
*sctx
, struct si_resource
*buffer
, uint64_t va
)
627 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
629 si_cp_release_mem(sctx
, cs
, V_028A90_BOTTOM_OF_PIPE_TS
, 0, EOP_DST_SEL_MEM
, EOP_INT_SEL_NONE
,
630 EOP_DATA_SEL_VALUE_32BIT
, buffer
, va
, 0, SI_NOT_QUERY
);
631 si_cp_wait_mem(sctx
, cs
, va
, 0, 0xffffffff, WAIT_REG_MEM_EQUAL
);
633 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
634 radeon_emit(cs
, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE
) | EVENT_INDEX(0));
635 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
636 radeon_emit(cs
, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP
) | EVENT_INDEX(0));
637 radeon_set_uconfig_reg(
638 cs
, R_036020_CP_PERFMON_CNTL
,
639 S_036020_PERFMON_STATE(V_036020_STOP_COUNTING
) | S_036020_PERFMON_SAMPLE_ENABLE(1));
642 static void si_pc_emit_read(struct si_context
*sctx
, struct si_pc_block
*block
, unsigned count
,
645 struct si_pc_block_base
*regs
= block
->b
->b
;
646 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
648 unsigned reg
= regs
->counter0_lo
;
649 unsigned reg_delta
= 8;
651 if (!(regs
->layout
& SI_PC_FAKE
)) {
652 if (regs
->layout
& SI_PC_REG_REVERSE
)
653 reg_delta
= -reg_delta
;
655 for (idx
= 0; idx
< count
; ++idx
) {
657 reg
= regs
->counters
[idx
];
659 radeon_emit(cs
, PKT3(PKT3_COPY_DATA
, 4, 0));
660 radeon_emit(cs
, COPY_DATA_SRC_SEL(COPY_DATA_PERF
) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM
) |
661 COPY_DATA_COUNT_SEL
); /* 64 bits */
662 radeon_emit(cs
, reg
>> 2);
663 radeon_emit(cs
, 0); /* unused */
665 radeon_emit(cs
, va
>> 32);
666 va
+= sizeof(uint64_t);
670 for (idx
= 0; idx
< count
; ++idx
) {
671 radeon_emit(cs
, PKT3(PKT3_COPY_DATA
, 4, 0));
672 radeon_emit(cs
, COPY_DATA_SRC_SEL(COPY_DATA_IMM
) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM
) |
673 COPY_DATA_COUNT_SEL
);
674 radeon_emit(cs
, 0); /* immediate */
677 radeon_emit(cs
, va
>> 32);
678 va
+= sizeof(uint64_t);
683 static void si_pc_query_destroy(struct si_context
*sctx
, struct si_query
*squery
)
685 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
687 while (query
->groups
) {
688 struct si_query_group
*group
= query
->groups
;
689 query
->groups
= group
->next
;
693 FREE(query
->counters
);
695 si_query_buffer_destroy(sctx
->screen
, &query
->buffer
);
699 static void si_pc_query_resume(struct si_context
*sctx
, struct si_query
*squery
)
701 struct si_query_hw *hwquery,
702 struct si_resource *buffer, uint64_t va)*/
704 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
706 int current_instance
= -1;
708 if (!si_query_buffer_alloc(sctx
, &query
->buffer
, NULL
, query
->result_size
))
710 si_need_gfx_cs_space(sctx
);
713 si_pc_emit_shaders(sctx
, query
->shaders
);
715 for (struct si_query_group
*group
= query
->groups
; group
; group
= group
->next
) {
716 struct si_pc_block
*block
= group
->block
;
718 if (group
->se
!= current_se
|| group
->instance
!= current_instance
) {
719 current_se
= group
->se
;
720 current_instance
= group
->instance
;
721 si_pc_emit_instance(sctx
, group
->se
, group
->instance
);
724 si_pc_emit_select(sctx
, block
, group
->num_counters
, group
->selectors
);
727 if (current_se
!= -1 || current_instance
!= -1)
728 si_pc_emit_instance(sctx
, -1, -1);
730 uint64_t va
= query
->buffer
.buf
->gpu_address
+ query
->buffer
.results_end
;
731 si_pc_emit_start(sctx
, query
->buffer
.buf
, va
);
734 static void si_pc_query_suspend(struct si_context
*sctx
, struct si_query
*squery
)
736 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
738 if (!query
->buffer
.buf
)
741 uint64_t va
= query
->buffer
.buf
->gpu_address
+ query
->buffer
.results_end
;
742 query
->buffer
.results_end
+= query
->result_size
;
744 si_pc_emit_stop(sctx
, query
->buffer
.buf
, va
);
746 for (struct si_query_group
*group
= query
->groups
; group
; group
= group
->next
) {
747 struct si_pc_block
*block
= group
->block
;
748 unsigned se
= group
->se
>= 0 ? group
->se
: 0;
749 unsigned se_end
= se
+ 1;
751 if ((block
->b
->b
->flags
& SI_PC_BLOCK_SE
) && (group
->se
< 0))
752 se_end
= sctx
->screen
->info
.max_se
;
755 unsigned instance
= group
->instance
>= 0 ? group
->instance
: 0;
758 si_pc_emit_instance(sctx
, se
, instance
);
759 si_pc_emit_read(sctx
, block
, group
->num_counters
, va
);
760 va
+= sizeof(uint64_t) * group
->num_counters
;
761 } while (group
->instance
< 0 && ++instance
< block
->num_instances
);
762 } while (++se
< se_end
);
765 si_pc_emit_instance(sctx
, -1, -1);
768 static bool si_pc_query_begin(struct si_context
*ctx
, struct si_query
*squery
)
770 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
772 si_query_buffer_reset(ctx
, &query
->buffer
);
774 list_addtail(&query
->b
.active_list
, &ctx
->active_queries
);
775 ctx
->num_cs_dw_queries_suspend
+= query
->b
.num_cs_dw_suspend
;
777 si_pc_query_resume(ctx
, squery
);
782 static bool si_pc_query_end(struct si_context
*ctx
, struct si_query
*squery
)
784 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
786 si_pc_query_suspend(ctx
, squery
);
788 list_del(&squery
->active_list
);
789 ctx
->num_cs_dw_queries_suspend
-= squery
->num_cs_dw_suspend
;
791 return query
->buffer
.buf
!= NULL
;
794 static void si_pc_query_add_result(struct si_query_pc
*query
, void *buffer
,
795 union pipe_query_result
*result
)
797 uint64_t *results
= buffer
;
800 for (i
= 0; i
< query
->num_counters
; ++i
) {
801 struct si_query_counter
*counter
= &query
->counters
[i
];
803 for (j
= 0; j
< counter
->qwords
; ++j
) {
804 uint32_t value
= results
[counter
->base
+ j
* counter
->stride
];
805 result
->batch
[i
].u64
+= value
;
810 static bool si_pc_query_get_result(struct si_context
*sctx
, struct si_query
*squery
, bool wait
,
811 union pipe_query_result
*result
)
813 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
815 memset(result
, 0, sizeof(result
->batch
[0]) * query
->num_counters
);
817 for (struct si_query_buffer
*qbuf
= &query
->buffer
; qbuf
; qbuf
= qbuf
->previous
) {
818 unsigned usage
= PIPE_TRANSFER_READ
| (wait
? 0 : PIPE_TRANSFER_DONTBLOCK
);
819 unsigned results_base
= 0;
822 if (squery
->b
.flushed
)
823 map
= sctx
->ws
->buffer_map(qbuf
->buf
->buf
, NULL
, usage
);
825 map
= si_buffer_map_sync_with_rings(sctx
, qbuf
->buf
, usage
);
830 while (results_base
!= qbuf
->results_end
) {
831 si_pc_query_add_result(query
, map
+ results_base
, result
);
832 results_base
+= query
->result_size
;
839 static const struct si_query_ops batch_query_ops
= {
840 .destroy
= si_pc_query_destroy
,
841 .begin
= si_pc_query_begin
,
842 .end
= si_pc_query_end
,
843 .get_result
= si_pc_query_get_result
,
845 .suspend
= si_pc_query_suspend
,
846 .resume
= si_pc_query_resume
,
849 static struct si_query_group
*get_group_state(struct si_screen
*screen
, struct si_query_pc
*query
,
850 struct si_pc_block
*block
, unsigned sub_gid
)
852 struct si_query_group
*group
= query
->groups
;
855 if (group
->block
== block
&& group
->sub_gid
== sub_gid
)
860 group
= CALLOC_STRUCT(si_query_group
);
864 group
->block
= block
;
865 group
->sub_gid
= sub_gid
;
867 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
) {
868 unsigned sub_gids
= block
->num_instances
;
871 unsigned query_shaders
;
873 if (si_pc_block_has_per_se_groups(screen
->perfcounters
, block
))
874 sub_gids
= sub_gids
* screen
->info
.max_se
;
875 shader_id
= sub_gid
/ sub_gids
;
876 sub_gid
= sub_gid
% sub_gids
;
878 shaders
= si_pc_shader_type_bits
[shader_id
];
880 query_shaders
= query
->shaders
& ~SI_PC_SHADERS_WINDOWING
;
881 if (query_shaders
&& query_shaders
!= shaders
) {
882 fprintf(stderr
, "si_perfcounter: incompatible shader groups\n");
886 query
->shaders
= shaders
;
889 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER_WINDOWED
&& !query
->shaders
) {
890 // A non-zero value in query->shaders ensures that the shader
891 // masking is reset unless the user explicitly requests one.
892 query
->shaders
= SI_PC_SHADERS_WINDOWING
;
895 if (si_pc_block_has_per_se_groups(screen
->perfcounters
, block
)) {
896 group
->se
= sub_gid
/ block
->num_instances
;
897 sub_gid
= sub_gid
% block
->num_instances
;
902 if (si_pc_block_has_per_instance_groups(screen
->perfcounters
, block
)) {
903 group
->instance
= sub_gid
;
905 group
->instance
= -1;
908 group
->next
= query
->groups
;
909 query
->groups
= group
;
914 struct pipe_query
*si_create_batch_query(struct pipe_context
*ctx
, unsigned num_queries
,
915 unsigned *query_types
)
917 struct si_screen
*screen
= (struct si_screen
*)ctx
->screen
;
918 struct si_perfcounters
*pc
= screen
->perfcounters
;
919 struct si_pc_block
*block
;
920 struct si_query_group
*group
;
921 struct si_query_pc
*query
;
922 unsigned base_gid
, sub_gid
, sub_index
;
928 query
= CALLOC_STRUCT(si_query_pc
);
932 query
->b
.ops
= &batch_query_ops
;
934 query
->num_counters
= num_queries
;
936 /* Collect selectors per group */
937 for (i
= 0; i
< num_queries
; ++i
) {
940 if (query_types
[i
] < SI_QUERY_FIRST_PERFCOUNTER
)
944 lookup_counter(pc
, query_types
[i
] - SI_QUERY_FIRST_PERFCOUNTER
, &base_gid
, &sub_index
);
948 sub_gid
= sub_index
/ block
->b
->selectors
;
949 sub_index
= sub_index
% block
->b
->selectors
;
951 group
= get_group_state(screen
, query
, block
, sub_gid
);
955 if (group
->num_counters
>= block
->b
->b
->num_counters
) {
956 fprintf(stderr
, "perfcounter group %s: too many selected\n", block
->b
->b
->name
);
959 group
->selectors
[group
->num_counters
] = sub_index
;
960 ++group
->num_counters
;
963 /* Compute result bases and CS size per group */
964 query
->b
.num_cs_dw_suspend
= pc
->num_stop_cs_dwords
;
965 query
->b
.num_cs_dw_suspend
+= pc
->num_instance_cs_dwords
;
968 for (group
= query
->groups
; group
; group
= group
->next
) {
969 struct si_pc_block
*block
= group
->block
;
971 unsigned instances
= 1;
973 if ((block
->b
->b
->flags
& SI_PC_BLOCK_SE
) && group
->se
< 0)
974 instances
= screen
->info
.max_se
;
975 if (group
->instance
< 0)
976 instances
*= block
->num_instances
;
978 group
->result_base
= i
;
979 query
->result_size
+= sizeof(uint64_t) * instances
* group
->num_counters
;
980 i
+= instances
* group
->num_counters
;
982 read_dw
= 6 * group
->num_counters
;
983 query
->b
.num_cs_dw_suspend
+= instances
* read_dw
;
984 query
->b
.num_cs_dw_suspend
+= instances
* pc
->num_instance_cs_dwords
;
987 if (query
->shaders
) {
988 if (query
->shaders
== SI_PC_SHADERS_WINDOWING
)
989 query
->shaders
= 0xffffffff;
992 /* Map user-supplied query array to result indices */
993 query
->counters
= CALLOC(num_queries
, sizeof(*query
->counters
));
994 for (i
= 0; i
< num_queries
; ++i
) {
995 struct si_query_counter
*counter
= &query
->counters
[i
];
996 struct si_pc_block
*block
;
999 lookup_counter(pc
, query_types
[i
] - SI_QUERY_FIRST_PERFCOUNTER
, &base_gid
, &sub_index
);
1001 sub_gid
= sub_index
/ block
->b
->selectors
;
1002 sub_index
= sub_index
% block
->b
->selectors
;
1004 group
= get_group_state(screen
, query
, block
, sub_gid
);
1005 assert(group
!= NULL
);
1007 for (j
= 0; j
< group
->num_counters
; ++j
) {
1008 if (group
->selectors
[j
] == sub_index
)
1012 counter
->base
= group
->result_base
+ j
;
1013 counter
->stride
= group
->num_counters
;
1015 counter
->qwords
= 1;
1016 if ((block
->b
->b
->flags
& SI_PC_BLOCK_SE
) && group
->se
< 0)
1017 counter
->qwords
= screen
->info
.max_se
;
1018 if (group
->instance
< 0)
1019 counter
->qwords
*= block
->num_instances
;
1022 return (struct pipe_query
*)query
;
1025 si_pc_query_destroy((struct si_context
*)ctx
, &query
->b
);
1029 static bool si_init_block_names(struct si_screen
*screen
, struct si_pc_block
*block
)
1031 bool per_instance_groups
= si_pc_block_has_per_instance_groups(screen
->perfcounters
, block
);
1032 bool per_se_groups
= si_pc_block_has_per_se_groups(screen
->perfcounters
, block
);
1034 unsigned groups_shader
= 1, groups_se
= 1, groups_instance
= 1;
1039 if (per_instance_groups
)
1040 groups_instance
= block
->num_instances
;
1042 groups_se
= screen
->info
.max_se
;
1043 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
)
1044 groups_shader
= ARRAY_SIZE(si_pc_shader_type_bits
);
1046 namelen
= strlen(block
->b
->b
->name
);
1047 block
->group_name_stride
= namelen
+ 1;
1048 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
)
1049 block
->group_name_stride
+= 3;
1050 if (per_se_groups
) {
1051 assert(groups_se
<= 10);
1052 block
->group_name_stride
+= 1;
1054 if (per_instance_groups
)
1055 block
->group_name_stride
+= 1;
1057 if (per_instance_groups
) {
1058 assert(groups_instance
<= 100);
1059 block
->group_name_stride
+= 2;
1062 block
->group_names
= MALLOC(block
->num_groups
* block
->group_name_stride
);
1063 if (!block
->group_names
)
1066 groupname
= block
->group_names
;
1067 for (i
= 0; i
< groups_shader
; ++i
) {
1068 const char *shader_suffix
= si_pc_shader_type_suffixes
[i
];
1069 unsigned shaderlen
= strlen(shader_suffix
);
1070 for (j
= 0; j
< groups_se
; ++j
) {
1071 for (k
= 0; k
< groups_instance
; ++k
) {
1072 strcpy(groupname
, block
->b
->b
->name
);
1073 p
= groupname
+ namelen
;
1075 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
) {
1076 strcpy(p
, shader_suffix
);
1080 if (per_se_groups
) {
1081 p
+= sprintf(p
, "%d", j
);
1082 if (per_instance_groups
)
1086 if (per_instance_groups
)
1087 p
+= sprintf(p
, "%d", k
);
1089 groupname
+= block
->group_name_stride
;
1094 assert(block
->b
->selectors
<= 1000);
1095 block
->selector_name_stride
= block
->group_name_stride
+ 4;
1096 block
->selector_names
=
1097 MALLOC(block
->num_groups
* block
->b
->selectors
* block
->selector_name_stride
);
1098 if (!block
->selector_names
)
1101 groupname
= block
->group_names
;
1102 p
= block
->selector_names
;
1103 for (i
= 0; i
< block
->num_groups
; ++i
) {
1104 for (j
= 0; j
< block
->b
->selectors
; ++j
) {
1105 sprintf(p
, "%s_%03d", groupname
, j
);
1106 p
+= block
->selector_name_stride
;
1108 groupname
+= block
->group_name_stride
;
1114 int si_get_perfcounter_info(struct si_screen
*screen
, unsigned index
,
1115 struct pipe_driver_query_info
*info
)
1117 struct si_perfcounters
*pc
= screen
->perfcounters
;
1118 struct si_pc_block
*block
;
1119 unsigned base_gid
, sub
;
1125 unsigned bid
, num_queries
= 0;
1127 for (bid
= 0; bid
< pc
->num_blocks
; ++bid
) {
1128 num_queries
+= pc
->blocks
[bid
].b
->selectors
* pc
->blocks
[bid
].num_groups
;
1134 block
= lookup_counter(pc
, index
, &base_gid
, &sub
);
1138 if (!block
->selector_names
) {
1139 if (!si_init_block_names(screen
, block
))
1142 info
->name
= block
->selector_names
+ sub
* block
->selector_name_stride
;
1143 info
->query_type
= SI_QUERY_FIRST_PERFCOUNTER
+ index
;
1144 info
->max_value
.u64
= 0;
1145 info
->type
= PIPE_DRIVER_QUERY_TYPE_UINT64
;
1146 info
->result_type
= PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE
;
1147 info
->group_id
= base_gid
+ sub
/ block
->b
->selectors
;
1148 info
->flags
= PIPE_DRIVER_QUERY_FLAG_BATCH
;
1149 if (sub
> 0 && sub
+ 1 < block
->b
->selectors
* block
->num_groups
)
1150 info
->flags
|= PIPE_DRIVER_QUERY_FLAG_DONT_LIST
;
1154 int si_get_perfcounter_group_info(struct si_screen
*screen
, unsigned index
,
1155 struct pipe_driver_query_group_info
*info
)
1157 struct si_perfcounters
*pc
= screen
->perfcounters
;
1158 struct si_pc_block
*block
;
1164 return pc
->num_groups
;
1166 block
= lookup_group(pc
, &index
);
1170 if (!block
->group_names
) {
1171 if (!si_init_block_names(screen
, block
))
1174 info
->name
= block
->group_names
+ index
* block
->group_name_stride
;
1175 info
->num_queries
= block
->b
->selectors
;
1176 info
->max_active_queries
= block
->b
->b
->num_counters
;
1180 void si_destroy_perfcounters(struct si_screen
*screen
)
1182 struct si_perfcounters
*pc
= screen
->perfcounters
;
1188 for (i
= 0; i
< pc
->num_blocks
; ++i
) {
1189 FREE(pc
->blocks
[i
].group_names
);
1190 FREE(pc
->blocks
[i
].selector_names
);
1194 screen
->perfcounters
= NULL
;
1197 void si_init_perfcounters(struct si_screen
*screen
)
1199 struct si_perfcounters
*pc
;
1200 const struct si_pc_block_gfxdescr
*blocks
;
1201 unsigned num_blocks
;
1204 switch (screen
->info
.chip_class
) {
1206 blocks
= groups_CIK
;
1207 num_blocks
= ARRAY_SIZE(groups_CIK
);
1211 num_blocks
= ARRAY_SIZE(groups_VI
);
1214 blocks
= groups_gfx9
;
1215 num_blocks
= ARRAY_SIZE(groups_gfx9
);
1219 return; /* not implemented */
1222 if (screen
->info
.max_sh_per_se
!= 1) {
1223 /* This should not happen on non-GFX6 chips. */
1225 "si_init_perfcounters: max_sh_per_se = %d not "
1226 "supported (inaccurate performance counters)\n",
1227 screen
->info
.max_sh_per_se
);
1230 screen
->perfcounters
= pc
= CALLOC_STRUCT(si_perfcounters
);
1234 pc
->num_stop_cs_dwords
= 14 + si_cp_write_fence_dwords(screen
);
1235 pc
->num_instance_cs_dwords
= 3;
1237 pc
->separate_se
= debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
1238 pc
->separate_instance
= debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
1240 pc
->blocks
= CALLOC(num_blocks
, sizeof(struct si_pc_block
));
1243 pc
->num_blocks
= num_blocks
;
1245 for (i
= 0; i
< num_blocks
; ++i
) {
1246 struct si_pc_block
*block
= &pc
->blocks
[i
];
1247 block
->b
= &blocks
[i
];
1248 block
->num_instances
= MAX2(1, block
->b
->instances
);
1250 if (!strcmp(block
->b
->b
->name
, "CB") || !strcmp(block
->b
->b
->name
, "DB"))
1251 block
->num_instances
= screen
->info
.max_se
;
1252 else if (!strcmp(block
->b
->b
->name
, "TCC"))
1253 block
->num_instances
= screen
->info
.num_tcc_blocks
;
1254 else if (!strcmp(block
->b
->b
->name
, "IA"))
1255 block
->num_instances
= MAX2(1, screen
->info
.max_se
/ 2);
1256 else if (!strcmp(block
->b
->b
->name
, "TA") ||
1257 !strcmp(block
->b
->b
->name
, "TCP") ||
1258 !strcmp(block
->b
->b
->name
, "TD")) {
1259 block
->num_instances
= MAX2(1, screen
->info
.max_good_cu_per_sa
);
1262 if (si_pc_block_has_per_instance_groups(pc
, block
)) {
1263 block
->num_groups
= block
->num_instances
;
1265 block
->num_groups
= 1;
1268 if (si_pc_block_has_per_se_groups(pc
, block
))
1269 block
->num_groups
*= screen
->info
.max_se
;
1270 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
)
1271 block
->num_groups
*= ARRAY_SIZE(si_pc_shader_type_bits
);
1273 pc
->num_groups
+= block
->num_groups
;
1279 si_destroy_perfcounters(screen
);