2 * Copyright 2015 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 #include "si_build_pm4.h"
27 #include "util/u_memory.h"
30 enum si_pc_block_flags
{
31 /* This block is part of the shader engine */
32 SI_PC_BLOCK_SE
= (1 << 0),
34 /* Expose per-instance groups instead of summing all instances (within
36 SI_PC_BLOCK_INSTANCE_GROUPS
= (1 << 1),
38 /* Expose per-SE groups instead of summing instances across SEs. */
39 SI_PC_BLOCK_SE_GROUPS
= (1 << 2),
42 SI_PC_BLOCK_SHADER
= (1 << 3),
44 /* Non-shader block with perfcounters windowed by shaders. */
45 SI_PC_BLOCK_SHADER_WINDOWED
= (1 << 4),
48 enum si_pc_reg_layout
{
49 /* All secondary selector dwords follow as one block after the primary
50 * selector dwords for the counters that have secondary selectors.
52 SI_PC_MULTI_BLOCK
= 0,
54 /* Each secondary selector dword follows immediately afters the
55 * corresponding primary.
57 SI_PC_MULTI_ALTERNATE
= 1,
59 /* All secondary selector dwords follow as one block after all primary
64 /* Free-form arrangement of selector registers. */
65 SI_PC_MULTI_CUSTOM
= 3,
69 /* Registers are laid out in decreasing rather than increasing order. */
70 SI_PC_REG_REVERSE
= 4,
75 struct si_pc_block_base
{
77 unsigned num_counters
;
90 struct si_pc_block_gfxdescr
{
91 struct si_pc_block_base
*b
;
97 const struct si_pc_block_gfxdescr
*b
;
98 unsigned num_instances
;
102 unsigned group_name_stride
;
104 char *selector_names
;
105 unsigned selector_name_stride
;
108 /* The order is chosen to be compatible with GPUPerfStudio's hardcoding of
109 * performance counter group IDs.
111 static const char * const si_pc_shader_type_suffixes
[] = {
112 "", "_ES", "_GS", "_VS", "_PS", "_LS", "_HS", "_CS"
115 static const unsigned si_pc_shader_type_bits
[] = {
126 /* Max counters per HW block */
127 #define SI_QUERY_MAX_COUNTERS 16
129 #define SI_PC_SHADERS_WINDOWING (1u << 31)
131 struct si_query_group
{
132 struct si_query_group
*next
;
133 struct si_pc_block
*block
;
134 unsigned sub_gid
; /* only used during init */
135 unsigned result_base
; /* only used during init */
138 unsigned num_counters
;
139 unsigned selectors
[SI_QUERY_MAX_COUNTERS
];
142 struct si_query_counter
{
145 unsigned stride
; /* in uint64s */
150 struct si_query_buffer buffer
;
152 /* Size of the results in memory, in bytes. */
153 unsigned result_size
;
156 unsigned num_counters
;
157 struct si_query_counter
*counters
;
158 struct si_query_group
*groups
;
162 static struct si_pc_block_base cik_CB
= {
165 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
,
167 .select0
= R_037000_CB_PERFCOUNTER_FILTER
,
168 .counter0_lo
= R_035018_CB_PERFCOUNTER0_LO
,
171 .layout
= SI_PC_MULTI_ALTERNATE
,
174 static unsigned cik_CPC_select
[] = {
175 R_036024_CPC_PERFCOUNTER0_SELECT
,
176 R_036010_CPC_PERFCOUNTER0_SELECT1
,
177 R_03600C_CPC_PERFCOUNTER1_SELECT
,
179 static struct si_pc_block_base cik_CPC
= {
183 .select
= cik_CPC_select
,
184 .counter0_lo
= R_034018_CPC_PERFCOUNTER0_LO
,
186 .layout
= SI_PC_MULTI_CUSTOM
| SI_PC_REG_REVERSE
,
189 static struct si_pc_block_base cik_CPF
= {
193 .select0
= R_03601C_CPF_PERFCOUNTER0_SELECT
,
194 .counter0_lo
= R_034028_CPF_PERFCOUNTER0_LO
,
196 .layout
= SI_PC_MULTI_ALTERNATE
| SI_PC_REG_REVERSE
,
199 static struct si_pc_block_base cik_CPG
= {
203 .select0
= R_036008_CPG_PERFCOUNTER0_SELECT
,
204 .counter0_lo
= R_034008_CPG_PERFCOUNTER0_LO
,
206 .layout
= SI_PC_MULTI_ALTERNATE
| SI_PC_REG_REVERSE
,
209 static struct si_pc_block_base cik_DB
= {
212 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
,
214 .select0
= R_037100_DB_PERFCOUNTER0_SELECT
,
215 .counter0_lo
= R_035100_DB_PERFCOUNTER0_LO
,
216 .num_multi
= 3, // really only 2, but there's a gap between registers
217 .layout
= SI_PC_MULTI_ALTERNATE
,
220 static struct si_pc_block_base cik_GDS
= {
224 .select0
= R_036A00_GDS_PERFCOUNTER0_SELECT
,
225 .counter0_lo
= R_034A00_GDS_PERFCOUNTER0_LO
,
227 .layout
= SI_PC_MULTI_TAIL
,
230 static unsigned cik_GRBM_counters
[] = {
231 R_034100_GRBM_PERFCOUNTER0_LO
,
232 R_03410C_GRBM_PERFCOUNTER1_LO
,
234 static struct si_pc_block_base cik_GRBM
= {
238 .select0
= R_036100_GRBM_PERFCOUNTER0_SELECT
,
239 .counters
= cik_GRBM_counters
,
242 static struct si_pc_block_base cik_GRBMSE
= {
246 .select0
= R_036108_GRBM_SE0_PERFCOUNTER_SELECT
,
247 .counter0_lo
= R_034114_GRBM_SE0_PERFCOUNTER_LO
,
250 static struct si_pc_block_base cik_IA
= {
254 .select0
= R_036210_IA_PERFCOUNTER0_SELECT
,
255 .counter0_lo
= R_034220_IA_PERFCOUNTER0_LO
,
257 .layout
= SI_PC_MULTI_TAIL
,
260 static struct si_pc_block_base cik_PA_SC
= {
263 .flags
= SI_PC_BLOCK_SE
,
265 .select0
= R_036500_PA_SC_PERFCOUNTER0_SELECT
,
266 .counter0_lo
= R_034500_PA_SC_PERFCOUNTER0_LO
,
268 .layout
= SI_PC_MULTI_ALTERNATE
,
271 /* According to docs, PA_SU counters are only 48 bits wide. */
272 static struct si_pc_block_base cik_PA_SU
= {
275 .flags
= SI_PC_BLOCK_SE
,
277 .select0
= R_036400_PA_SU_PERFCOUNTER0_SELECT
,
278 .counter0_lo
= R_034400_PA_SU_PERFCOUNTER0_LO
,
280 .layout
= SI_PC_MULTI_ALTERNATE
,
283 static struct si_pc_block_base cik_SPI
= {
286 .flags
= SI_PC_BLOCK_SE
,
288 .select0
= R_036600_SPI_PERFCOUNTER0_SELECT
,
289 .counter0_lo
= R_034604_SPI_PERFCOUNTER0_LO
,
291 .layout
= SI_PC_MULTI_BLOCK
,
294 static struct si_pc_block_base cik_SQ
= {
297 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_SHADER
,
299 .select0
= R_036700_SQ_PERFCOUNTER0_SELECT
,
300 .select_or
= S_036700_SQC_BANK_MASK(15) |
301 S_036700_SQC_CLIENT_MASK(15) |
302 S_036700_SIMD_MASK(15),
303 .counter0_lo
= R_034700_SQ_PERFCOUNTER0_LO
,
306 static struct si_pc_block_base cik_SX
= {
309 .flags
= SI_PC_BLOCK_SE
,
311 .select0
= R_036900_SX_PERFCOUNTER0_SELECT
,
312 .counter0_lo
= R_034900_SX_PERFCOUNTER0_LO
,
314 .layout
= SI_PC_MULTI_TAIL
,
317 static struct si_pc_block_base cik_TA
= {
320 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
| SI_PC_BLOCK_SHADER_WINDOWED
,
322 .select0
= R_036B00_TA_PERFCOUNTER0_SELECT
,
323 .counter0_lo
= R_034B00_TA_PERFCOUNTER0_LO
,
325 .layout
= SI_PC_MULTI_ALTERNATE
,
328 static struct si_pc_block_base cik_TD
= {
331 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
| SI_PC_BLOCK_SHADER_WINDOWED
,
333 .select0
= R_036C00_TD_PERFCOUNTER0_SELECT
,
334 .counter0_lo
= R_034C00_TD_PERFCOUNTER0_LO
,
336 .layout
= SI_PC_MULTI_ALTERNATE
,
339 static struct si_pc_block_base cik_TCA
= {
342 .flags
= SI_PC_BLOCK_INSTANCE_GROUPS
,
344 .select0
= R_036E40_TCA_PERFCOUNTER0_SELECT
,
345 .counter0_lo
= R_034E40_TCA_PERFCOUNTER0_LO
,
347 .layout
= SI_PC_MULTI_ALTERNATE
,
350 static struct si_pc_block_base cik_TCC
= {
353 .flags
= SI_PC_BLOCK_INSTANCE_GROUPS
,
355 .select0
= R_036E00_TCC_PERFCOUNTER0_SELECT
,
356 .counter0_lo
= R_034E00_TCC_PERFCOUNTER0_LO
,
358 .layout
= SI_PC_MULTI_ALTERNATE
,
361 static struct si_pc_block_base cik_TCP
= {
364 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
| SI_PC_BLOCK_SHADER_WINDOWED
,
366 .select0
= R_036D00_TCP_PERFCOUNTER0_SELECT
,
367 .counter0_lo
= R_034D00_TCP_PERFCOUNTER0_LO
,
369 .layout
= SI_PC_MULTI_ALTERNATE
,
372 static struct si_pc_block_base cik_VGT
= {
375 .flags
= SI_PC_BLOCK_SE
,
377 .select0
= R_036230_VGT_PERFCOUNTER0_SELECT
,
378 .counter0_lo
= R_034240_VGT_PERFCOUNTER0_LO
,
380 .layout
= SI_PC_MULTI_TAIL
,
383 static struct si_pc_block_base cik_WD
= {
387 .select0
= R_036200_WD_PERFCOUNTER0_SELECT
,
388 .counter0_lo
= R_034200_WD_PERFCOUNTER0_LO
,
391 static struct si_pc_block_base cik_MC
= {
395 .layout
= SI_PC_FAKE
,
398 static struct si_pc_block_base cik_SRBM
= {
402 .layout
= SI_PC_FAKE
,
405 /* Both the number of instances and selectors varies between chips of the same
406 * class. We only differentiate by class here and simply expose the maximum
407 * number over all chips in a class.
409 * Unfortunately, GPUPerfStudio uses the order of performance counter groups
410 * blindly once it believes it has identified the hardware, so the order of
411 * blocks here matters.
413 static struct si_pc_block_gfxdescr groups_CIK
[] = {
424 { &cik_TA
, 111, 11 },
428 { &cik_TCP
, 154, 11 },
440 static struct si_pc_block_gfxdescr groups_VI
[] = {
451 { &cik_TA
, 119, 16 },
455 { &cik_TCP
, 180, 16 },
467 static struct si_pc_block_gfxdescr groups_gfx9
[] = {
478 { &cik_TA
, 119, 16 },
482 { &cik_TCP
, 85, 16 },
491 static bool si_pc_block_has_per_se_groups(const struct si_perfcounters
*pc
,
492 const struct si_pc_block
*block
)
494 return block
->b
->b
->flags
& SI_PC_BLOCK_SE_GROUPS
||
495 (block
->b
->b
->flags
& SI_PC_BLOCK_SE
&& pc
->separate_se
);
498 static bool si_pc_block_has_per_instance_groups(const struct si_perfcounters
*pc
,
499 const struct si_pc_block
*block
)
501 return block
->b
->b
->flags
& SI_PC_BLOCK_INSTANCE_GROUPS
||
502 (block
->num_instances
> 1 && pc
->separate_instance
);
505 static struct si_pc_block
*
506 lookup_counter(struct si_perfcounters
*pc
, unsigned index
,
507 unsigned *base_gid
, unsigned *sub_index
)
509 struct si_pc_block
*block
= pc
->blocks
;
513 for (bid
= 0; bid
< pc
->num_blocks
; ++bid
, ++block
) {
514 unsigned total
= block
->num_groups
* block
->b
->selectors
;
522 *base_gid
+= block
->num_groups
;
528 static struct si_pc_block
*
529 lookup_group(struct si_perfcounters
*pc
, unsigned *index
)
532 struct si_pc_block
*block
= pc
->blocks
;
534 for (bid
= 0; bid
< pc
->num_blocks
; ++bid
, ++block
) {
535 if (*index
< block
->num_groups
)
537 *index
-= block
->num_groups
;
543 static void si_pc_emit_instance(struct si_context
*sctx
,
544 int se
, int instance
)
546 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
547 unsigned value
= S_030800_SH_BROADCAST_WRITES(1);
550 value
|= S_030800_SE_INDEX(se
);
552 value
|= S_030800_SE_BROADCAST_WRITES(1);
556 value
|= S_030800_INSTANCE_INDEX(instance
);
558 value
|= S_030800_INSTANCE_BROADCAST_WRITES(1);
561 radeon_set_uconfig_reg(cs
, R_030800_GRBM_GFX_INDEX
, value
);
564 static void si_pc_emit_shaders(struct si_context
*sctx
,
567 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
569 radeon_set_uconfig_reg_seq(cs
, R_036780_SQ_PERFCOUNTER_CTRL
, 2);
570 radeon_emit(cs
, shaders
& 0x7f);
571 radeon_emit(cs
, 0xffffffff);
574 static void si_pc_emit_select(struct si_context
*sctx
,
575 struct si_pc_block
*block
,
576 unsigned count
, unsigned *selectors
)
578 struct si_pc_block_base
*regs
= block
->b
->b
;
579 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
581 unsigned layout_multi
= regs
->layout
& SI_PC_MULTI_MASK
;
584 assert(count
<= regs
->num_counters
);
586 if (regs
->layout
& SI_PC_FAKE
)
589 if (layout_multi
== SI_PC_MULTI_BLOCK
) {
590 assert(!(regs
->layout
& SI_PC_REG_REVERSE
));
592 dw
= count
+ regs
->num_prelude
;
593 if (count
>= regs
->num_multi
)
594 dw
+= regs
->num_multi
;
595 radeon_set_uconfig_reg_seq(cs
, regs
->select0
, dw
);
596 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
598 for (idx
= 0; idx
< MIN2(count
, regs
->num_multi
); ++idx
)
599 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
601 if (count
< regs
->num_multi
) {
603 regs
->select0
+ 4 * regs
->num_multi
;
604 radeon_set_uconfig_reg_seq(cs
, select1
, count
);
607 for (idx
= 0; idx
< MIN2(count
, regs
->num_multi
); ++idx
)
610 if (count
> regs
->num_multi
) {
611 for (idx
= regs
->num_multi
; idx
< count
; ++idx
)
612 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
614 } else if (layout_multi
== SI_PC_MULTI_TAIL
) {
615 unsigned select1
, select1_count
;
617 assert(!(regs
->layout
& SI_PC_REG_REVERSE
));
619 radeon_set_uconfig_reg_seq(cs
, regs
->select0
, count
+ regs
->num_prelude
);
620 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
622 for (idx
= 0; idx
< count
; ++idx
)
623 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
625 select1
= regs
->select0
+ 4 * regs
->num_counters
;
626 select1_count
= MIN2(count
, regs
->num_multi
);
627 radeon_set_uconfig_reg_seq(cs
, select1
, select1_count
);
628 for (idx
= 0; idx
< select1_count
; ++idx
)
630 } else if (layout_multi
== SI_PC_MULTI_CUSTOM
) {
631 unsigned *reg
= regs
->select
;
632 for (idx
= 0; idx
< count
; ++idx
) {
633 radeon_set_uconfig_reg(cs
, *reg
++, selectors
[idx
] | regs
->select_or
);
634 if (idx
< regs
->num_multi
)
635 radeon_set_uconfig_reg(cs
, *reg
++, 0);
638 assert(layout_multi
== SI_PC_MULTI_ALTERNATE
);
640 unsigned reg_base
= regs
->select0
;
641 unsigned reg_count
= count
+ MIN2(count
, regs
->num_multi
);
642 reg_count
+= regs
->num_prelude
;
644 if (!(regs
->layout
& SI_PC_REG_REVERSE
)) {
645 radeon_set_uconfig_reg_seq(cs
, reg_base
, reg_count
);
647 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
649 for (idx
= 0; idx
< count
; ++idx
) {
650 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
651 if (idx
< regs
->num_multi
)
655 reg_base
-= (reg_count
- 1) * 4;
656 radeon_set_uconfig_reg_seq(cs
, reg_base
, reg_count
);
658 for (idx
= count
; idx
> 0; --idx
) {
659 if (idx
<= regs
->num_multi
)
661 radeon_emit(cs
, selectors
[idx
- 1] | regs
->select_or
);
663 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
669 static void si_pc_emit_start(struct si_context
*sctx
,
670 struct si_resource
*buffer
, uint64_t va
)
672 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
674 si_cp_copy_data(sctx
, sctx
->gfx_cs
,
675 COPY_DATA_DST_MEM
, buffer
, va
- buffer
->gpu_address
,
676 COPY_DATA_IMM
, NULL
, 1);
678 radeon_set_uconfig_reg(cs
, R_036020_CP_PERFMON_CNTL
,
679 S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET
));
680 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
681 radeon_emit(cs
, EVENT_TYPE(V_028A90_PERFCOUNTER_START
) | EVENT_INDEX(0));
682 radeon_set_uconfig_reg(cs
, R_036020_CP_PERFMON_CNTL
,
683 S_036020_PERFMON_STATE(V_036020_START_COUNTING
));
686 /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
687 * do it again in here. */
688 static void si_pc_emit_stop(struct si_context
*sctx
,
689 struct si_resource
*buffer
, uint64_t va
)
691 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
693 si_cp_release_mem(sctx
, cs
, V_028A90_BOTTOM_OF_PIPE_TS
, 0,
694 EOP_DST_SEL_MEM
, EOP_INT_SEL_NONE
,
695 EOP_DATA_SEL_VALUE_32BIT
,
696 buffer
, va
, 0, SI_NOT_QUERY
);
697 si_cp_wait_mem(sctx
, cs
, va
, 0, 0xffffffff, WAIT_REG_MEM_EQUAL
);
699 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
700 radeon_emit(cs
, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE
) | EVENT_INDEX(0));
701 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
702 radeon_emit(cs
, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP
) | EVENT_INDEX(0));
703 radeon_set_uconfig_reg(cs
, R_036020_CP_PERFMON_CNTL
,
704 S_036020_PERFMON_STATE(V_036020_STOP_COUNTING
) |
705 S_036020_PERFMON_SAMPLE_ENABLE(1));
708 static void si_pc_emit_read(struct si_context
*sctx
,
709 struct si_pc_block
*block
,
710 unsigned count
, uint64_t va
)
712 struct si_pc_block_base
*regs
= block
->b
->b
;
713 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
715 unsigned reg
= regs
->counter0_lo
;
716 unsigned reg_delta
= 8;
718 if (!(regs
->layout
& SI_PC_FAKE
)) {
719 if (regs
->layout
& SI_PC_REG_REVERSE
)
720 reg_delta
= -reg_delta
;
722 for (idx
= 0; idx
< count
; ++idx
) {
724 reg
= regs
->counters
[idx
];
726 radeon_emit(cs
, PKT3(PKT3_COPY_DATA
, 4, 0));
727 radeon_emit(cs
, COPY_DATA_SRC_SEL(COPY_DATA_PERF
) |
728 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM
) |
729 COPY_DATA_COUNT_SEL
); /* 64 bits */
730 radeon_emit(cs
, reg
>> 2);
731 radeon_emit(cs
, 0); /* unused */
733 radeon_emit(cs
, va
>> 32);
734 va
+= sizeof(uint64_t);
738 for (idx
= 0; idx
< count
; ++idx
) {
739 radeon_emit(cs
, PKT3(PKT3_COPY_DATA
, 4, 0));
740 radeon_emit(cs
, COPY_DATA_SRC_SEL(COPY_DATA_IMM
) |
741 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM
) |
742 COPY_DATA_COUNT_SEL
);
743 radeon_emit(cs
, 0); /* immediate */
746 radeon_emit(cs
, va
>> 32);
747 va
+= sizeof(uint64_t);
752 static void si_pc_query_destroy(struct si_context
*sctx
,
753 struct si_query
*squery
)
755 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
757 while (query
->groups
) {
758 struct si_query_group
*group
= query
->groups
;
759 query
->groups
= group
->next
;
763 FREE(query
->counters
);
765 si_query_buffer_destroy(sctx
->screen
, &query
->buffer
);
769 static void si_pc_query_resume(struct si_context
*sctx
, struct si_query
*squery
)
771 struct si_query_hw *hwquery,
772 struct si_resource *buffer, uint64_t va)*/
774 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
776 int current_instance
= -1;
778 if (!si_query_buffer_alloc(sctx
, &query
->buffer
, NULL
, query
->result_size
))
780 si_need_gfx_cs_space(sctx
);
783 si_pc_emit_shaders(sctx
, query
->shaders
);
785 for (struct si_query_group
*group
= query
->groups
; group
; group
= group
->next
) {
786 struct si_pc_block
*block
= group
->block
;
788 if (group
->se
!= current_se
|| group
->instance
!= current_instance
) {
789 current_se
= group
->se
;
790 current_instance
= group
->instance
;
791 si_pc_emit_instance(sctx
, group
->se
, group
->instance
);
794 si_pc_emit_select(sctx
, block
, group
->num_counters
, group
->selectors
);
797 if (current_se
!= -1 || current_instance
!= -1)
798 si_pc_emit_instance(sctx
, -1, -1);
800 uint64_t va
= query
->buffer
.buf
->gpu_address
+ query
->buffer
.results_end
;
801 si_pc_emit_start(sctx
, query
->buffer
.buf
, va
);
804 static void si_pc_query_suspend(struct si_context
*sctx
, struct si_query
*squery
)
806 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
808 if (!query
->buffer
.buf
)
811 uint64_t va
= query
->buffer
.buf
->gpu_address
+ query
->buffer
.results_end
;
812 query
->buffer
.results_end
+= query
->result_size
;
814 si_pc_emit_stop(sctx
, query
->buffer
.buf
, va
);
816 for (struct si_query_group
*group
= query
->groups
; group
; group
= group
->next
) {
817 struct si_pc_block
*block
= group
->block
;
818 unsigned se
= group
->se
>= 0 ? group
->se
: 0;
819 unsigned se_end
= se
+ 1;
821 if ((block
->b
->b
->flags
& SI_PC_BLOCK_SE
) && (group
->se
< 0))
822 se_end
= sctx
->screen
->info
.max_se
;
825 unsigned instance
= group
->instance
>= 0 ? group
->instance
: 0;
828 si_pc_emit_instance(sctx
, se
, instance
);
829 si_pc_emit_read(sctx
, block
, group
->num_counters
, va
);
830 va
+= sizeof(uint64_t) * group
->num_counters
;
831 } while (group
->instance
< 0 && ++instance
< block
->num_instances
);
832 } while (++se
< se_end
);
835 si_pc_emit_instance(sctx
, -1, -1);
838 static bool si_pc_query_begin(struct si_context
*ctx
, struct si_query
*squery
)
840 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
842 si_query_buffer_reset(ctx
, &query
->buffer
);
844 list_addtail(&query
->b
.active_list
, &ctx
->active_queries
);
845 ctx
->num_cs_dw_queries_suspend
+= query
->b
.num_cs_dw_suspend
;
847 si_pc_query_resume(ctx
, squery
);
852 static bool si_pc_query_end(struct si_context
*ctx
, struct si_query
*squery
)
854 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
856 si_pc_query_suspend(ctx
, squery
);
858 list_del(&squery
->active_list
);
859 ctx
->num_cs_dw_queries_suspend
-= squery
->num_cs_dw_suspend
;
861 return query
->buffer
.buf
!= NULL
;
864 static void si_pc_query_add_result(struct si_query_pc
*query
,
866 union pipe_query_result
*result
)
868 uint64_t *results
= buffer
;
871 for (i
= 0; i
< query
->num_counters
; ++i
) {
872 struct si_query_counter
*counter
= &query
->counters
[i
];
874 for (j
= 0; j
< counter
->qwords
; ++j
) {
875 uint32_t value
= results
[counter
->base
+ j
* counter
->stride
];
876 result
->batch
[i
].u64
+= value
;
881 static bool si_pc_query_get_result(struct si_context
*sctx
, struct si_query
*squery
,
882 bool wait
, union pipe_query_result
*result
)
884 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
886 memset(result
, 0, sizeof(result
->batch
[0]) * query
->num_counters
);
888 for (struct si_query_buffer
*qbuf
= &query
->buffer
; qbuf
; qbuf
= qbuf
->previous
) {
889 unsigned usage
= PIPE_TRANSFER_READ
|
890 (wait
? 0 : PIPE_TRANSFER_DONTBLOCK
);
891 unsigned results_base
= 0;
894 if (squery
->b
.flushed
)
895 map
= sctx
->ws
->buffer_map(qbuf
->buf
->buf
, NULL
, usage
);
897 map
= si_buffer_map_sync_with_rings(sctx
, qbuf
->buf
, usage
);
902 while (results_base
!= qbuf
->results_end
) {
903 si_pc_query_add_result(query
, map
+ results_base
, result
);
904 results_base
+= query
->result_size
;
911 static const struct si_query_ops batch_query_ops
= {
912 .destroy
= si_pc_query_destroy
,
913 .begin
= si_pc_query_begin
,
914 .end
= si_pc_query_end
,
915 .get_result
= si_pc_query_get_result
,
917 .suspend
= si_pc_query_suspend
,
918 .resume
= si_pc_query_resume
,
921 static struct si_query_group
*get_group_state(struct si_screen
*screen
,
922 struct si_query_pc
*query
,
923 struct si_pc_block
*block
,
926 struct si_query_group
*group
= query
->groups
;
929 if (group
->block
== block
&& group
->sub_gid
== sub_gid
)
934 group
= CALLOC_STRUCT(si_query_group
);
938 group
->block
= block
;
939 group
->sub_gid
= sub_gid
;
941 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
) {
942 unsigned sub_gids
= block
->num_instances
;
945 unsigned query_shaders
;
947 if (si_pc_block_has_per_se_groups(screen
->perfcounters
, block
))
948 sub_gids
= sub_gids
* screen
->info
.max_se
;
949 shader_id
= sub_gid
/ sub_gids
;
950 sub_gid
= sub_gid
% sub_gids
;
952 shaders
= si_pc_shader_type_bits
[shader_id
];
954 query_shaders
= query
->shaders
& ~SI_PC_SHADERS_WINDOWING
;
955 if (query_shaders
&& query_shaders
!= shaders
) {
956 fprintf(stderr
, "si_perfcounter: incompatible shader groups\n");
960 query
->shaders
= shaders
;
963 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER_WINDOWED
&& !query
->shaders
) {
964 // A non-zero value in query->shaders ensures that the shader
965 // masking is reset unless the user explicitly requests one.
966 query
->shaders
= SI_PC_SHADERS_WINDOWING
;
969 if (si_pc_block_has_per_se_groups(screen
->perfcounters
, block
)) {
970 group
->se
= sub_gid
/ block
->num_instances
;
971 sub_gid
= sub_gid
% block
->num_instances
;
976 if (si_pc_block_has_per_instance_groups(screen
->perfcounters
, block
)) {
977 group
->instance
= sub_gid
;
979 group
->instance
= -1;
982 group
->next
= query
->groups
;
983 query
->groups
= group
;
988 struct pipe_query
*si_create_batch_query(struct pipe_context
*ctx
,
989 unsigned num_queries
,
990 unsigned *query_types
)
992 struct si_screen
*screen
=
993 (struct si_screen
*)ctx
->screen
;
994 struct si_perfcounters
*pc
= screen
->perfcounters
;
995 struct si_pc_block
*block
;
996 struct si_query_group
*group
;
997 struct si_query_pc
*query
;
998 unsigned base_gid
, sub_gid
, sub_index
;
1004 query
= CALLOC_STRUCT(si_query_pc
);
1008 query
->b
.ops
= &batch_query_ops
;
1010 query
->num_counters
= num_queries
;
1012 /* Collect selectors per group */
1013 for (i
= 0; i
< num_queries
; ++i
) {
1016 if (query_types
[i
] < SI_QUERY_FIRST_PERFCOUNTER
)
1019 block
= lookup_counter(pc
, query_types
[i
] - SI_QUERY_FIRST_PERFCOUNTER
,
1020 &base_gid
, &sub_index
);
1024 sub_gid
= sub_index
/ block
->b
->selectors
;
1025 sub_index
= sub_index
% block
->b
->selectors
;
1027 group
= get_group_state(screen
, query
, block
, sub_gid
);
1031 if (group
->num_counters
>= block
->b
->b
->num_counters
) {
1033 "perfcounter group %s: too many selected\n",
1037 group
->selectors
[group
->num_counters
] = sub_index
;
1038 ++group
->num_counters
;
1041 /* Compute result bases and CS size per group */
1042 query
->b
.num_cs_dw_suspend
= pc
->num_stop_cs_dwords
;
1043 query
->b
.num_cs_dw_suspend
+= pc
->num_instance_cs_dwords
;
1046 for (group
= query
->groups
; group
; group
= group
->next
) {
1047 struct si_pc_block
*block
= group
->block
;
1049 unsigned instances
= 1;
1051 if ((block
->b
->b
->flags
& SI_PC_BLOCK_SE
) && group
->se
< 0)
1052 instances
= screen
->info
.max_se
;
1053 if (group
->instance
< 0)
1054 instances
*= block
->num_instances
;
1056 group
->result_base
= i
;
1057 query
->result_size
+= sizeof(uint64_t) * instances
* group
->num_counters
;
1058 i
+= instances
* group
->num_counters
;
1060 read_dw
= 6 * group
->num_counters
;
1061 query
->b
.num_cs_dw_suspend
+= instances
* read_dw
;
1062 query
->b
.num_cs_dw_suspend
+= instances
* pc
->num_instance_cs_dwords
;
1065 if (query
->shaders
) {
1066 if (query
->shaders
== SI_PC_SHADERS_WINDOWING
)
1067 query
->shaders
= 0xffffffff;
1070 /* Map user-supplied query array to result indices */
1071 query
->counters
= CALLOC(num_queries
, sizeof(*query
->counters
));
1072 for (i
= 0; i
< num_queries
; ++i
) {
1073 struct si_query_counter
*counter
= &query
->counters
[i
];
1074 struct si_pc_block
*block
;
1076 block
= lookup_counter(pc
, query_types
[i
] - SI_QUERY_FIRST_PERFCOUNTER
,
1077 &base_gid
, &sub_index
);
1079 sub_gid
= sub_index
/ block
->b
->selectors
;
1080 sub_index
= sub_index
% block
->b
->selectors
;
1082 group
= get_group_state(screen
, query
, block
, sub_gid
);
1083 assert(group
!= NULL
);
1085 for (j
= 0; j
< group
->num_counters
; ++j
) {
1086 if (group
->selectors
[j
] == sub_index
)
1090 counter
->base
= group
->result_base
+ j
;
1091 counter
->stride
= group
->num_counters
;
1093 counter
->qwords
= 1;
1094 if ((block
->b
->b
->flags
& SI_PC_BLOCK_SE
) && group
->se
< 0)
1095 counter
->qwords
= screen
->info
.max_se
;
1096 if (group
->instance
< 0)
1097 counter
->qwords
*= block
->num_instances
;
1100 return (struct pipe_query
*)query
;
1103 si_pc_query_destroy((struct si_context
*)ctx
, &query
->b
);
1107 static bool si_init_block_names(struct si_screen
*screen
,
1108 struct si_pc_block
*block
)
1110 bool per_instance_groups
= si_pc_block_has_per_instance_groups(screen
->perfcounters
, block
);
1111 bool per_se_groups
= si_pc_block_has_per_se_groups(screen
->perfcounters
, block
);
1113 unsigned groups_shader
= 1, groups_se
= 1, groups_instance
= 1;
1118 if (per_instance_groups
)
1119 groups_instance
= block
->num_instances
;
1121 groups_se
= screen
->info
.max_se
;
1122 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
)
1123 groups_shader
= ARRAY_SIZE(si_pc_shader_type_bits
);
1125 namelen
= strlen(block
->b
->b
->name
);
1126 block
->group_name_stride
= namelen
+ 1;
1127 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
)
1128 block
->group_name_stride
+= 3;
1129 if (per_se_groups
) {
1130 assert(groups_se
<= 10);
1131 block
->group_name_stride
+= 1;
1133 if (per_instance_groups
)
1134 block
->group_name_stride
+= 1;
1136 if (per_instance_groups
) {
1137 assert(groups_instance
<= 100);
1138 block
->group_name_stride
+= 2;
1141 block
->group_names
= MALLOC(block
->num_groups
* block
->group_name_stride
);
1142 if (!block
->group_names
)
1145 groupname
= block
->group_names
;
1146 for (i
= 0; i
< groups_shader
; ++i
) {
1147 const char *shader_suffix
= si_pc_shader_type_suffixes
[i
];
1148 unsigned shaderlen
= strlen(shader_suffix
);
1149 for (j
= 0; j
< groups_se
; ++j
) {
1150 for (k
= 0; k
< groups_instance
; ++k
) {
1151 strcpy(groupname
, block
->b
->b
->name
);
1152 p
= groupname
+ namelen
;
1154 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
) {
1155 strcpy(p
, shader_suffix
);
1159 if (per_se_groups
) {
1160 p
+= sprintf(p
, "%d", j
);
1161 if (per_instance_groups
)
1165 if (per_instance_groups
)
1166 p
+= sprintf(p
, "%d", k
);
1168 groupname
+= block
->group_name_stride
;
1173 assert(block
->b
->selectors
<= 1000);
1174 block
->selector_name_stride
= block
->group_name_stride
+ 4;
1175 block
->selector_names
= MALLOC(block
->num_groups
* block
->b
->selectors
*
1176 block
->selector_name_stride
);
1177 if (!block
->selector_names
)
1180 groupname
= block
->group_names
;
1181 p
= block
->selector_names
;
1182 for (i
= 0; i
< block
->num_groups
; ++i
) {
1183 for (j
= 0; j
< block
->b
->selectors
; ++j
) {
1184 sprintf(p
, "%s_%03d", groupname
, j
);
1185 p
+= block
->selector_name_stride
;
1187 groupname
+= block
->group_name_stride
;
1193 int si_get_perfcounter_info(struct si_screen
*screen
,
1195 struct pipe_driver_query_info
*info
)
1197 struct si_perfcounters
*pc
= screen
->perfcounters
;
1198 struct si_pc_block
*block
;
1199 unsigned base_gid
, sub
;
1205 unsigned bid
, num_queries
= 0;
1207 for (bid
= 0; bid
< pc
->num_blocks
; ++bid
) {
1208 num_queries
+= pc
->blocks
[bid
].b
->selectors
*
1209 pc
->blocks
[bid
].num_groups
;
1215 block
= lookup_counter(pc
, index
, &base_gid
, &sub
);
1219 if (!block
->selector_names
) {
1220 if (!si_init_block_names(screen
, block
))
1223 info
->name
= block
->selector_names
+ sub
* block
->selector_name_stride
;
1224 info
->query_type
= SI_QUERY_FIRST_PERFCOUNTER
+ index
;
1225 info
->max_value
.u64
= 0;
1226 info
->type
= PIPE_DRIVER_QUERY_TYPE_UINT64
;
1227 info
->result_type
= PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE
;
1228 info
->group_id
= base_gid
+ sub
/ block
->b
->selectors
;
1229 info
->flags
= PIPE_DRIVER_QUERY_FLAG_BATCH
;
1230 if (sub
> 0 && sub
+ 1 < block
->b
->selectors
* block
->num_groups
)
1231 info
->flags
|= PIPE_DRIVER_QUERY_FLAG_DONT_LIST
;
1235 int si_get_perfcounter_group_info(struct si_screen
*screen
,
1237 struct pipe_driver_query_group_info
*info
)
1239 struct si_perfcounters
*pc
= screen
->perfcounters
;
1240 struct si_pc_block
*block
;
1246 return pc
->num_groups
;
1248 block
= lookup_group(pc
, &index
);
1252 if (!block
->group_names
) {
1253 if (!si_init_block_names(screen
, block
))
1256 info
->name
= block
->group_names
+ index
* block
->group_name_stride
;
1257 info
->num_queries
= block
->b
->selectors
;
1258 info
->max_active_queries
= block
->b
->b
->num_counters
;
1262 void si_destroy_perfcounters(struct si_screen
*screen
)
1264 struct si_perfcounters
*pc
= screen
->perfcounters
;
1270 for (i
= 0; i
< pc
->num_blocks
; ++i
) {
1271 FREE(pc
->blocks
[i
].group_names
);
1272 FREE(pc
->blocks
[i
].selector_names
);
1276 screen
->perfcounters
= NULL
;
1279 void si_init_perfcounters(struct si_screen
*screen
)
1281 struct si_perfcounters
*pc
;
1282 const struct si_pc_block_gfxdescr
*blocks
;
1283 unsigned num_blocks
;
1286 switch (screen
->info
.chip_class
) {
1288 blocks
= groups_CIK
;
1289 num_blocks
= ARRAY_SIZE(groups_CIK
);
1293 num_blocks
= ARRAY_SIZE(groups_VI
);
1296 blocks
= groups_gfx9
;
1297 num_blocks
= ARRAY_SIZE(groups_gfx9
);
1301 return; /* not implemented */
1304 if (screen
->info
.max_sh_per_se
!= 1) {
1305 /* This should not happen on non-GFX6 chips. */
1306 fprintf(stderr
, "si_init_perfcounters: max_sh_per_se = %d not "
1307 "supported (inaccurate performance counters)\n",
1308 screen
->info
.max_sh_per_se
);
1311 screen
->perfcounters
= pc
= CALLOC_STRUCT(si_perfcounters
);
1315 pc
->num_stop_cs_dwords
= 14 + si_cp_write_fence_dwords(screen
);
1316 pc
->num_instance_cs_dwords
= 3;
1318 pc
->separate_se
= debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
1319 pc
->separate_instance
= debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
1321 pc
->blocks
= CALLOC(num_blocks
, sizeof(struct si_pc_block
));
1324 pc
->num_blocks
= num_blocks
;
1326 for (i
= 0; i
< num_blocks
; ++i
) {
1327 struct si_pc_block
*block
= &pc
->blocks
[i
];
1328 block
->b
= &blocks
[i
];
1329 block
->num_instances
= MAX2(1, block
->b
->instances
);
1331 if (!strcmp(block
->b
->b
->name
, "CB") ||
1332 !strcmp(block
->b
->b
->name
, "DB"))
1333 block
->num_instances
= screen
->info
.max_se
;
1334 else if (!strcmp(block
->b
->b
->name
, "TCC"))
1335 block
->num_instances
= screen
->info
.num_tcc_blocks
;
1336 else if (!strcmp(block
->b
->b
->name
, "IA"))
1337 block
->num_instances
= MAX2(1, screen
->info
.max_se
/ 2);
1339 if (si_pc_block_has_per_instance_groups(pc
, block
)) {
1340 block
->num_groups
= block
->num_instances
;
1342 block
->num_groups
= 1;
1345 if (si_pc_block_has_per_se_groups(pc
, block
))
1346 block
->num_groups
*= screen
->info
.max_se
;
1347 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
)
1348 block
->num_groups
*= ARRAY_SIZE(si_pc_shader_type_bits
);
1350 pc
->num_groups
+= block
->num_groups
;
1356 si_destroy_perfcounters(screen
);