2 * Copyright 2015 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 #include "si_build_pm4.h"
27 #include "util/u_memory.h"
30 enum si_pc_block_flags
{
31 /* This block is part of the shader engine */
32 SI_PC_BLOCK_SE
= (1 << 0),
34 /* Expose per-instance groups instead of summing all instances (within
36 SI_PC_BLOCK_INSTANCE_GROUPS
= (1 << 1),
38 /* Expose per-SE groups instead of summing instances across SEs. */
39 SI_PC_BLOCK_SE_GROUPS
= (1 << 2),
42 SI_PC_BLOCK_SHADER
= (1 << 3),
44 /* Non-shader block with perfcounters windowed by shaders. */
45 SI_PC_BLOCK_SHADER_WINDOWED
= (1 << 4),
48 enum si_pc_reg_layout
{
49 /* All secondary selector dwords follow as one block after the primary
50 * selector dwords for the counters that have secondary selectors.
52 SI_PC_MULTI_BLOCK
= 0,
54 /* Each secondary selector dword follows immediately afters the
55 * corresponding primary.
57 SI_PC_MULTI_ALTERNATE
= 1,
59 /* All secondary selector dwords follow as one block after all primary
64 /* Free-form arrangement of selector registers. */
65 SI_PC_MULTI_CUSTOM
= 3,
69 /* Registers are laid out in decreasing rather than increasing order. */
70 SI_PC_REG_REVERSE
= 4,
75 struct si_pc_block_base
{
77 unsigned num_counters
;
90 struct si_pc_block_gfxdescr
{
91 struct si_pc_block_base
*b
;
97 const struct si_pc_block_gfxdescr
*b
;
98 unsigned num_instances
;
102 unsigned group_name_stride
;
104 char *selector_names
;
105 unsigned selector_name_stride
;
108 /* The order is chosen to be compatible with GPUPerfStudio's hardcoding of
109 * performance counter group IDs.
111 static const char * const si_pc_shader_type_suffixes
[] = {
112 "", "_ES", "_GS", "_VS", "_PS", "_LS", "_HS", "_CS"
115 static const unsigned si_pc_shader_type_bits
[] = {
126 /* Max counters per HW block */
127 #define SI_QUERY_MAX_COUNTERS 16
129 #define SI_PC_SHADERS_WINDOWING (1 << 31)
131 struct si_query_group
{
132 struct si_query_group
*next
;
133 struct si_pc_block
*block
;
134 unsigned sub_gid
; /* only used during init */
135 unsigned result_base
; /* only used during init */
138 unsigned num_counters
;
139 unsigned selectors
[SI_QUERY_MAX_COUNTERS
];
142 struct si_query_counter
{
145 unsigned stride
; /* in uint64s */
149 struct si_query_hw b
;
152 unsigned num_counters
;
153 struct si_query_counter
*counters
;
154 struct si_query_group
*groups
;
158 static struct si_pc_block_base cik_CB
= {
161 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
,
163 .select0
= R_037000_CB_PERFCOUNTER_FILTER
,
164 .counter0_lo
= R_035018_CB_PERFCOUNTER0_LO
,
167 .layout
= SI_PC_MULTI_ALTERNATE
,
170 static unsigned cik_CPC_select
[] = {
171 R_036024_CPC_PERFCOUNTER0_SELECT
,
172 R_036010_CPC_PERFCOUNTER0_SELECT1
,
173 R_03600C_CPC_PERFCOUNTER1_SELECT
,
175 static struct si_pc_block_base cik_CPC
= {
179 .select
= cik_CPC_select
,
180 .counter0_lo
= R_034018_CPC_PERFCOUNTER0_LO
,
182 .layout
= SI_PC_MULTI_CUSTOM
| SI_PC_REG_REVERSE
,
185 static struct si_pc_block_base cik_CPF
= {
189 .select0
= R_03601C_CPF_PERFCOUNTER0_SELECT
,
190 .counter0_lo
= R_034028_CPF_PERFCOUNTER0_LO
,
192 .layout
= SI_PC_MULTI_ALTERNATE
| SI_PC_REG_REVERSE
,
195 static struct si_pc_block_base cik_CPG
= {
199 .select0
= R_036008_CPG_PERFCOUNTER0_SELECT
,
200 .counter0_lo
= R_034008_CPG_PERFCOUNTER0_LO
,
202 .layout
= SI_PC_MULTI_ALTERNATE
| SI_PC_REG_REVERSE
,
205 static struct si_pc_block_base cik_DB
= {
208 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
,
210 .select0
= R_037100_DB_PERFCOUNTER0_SELECT
,
211 .counter0_lo
= R_035100_DB_PERFCOUNTER0_LO
,
212 .num_multi
= 3, // really only 2, but there's a gap between registers
213 .layout
= SI_PC_MULTI_ALTERNATE
,
216 static struct si_pc_block_base cik_GDS
= {
220 .select0
= R_036A00_GDS_PERFCOUNTER0_SELECT
,
221 .counter0_lo
= R_034A00_GDS_PERFCOUNTER0_LO
,
223 .layout
= SI_PC_MULTI_TAIL
,
226 static unsigned cik_GRBM_counters
[] = {
227 R_034100_GRBM_PERFCOUNTER0_LO
,
228 R_03410C_GRBM_PERFCOUNTER1_LO
,
230 static struct si_pc_block_base cik_GRBM
= {
234 .select0
= R_036100_GRBM_PERFCOUNTER0_SELECT
,
235 .counters
= cik_GRBM_counters
,
238 static struct si_pc_block_base cik_GRBMSE
= {
242 .select0
= R_036108_GRBM_SE0_PERFCOUNTER_SELECT
,
243 .counter0_lo
= R_034114_GRBM_SE0_PERFCOUNTER_LO
,
246 static struct si_pc_block_base cik_IA
= {
250 .select0
= R_036210_IA_PERFCOUNTER0_SELECT
,
251 .counter0_lo
= R_034220_IA_PERFCOUNTER0_LO
,
253 .layout
= SI_PC_MULTI_TAIL
,
256 static struct si_pc_block_base cik_PA_SC
= {
259 .flags
= SI_PC_BLOCK_SE
,
261 .select0
= R_036500_PA_SC_PERFCOUNTER0_SELECT
,
262 .counter0_lo
= R_034500_PA_SC_PERFCOUNTER0_LO
,
264 .layout
= SI_PC_MULTI_ALTERNATE
,
267 /* According to docs, PA_SU counters are only 48 bits wide. */
268 static struct si_pc_block_base cik_PA_SU
= {
271 .flags
= SI_PC_BLOCK_SE
,
273 .select0
= R_036400_PA_SU_PERFCOUNTER0_SELECT
,
274 .counter0_lo
= R_034400_PA_SU_PERFCOUNTER0_LO
,
276 .layout
= SI_PC_MULTI_ALTERNATE
,
279 static struct si_pc_block_base cik_SPI
= {
282 .flags
= SI_PC_BLOCK_SE
,
284 .select0
= R_036600_SPI_PERFCOUNTER0_SELECT
,
285 .counter0_lo
= R_034604_SPI_PERFCOUNTER0_LO
,
287 .layout
= SI_PC_MULTI_BLOCK
,
290 static struct si_pc_block_base cik_SQ
= {
293 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_SHADER
,
295 .select0
= R_036700_SQ_PERFCOUNTER0_SELECT
,
296 .select_or
= S_036700_SQC_BANK_MASK(15) |
297 S_036700_SQC_CLIENT_MASK(15) |
298 S_036700_SIMD_MASK(15),
299 .counter0_lo
= R_034700_SQ_PERFCOUNTER0_LO
,
302 static struct si_pc_block_base cik_SX
= {
305 .flags
= SI_PC_BLOCK_SE
,
307 .select0
= R_036900_SX_PERFCOUNTER0_SELECT
,
308 .counter0_lo
= R_034900_SX_PERFCOUNTER0_LO
,
310 .layout
= SI_PC_MULTI_TAIL
,
313 static struct si_pc_block_base cik_TA
= {
316 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
| SI_PC_BLOCK_SHADER_WINDOWED
,
318 .select0
= R_036B00_TA_PERFCOUNTER0_SELECT
,
319 .counter0_lo
= R_034B00_TA_PERFCOUNTER0_LO
,
321 .layout
= SI_PC_MULTI_ALTERNATE
,
324 static struct si_pc_block_base cik_TD
= {
327 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
| SI_PC_BLOCK_SHADER_WINDOWED
,
329 .select0
= R_036C00_TD_PERFCOUNTER0_SELECT
,
330 .counter0_lo
= R_034C00_TD_PERFCOUNTER0_LO
,
332 .layout
= SI_PC_MULTI_ALTERNATE
,
335 static struct si_pc_block_base cik_TCA
= {
338 .flags
= SI_PC_BLOCK_INSTANCE_GROUPS
,
340 .select0
= R_036E40_TCA_PERFCOUNTER0_SELECT
,
341 .counter0_lo
= R_034E40_TCA_PERFCOUNTER0_LO
,
343 .layout
= SI_PC_MULTI_ALTERNATE
,
346 static struct si_pc_block_base cik_TCC
= {
349 .flags
= SI_PC_BLOCK_INSTANCE_GROUPS
,
351 .select0
= R_036E00_TCC_PERFCOUNTER0_SELECT
,
352 .counter0_lo
= R_034E00_TCC_PERFCOUNTER0_LO
,
354 .layout
= SI_PC_MULTI_ALTERNATE
,
357 static struct si_pc_block_base cik_TCP
= {
360 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
| SI_PC_BLOCK_SHADER_WINDOWED
,
362 .select0
= R_036D00_TCP_PERFCOUNTER0_SELECT
,
363 .counter0_lo
= R_034D00_TCP_PERFCOUNTER0_LO
,
365 .layout
= SI_PC_MULTI_ALTERNATE
,
368 static struct si_pc_block_base cik_VGT
= {
371 .flags
= SI_PC_BLOCK_SE
,
373 .select0
= R_036230_VGT_PERFCOUNTER0_SELECT
,
374 .counter0_lo
= R_034240_VGT_PERFCOUNTER0_LO
,
376 .layout
= SI_PC_MULTI_TAIL
,
379 static struct si_pc_block_base cik_WD
= {
383 .select0
= R_036200_WD_PERFCOUNTER0_SELECT
,
384 .counter0_lo
= R_034200_WD_PERFCOUNTER0_LO
,
387 static struct si_pc_block_base cik_MC
= {
391 .layout
= SI_PC_FAKE
,
394 static struct si_pc_block_base cik_SRBM
= {
398 .layout
= SI_PC_FAKE
,
401 /* Both the number of instances and selectors varies between chips of the same
402 * class. We only differentiate by class here and simply expose the maximum
403 * number over all chips in a class.
405 * Unfortunately, GPUPerfStudio uses the order of performance counter groups
406 * blindly once it believes it has identified the hardware, so the order of
407 * blocks here matters.
409 static struct si_pc_block_gfxdescr groups_CIK
[] = {
420 { &cik_TA
, 111, 11 },
424 { &cik_TCP
, 154, 11 },
436 static struct si_pc_block_gfxdescr groups_VI
[] = {
447 { &cik_TA
, 119, 16 },
451 { &cik_TCP
, 180, 16 },
463 static struct si_pc_block_gfxdescr groups_gfx9
[] = {
474 { &cik_TA
, 119, 16 },
478 { &cik_TCP
, 85, 16 },
487 static bool si_pc_block_has_per_se_groups(const struct si_perfcounters
*pc
,
488 const struct si_pc_block
*block
)
490 return block
->b
->b
->flags
& SI_PC_BLOCK_SE_GROUPS
||
491 (block
->b
->b
->flags
& SI_PC_BLOCK_SE
&& pc
->separate_se
);
494 static bool si_pc_block_has_per_instance_groups(const struct si_perfcounters
*pc
,
495 const struct si_pc_block
*block
)
497 return block
->b
->b
->flags
& SI_PC_BLOCK_INSTANCE_GROUPS
||
498 (block
->num_instances
> 1 && pc
->separate_instance
);
501 static struct si_pc_block
*
502 lookup_counter(struct si_perfcounters
*pc
, unsigned index
,
503 unsigned *base_gid
, unsigned *sub_index
)
505 struct si_pc_block
*block
= pc
->blocks
;
509 for (bid
= 0; bid
< pc
->num_blocks
; ++bid
, ++block
) {
510 unsigned total
= block
->num_groups
* block
->b
->selectors
;
518 *base_gid
+= block
->num_groups
;
524 static struct si_pc_block
*
525 lookup_group(struct si_perfcounters
*pc
, unsigned *index
)
528 struct si_pc_block
*block
= pc
->blocks
;
530 for (bid
= 0; bid
< pc
->num_blocks
; ++bid
, ++block
) {
531 if (*index
< block
->num_groups
)
533 *index
-= block
->num_groups
;
539 static void si_pc_emit_instance(struct si_context
*sctx
,
540 int se
, int instance
)
542 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
543 unsigned value
= S_030800_SH_BROADCAST_WRITES(1);
546 value
|= S_030800_SE_INDEX(se
);
548 value
|= S_030800_SE_BROADCAST_WRITES(1);
552 value
|= S_030800_INSTANCE_INDEX(instance
);
554 value
|= S_030800_INSTANCE_BROADCAST_WRITES(1);
557 radeon_set_uconfig_reg(cs
, R_030800_GRBM_GFX_INDEX
, value
);
560 static void si_pc_emit_shaders(struct si_context
*sctx
,
563 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
565 radeon_set_uconfig_reg_seq(cs
, R_036780_SQ_PERFCOUNTER_CTRL
, 2);
566 radeon_emit(cs
, shaders
& 0x7f);
567 radeon_emit(cs
, 0xffffffff);
570 static void si_pc_emit_select(struct si_context
*sctx
,
571 struct si_pc_block
*block
,
572 unsigned count
, unsigned *selectors
)
574 struct si_pc_block_base
*regs
= block
->b
->b
;
575 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
577 unsigned layout_multi
= regs
->layout
& SI_PC_MULTI_MASK
;
580 assert(count
<= regs
->num_counters
);
582 if (regs
->layout
& SI_PC_FAKE
)
585 if (layout_multi
== SI_PC_MULTI_BLOCK
) {
586 assert(!(regs
->layout
& SI_PC_REG_REVERSE
));
588 dw
= count
+ regs
->num_prelude
;
589 if (count
>= regs
->num_multi
)
590 dw
+= regs
->num_multi
;
591 radeon_set_uconfig_reg_seq(cs
, regs
->select0
, dw
);
592 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
594 for (idx
= 0; idx
< MIN2(count
, regs
->num_multi
); ++idx
)
595 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
597 if (count
< regs
->num_multi
) {
599 regs
->select0
+ 4 * regs
->num_multi
;
600 radeon_set_uconfig_reg_seq(cs
, select1
, count
);
603 for (idx
= 0; idx
< MIN2(count
, regs
->num_multi
); ++idx
)
606 if (count
> regs
->num_multi
) {
607 for (idx
= regs
->num_multi
; idx
< count
; ++idx
)
608 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
610 } else if (layout_multi
== SI_PC_MULTI_TAIL
) {
611 unsigned select1
, select1_count
;
613 assert(!(regs
->layout
& SI_PC_REG_REVERSE
));
615 radeon_set_uconfig_reg_seq(cs
, regs
->select0
, count
+ regs
->num_prelude
);
616 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
618 for (idx
= 0; idx
< count
; ++idx
)
619 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
621 select1
= regs
->select0
+ 4 * regs
->num_counters
;
622 select1_count
= MIN2(count
, regs
->num_multi
);
623 radeon_set_uconfig_reg_seq(cs
, select1
, select1_count
);
624 for (idx
= 0; idx
< select1_count
; ++idx
)
626 } else if (layout_multi
== SI_PC_MULTI_CUSTOM
) {
627 unsigned *reg
= regs
->select
;
628 for (idx
= 0; idx
< count
; ++idx
) {
629 radeon_set_uconfig_reg(cs
, *reg
++, selectors
[idx
] | regs
->select_or
);
630 if (idx
< regs
->num_multi
)
631 radeon_set_uconfig_reg(cs
, *reg
++, 0);
634 assert(layout_multi
== SI_PC_MULTI_ALTERNATE
);
636 unsigned reg_base
= regs
->select0
;
637 unsigned reg_count
= count
+ MIN2(count
, regs
->num_multi
);
638 reg_count
+= regs
->num_prelude
;
640 if (!(regs
->layout
& SI_PC_REG_REVERSE
)) {
641 radeon_set_uconfig_reg_seq(cs
, reg_base
, reg_count
);
643 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
645 for (idx
= 0; idx
< count
; ++idx
) {
646 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
647 if (idx
< regs
->num_multi
)
651 reg_base
-= (reg_count
- 1) * 4;
652 radeon_set_uconfig_reg_seq(cs
, reg_base
, reg_count
);
654 for (idx
= count
; idx
> 0; --idx
) {
655 if (idx
<= regs
->num_multi
)
657 radeon_emit(cs
, selectors
[idx
- 1] | regs
->select_or
);
659 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
665 static void si_pc_emit_start(struct si_context
*sctx
,
666 struct r600_resource
*buffer
, uint64_t va
)
668 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
670 radeon_add_to_buffer_list(sctx
, sctx
->gfx_cs
, buffer
,
671 RADEON_USAGE_WRITE
, RADEON_PRIO_QUERY
);
673 radeon_emit(cs
, PKT3(PKT3_COPY_DATA
, 4, 0));
674 radeon_emit(cs
, COPY_DATA_SRC_SEL(COPY_DATA_IMM
) |
675 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM
));
676 radeon_emit(cs
, 1); /* immediate */
677 radeon_emit(cs
, 0); /* unused */
679 radeon_emit(cs
, va
>> 32);
681 radeon_set_uconfig_reg(cs
, R_036020_CP_PERFMON_CNTL
,
682 S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET
));
683 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
684 radeon_emit(cs
, EVENT_TYPE(V_028A90_PERFCOUNTER_START
) | EVENT_INDEX(0));
685 radeon_set_uconfig_reg(cs
, R_036020_CP_PERFMON_CNTL
,
686 S_036020_PERFMON_STATE(V_036020_START_COUNTING
));
689 /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
690 * do it again in here. */
691 static void si_pc_emit_stop(struct si_context
*sctx
,
692 struct r600_resource
*buffer
, uint64_t va
)
694 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
696 si_cp_release_mem(sctx
, V_028A90_BOTTOM_OF_PIPE_TS
, 0,
697 EOP_DST_SEL_MEM
, EOP_INT_SEL_NONE
,
698 EOP_DATA_SEL_VALUE_32BIT
,
699 buffer
, va
, 0, SI_NOT_QUERY
);
700 si_cp_wait_mem(sctx
, va
, 0, 0xffffffff, 0);
702 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
703 radeon_emit(cs
, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE
) | EVENT_INDEX(0));
704 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
705 radeon_emit(cs
, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP
) | EVENT_INDEX(0));
706 radeon_set_uconfig_reg(cs
, R_036020_CP_PERFMON_CNTL
,
707 S_036020_PERFMON_STATE(V_036020_STOP_COUNTING
) |
708 S_036020_PERFMON_SAMPLE_ENABLE(1));
711 static void si_pc_emit_read(struct si_context
*sctx
,
712 struct si_pc_block
*block
,
713 unsigned count
, uint64_t va
)
715 struct si_pc_block_base
*regs
= block
->b
->b
;
716 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
718 unsigned reg
= regs
->counter0_lo
;
719 unsigned reg_delta
= 8;
721 if (!(regs
->layout
& SI_PC_FAKE
)) {
722 if (regs
->layout
& SI_PC_REG_REVERSE
)
723 reg_delta
= -reg_delta
;
725 for (idx
= 0; idx
< count
; ++idx
) {
727 reg
= regs
->counters
[idx
];
729 radeon_emit(cs
, PKT3(PKT3_COPY_DATA
, 4, 0));
730 radeon_emit(cs
, COPY_DATA_SRC_SEL(COPY_DATA_PERF
) |
731 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM
) |
732 COPY_DATA_COUNT_SEL
); /* 64 bits */
733 radeon_emit(cs
, reg
>> 2);
734 radeon_emit(cs
, 0); /* unused */
736 radeon_emit(cs
, va
>> 32);
737 va
+= sizeof(uint64_t);
741 for (idx
= 0; idx
< count
; ++idx
) {
742 radeon_emit(cs
, PKT3(PKT3_COPY_DATA
, 4, 0));
743 radeon_emit(cs
, COPY_DATA_SRC_SEL(COPY_DATA_IMM
) |
744 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM
) |
745 COPY_DATA_COUNT_SEL
);
746 radeon_emit(cs
, 0); /* immediate */
749 radeon_emit(cs
, va
>> 32);
750 va
+= sizeof(uint64_t);
755 static void si_pc_query_destroy(struct si_screen
*sscreen
,
756 struct si_query
*rquery
)
758 struct si_query_pc
*query
= (struct si_query_pc
*)rquery
;
760 while (query
->groups
) {
761 struct si_query_group
*group
= query
->groups
;
762 query
->groups
= group
->next
;
766 FREE(query
->counters
);
768 si_query_hw_destroy(sscreen
, rquery
);
771 static bool si_pc_query_prepare_buffer(struct si_screen
*screen
,
772 struct si_query_hw
*hwquery
,
773 struct r600_resource
*buffer
)
779 static void si_pc_query_emit_start(struct si_context
*sctx
,
780 struct si_query_hw
*hwquery
,
781 struct r600_resource
*buffer
, uint64_t va
)
783 struct si_query_pc
*query
= (struct si_query_pc
*)hwquery
;
784 struct si_query_group
*group
;
786 int current_instance
= -1;
789 si_pc_emit_shaders(sctx
, query
->shaders
);
791 for (group
= query
->groups
; group
; group
= group
->next
) {
792 struct si_pc_block
*block
= group
->block
;
794 if (group
->se
!= current_se
|| group
->instance
!= current_instance
) {
795 current_se
= group
->se
;
796 current_instance
= group
->instance
;
797 si_pc_emit_instance(sctx
, group
->se
, group
->instance
);
800 si_pc_emit_select(sctx
, block
, group
->num_counters
, group
->selectors
);
803 if (current_se
!= -1 || current_instance
!= -1)
804 si_pc_emit_instance(sctx
, -1, -1);
806 si_pc_emit_start(sctx
, buffer
, va
);
809 static void si_pc_query_emit_stop(struct si_context
*sctx
,
810 struct si_query_hw
*hwquery
,
811 struct r600_resource
*buffer
, uint64_t va
)
813 struct si_query_pc
*query
= (struct si_query_pc
*)hwquery
;
814 struct si_query_group
*group
;
816 si_pc_emit_stop(sctx
, buffer
, va
);
818 for (group
= query
->groups
; group
; group
= group
->next
) {
819 struct si_pc_block
*block
= group
->block
;
820 unsigned se
= group
->se
>= 0 ? group
->se
: 0;
821 unsigned se_end
= se
+ 1;
823 if ((block
->b
->b
->flags
& SI_PC_BLOCK_SE
) && (group
->se
< 0))
824 se_end
= sctx
->screen
->info
.max_se
;
827 unsigned instance
= group
->instance
>= 0 ? group
->instance
: 0;
830 si_pc_emit_instance(sctx
, se
, instance
);
831 si_pc_emit_read(sctx
, block
, group
->num_counters
, va
);
832 va
+= sizeof(uint64_t) * group
->num_counters
;
833 } while (group
->instance
< 0 && ++instance
< block
->num_instances
);
834 } while (++se
< se_end
);
837 si_pc_emit_instance(sctx
, -1, -1);
840 static void si_pc_query_clear_result(struct si_query_hw
*hwquery
,
841 union pipe_query_result
*result
)
843 struct si_query_pc
*query
= (struct si_query_pc
*)hwquery
;
845 memset(result
, 0, sizeof(result
->batch
[0]) * query
->num_counters
);
848 static void si_pc_query_add_result(struct si_screen
*screen
,
849 struct si_query_hw
*hwquery
,
851 union pipe_query_result
*result
)
853 struct si_query_pc
*query
= (struct si_query_pc
*)hwquery
;
854 uint64_t *results
= buffer
;
857 for (i
= 0; i
< query
->num_counters
; ++i
) {
858 struct si_query_counter
*counter
= &query
->counters
[i
];
860 for (j
= 0; j
< counter
->qwords
; ++j
) {
861 uint32_t value
= results
[counter
->base
+ j
* counter
->stride
];
862 result
->batch
[i
].u64
+= value
;
867 static struct si_query_ops batch_query_ops
= {
868 .destroy
= si_pc_query_destroy
,
869 .begin
= si_query_hw_begin
,
870 .end
= si_query_hw_end
,
871 .get_result
= si_query_hw_get_result
874 static struct si_query_hw_ops batch_query_hw_ops
= {
875 .prepare_buffer
= si_pc_query_prepare_buffer
,
876 .emit_start
= si_pc_query_emit_start
,
877 .emit_stop
= si_pc_query_emit_stop
,
878 .clear_result
= si_pc_query_clear_result
,
879 .add_result
= si_pc_query_add_result
,
882 static struct si_query_group
*get_group_state(struct si_screen
*screen
,
883 struct si_query_pc
*query
,
884 struct si_pc_block
*block
,
887 struct si_query_group
*group
= query
->groups
;
890 if (group
->block
== block
&& group
->sub_gid
== sub_gid
)
895 group
= CALLOC_STRUCT(si_query_group
);
899 group
->block
= block
;
900 group
->sub_gid
= sub_gid
;
902 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
) {
903 unsigned sub_gids
= block
->num_instances
;
906 unsigned query_shaders
;
908 if (si_pc_block_has_per_se_groups(screen
->perfcounters
, block
))
909 sub_gids
= sub_gids
* screen
->info
.max_se
;
910 shader_id
= sub_gid
/ sub_gids
;
911 sub_gid
= sub_gid
% sub_gids
;
913 shaders
= si_pc_shader_type_bits
[shader_id
];
915 query_shaders
= query
->shaders
& ~SI_PC_SHADERS_WINDOWING
;
916 if (query_shaders
&& query_shaders
!= shaders
) {
917 fprintf(stderr
, "si_perfcounter: incompatible shader groups\n");
921 query
->shaders
= shaders
;
924 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER_WINDOWED
&& !query
->shaders
) {
925 // A non-zero value in query->shaders ensures that the shader
926 // masking is reset unless the user explicitly requests one.
927 query
->shaders
= SI_PC_SHADERS_WINDOWING
;
930 if (si_pc_block_has_per_se_groups(screen
->perfcounters
, block
)) {
931 group
->se
= sub_gid
/ block
->num_instances
;
932 sub_gid
= sub_gid
% block
->num_instances
;
937 if (si_pc_block_has_per_instance_groups(screen
->perfcounters
, block
)) {
938 group
->instance
= sub_gid
;
940 group
->instance
= -1;
943 group
->next
= query
->groups
;
944 query
->groups
= group
;
949 struct pipe_query
*si_create_batch_query(struct pipe_context
*ctx
,
950 unsigned num_queries
,
951 unsigned *query_types
)
953 struct si_screen
*screen
=
954 (struct si_screen
*)ctx
->screen
;
955 struct si_perfcounters
*pc
= screen
->perfcounters
;
956 struct si_pc_block
*block
;
957 struct si_query_group
*group
;
958 struct si_query_pc
*query
;
959 unsigned base_gid
, sub_gid
, sub_index
;
965 query
= CALLOC_STRUCT(si_query_pc
);
969 query
->b
.b
.ops
= &batch_query_ops
;
970 query
->b
.ops
= &batch_query_hw_ops
;
972 query
->num_counters
= num_queries
;
974 /* Collect selectors per group */
975 for (i
= 0; i
< num_queries
; ++i
) {
978 if (query_types
[i
] < SI_QUERY_FIRST_PERFCOUNTER
)
981 block
= lookup_counter(pc
, query_types
[i
] - SI_QUERY_FIRST_PERFCOUNTER
,
982 &base_gid
, &sub_index
);
986 sub_gid
= sub_index
/ block
->b
->selectors
;
987 sub_index
= sub_index
% block
->b
->selectors
;
989 group
= get_group_state(screen
, query
, block
, sub_gid
);
993 if (group
->num_counters
>= block
->b
->b
->num_counters
) {
995 "perfcounter group %s: too many selected\n",
999 group
->selectors
[group
->num_counters
] = sub_index
;
1000 ++group
->num_counters
;
1003 /* Compute result bases and CS size per group */
1004 query
->b
.num_cs_dw_end
= pc
->num_stop_cs_dwords
;
1005 query
->b
.num_cs_dw_end
+= pc
->num_instance_cs_dwords
;
1008 for (group
= query
->groups
; group
; group
= group
->next
) {
1009 struct si_pc_block
*block
= group
->block
;
1011 unsigned instances
= 1;
1013 if ((block
->b
->b
->flags
& SI_PC_BLOCK_SE
) && group
->se
< 0)
1014 instances
= screen
->info
.max_se
;
1015 if (group
->instance
< 0)
1016 instances
*= block
->num_instances
;
1018 group
->result_base
= i
;
1019 query
->b
.result_size
+= sizeof(uint64_t) * instances
* group
->num_counters
;
1020 i
+= instances
* group
->num_counters
;
1022 read_dw
= 6 * group
->num_counters
;
1023 query
->b
.num_cs_dw_end
+= instances
* read_dw
;
1024 query
->b
.num_cs_dw_end
+= instances
* pc
->num_instance_cs_dwords
;
1027 if (query
->shaders
) {
1028 if (query
->shaders
== SI_PC_SHADERS_WINDOWING
)
1029 query
->shaders
= 0xffffffff;
1032 /* Map user-supplied query array to result indices */
1033 query
->counters
= CALLOC(num_queries
, sizeof(*query
->counters
));
1034 for (i
= 0; i
< num_queries
; ++i
) {
1035 struct si_query_counter
*counter
= &query
->counters
[i
];
1036 struct si_pc_block
*block
;
1038 block
= lookup_counter(pc
, query_types
[i
] - SI_QUERY_FIRST_PERFCOUNTER
,
1039 &base_gid
, &sub_index
);
1041 sub_gid
= sub_index
/ block
->b
->selectors
;
1042 sub_index
= sub_index
% block
->b
->selectors
;
1044 group
= get_group_state(screen
, query
, block
, sub_gid
);
1045 assert(group
!= NULL
);
1047 for (j
= 0; j
< group
->num_counters
; ++j
) {
1048 if (group
->selectors
[j
] == sub_index
)
1052 counter
->base
= group
->result_base
+ j
;
1053 counter
->stride
= group
->num_counters
;
1055 counter
->qwords
= 1;
1056 if ((block
->b
->b
->flags
& SI_PC_BLOCK_SE
) && group
->se
< 0)
1057 counter
->qwords
= screen
->info
.max_se
;
1058 if (group
->instance
< 0)
1059 counter
->qwords
*= block
->num_instances
;
1062 if (!si_query_hw_init(screen
, &query
->b
))
1065 return (struct pipe_query
*)query
;
1068 si_pc_query_destroy(screen
, &query
->b
.b
);
1072 static bool si_init_block_names(struct si_screen
*screen
,
1073 struct si_pc_block
*block
)
1075 bool per_instance_groups
= si_pc_block_has_per_instance_groups(screen
->perfcounters
, block
);
1076 bool per_se_groups
= si_pc_block_has_per_se_groups(screen
->perfcounters
, block
);
1078 unsigned groups_shader
= 1, groups_se
= 1, groups_instance
= 1;
1083 if (per_instance_groups
)
1084 groups_instance
= block
->num_instances
;
1086 groups_se
= screen
->info
.max_se
;
1087 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
)
1088 groups_shader
= ARRAY_SIZE(si_pc_shader_type_bits
);
1090 namelen
= strlen(block
->b
->b
->name
);
1091 block
->group_name_stride
= namelen
+ 1;
1092 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
)
1093 block
->group_name_stride
+= 3;
1094 if (per_se_groups
) {
1095 assert(groups_se
<= 10);
1096 block
->group_name_stride
+= 1;
1098 if (per_instance_groups
)
1099 block
->group_name_stride
+= 1;
1101 if (per_instance_groups
) {
1102 assert(groups_instance
<= 100);
1103 block
->group_name_stride
+= 2;
1106 block
->group_names
= MALLOC(block
->num_groups
* block
->group_name_stride
);
1107 if (!block
->group_names
)
1110 groupname
= block
->group_names
;
1111 for (i
= 0; i
< groups_shader
; ++i
) {
1112 const char *shader_suffix
= si_pc_shader_type_suffixes
[i
];
1113 unsigned shaderlen
= strlen(shader_suffix
);
1114 for (j
= 0; j
< groups_se
; ++j
) {
1115 for (k
= 0; k
< groups_instance
; ++k
) {
1116 strcpy(groupname
, block
->b
->b
->name
);
1117 p
= groupname
+ namelen
;
1119 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
) {
1120 strcpy(p
, shader_suffix
);
1124 if (per_se_groups
) {
1125 p
+= sprintf(p
, "%d", j
);
1126 if (per_instance_groups
)
1130 if (per_instance_groups
)
1131 p
+= sprintf(p
, "%d", k
);
1133 groupname
+= block
->group_name_stride
;
1138 assert(block
->b
->selectors
<= 1000);
1139 block
->selector_name_stride
= block
->group_name_stride
+ 4;
1140 block
->selector_names
= MALLOC(block
->num_groups
* block
->b
->selectors
*
1141 block
->selector_name_stride
);
1142 if (!block
->selector_names
)
1145 groupname
= block
->group_names
;
1146 p
= block
->selector_names
;
1147 for (i
= 0; i
< block
->num_groups
; ++i
) {
1148 for (j
= 0; j
< block
->b
->selectors
; ++j
) {
1149 sprintf(p
, "%s_%03d", groupname
, j
);
1150 p
+= block
->selector_name_stride
;
1152 groupname
+= block
->group_name_stride
;
1158 int si_get_perfcounter_info(struct si_screen
*screen
,
1160 struct pipe_driver_query_info
*info
)
1162 struct si_perfcounters
*pc
= screen
->perfcounters
;
1163 struct si_pc_block
*block
;
1164 unsigned base_gid
, sub
;
1170 unsigned bid
, num_queries
= 0;
1172 for (bid
= 0; bid
< pc
->num_blocks
; ++bid
) {
1173 num_queries
+= pc
->blocks
[bid
].b
->selectors
*
1174 pc
->blocks
[bid
].num_groups
;
1180 block
= lookup_counter(pc
, index
, &base_gid
, &sub
);
1184 if (!block
->selector_names
) {
1185 if (!si_init_block_names(screen
, block
))
1188 info
->name
= block
->selector_names
+ sub
* block
->selector_name_stride
;
1189 info
->query_type
= SI_QUERY_FIRST_PERFCOUNTER
+ index
;
1190 info
->max_value
.u64
= 0;
1191 info
->type
= PIPE_DRIVER_QUERY_TYPE_UINT64
;
1192 info
->result_type
= PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE
;
1193 info
->group_id
= base_gid
+ sub
/ block
->b
->selectors
;
1194 info
->flags
= PIPE_DRIVER_QUERY_FLAG_BATCH
;
1195 if (sub
> 0 && sub
+ 1 < block
->b
->selectors
* block
->num_groups
)
1196 info
->flags
|= PIPE_DRIVER_QUERY_FLAG_DONT_LIST
;
1200 int si_get_perfcounter_group_info(struct si_screen
*screen
,
1202 struct pipe_driver_query_group_info
*info
)
1204 struct si_perfcounters
*pc
= screen
->perfcounters
;
1205 struct si_pc_block
*block
;
1211 return pc
->num_groups
;
1213 block
= lookup_group(pc
, &index
);
1217 if (!block
->group_names
) {
1218 if (!si_init_block_names(screen
, block
))
1221 info
->name
= block
->group_names
+ index
* block
->group_name_stride
;
1222 info
->num_queries
= block
->b
->selectors
;
1223 info
->max_active_queries
= block
->b
->b
->num_counters
;
1227 void si_destroy_perfcounters(struct si_screen
*screen
)
1229 struct si_perfcounters
*pc
= screen
->perfcounters
;
1235 for (i
= 0; i
< pc
->num_blocks
; ++i
) {
1236 FREE(pc
->blocks
[i
].group_names
);
1237 FREE(pc
->blocks
[i
].selector_names
);
1241 screen
->perfcounters
= NULL
;
1244 void si_init_perfcounters(struct si_screen
*screen
)
1246 struct si_perfcounters
*pc
;
1247 const struct si_pc_block_gfxdescr
*blocks
;
1248 unsigned num_blocks
;
1251 switch (screen
->info
.chip_class
) {
1253 blocks
= groups_CIK
;
1254 num_blocks
= ARRAY_SIZE(groups_CIK
);
1258 num_blocks
= ARRAY_SIZE(groups_VI
);
1261 blocks
= groups_gfx9
;
1262 num_blocks
= ARRAY_SIZE(groups_gfx9
);
1266 return; /* not implemented */
1269 if (screen
->info
.max_sh_per_se
!= 1) {
1270 /* This should not happen on non-SI chips. */
1271 fprintf(stderr
, "si_init_perfcounters: max_sh_per_se = %d not "
1272 "supported (inaccurate performance counters)\n",
1273 screen
->info
.max_sh_per_se
);
1276 screen
->perfcounters
= pc
= CALLOC_STRUCT(si_perfcounters
);
1280 pc
->num_stop_cs_dwords
= 14 + si_cp_write_fence_dwords(screen
);
1281 pc
->num_instance_cs_dwords
= 3;
1283 pc
->separate_se
= debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
1284 pc
->separate_instance
= debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
1286 pc
->blocks
= CALLOC(num_blocks
, sizeof(struct si_pc_block
));
1289 pc
->num_blocks
= num_blocks
;
1291 for (i
= 0; i
< num_blocks
; ++i
) {
1292 struct si_pc_block
*block
= &pc
->blocks
[i
];
1293 block
->b
= &blocks
[i
];
1294 block
->num_instances
= block
->b
->instances
;
1296 if (!strcmp(block
->b
->b
->name
, "CB") ||
1297 !strcmp(block
->b
->b
->name
, "DB"))
1298 block
->num_instances
= screen
->info
.max_se
;
1299 else if (!strcmp(block
->b
->b
->name
, "TCC"))
1300 block
->num_instances
= screen
->info
.num_tcc_blocks
;
1301 else if (!strcmp(block
->b
->b
->name
, "IA"))
1302 block
->num_instances
= MAX2(1, screen
->info
.max_se
/ 2);
1304 if (si_pc_block_has_per_instance_groups(pc
, block
)) {
1305 block
->num_groups
= block
->num_instances
;
1307 block
->num_groups
= 1;
1310 if (si_pc_block_has_per_se_groups(pc
, block
))
1311 block
->num_groups
*= screen
->info
.max_se
;
1312 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
)
1313 block
->num_groups
*= ARRAY_SIZE(si_pc_shader_type_bits
);
1315 pc
->num_groups
+= block
->num_groups
;
1321 si_destroy_perfcounters(screen
);