2 * Copyright 2015 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 #include "si_build_pm4.h"
27 #include "util/u_memory.h"
30 enum si_pc_block_flags
{
31 /* This block is part of the shader engine */
32 SI_PC_BLOCK_SE
= (1 << 0),
34 /* Expose per-instance groups instead of summing all instances (within
36 SI_PC_BLOCK_INSTANCE_GROUPS
= (1 << 1),
38 /* Expose per-SE groups instead of summing instances across SEs. */
39 SI_PC_BLOCK_SE_GROUPS
= (1 << 2),
42 SI_PC_BLOCK_SHADER
= (1 << 3),
44 /* Non-shader block with perfcounters windowed by shaders. */
45 SI_PC_BLOCK_SHADER_WINDOWED
= (1 << 4),
48 enum si_pc_reg_layout
{
49 /* All secondary selector dwords follow as one block after the primary
50 * selector dwords for the counters that have secondary selectors.
52 SI_PC_MULTI_BLOCK
= 0,
54 /* Each secondary selector dword follows immediately afters the
55 * corresponding primary.
57 SI_PC_MULTI_ALTERNATE
= 1,
59 /* All secondary selector dwords follow as one block after all primary
64 /* Free-form arrangement of selector registers. */
65 SI_PC_MULTI_CUSTOM
= 3,
69 /* Registers are laid out in decreasing rather than increasing order. */
70 SI_PC_REG_REVERSE
= 4,
75 struct si_pc_block_base
{
77 unsigned num_counters
;
90 struct si_pc_block_gfxdescr
{
91 struct si_pc_block_base
*b
;
97 const struct si_pc_block_gfxdescr
*b
;
98 unsigned num_instances
;
102 unsigned group_name_stride
;
104 char *selector_names
;
105 unsigned selector_name_stride
;
108 /* The order is chosen to be compatible with GPUPerfStudio's hardcoding of
109 * performance counter group IDs.
111 static const char * const si_pc_shader_type_suffixes
[] = {
112 "", "_ES", "_GS", "_VS", "_PS", "_LS", "_HS", "_CS"
115 static const unsigned si_pc_shader_type_bits
[] = {
126 /* Max counters per HW block */
127 #define SI_QUERY_MAX_COUNTERS 16
129 #define SI_PC_SHADERS_WINDOWING (1 << 31)
131 struct si_query_group
{
132 struct si_query_group
*next
;
133 struct si_pc_block
*block
;
134 unsigned sub_gid
; /* only used during init */
135 unsigned result_base
; /* only used during init */
138 unsigned num_counters
;
139 unsigned selectors
[SI_QUERY_MAX_COUNTERS
];
142 struct si_query_counter
{
145 unsigned stride
; /* in uint64s */
150 struct si_query_buffer buffer
;
152 /* Size of the results in memory, in bytes. */
153 unsigned result_size
;
156 unsigned num_counters
;
157 struct si_query_counter
*counters
;
158 struct si_query_group
*groups
;
162 static struct si_pc_block_base cik_CB
= {
165 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
,
167 .select0
= R_037000_CB_PERFCOUNTER_FILTER
,
168 .counter0_lo
= R_035018_CB_PERFCOUNTER0_LO
,
171 .layout
= SI_PC_MULTI_ALTERNATE
,
174 static unsigned cik_CPC_select
[] = {
175 R_036024_CPC_PERFCOUNTER0_SELECT
,
176 R_036010_CPC_PERFCOUNTER0_SELECT1
,
177 R_03600C_CPC_PERFCOUNTER1_SELECT
,
179 static struct si_pc_block_base cik_CPC
= {
183 .select
= cik_CPC_select
,
184 .counter0_lo
= R_034018_CPC_PERFCOUNTER0_LO
,
186 .layout
= SI_PC_MULTI_CUSTOM
| SI_PC_REG_REVERSE
,
189 static struct si_pc_block_base cik_CPF
= {
193 .select0
= R_03601C_CPF_PERFCOUNTER0_SELECT
,
194 .counter0_lo
= R_034028_CPF_PERFCOUNTER0_LO
,
196 .layout
= SI_PC_MULTI_ALTERNATE
| SI_PC_REG_REVERSE
,
199 static struct si_pc_block_base cik_CPG
= {
203 .select0
= R_036008_CPG_PERFCOUNTER0_SELECT
,
204 .counter0_lo
= R_034008_CPG_PERFCOUNTER0_LO
,
206 .layout
= SI_PC_MULTI_ALTERNATE
| SI_PC_REG_REVERSE
,
209 static struct si_pc_block_base cik_DB
= {
212 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
,
214 .select0
= R_037100_DB_PERFCOUNTER0_SELECT
,
215 .counter0_lo
= R_035100_DB_PERFCOUNTER0_LO
,
216 .num_multi
= 3, // really only 2, but there's a gap between registers
217 .layout
= SI_PC_MULTI_ALTERNATE
,
220 static struct si_pc_block_base cik_GDS
= {
224 .select0
= R_036A00_GDS_PERFCOUNTER0_SELECT
,
225 .counter0_lo
= R_034A00_GDS_PERFCOUNTER0_LO
,
227 .layout
= SI_PC_MULTI_TAIL
,
230 static unsigned cik_GRBM_counters
[] = {
231 R_034100_GRBM_PERFCOUNTER0_LO
,
232 R_03410C_GRBM_PERFCOUNTER1_LO
,
234 static struct si_pc_block_base cik_GRBM
= {
238 .select0
= R_036100_GRBM_PERFCOUNTER0_SELECT
,
239 .counters
= cik_GRBM_counters
,
242 static struct si_pc_block_base cik_GRBMSE
= {
246 .select0
= R_036108_GRBM_SE0_PERFCOUNTER_SELECT
,
247 .counter0_lo
= R_034114_GRBM_SE0_PERFCOUNTER_LO
,
250 static struct si_pc_block_base cik_IA
= {
254 .select0
= R_036210_IA_PERFCOUNTER0_SELECT
,
255 .counter0_lo
= R_034220_IA_PERFCOUNTER0_LO
,
257 .layout
= SI_PC_MULTI_TAIL
,
260 static struct si_pc_block_base cik_PA_SC
= {
263 .flags
= SI_PC_BLOCK_SE
,
265 .select0
= R_036500_PA_SC_PERFCOUNTER0_SELECT
,
266 .counter0_lo
= R_034500_PA_SC_PERFCOUNTER0_LO
,
268 .layout
= SI_PC_MULTI_ALTERNATE
,
271 /* According to docs, PA_SU counters are only 48 bits wide. */
272 static struct si_pc_block_base cik_PA_SU
= {
275 .flags
= SI_PC_BLOCK_SE
,
277 .select0
= R_036400_PA_SU_PERFCOUNTER0_SELECT
,
278 .counter0_lo
= R_034400_PA_SU_PERFCOUNTER0_LO
,
280 .layout
= SI_PC_MULTI_ALTERNATE
,
283 static struct si_pc_block_base cik_SPI
= {
286 .flags
= SI_PC_BLOCK_SE
,
288 .select0
= R_036600_SPI_PERFCOUNTER0_SELECT
,
289 .counter0_lo
= R_034604_SPI_PERFCOUNTER0_LO
,
291 .layout
= SI_PC_MULTI_BLOCK
,
294 static struct si_pc_block_base cik_SQ
= {
297 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_SHADER
,
299 .select0
= R_036700_SQ_PERFCOUNTER0_SELECT
,
300 .select_or
= S_036700_SQC_BANK_MASK(15) |
301 S_036700_SQC_CLIENT_MASK(15) |
302 S_036700_SIMD_MASK(15),
303 .counter0_lo
= R_034700_SQ_PERFCOUNTER0_LO
,
306 static struct si_pc_block_base cik_SX
= {
309 .flags
= SI_PC_BLOCK_SE
,
311 .select0
= R_036900_SX_PERFCOUNTER0_SELECT
,
312 .counter0_lo
= R_034900_SX_PERFCOUNTER0_LO
,
314 .layout
= SI_PC_MULTI_TAIL
,
317 static struct si_pc_block_base cik_TA
= {
320 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
| SI_PC_BLOCK_SHADER_WINDOWED
,
322 .select0
= R_036B00_TA_PERFCOUNTER0_SELECT
,
323 .counter0_lo
= R_034B00_TA_PERFCOUNTER0_LO
,
325 .layout
= SI_PC_MULTI_ALTERNATE
,
328 static struct si_pc_block_base cik_TD
= {
331 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
| SI_PC_BLOCK_SHADER_WINDOWED
,
333 .select0
= R_036C00_TD_PERFCOUNTER0_SELECT
,
334 .counter0_lo
= R_034C00_TD_PERFCOUNTER0_LO
,
336 .layout
= SI_PC_MULTI_ALTERNATE
,
339 static struct si_pc_block_base cik_TCA
= {
342 .flags
= SI_PC_BLOCK_INSTANCE_GROUPS
,
344 .select0
= R_036E40_TCA_PERFCOUNTER0_SELECT
,
345 .counter0_lo
= R_034E40_TCA_PERFCOUNTER0_LO
,
347 .layout
= SI_PC_MULTI_ALTERNATE
,
350 static struct si_pc_block_base cik_TCC
= {
353 .flags
= SI_PC_BLOCK_INSTANCE_GROUPS
,
355 .select0
= R_036E00_TCC_PERFCOUNTER0_SELECT
,
356 .counter0_lo
= R_034E00_TCC_PERFCOUNTER0_LO
,
358 .layout
= SI_PC_MULTI_ALTERNATE
,
361 static struct si_pc_block_base cik_TCP
= {
364 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
| SI_PC_BLOCK_SHADER_WINDOWED
,
366 .select0
= R_036D00_TCP_PERFCOUNTER0_SELECT
,
367 .counter0_lo
= R_034D00_TCP_PERFCOUNTER0_LO
,
369 .layout
= SI_PC_MULTI_ALTERNATE
,
372 static struct si_pc_block_base cik_VGT
= {
375 .flags
= SI_PC_BLOCK_SE
,
377 .select0
= R_036230_VGT_PERFCOUNTER0_SELECT
,
378 .counter0_lo
= R_034240_VGT_PERFCOUNTER0_LO
,
380 .layout
= SI_PC_MULTI_TAIL
,
383 static struct si_pc_block_base cik_WD
= {
387 .select0
= R_036200_WD_PERFCOUNTER0_SELECT
,
388 .counter0_lo
= R_034200_WD_PERFCOUNTER0_LO
,
391 static struct si_pc_block_base cik_MC
= {
395 .layout
= SI_PC_FAKE
,
398 static struct si_pc_block_base cik_SRBM
= {
402 .layout
= SI_PC_FAKE
,
405 /* Both the number of instances and selectors varies between chips of the same
406 * class. We only differentiate by class here and simply expose the maximum
407 * number over all chips in a class.
409 * Unfortunately, GPUPerfStudio uses the order of performance counter groups
410 * blindly once it believes it has identified the hardware, so the order of
411 * blocks here matters.
413 static struct si_pc_block_gfxdescr groups_CIK
[] = {
424 { &cik_TA
, 111, 11 },
428 { &cik_TCP
, 154, 11 },
440 static struct si_pc_block_gfxdescr groups_VI
[] = {
451 { &cik_TA
, 119, 16 },
455 { &cik_TCP
, 180, 16 },
467 static struct si_pc_block_gfxdescr groups_gfx9
[] = {
478 { &cik_TA
, 119, 16 },
482 { &cik_TCP
, 85, 16 },
491 static bool si_pc_block_has_per_se_groups(const struct si_perfcounters
*pc
,
492 const struct si_pc_block
*block
)
494 return block
->b
->b
->flags
& SI_PC_BLOCK_SE_GROUPS
||
495 (block
->b
->b
->flags
& SI_PC_BLOCK_SE
&& pc
->separate_se
);
498 static bool si_pc_block_has_per_instance_groups(const struct si_perfcounters
*pc
,
499 const struct si_pc_block
*block
)
501 return block
->b
->b
->flags
& SI_PC_BLOCK_INSTANCE_GROUPS
||
502 (block
->num_instances
> 1 && pc
->separate_instance
);
505 static struct si_pc_block
*
506 lookup_counter(struct si_perfcounters
*pc
, unsigned index
,
507 unsigned *base_gid
, unsigned *sub_index
)
509 struct si_pc_block
*block
= pc
->blocks
;
513 for (bid
= 0; bid
< pc
->num_blocks
; ++bid
, ++block
) {
514 unsigned total
= block
->num_groups
* block
->b
->selectors
;
522 *base_gid
+= block
->num_groups
;
528 static struct si_pc_block
*
529 lookup_group(struct si_perfcounters
*pc
, unsigned *index
)
532 struct si_pc_block
*block
= pc
->blocks
;
534 for (bid
= 0; bid
< pc
->num_blocks
; ++bid
, ++block
) {
535 if (*index
< block
->num_groups
)
537 *index
-= block
->num_groups
;
543 static void si_pc_emit_instance(struct si_context
*sctx
,
544 int se
, int instance
)
546 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
547 unsigned value
= S_030800_SH_BROADCAST_WRITES(1);
550 value
|= S_030800_SE_INDEX(se
);
552 value
|= S_030800_SE_BROADCAST_WRITES(1);
556 value
|= S_030800_INSTANCE_INDEX(instance
);
558 value
|= S_030800_INSTANCE_BROADCAST_WRITES(1);
561 radeon_set_uconfig_reg(cs
, R_030800_GRBM_GFX_INDEX
, value
);
564 static void si_pc_emit_shaders(struct si_context
*sctx
,
567 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
569 radeon_set_uconfig_reg_seq(cs
, R_036780_SQ_PERFCOUNTER_CTRL
, 2);
570 radeon_emit(cs
, shaders
& 0x7f);
571 radeon_emit(cs
, 0xffffffff);
574 static void si_pc_emit_select(struct si_context
*sctx
,
575 struct si_pc_block
*block
,
576 unsigned count
, unsigned *selectors
)
578 struct si_pc_block_base
*regs
= block
->b
->b
;
579 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
581 unsigned layout_multi
= regs
->layout
& SI_PC_MULTI_MASK
;
584 assert(count
<= regs
->num_counters
);
586 if (regs
->layout
& SI_PC_FAKE
)
589 if (layout_multi
== SI_PC_MULTI_BLOCK
) {
590 assert(!(regs
->layout
& SI_PC_REG_REVERSE
));
592 dw
= count
+ regs
->num_prelude
;
593 if (count
>= regs
->num_multi
)
594 dw
+= regs
->num_multi
;
595 radeon_set_uconfig_reg_seq(cs
, regs
->select0
, dw
);
596 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
598 for (idx
= 0; idx
< MIN2(count
, regs
->num_multi
); ++idx
)
599 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
601 if (count
< regs
->num_multi
) {
603 regs
->select0
+ 4 * regs
->num_multi
;
604 radeon_set_uconfig_reg_seq(cs
, select1
, count
);
607 for (idx
= 0; idx
< MIN2(count
, regs
->num_multi
); ++idx
)
610 if (count
> regs
->num_multi
) {
611 for (idx
= regs
->num_multi
; idx
< count
; ++idx
)
612 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
614 } else if (layout_multi
== SI_PC_MULTI_TAIL
) {
615 unsigned select1
, select1_count
;
617 assert(!(regs
->layout
& SI_PC_REG_REVERSE
));
619 radeon_set_uconfig_reg_seq(cs
, regs
->select0
, count
+ regs
->num_prelude
);
620 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
622 for (idx
= 0; idx
< count
; ++idx
)
623 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
625 select1
= regs
->select0
+ 4 * regs
->num_counters
;
626 select1_count
= MIN2(count
, regs
->num_multi
);
627 radeon_set_uconfig_reg_seq(cs
, select1
, select1_count
);
628 for (idx
= 0; idx
< select1_count
; ++idx
)
630 } else if (layout_multi
== SI_PC_MULTI_CUSTOM
) {
631 unsigned *reg
= regs
->select
;
632 for (idx
= 0; idx
< count
; ++idx
) {
633 radeon_set_uconfig_reg(cs
, *reg
++, selectors
[idx
] | regs
->select_or
);
634 if (idx
< regs
->num_multi
)
635 radeon_set_uconfig_reg(cs
, *reg
++, 0);
638 assert(layout_multi
== SI_PC_MULTI_ALTERNATE
);
640 unsigned reg_base
= regs
->select0
;
641 unsigned reg_count
= count
+ MIN2(count
, regs
->num_multi
);
642 reg_count
+= regs
->num_prelude
;
644 if (!(regs
->layout
& SI_PC_REG_REVERSE
)) {
645 radeon_set_uconfig_reg_seq(cs
, reg_base
, reg_count
);
647 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
649 for (idx
= 0; idx
< count
; ++idx
) {
650 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
651 if (idx
< regs
->num_multi
)
655 reg_base
-= (reg_count
- 1) * 4;
656 radeon_set_uconfig_reg_seq(cs
, reg_base
, reg_count
);
658 for (idx
= count
; idx
> 0; --idx
) {
659 if (idx
<= regs
->num_multi
)
661 radeon_emit(cs
, selectors
[idx
- 1] | regs
->select_or
);
663 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
669 static void si_pc_emit_start(struct si_context
*sctx
,
670 struct r600_resource
*buffer
, uint64_t va
)
672 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
674 radeon_add_to_buffer_list(sctx
, sctx
->gfx_cs
, buffer
,
675 RADEON_USAGE_WRITE
, RADEON_PRIO_QUERY
);
677 radeon_emit(cs
, PKT3(PKT3_COPY_DATA
, 4, 0));
678 radeon_emit(cs
, COPY_DATA_SRC_SEL(COPY_DATA_IMM
) |
679 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM
));
680 radeon_emit(cs
, 1); /* immediate */
681 radeon_emit(cs
, 0); /* unused */
683 radeon_emit(cs
, va
>> 32);
685 radeon_set_uconfig_reg(cs
, R_036020_CP_PERFMON_CNTL
,
686 S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET
));
687 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
688 radeon_emit(cs
, EVENT_TYPE(V_028A90_PERFCOUNTER_START
) | EVENT_INDEX(0));
689 radeon_set_uconfig_reg(cs
, R_036020_CP_PERFMON_CNTL
,
690 S_036020_PERFMON_STATE(V_036020_START_COUNTING
));
693 /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
694 * do it again in here. */
695 static void si_pc_emit_stop(struct si_context
*sctx
,
696 struct r600_resource
*buffer
, uint64_t va
)
698 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
700 si_cp_release_mem(sctx
, V_028A90_BOTTOM_OF_PIPE_TS
, 0,
701 EOP_DST_SEL_MEM
, EOP_INT_SEL_NONE
,
702 EOP_DATA_SEL_VALUE_32BIT
,
703 buffer
, va
, 0, SI_NOT_QUERY
);
704 si_cp_wait_mem(sctx
, va
, 0, 0xffffffff, 0);
706 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
707 radeon_emit(cs
, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE
) | EVENT_INDEX(0));
708 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
709 radeon_emit(cs
, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP
) | EVENT_INDEX(0));
710 radeon_set_uconfig_reg(cs
, R_036020_CP_PERFMON_CNTL
,
711 S_036020_PERFMON_STATE(V_036020_STOP_COUNTING
) |
712 S_036020_PERFMON_SAMPLE_ENABLE(1));
715 static void si_pc_emit_read(struct si_context
*sctx
,
716 struct si_pc_block
*block
,
717 unsigned count
, uint64_t va
)
719 struct si_pc_block_base
*regs
= block
->b
->b
;
720 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
722 unsigned reg
= regs
->counter0_lo
;
723 unsigned reg_delta
= 8;
725 if (!(regs
->layout
& SI_PC_FAKE
)) {
726 if (regs
->layout
& SI_PC_REG_REVERSE
)
727 reg_delta
= -reg_delta
;
729 for (idx
= 0; idx
< count
; ++idx
) {
731 reg
= regs
->counters
[idx
];
733 radeon_emit(cs
, PKT3(PKT3_COPY_DATA
, 4, 0));
734 radeon_emit(cs
, COPY_DATA_SRC_SEL(COPY_DATA_PERF
) |
735 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM
) |
736 COPY_DATA_COUNT_SEL
); /* 64 bits */
737 radeon_emit(cs
, reg
>> 2);
738 radeon_emit(cs
, 0); /* unused */
740 radeon_emit(cs
, va
>> 32);
741 va
+= sizeof(uint64_t);
745 for (idx
= 0; idx
< count
; ++idx
) {
746 radeon_emit(cs
, PKT3(PKT3_COPY_DATA
, 4, 0));
747 radeon_emit(cs
, COPY_DATA_SRC_SEL(COPY_DATA_IMM
) |
748 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM
) |
749 COPY_DATA_COUNT_SEL
);
750 radeon_emit(cs
, 0); /* immediate */
753 radeon_emit(cs
, va
>> 32);
754 va
+= sizeof(uint64_t);
759 static void si_pc_query_destroy(struct si_screen
*sscreen
,
760 struct si_query
*rquery
)
762 struct si_query_pc
*query
= (struct si_query_pc
*)rquery
;
764 while (query
->groups
) {
765 struct si_query_group
*group
= query
->groups
;
766 query
->groups
= group
->next
;
770 FREE(query
->counters
);
772 si_query_buffer_destroy(sscreen
, &query
->buffer
);
776 static void si_pc_query_resume(struct si_context
*sctx
, struct si_query
*rquery
)
778 struct si_query_hw *hwquery,
779 struct r600_resource *buffer, uint64_t va)*/
781 struct si_query_pc
*query
= (struct si_query_pc
*)rquery
;
783 int current_instance
= -1;
785 if (!si_query_buffer_alloc(sctx
, &query
->buffer
, NULL
, query
->result_size
))
787 si_need_gfx_cs_space(sctx
);
790 si_pc_emit_shaders(sctx
, query
->shaders
);
792 for (struct si_query_group
*group
= query
->groups
; group
; group
= group
->next
) {
793 struct si_pc_block
*block
= group
->block
;
795 if (group
->se
!= current_se
|| group
->instance
!= current_instance
) {
796 current_se
= group
->se
;
797 current_instance
= group
->instance
;
798 si_pc_emit_instance(sctx
, group
->se
, group
->instance
);
801 si_pc_emit_select(sctx
, block
, group
->num_counters
, group
->selectors
);
804 if (current_se
!= -1 || current_instance
!= -1)
805 si_pc_emit_instance(sctx
, -1, -1);
807 uint64_t va
= query
->buffer
.buf
->gpu_address
+ query
->buffer
.results_end
;
808 si_pc_emit_start(sctx
, query
->buffer
.buf
, va
);
811 static void si_pc_query_suspend(struct si_context
*sctx
, struct si_query
*rquery
)
813 struct si_query_pc
*query
= (struct si_query_pc
*)rquery
;
815 if (!query
->buffer
.buf
)
818 uint64_t va
= query
->buffer
.buf
->gpu_address
+ query
->buffer
.results_end
;
819 query
->buffer
.results_end
+= query
->result_size
;
821 si_pc_emit_stop(sctx
, query
->buffer
.buf
, va
);
823 for (struct si_query_group
*group
= query
->groups
; group
; group
= group
->next
) {
824 struct si_pc_block
*block
= group
->block
;
825 unsigned se
= group
->se
>= 0 ? group
->se
: 0;
826 unsigned se_end
= se
+ 1;
828 if ((block
->b
->b
->flags
& SI_PC_BLOCK_SE
) && (group
->se
< 0))
829 se_end
= sctx
->screen
->info
.max_se
;
832 unsigned instance
= group
->instance
>= 0 ? group
->instance
: 0;
835 si_pc_emit_instance(sctx
, se
, instance
);
836 si_pc_emit_read(sctx
, block
, group
->num_counters
, va
);
837 va
+= sizeof(uint64_t) * group
->num_counters
;
838 } while (group
->instance
< 0 && ++instance
< block
->num_instances
);
839 } while (++se
< se_end
);
842 si_pc_emit_instance(sctx
, -1, -1);
845 static bool si_pc_query_begin(struct si_context
*ctx
, struct si_query
*rquery
)
847 struct si_query_pc
*query
= (struct si_query_pc
*)rquery
;
849 si_query_buffer_reset(ctx
, &query
->buffer
);
851 LIST_ADDTAIL(&query
->b
.active_list
, &ctx
->active_queries
);
852 ctx
->num_cs_dw_queries_suspend
+= query
->b
.num_cs_dw_suspend
;
854 si_pc_query_resume(ctx
, rquery
);
859 static bool si_pc_query_end(struct si_context
*ctx
, struct si_query
*rquery
)
861 struct si_query_pc
*query
= (struct si_query_pc
*)rquery
;
863 si_pc_query_suspend(ctx
, rquery
);
865 LIST_DEL(&rquery
->active_list
);
866 ctx
->num_cs_dw_queries_suspend
-= rquery
->num_cs_dw_suspend
;
868 return query
->buffer
.buf
!= NULL
;
871 static void si_pc_query_add_result(struct si_query_pc
*query
,
873 union pipe_query_result
*result
)
875 uint64_t *results
= buffer
;
878 for (i
= 0; i
< query
->num_counters
; ++i
) {
879 struct si_query_counter
*counter
= &query
->counters
[i
];
881 for (j
= 0; j
< counter
->qwords
; ++j
) {
882 uint32_t value
= results
[counter
->base
+ j
* counter
->stride
];
883 result
->batch
[i
].u64
+= value
;
888 static bool si_pc_query_get_result(struct si_context
*sctx
, struct si_query
*rquery
,
889 bool wait
, union pipe_query_result
*result
)
891 struct si_query_pc
*query
= (struct si_query_pc
*)rquery
;
893 memset(result
, 0, sizeof(result
->batch
[0]) * query
->num_counters
);
895 for (struct si_query_buffer
*qbuf
= &query
->buffer
; qbuf
; qbuf
= qbuf
->previous
) {
896 unsigned usage
= PIPE_TRANSFER_READ
|
897 (wait
? 0 : PIPE_TRANSFER_DONTBLOCK
);
898 unsigned results_base
= 0;
901 if (rquery
->b
.flushed
)
902 map
= sctx
->ws
->buffer_map(qbuf
->buf
->buf
, NULL
, usage
);
904 map
= si_buffer_map_sync_with_rings(sctx
, qbuf
->buf
, usage
);
909 while (results_base
!= qbuf
->results_end
) {
910 si_pc_query_add_result(query
, map
+ results_base
, result
);
911 results_base
+= query
->result_size
;
918 static const struct si_query_ops batch_query_ops
= {
919 .destroy
= si_pc_query_destroy
,
920 .begin
= si_pc_query_begin
,
921 .end
= si_pc_query_end
,
922 .get_result
= si_pc_query_get_result
,
924 .suspend
= si_pc_query_suspend
,
925 .resume
= si_pc_query_resume
,
928 static struct si_query_group
*get_group_state(struct si_screen
*screen
,
929 struct si_query_pc
*query
,
930 struct si_pc_block
*block
,
933 struct si_query_group
*group
= query
->groups
;
936 if (group
->block
== block
&& group
->sub_gid
== sub_gid
)
941 group
= CALLOC_STRUCT(si_query_group
);
945 group
->block
= block
;
946 group
->sub_gid
= sub_gid
;
948 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
) {
949 unsigned sub_gids
= block
->num_instances
;
952 unsigned query_shaders
;
954 if (si_pc_block_has_per_se_groups(screen
->perfcounters
, block
))
955 sub_gids
= sub_gids
* screen
->info
.max_se
;
956 shader_id
= sub_gid
/ sub_gids
;
957 sub_gid
= sub_gid
% sub_gids
;
959 shaders
= si_pc_shader_type_bits
[shader_id
];
961 query_shaders
= query
->shaders
& ~SI_PC_SHADERS_WINDOWING
;
962 if (query_shaders
&& query_shaders
!= shaders
) {
963 fprintf(stderr
, "si_perfcounter: incompatible shader groups\n");
967 query
->shaders
= shaders
;
970 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER_WINDOWED
&& !query
->shaders
) {
971 // A non-zero value in query->shaders ensures that the shader
972 // masking is reset unless the user explicitly requests one.
973 query
->shaders
= SI_PC_SHADERS_WINDOWING
;
976 if (si_pc_block_has_per_se_groups(screen
->perfcounters
, block
)) {
977 group
->se
= sub_gid
/ block
->num_instances
;
978 sub_gid
= sub_gid
% block
->num_instances
;
983 if (si_pc_block_has_per_instance_groups(screen
->perfcounters
, block
)) {
984 group
->instance
= sub_gid
;
986 group
->instance
= -1;
989 group
->next
= query
->groups
;
990 query
->groups
= group
;
995 struct pipe_query
*si_create_batch_query(struct pipe_context
*ctx
,
996 unsigned num_queries
,
997 unsigned *query_types
)
999 struct si_screen
*screen
=
1000 (struct si_screen
*)ctx
->screen
;
1001 struct si_perfcounters
*pc
= screen
->perfcounters
;
1002 struct si_pc_block
*block
;
1003 struct si_query_group
*group
;
1004 struct si_query_pc
*query
;
1005 unsigned base_gid
, sub_gid
, sub_index
;
1011 query
= CALLOC_STRUCT(si_query_pc
);
1015 query
->b
.ops
= &batch_query_ops
;
1017 query
->num_counters
= num_queries
;
1019 /* Collect selectors per group */
1020 for (i
= 0; i
< num_queries
; ++i
) {
1023 if (query_types
[i
] < SI_QUERY_FIRST_PERFCOUNTER
)
1026 block
= lookup_counter(pc
, query_types
[i
] - SI_QUERY_FIRST_PERFCOUNTER
,
1027 &base_gid
, &sub_index
);
1031 sub_gid
= sub_index
/ block
->b
->selectors
;
1032 sub_index
= sub_index
% block
->b
->selectors
;
1034 group
= get_group_state(screen
, query
, block
, sub_gid
);
1038 if (group
->num_counters
>= block
->b
->b
->num_counters
) {
1040 "perfcounter group %s: too many selected\n",
1044 group
->selectors
[group
->num_counters
] = sub_index
;
1045 ++group
->num_counters
;
1048 /* Compute result bases and CS size per group */
1049 query
->b
.num_cs_dw_suspend
= pc
->num_stop_cs_dwords
;
1050 query
->b
.num_cs_dw_suspend
+= pc
->num_instance_cs_dwords
;
1053 for (group
= query
->groups
; group
; group
= group
->next
) {
1054 struct si_pc_block
*block
= group
->block
;
1056 unsigned instances
= 1;
1058 if ((block
->b
->b
->flags
& SI_PC_BLOCK_SE
) && group
->se
< 0)
1059 instances
= screen
->info
.max_se
;
1060 if (group
->instance
< 0)
1061 instances
*= block
->num_instances
;
1063 group
->result_base
= i
;
1064 query
->result_size
+= sizeof(uint64_t) * instances
* group
->num_counters
;
1065 i
+= instances
* group
->num_counters
;
1067 read_dw
= 6 * group
->num_counters
;
1068 query
->b
.num_cs_dw_suspend
+= instances
* read_dw
;
1069 query
->b
.num_cs_dw_suspend
+= instances
* pc
->num_instance_cs_dwords
;
1072 if (query
->shaders
) {
1073 if (query
->shaders
== SI_PC_SHADERS_WINDOWING
)
1074 query
->shaders
= 0xffffffff;
1077 /* Map user-supplied query array to result indices */
1078 query
->counters
= CALLOC(num_queries
, sizeof(*query
->counters
));
1079 for (i
= 0; i
< num_queries
; ++i
) {
1080 struct si_query_counter
*counter
= &query
->counters
[i
];
1081 struct si_pc_block
*block
;
1083 block
= lookup_counter(pc
, query_types
[i
] - SI_QUERY_FIRST_PERFCOUNTER
,
1084 &base_gid
, &sub_index
);
1086 sub_gid
= sub_index
/ block
->b
->selectors
;
1087 sub_index
= sub_index
% block
->b
->selectors
;
1089 group
= get_group_state(screen
, query
, block
, sub_gid
);
1090 assert(group
!= NULL
);
1092 for (j
= 0; j
< group
->num_counters
; ++j
) {
1093 if (group
->selectors
[j
] == sub_index
)
1097 counter
->base
= group
->result_base
+ j
;
1098 counter
->stride
= group
->num_counters
;
1100 counter
->qwords
= 1;
1101 if ((block
->b
->b
->flags
& SI_PC_BLOCK_SE
) && group
->se
< 0)
1102 counter
->qwords
= screen
->info
.max_se
;
1103 if (group
->instance
< 0)
1104 counter
->qwords
*= block
->num_instances
;
1107 return (struct pipe_query
*)query
;
1110 si_pc_query_destroy(screen
, &query
->b
);
1114 static bool si_init_block_names(struct si_screen
*screen
,
1115 struct si_pc_block
*block
)
1117 bool per_instance_groups
= si_pc_block_has_per_instance_groups(screen
->perfcounters
, block
);
1118 bool per_se_groups
= si_pc_block_has_per_se_groups(screen
->perfcounters
, block
);
1120 unsigned groups_shader
= 1, groups_se
= 1, groups_instance
= 1;
1125 if (per_instance_groups
)
1126 groups_instance
= block
->num_instances
;
1128 groups_se
= screen
->info
.max_se
;
1129 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
)
1130 groups_shader
= ARRAY_SIZE(si_pc_shader_type_bits
);
1132 namelen
= strlen(block
->b
->b
->name
);
1133 block
->group_name_stride
= namelen
+ 1;
1134 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
)
1135 block
->group_name_stride
+= 3;
1136 if (per_se_groups
) {
1137 assert(groups_se
<= 10);
1138 block
->group_name_stride
+= 1;
1140 if (per_instance_groups
)
1141 block
->group_name_stride
+= 1;
1143 if (per_instance_groups
) {
1144 assert(groups_instance
<= 100);
1145 block
->group_name_stride
+= 2;
1148 block
->group_names
= MALLOC(block
->num_groups
* block
->group_name_stride
);
1149 if (!block
->group_names
)
1152 groupname
= block
->group_names
;
1153 for (i
= 0; i
< groups_shader
; ++i
) {
1154 const char *shader_suffix
= si_pc_shader_type_suffixes
[i
];
1155 unsigned shaderlen
= strlen(shader_suffix
);
1156 for (j
= 0; j
< groups_se
; ++j
) {
1157 for (k
= 0; k
< groups_instance
; ++k
) {
1158 strcpy(groupname
, block
->b
->b
->name
);
1159 p
= groupname
+ namelen
;
1161 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
) {
1162 strcpy(p
, shader_suffix
);
1166 if (per_se_groups
) {
1167 p
+= sprintf(p
, "%d", j
);
1168 if (per_instance_groups
)
1172 if (per_instance_groups
)
1173 p
+= sprintf(p
, "%d", k
);
1175 groupname
+= block
->group_name_stride
;
1180 assert(block
->b
->selectors
<= 1000);
1181 block
->selector_name_stride
= block
->group_name_stride
+ 4;
1182 block
->selector_names
= MALLOC(block
->num_groups
* block
->b
->selectors
*
1183 block
->selector_name_stride
);
1184 if (!block
->selector_names
)
1187 groupname
= block
->group_names
;
1188 p
= block
->selector_names
;
1189 for (i
= 0; i
< block
->num_groups
; ++i
) {
1190 for (j
= 0; j
< block
->b
->selectors
; ++j
) {
1191 sprintf(p
, "%s_%03d", groupname
, j
);
1192 p
+= block
->selector_name_stride
;
1194 groupname
+= block
->group_name_stride
;
1200 int si_get_perfcounter_info(struct si_screen
*screen
,
1202 struct pipe_driver_query_info
*info
)
1204 struct si_perfcounters
*pc
= screen
->perfcounters
;
1205 struct si_pc_block
*block
;
1206 unsigned base_gid
, sub
;
1212 unsigned bid
, num_queries
= 0;
1214 for (bid
= 0; bid
< pc
->num_blocks
; ++bid
) {
1215 num_queries
+= pc
->blocks
[bid
].b
->selectors
*
1216 pc
->blocks
[bid
].num_groups
;
1222 block
= lookup_counter(pc
, index
, &base_gid
, &sub
);
1226 if (!block
->selector_names
) {
1227 if (!si_init_block_names(screen
, block
))
1230 info
->name
= block
->selector_names
+ sub
* block
->selector_name_stride
;
1231 info
->query_type
= SI_QUERY_FIRST_PERFCOUNTER
+ index
;
1232 info
->max_value
.u64
= 0;
1233 info
->type
= PIPE_DRIVER_QUERY_TYPE_UINT64
;
1234 info
->result_type
= PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE
;
1235 info
->group_id
= base_gid
+ sub
/ block
->b
->selectors
;
1236 info
->flags
= PIPE_DRIVER_QUERY_FLAG_BATCH
;
1237 if (sub
> 0 && sub
+ 1 < block
->b
->selectors
* block
->num_groups
)
1238 info
->flags
|= PIPE_DRIVER_QUERY_FLAG_DONT_LIST
;
1242 int si_get_perfcounter_group_info(struct si_screen
*screen
,
1244 struct pipe_driver_query_group_info
*info
)
1246 struct si_perfcounters
*pc
= screen
->perfcounters
;
1247 struct si_pc_block
*block
;
1253 return pc
->num_groups
;
1255 block
= lookup_group(pc
, &index
);
1259 if (!block
->group_names
) {
1260 if (!si_init_block_names(screen
, block
))
1263 info
->name
= block
->group_names
+ index
* block
->group_name_stride
;
1264 info
->num_queries
= block
->b
->selectors
;
1265 info
->max_active_queries
= block
->b
->b
->num_counters
;
1269 void si_destroy_perfcounters(struct si_screen
*screen
)
1271 struct si_perfcounters
*pc
= screen
->perfcounters
;
1277 for (i
= 0; i
< pc
->num_blocks
; ++i
) {
1278 FREE(pc
->blocks
[i
].group_names
);
1279 FREE(pc
->blocks
[i
].selector_names
);
1283 screen
->perfcounters
= NULL
;
1286 void si_init_perfcounters(struct si_screen
*screen
)
1288 struct si_perfcounters
*pc
;
1289 const struct si_pc_block_gfxdescr
*blocks
;
1290 unsigned num_blocks
;
1293 switch (screen
->info
.chip_class
) {
1295 blocks
= groups_CIK
;
1296 num_blocks
= ARRAY_SIZE(groups_CIK
);
1300 num_blocks
= ARRAY_SIZE(groups_VI
);
1303 blocks
= groups_gfx9
;
1304 num_blocks
= ARRAY_SIZE(groups_gfx9
);
1308 return; /* not implemented */
1311 if (screen
->info
.max_sh_per_se
!= 1) {
1312 /* This should not happen on non-SI chips. */
1313 fprintf(stderr
, "si_init_perfcounters: max_sh_per_se = %d not "
1314 "supported (inaccurate performance counters)\n",
1315 screen
->info
.max_sh_per_se
);
1318 screen
->perfcounters
= pc
= CALLOC_STRUCT(si_perfcounters
);
1322 pc
->num_stop_cs_dwords
= 14 + si_cp_write_fence_dwords(screen
);
1323 pc
->num_instance_cs_dwords
= 3;
1325 pc
->separate_se
= debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
1326 pc
->separate_instance
= debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
1328 pc
->blocks
= CALLOC(num_blocks
, sizeof(struct si_pc_block
));
1331 pc
->num_blocks
= num_blocks
;
1333 for (i
= 0; i
< num_blocks
; ++i
) {
1334 struct si_pc_block
*block
= &pc
->blocks
[i
];
1335 block
->b
= &blocks
[i
];
1336 block
->num_instances
= block
->b
->instances
;
1338 if (!strcmp(block
->b
->b
->name
, "CB") ||
1339 !strcmp(block
->b
->b
->name
, "DB"))
1340 block
->num_instances
= screen
->info
.max_se
;
1341 else if (!strcmp(block
->b
->b
->name
, "TCC"))
1342 block
->num_instances
= screen
->info
.num_tcc_blocks
;
1343 else if (!strcmp(block
->b
->b
->name
, "IA"))
1344 block
->num_instances
= MAX2(1, screen
->info
.max_se
/ 2);
1346 if (si_pc_block_has_per_instance_groups(pc
, block
)) {
1347 block
->num_groups
= block
->num_instances
;
1349 block
->num_groups
= 1;
1352 if (si_pc_block_has_per_se_groups(pc
, block
))
1353 block
->num_groups
*= screen
->info
.max_se
;
1354 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
)
1355 block
->num_groups
*= ARRAY_SIZE(si_pc_shader_type_bits
);
1357 pc
->num_groups
+= block
->num_groups
;
1363 si_destroy_perfcounters(screen
);