2 * Copyright 2015 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 #include "si_build_pm4.h"
27 #include "util/u_memory.h"
29 enum si_pc_block_flags
31 /* This block is part of the shader engine */
32 SI_PC_BLOCK_SE
= (1 << 0),
34 /* Expose per-instance groups instead of summing all instances (within
36 SI_PC_BLOCK_INSTANCE_GROUPS
= (1 << 1),
38 /* Expose per-SE groups instead of summing instances across SEs. */
39 SI_PC_BLOCK_SE_GROUPS
= (1 << 2),
42 SI_PC_BLOCK_SHADER
= (1 << 3),
44 /* Non-shader block with perfcounters windowed by shaders. */
45 SI_PC_BLOCK_SHADER_WINDOWED
= (1 << 4),
50 /* All secondary selector dwords follow as one block after the primary
51 * selector dwords for the counters that have secondary selectors.
56 * PERFCOUNTER0_SELECT1
57 * PERFCOUNTER1_SELECT1
61 SI_PC_MULTI_BLOCK
= 0,
63 /* Each secondary selector dword follows immediately after the
64 * corresponding primary.
68 * PERFCOUNTER0_SELECT1
70 * PERFCOUNTER1_SELECT1
74 SI_PC_MULTI_ALTERNATE
= 1,
76 /* All secondary selector dwords follow as one block after all primary
84 * PERFCOUNTER0_SELECT1
85 * PERFCOUNTER1_SELECT1
89 /* Free-form arrangement of selector registers. */
90 SI_PC_MULTI_CUSTOM
= 3,
94 /* Registers are laid out in decreasing rather than increasing order. */
95 SI_PC_REG_REVERSE
= 4,
100 struct si_pc_block_base
{
102 unsigned num_counters
;
107 unsigned counter0_lo
;
111 unsigned num_prelude
;
115 struct si_pc_block_gfxdescr
{
116 struct si_pc_block_base
*b
;
122 const struct si_pc_block_gfxdescr
*b
;
123 unsigned num_instances
;
127 unsigned group_name_stride
;
129 char *selector_names
;
130 unsigned selector_name_stride
;
133 /* The order is chosen to be compatible with GPUPerfStudio's hardcoding of
134 * performance counter group IDs.
136 static const char *const si_pc_shader_type_suffixes
[] = {"", "_ES", "_GS", "_VS",
137 "_PS", "_LS", "_HS", "_CS"};
139 static const unsigned si_pc_shader_type_bits
[] = {
150 /* Max counters per HW block */
151 #define SI_QUERY_MAX_COUNTERS 16
153 #define SI_PC_SHADERS_WINDOWING (1u << 31)
155 struct si_query_group
{
156 struct si_query_group
*next
;
157 struct si_pc_block
*block
;
158 unsigned sub_gid
; /* only used during init */
159 unsigned result_base
; /* only used during init */
162 unsigned num_counters
;
163 unsigned selectors
[SI_QUERY_MAX_COUNTERS
];
166 struct si_query_counter
{
169 unsigned stride
; /* in uint64s */
174 struct si_query_buffer buffer
;
176 /* Size of the results in memory, in bytes. */
177 unsigned result_size
;
180 unsigned num_counters
;
181 struct si_query_counter
*counters
;
182 struct si_query_group
*groups
;
185 static struct si_pc_block_base cik_CB
= {
188 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
,
190 .select0
= R_037000_CB_PERFCOUNTER_FILTER
,
191 .counter0_lo
= R_035018_CB_PERFCOUNTER0_LO
,
194 .layout
= SI_PC_MULTI_ALTERNATE
,
197 static unsigned cik_CPC_select
[] = {
198 R_036024_CPC_PERFCOUNTER0_SELECT
,
199 R_036010_CPC_PERFCOUNTER0_SELECT1
,
200 R_03600C_CPC_PERFCOUNTER1_SELECT
,
202 static struct si_pc_block_base cik_CPC
= {
206 .select
= cik_CPC_select
,
207 .counter0_lo
= R_034018_CPC_PERFCOUNTER0_LO
,
209 .layout
= SI_PC_MULTI_CUSTOM
| SI_PC_REG_REVERSE
,
212 static struct si_pc_block_base cik_CPF
= {
216 .select0
= R_03601C_CPF_PERFCOUNTER0_SELECT
,
217 .counter0_lo
= R_034028_CPF_PERFCOUNTER0_LO
,
219 .layout
= SI_PC_MULTI_ALTERNATE
| SI_PC_REG_REVERSE
,
222 static struct si_pc_block_base cik_CPG
= {
226 .select0
= R_036008_CPG_PERFCOUNTER0_SELECT
,
227 .counter0_lo
= R_034008_CPG_PERFCOUNTER0_LO
,
229 .layout
= SI_PC_MULTI_ALTERNATE
| SI_PC_REG_REVERSE
,
232 static struct si_pc_block_base cik_DB
= {
235 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
,
237 .select0
= R_037100_DB_PERFCOUNTER0_SELECT
,
238 .counter0_lo
= R_035100_DB_PERFCOUNTER0_LO
,
239 .num_multi
= 3, // really only 2, but there's a gap between registers
240 .layout
= SI_PC_MULTI_ALTERNATE
,
243 static struct si_pc_block_base cik_GDS
= {
247 .select0
= R_036A00_GDS_PERFCOUNTER0_SELECT
,
248 .counter0_lo
= R_034A00_GDS_PERFCOUNTER0_LO
,
250 .layout
= SI_PC_MULTI_TAIL
,
253 static unsigned cik_GRBM_counters
[] = {
254 R_034100_GRBM_PERFCOUNTER0_LO
,
255 R_03410C_GRBM_PERFCOUNTER1_LO
,
257 static struct si_pc_block_base cik_GRBM
= {
261 .select0
= R_036100_GRBM_PERFCOUNTER0_SELECT
,
262 .counters
= cik_GRBM_counters
,
265 static struct si_pc_block_base cik_GRBMSE
= {
269 .select0
= R_036108_GRBM_SE0_PERFCOUNTER_SELECT
,
270 .counter0_lo
= R_034114_GRBM_SE0_PERFCOUNTER_LO
,
273 static struct si_pc_block_base cik_IA
= {
277 .select0
= R_036210_IA_PERFCOUNTER0_SELECT
,
278 .counter0_lo
= R_034220_IA_PERFCOUNTER0_LO
,
280 .layout
= SI_PC_MULTI_TAIL
,
283 static struct si_pc_block_base cik_PA_SC
= {
286 .flags
= SI_PC_BLOCK_SE
,
288 .select0
= R_036500_PA_SC_PERFCOUNTER0_SELECT
,
289 .counter0_lo
= R_034500_PA_SC_PERFCOUNTER0_LO
,
291 .layout
= SI_PC_MULTI_ALTERNATE
,
294 /* According to docs, PA_SU counters are only 48 bits wide. */
295 static struct si_pc_block_base cik_PA_SU
= {
298 .flags
= SI_PC_BLOCK_SE
,
300 .select0
= R_036400_PA_SU_PERFCOUNTER0_SELECT
,
301 .counter0_lo
= R_034400_PA_SU_PERFCOUNTER0_LO
,
303 .layout
= SI_PC_MULTI_ALTERNATE
,
306 static struct si_pc_block_base cik_SPI
= {
309 .flags
= SI_PC_BLOCK_SE
,
311 .select0
= R_036600_SPI_PERFCOUNTER0_SELECT
,
312 .counter0_lo
= R_034604_SPI_PERFCOUNTER0_LO
,
314 .layout
= SI_PC_MULTI_BLOCK
,
317 static struct si_pc_block_base cik_SQ
= {
320 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_SHADER
,
322 .select0
= R_036700_SQ_PERFCOUNTER0_SELECT
,
323 .select_or
= S_036700_SQC_BANK_MASK(15) | S_036700_SQC_CLIENT_MASK(15) | S_036700_SIMD_MASK(15),
324 .counter0_lo
= R_034700_SQ_PERFCOUNTER0_LO
,
327 static struct si_pc_block_base cik_SX
= {
330 .flags
= SI_PC_BLOCK_SE
,
332 .select0
= R_036900_SX_PERFCOUNTER0_SELECT
,
333 .counter0_lo
= R_034900_SX_PERFCOUNTER0_LO
,
335 .layout
= SI_PC_MULTI_TAIL
,
338 static struct si_pc_block_base cik_TA
= {
341 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
| SI_PC_BLOCK_SHADER_WINDOWED
,
343 .select0
= R_036B00_TA_PERFCOUNTER0_SELECT
,
344 .counter0_lo
= R_034B00_TA_PERFCOUNTER0_LO
,
346 .layout
= SI_PC_MULTI_ALTERNATE
,
349 static struct si_pc_block_base cik_TD
= {
352 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
| SI_PC_BLOCK_SHADER_WINDOWED
,
354 .select0
= R_036C00_TD_PERFCOUNTER0_SELECT
,
355 .counter0_lo
= R_034C00_TD_PERFCOUNTER0_LO
,
357 .layout
= SI_PC_MULTI_ALTERNATE
,
360 static struct si_pc_block_base cik_TCA
= {
363 .flags
= SI_PC_BLOCK_INSTANCE_GROUPS
,
365 .select0
= R_036E40_TCA_PERFCOUNTER0_SELECT
,
366 .counter0_lo
= R_034E40_TCA_PERFCOUNTER0_LO
,
368 .layout
= SI_PC_MULTI_ALTERNATE
,
371 static struct si_pc_block_base cik_TCC
= {
374 .flags
= SI_PC_BLOCK_INSTANCE_GROUPS
,
376 .select0
= R_036E00_TCC_PERFCOUNTER0_SELECT
,
377 .counter0_lo
= R_034E00_TCC_PERFCOUNTER0_LO
,
379 .layout
= SI_PC_MULTI_ALTERNATE
,
382 static struct si_pc_block_base cik_TCP
= {
385 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
| SI_PC_BLOCK_SHADER_WINDOWED
,
387 .select0
= R_036D00_TCP_PERFCOUNTER0_SELECT
,
388 .counter0_lo
= R_034D00_TCP_PERFCOUNTER0_LO
,
390 .layout
= SI_PC_MULTI_ALTERNATE
,
393 static struct si_pc_block_base cik_VGT
= {
396 .flags
= SI_PC_BLOCK_SE
,
398 .select0
= R_036230_VGT_PERFCOUNTER0_SELECT
,
399 .counter0_lo
= R_034240_VGT_PERFCOUNTER0_LO
,
401 .layout
= SI_PC_MULTI_TAIL
,
404 static struct si_pc_block_base cik_WD
= {
408 .select0
= R_036200_WD_PERFCOUNTER0_SELECT
,
409 .counter0_lo
= R_034200_WD_PERFCOUNTER0_LO
,
412 static struct si_pc_block_base cik_MC
= {
416 .layout
= SI_PC_FAKE
,
419 static struct si_pc_block_base cik_SRBM
= {
423 .layout
= SI_PC_FAKE
,
426 static struct si_pc_block_base gfx10_CHA
= {
430 .select0
= R_037780_CHA_PERFCOUNTER0_SELECT
,
431 .counter0_lo
= R_035800_CHA_PERFCOUNTER0_LO
,
433 .layout
= SI_PC_MULTI_ALTERNATE
,
436 static struct si_pc_block_base gfx10_CHCG
= {
440 .select0
= R_036F18_CHCG_PERFCOUNTER0_SELECT
,
441 .counter0_lo
= R_034F20_CHCG_PERFCOUNTER0_LO
,
443 .layout
= SI_PC_MULTI_ALTERNATE
,
446 static struct si_pc_block_base gfx10_CHC
= {
450 .select0
= R_036F00_CHC_PERFCOUNTER0_SELECT
,
451 .counter0_lo
= R_034F00_CHC_PERFCOUNTER0_LO
,
453 .layout
= SI_PC_MULTI_ALTERNATE
,
456 static struct si_pc_block_base gfx10_GCR
= {
460 .select0
= R_037580_GCR_PERFCOUNTER0_SELECT
,
461 .counter0_lo
= R_035480_GCR_PERFCOUNTER0_LO
,
463 .layout
= SI_PC_MULTI_ALTERNATE
,
466 static struct si_pc_block_base gfx10_GE
= {
470 .select0
= R_036200_GE_PERFCOUNTER0_SELECT
,
471 .counter0_lo
= R_034200_GE_PERFCOUNTER0_LO
,
473 .layout
= SI_PC_MULTI_ALTERNATE
,
476 static struct si_pc_block_base gfx10_GL1A
= {
479 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_SHADER_WINDOWED
,
481 .select0
= R_037700_GL1A_PERFCOUNTER0_SELECT
,
482 .counter0_lo
= R_035700_GL1A_PERFCOUNTER0_LO
,
484 .layout
= SI_PC_MULTI_ALTERNATE
,
487 static struct si_pc_block_base gfx10_GL1C
= {
490 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_SHADER_WINDOWED
,
492 .select0
= R_036E80_GL1C_PERFCOUNTER0_SELECT
,
493 .counter0_lo
= R_034E80_GL1C_PERFCOUNTER0_LO
,
495 .layout
= SI_PC_MULTI_ALTERNATE
,
498 static struct si_pc_block_base gfx10_GL2A
= {
502 .select0
= R_036E40_GL2A_PERFCOUNTER0_SELECT
,
503 .counter0_lo
= R_034E40_GL2A_PERFCOUNTER0_LO
,
505 .layout
= SI_PC_MULTI_ALTERNATE
,
508 static struct si_pc_block_base gfx10_GL2C
= {
512 .select0
= R_036E00_GL2C_PERFCOUNTER0_SELECT
,
513 .counter0_lo
= R_034E00_GL2C_PERFCOUNTER0_LO
,
515 .layout
= SI_PC_MULTI_ALTERNATE
,
518 static unsigned gfx10_PA_PH_select
[] = {
519 R_037600_PA_PH_PERFCOUNTER0_SELECT
,
520 R_037604_PA_PH_PERFCOUNTER0_SELECT1
,
521 R_037608_PA_PH_PERFCOUNTER1_SELECT
,
522 R_037640_PA_PH_PERFCOUNTER1_SELECT1
,
523 R_03760C_PA_PH_PERFCOUNTER2_SELECT
,
524 R_037644_PA_PH_PERFCOUNTER2_SELECT1
,
525 R_037610_PA_PH_PERFCOUNTER3_SELECT
,
526 R_037648_PA_PH_PERFCOUNTER3_SELECT1
,
527 R_037614_PA_PH_PERFCOUNTER4_SELECT
,
528 R_037618_PA_PH_PERFCOUNTER5_SELECT
,
529 R_03761C_PA_PH_PERFCOUNTER6_SELECT
,
530 R_037620_PA_PH_PERFCOUNTER7_SELECT
,
532 static struct si_pc_block_base gfx10_PA_PH
= {
535 .flags
= SI_PC_BLOCK_SE
,
537 .select
= gfx10_PA_PH_select
,
538 .counter0_lo
= R_035600_PA_PH_PERFCOUNTER0_LO
,
540 .layout
= SI_PC_MULTI_CUSTOM
,
543 static struct si_pc_block_base gfx10_PA_SU
= {
546 .flags
= SI_PC_BLOCK_SE
,
548 .select0
= R_036400_PA_SU_PERFCOUNTER0_SELECT
,
549 .counter0_lo
= R_034400_PA_SU_PERFCOUNTER0_LO
,
551 .layout
= SI_PC_MULTI_ALTERNATE
,
554 static struct si_pc_block_base gfx10_RLC
= {
558 .select0
= R_037304_RLC_PERFCOUNTER0_SELECT
,
559 .counter0_lo
= R_035200_RLC_PERFCOUNTER0_LO
,
561 .layout
= SI_PC_MULTI_ALTERNATE
,
564 static struct si_pc_block_base gfx10_RMI
= {
566 /* Actually 4, but the 2nd counter is missing the secondary selector while
567 * the 3rd counter has it, which complicates the register layout. */
569 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
,
571 .select0
= R_037400_RMI_PERFCOUNTER0_SELECT
,
572 .counter0_lo
= R_035300_RMI_PERFCOUNTER0_LO
,
574 .layout
= SI_PC_MULTI_ALTERNATE
,
577 static struct si_pc_block_base gfx10_UTCL1
= {
580 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_SHADER_WINDOWED
,
582 .select0
= R_03758C_UTCL1_PERFCOUNTER0_SELECT
,
583 .counter0_lo
= R_035470_UTCL1_PERFCOUNTER0_LO
,
585 .layout
= SI_PC_MULTI_ALTERNATE
,
588 /* Both the number of instances and selectors varies between chips of the same
589 * class. We only differentiate by class here and simply expose the maximum
590 * number over all chips in a class.
592 * Unfortunately, GPUPerfStudio uses the order of performance counter groups
593 * blindly once it believes it has identified the hardware, so the order of
594 * blocks here matters.
596 static struct si_pc_block_gfxdescr groups_CIK
[] = {
597 {&cik_CB
, 226}, {&cik_CPF
, 17}, {&cik_DB
, 257}, {&cik_GRBM
, 34}, {&cik_GRBMSE
, 15},
598 {&cik_PA_SU
, 153}, {&cik_PA_SC
, 395}, {&cik_SPI
, 186}, {&cik_SQ
, 252}, {&cik_SX
, 32},
599 {&cik_TA
, 111}, {&cik_TCA
, 39, 2}, {&cik_TCC
, 160}, {&cik_TD
, 55}, {&cik_TCP
, 154},
600 {&cik_GDS
, 121}, {&cik_VGT
, 140}, {&cik_IA
, 22}, {&cik_MC
, 22}, {&cik_SRBM
, 19},
601 {&cik_WD
, 22}, {&cik_CPG
, 46}, {&cik_CPC
, 22},
605 static struct si_pc_block_gfxdescr groups_VI
[] = {
606 {&cik_CB
, 405}, {&cik_CPF
, 19}, {&cik_DB
, 257}, {&cik_GRBM
, 34}, {&cik_GRBMSE
, 15},
607 {&cik_PA_SU
, 154}, {&cik_PA_SC
, 397}, {&cik_SPI
, 197}, {&cik_SQ
, 273}, {&cik_SX
, 34},
608 {&cik_TA
, 119}, {&cik_TCA
, 35, 2}, {&cik_TCC
, 192}, {&cik_TD
, 55}, {&cik_TCP
, 180},
609 {&cik_GDS
, 121}, {&cik_VGT
, 147}, {&cik_IA
, 24}, {&cik_MC
, 22}, {&cik_SRBM
, 27},
610 {&cik_WD
, 37}, {&cik_CPG
, 48}, {&cik_CPC
, 24},
614 static struct si_pc_block_gfxdescr groups_gfx9
[] = {
615 {&cik_CB
, 438}, {&cik_CPF
, 32}, {&cik_DB
, 328}, {&cik_GRBM
, 38}, {&cik_GRBMSE
, 16},
616 {&cik_PA_SU
, 292}, {&cik_PA_SC
, 491}, {&cik_SPI
, 196}, {&cik_SQ
, 374}, {&cik_SX
, 208},
617 {&cik_TA
, 119}, {&cik_TCA
, 35, 2}, {&cik_TCC
, 256}, {&cik_TD
, 57}, {&cik_TCP
, 85},
618 {&cik_GDS
, 121}, {&cik_VGT
, 148}, {&cik_IA
, 32}, {&cik_WD
, 58}, {&cik_CPG
, 59},
622 static struct si_pc_block_gfxdescr groups_gfx10
[] = {
654 static bool si_pc_block_has_per_se_groups(const struct si_perfcounters
*pc
,
655 const struct si_pc_block
*block
)
657 return block
->b
->b
->flags
& SI_PC_BLOCK_SE_GROUPS
||
658 (block
->b
->b
->flags
& SI_PC_BLOCK_SE
&& pc
->separate_se
);
661 static bool si_pc_block_has_per_instance_groups(const struct si_perfcounters
*pc
,
662 const struct si_pc_block
*block
)
664 return block
->b
->b
->flags
& SI_PC_BLOCK_INSTANCE_GROUPS
||
665 (block
->num_instances
> 1 && pc
->separate_instance
);
668 static struct si_pc_block
*lookup_counter(struct si_perfcounters
*pc
, unsigned index
,
669 unsigned *base_gid
, unsigned *sub_index
)
671 struct si_pc_block
*block
= pc
->blocks
;
675 for (bid
= 0; bid
< pc
->num_blocks
; ++bid
, ++block
) {
676 unsigned total
= block
->num_groups
* block
->b
->selectors
;
684 *base_gid
+= block
->num_groups
;
690 static struct si_pc_block
*lookup_group(struct si_perfcounters
*pc
, unsigned *index
)
693 struct si_pc_block
*block
= pc
->blocks
;
695 for (bid
= 0; bid
< pc
->num_blocks
; ++bid
, ++block
) {
696 if (*index
< block
->num_groups
)
698 *index
-= block
->num_groups
;
704 static void si_pc_emit_instance(struct si_context
*sctx
, int se
, int instance
)
706 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
707 unsigned value
= S_030800_SH_BROADCAST_WRITES(1);
710 value
|= S_030800_SE_INDEX(se
);
712 value
|= S_030800_SE_BROADCAST_WRITES(1);
715 if (sctx
->chip_class
>= GFX10
) {
716 /* TODO: Expose counters from each shader array separately if needed. */
717 value
|= S_030800_SA_BROADCAST_WRITES(1);
721 value
|= S_030800_INSTANCE_INDEX(instance
);
723 value
|= S_030800_INSTANCE_BROADCAST_WRITES(1);
726 radeon_set_uconfig_reg(cs
, R_030800_GRBM_GFX_INDEX
, value
);
729 static void si_pc_emit_shaders(struct si_context
*sctx
, unsigned shaders
)
731 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
733 radeon_set_uconfig_reg_seq(cs
, R_036780_SQ_PERFCOUNTER_CTRL
, 2);
734 radeon_emit(cs
, shaders
& 0x7f);
735 radeon_emit(cs
, 0xffffffff);
738 static void si_pc_emit_select(struct si_context
*sctx
, struct si_pc_block
*block
, unsigned count
,
741 struct si_pc_block_base
*regs
= block
->b
->b
;
742 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
744 unsigned layout_multi
= regs
->layout
& SI_PC_MULTI_MASK
;
747 assert(count
<= regs
->num_counters
);
749 if (regs
->layout
& SI_PC_FAKE
)
752 if (layout_multi
== SI_PC_MULTI_BLOCK
) {
753 assert(!(regs
->layout
& SI_PC_REG_REVERSE
));
755 dw
= count
+ regs
->num_prelude
;
756 if (count
>= regs
->num_multi
)
757 dw
+= regs
->num_multi
;
758 radeon_set_uconfig_reg_seq(cs
, regs
->select0
, dw
);
759 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
761 for (idx
= 0; idx
< MIN2(count
, regs
->num_multi
); ++idx
)
762 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
764 if (count
< regs
->num_multi
) {
765 unsigned select1
= regs
->select0
+ 4 * regs
->num_multi
;
766 radeon_set_uconfig_reg_seq(cs
, select1
, count
);
769 for (idx
= 0; idx
< MIN2(count
, regs
->num_multi
); ++idx
)
772 if (count
> regs
->num_multi
) {
773 for (idx
= regs
->num_multi
; idx
< count
; ++idx
)
774 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
776 } else if (layout_multi
== SI_PC_MULTI_TAIL
) {
777 unsigned select1
, select1_count
;
779 assert(!(regs
->layout
& SI_PC_REG_REVERSE
));
781 radeon_set_uconfig_reg_seq(cs
, regs
->select0
, count
+ regs
->num_prelude
);
782 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
784 for (idx
= 0; idx
< count
; ++idx
)
785 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
787 select1
= regs
->select0
+ 4 * regs
->num_counters
;
788 select1_count
= MIN2(count
, regs
->num_multi
);
789 radeon_set_uconfig_reg_seq(cs
, select1
, select1_count
);
790 for (idx
= 0; idx
< select1_count
; ++idx
)
792 } else if (layout_multi
== SI_PC_MULTI_CUSTOM
) {
793 unsigned *reg
= regs
->select
;
794 for (idx
= 0; idx
< count
; ++idx
) {
795 radeon_set_uconfig_reg(cs
, *reg
++, selectors
[idx
] | regs
->select_or
);
796 if (idx
< regs
->num_multi
)
797 radeon_set_uconfig_reg(cs
, *reg
++, 0);
800 assert(layout_multi
== SI_PC_MULTI_ALTERNATE
);
802 unsigned reg_base
= regs
->select0
;
803 unsigned reg_count
= count
+ MIN2(count
, regs
->num_multi
);
804 reg_count
+= regs
->num_prelude
;
806 if (!(regs
->layout
& SI_PC_REG_REVERSE
)) {
807 radeon_set_uconfig_reg_seq(cs
, reg_base
, reg_count
);
809 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
811 for (idx
= 0; idx
< count
; ++idx
) {
812 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
813 if (idx
< regs
->num_multi
)
817 reg_base
-= (reg_count
- 1) * 4;
818 radeon_set_uconfig_reg_seq(cs
, reg_base
, reg_count
);
820 for (idx
= count
; idx
> 0; --idx
) {
821 if (idx
<= regs
->num_multi
)
823 radeon_emit(cs
, selectors
[idx
- 1] | regs
->select_or
);
825 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
831 static void si_pc_emit_start(struct si_context
*sctx
, struct si_resource
*buffer
, uint64_t va
)
833 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
835 si_cp_copy_data(sctx
, sctx
->gfx_cs
, COPY_DATA_DST_MEM
, buffer
, va
- buffer
->gpu_address
,
836 COPY_DATA_IMM
, NULL
, 1);
838 radeon_set_uconfig_reg(cs
, R_036020_CP_PERFMON_CNTL
,
839 S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET
));
840 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
841 radeon_emit(cs
, EVENT_TYPE(V_028A90_PERFCOUNTER_START
) | EVENT_INDEX(0));
842 radeon_set_uconfig_reg(cs
, R_036020_CP_PERFMON_CNTL
,
843 S_036020_PERFMON_STATE(V_036020_START_COUNTING
));
846 /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
847 * do it again in here. */
848 static void si_pc_emit_stop(struct si_context
*sctx
, struct si_resource
*buffer
, uint64_t va
)
850 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
852 si_cp_release_mem(sctx
, cs
, V_028A90_BOTTOM_OF_PIPE_TS
, 0, EOP_DST_SEL_MEM
, EOP_INT_SEL_NONE
,
853 EOP_DATA_SEL_VALUE_32BIT
, buffer
, va
, 0, SI_NOT_QUERY
);
854 si_cp_wait_mem(sctx
, cs
, va
, 0, 0xffffffff, WAIT_REG_MEM_EQUAL
);
856 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
857 radeon_emit(cs
, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE
) | EVENT_INDEX(0));
858 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
859 radeon_emit(cs
, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP
) | EVENT_INDEX(0));
860 radeon_set_uconfig_reg(
861 cs
, R_036020_CP_PERFMON_CNTL
,
862 S_036020_PERFMON_STATE(V_036020_STOP_COUNTING
) | S_036020_PERFMON_SAMPLE_ENABLE(1));
865 static void si_pc_emit_read(struct si_context
*sctx
, struct si_pc_block
*block
, unsigned count
,
868 struct si_pc_block_base
*regs
= block
->b
->b
;
869 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
871 unsigned reg
= regs
->counter0_lo
;
872 unsigned reg_delta
= 8;
874 if (!(regs
->layout
& SI_PC_FAKE
)) {
875 if (regs
->layout
& SI_PC_REG_REVERSE
)
876 reg_delta
= -reg_delta
;
878 for (idx
= 0; idx
< count
; ++idx
) {
880 reg
= regs
->counters
[idx
];
882 radeon_emit(cs
, PKT3(PKT3_COPY_DATA
, 4, 0));
883 radeon_emit(cs
, COPY_DATA_SRC_SEL(COPY_DATA_PERF
) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM
) |
884 COPY_DATA_COUNT_SEL
); /* 64 bits */
885 radeon_emit(cs
, reg
>> 2);
886 radeon_emit(cs
, 0); /* unused */
888 radeon_emit(cs
, va
>> 32);
889 va
+= sizeof(uint64_t);
893 for (idx
= 0; idx
< count
; ++idx
) {
894 radeon_emit(cs
, PKT3(PKT3_COPY_DATA
, 4, 0));
895 radeon_emit(cs
, COPY_DATA_SRC_SEL(COPY_DATA_IMM
) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM
) |
896 COPY_DATA_COUNT_SEL
);
897 radeon_emit(cs
, 0); /* immediate */
900 radeon_emit(cs
, va
>> 32);
901 va
+= sizeof(uint64_t);
906 static void si_pc_query_destroy(struct si_context
*sctx
, struct si_query
*squery
)
908 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
910 while (query
->groups
) {
911 struct si_query_group
*group
= query
->groups
;
912 query
->groups
= group
->next
;
916 FREE(query
->counters
);
918 si_query_buffer_destroy(sctx
->screen
, &query
->buffer
);
922 static void si_pc_query_resume(struct si_context
*sctx
, struct si_query
*squery
)
924 struct si_query_hw *hwquery,
925 struct si_resource *buffer, uint64_t va)*/
927 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
929 int current_instance
= -1;
931 if (!si_query_buffer_alloc(sctx
, &query
->buffer
, NULL
, query
->result_size
))
933 si_need_gfx_cs_space(sctx
);
936 si_pc_emit_shaders(sctx
, query
->shaders
);
938 for (struct si_query_group
*group
= query
->groups
; group
; group
= group
->next
) {
939 struct si_pc_block
*block
= group
->block
;
941 if (group
->se
!= current_se
|| group
->instance
!= current_instance
) {
942 current_se
= group
->se
;
943 current_instance
= group
->instance
;
944 si_pc_emit_instance(sctx
, group
->se
, group
->instance
);
947 si_pc_emit_select(sctx
, block
, group
->num_counters
, group
->selectors
);
950 if (current_se
!= -1 || current_instance
!= -1)
951 si_pc_emit_instance(sctx
, -1, -1);
953 uint64_t va
= query
->buffer
.buf
->gpu_address
+ query
->buffer
.results_end
;
954 si_pc_emit_start(sctx
, query
->buffer
.buf
, va
);
957 static void si_pc_query_suspend(struct si_context
*sctx
, struct si_query
*squery
)
959 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
961 if (!query
->buffer
.buf
)
964 uint64_t va
= query
->buffer
.buf
->gpu_address
+ query
->buffer
.results_end
;
965 query
->buffer
.results_end
+= query
->result_size
;
967 si_pc_emit_stop(sctx
, query
->buffer
.buf
, va
);
969 for (struct si_query_group
*group
= query
->groups
; group
; group
= group
->next
) {
970 struct si_pc_block
*block
= group
->block
;
971 unsigned se
= group
->se
>= 0 ? group
->se
: 0;
972 unsigned se_end
= se
+ 1;
974 if ((block
->b
->b
->flags
& SI_PC_BLOCK_SE
) && (group
->se
< 0))
975 se_end
= sctx
->screen
->info
.max_se
;
978 unsigned instance
= group
->instance
>= 0 ? group
->instance
: 0;
981 si_pc_emit_instance(sctx
, se
, instance
);
982 si_pc_emit_read(sctx
, block
, group
->num_counters
, va
);
983 va
+= sizeof(uint64_t) * group
->num_counters
;
984 } while (group
->instance
< 0 && ++instance
< block
->num_instances
);
985 } while (++se
< se_end
);
988 si_pc_emit_instance(sctx
, -1, -1);
991 static bool si_pc_query_begin(struct si_context
*ctx
, struct si_query
*squery
)
993 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
995 si_query_buffer_reset(ctx
, &query
->buffer
);
997 list_addtail(&query
->b
.active_list
, &ctx
->active_queries
);
998 ctx
->num_cs_dw_queries_suspend
+= query
->b
.num_cs_dw_suspend
;
1000 si_pc_query_resume(ctx
, squery
);
1005 static bool si_pc_query_end(struct si_context
*ctx
, struct si_query
*squery
)
1007 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
1009 si_pc_query_suspend(ctx
, squery
);
1011 list_del(&squery
->active_list
);
1012 ctx
->num_cs_dw_queries_suspend
-= squery
->num_cs_dw_suspend
;
1014 return query
->buffer
.buf
!= NULL
;
1017 static void si_pc_query_add_result(struct si_query_pc
*query
, void *buffer
,
1018 union pipe_query_result
*result
)
1020 uint64_t *results
= buffer
;
1023 for (i
= 0; i
< query
->num_counters
; ++i
) {
1024 struct si_query_counter
*counter
= &query
->counters
[i
];
1026 for (j
= 0; j
< counter
->qwords
; ++j
) {
1027 uint32_t value
= results
[counter
->base
+ j
* counter
->stride
];
1028 result
->batch
[i
].u64
+= value
;
1033 static bool si_pc_query_get_result(struct si_context
*sctx
, struct si_query
*squery
, bool wait
,
1034 union pipe_query_result
*result
)
1036 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
1038 memset(result
, 0, sizeof(result
->batch
[0]) * query
->num_counters
);
1040 for (struct si_query_buffer
*qbuf
= &query
->buffer
; qbuf
; qbuf
= qbuf
->previous
) {
1041 unsigned usage
= PIPE_TRANSFER_READ
| (wait
? 0 : PIPE_TRANSFER_DONTBLOCK
);
1042 unsigned results_base
= 0;
1045 if (squery
->b
.flushed
)
1046 map
= sctx
->ws
->buffer_map(qbuf
->buf
->buf
, NULL
, usage
);
1048 map
= si_buffer_map_sync_with_rings(sctx
, qbuf
->buf
, usage
);
1053 while (results_base
!= qbuf
->results_end
) {
1054 si_pc_query_add_result(query
, map
+ results_base
, result
);
1055 results_base
+= query
->result_size
;
1062 static const struct si_query_ops batch_query_ops
= {
1063 .destroy
= si_pc_query_destroy
,
1064 .begin
= si_pc_query_begin
,
1065 .end
= si_pc_query_end
,
1066 .get_result
= si_pc_query_get_result
,
1068 .suspend
= si_pc_query_suspend
,
1069 .resume
= si_pc_query_resume
,
1072 static struct si_query_group
*get_group_state(struct si_screen
*screen
, struct si_query_pc
*query
,
1073 struct si_pc_block
*block
, unsigned sub_gid
)
1075 struct si_query_group
*group
= query
->groups
;
1078 if (group
->block
== block
&& group
->sub_gid
== sub_gid
)
1080 group
= group
->next
;
1083 group
= CALLOC_STRUCT(si_query_group
);
1087 group
->block
= block
;
1088 group
->sub_gid
= sub_gid
;
1090 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
) {
1091 unsigned sub_gids
= block
->num_instances
;
1094 unsigned query_shaders
;
1096 if (si_pc_block_has_per_se_groups(screen
->perfcounters
, block
))
1097 sub_gids
= sub_gids
* screen
->info
.max_se
;
1098 shader_id
= sub_gid
/ sub_gids
;
1099 sub_gid
= sub_gid
% sub_gids
;
1101 shaders
= si_pc_shader_type_bits
[shader_id
];
1103 query_shaders
= query
->shaders
& ~SI_PC_SHADERS_WINDOWING
;
1104 if (query_shaders
&& query_shaders
!= shaders
) {
1105 fprintf(stderr
, "si_perfcounter: incompatible shader groups\n");
1109 query
->shaders
= shaders
;
1112 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER_WINDOWED
&& !query
->shaders
) {
1113 // A non-zero value in query->shaders ensures that the shader
1114 // masking is reset unless the user explicitly requests one.
1115 query
->shaders
= SI_PC_SHADERS_WINDOWING
;
1118 if (si_pc_block_has_per_se_groups(screen
->perfcounters
, block
)) {
1119 group
->se
= sub_gid
/ block
->num_instances
;
1120 sub_gid
= sub_gid
% block
->num_instances
;
1125 if (si_pc_block_has_per_instance_groups(screen
->perfcounters
, block
)) {
1126 group
->instance
= sub_gid
;
1128 group
->instance
= -1;
1131 group
->next
= query
->groups
;
1132 query
->groups
= group
;
1137 struct pipe_query
*si_create_batch_query(struct pipe_context
*ctx
, unsigned num_queries
,
1138 unsigned *query_types
)
1140 struct si_screen
*screen
= (struct si_screen
*)ctx
->screen
;
1141 struct si_perfcounters
*pc
= screen
->perfcounters
;
1142 struct si_pc_block
*block
;
1143 struct si_query_group
*group
;
1144 struct si_query_pc
*query
;
1145 unsigned base_gid
, sub_gid
, sub_index
;
1151 query
= CALLOC_STRUCT(si_query_pc
);
1155 query
->b
.ops
= &batch_query_ops
;
1157 query
->num_counters
= num_queries
;
1159 /* Collect selectors per group */
1160 for (i
= 0; i
< num_queries
; ++i
) {
1163 if (query_types
[i
] < SI_QUERY_FIRST_PERFCOUNTER
)
1167 lookup_counter(pc
, query_types
[i
] - SI_QUERY_FIRST_PERFCOUNTER
, &base_gid
, &sub_index
);
1171 sub_gid
= sub_index
/ block
->b
->selectors
;
1172 sub_index
= sub_index
% block
->b
->selectors
;
1174 group
= get_group_state(screen
, query
, block
, sub_gid
);
1178 if (group
->num_counters
>= block
->b
->b
->num_counters
) {
1179 fprintf(stderr
, "perfcounter group %s: too many selected\n", block
->b
->b
->name
);
1182 group
->selectors
[group
->num_counters
] = sub_index
;
1183 ++group
->num_counters
;
1186 /* Compute result bases and CS size per group */
1187 query
->b
.num_cs_dw_suspend
= pc
->num_stop_cs_dwords
;
1188 query
->b
.num_cs_dw_suspend
+= pc
->num_instance_cs_dwords
;
1191 for (group
= query
->groups
; group
; group
= group
->next
) {
1192 struct si_pc_block
*block
= group
->block
;
1194 unsigned instances
= 1;
1196 if ((block
->b
->b
->flags
& SI_PC_BLOCK_SE
) && group
->se
< 0)
1197 instances
= screen
->info
.max_se
;
1198 if (group
->instance
< 0)
1199 instances
*= block
->num_instances
;
1201 group
->result_base
= i
;
1202 query
->result_size
+= sizeof(uint64_t) * instances
* group
->num_counters
;
1203 i
+= instances
* group
->num_counters
;
1205 read_dw
= 6 * group
->num_counters
;
1206 query
->b
.num_cs_dw_suspend
+= instances
* read_dw
;
1207 query
->b
.num_cs_dw_suspend
+= instances
* pc
->num_instance_cs_dwords
;
1210 if (query
->shaders
) {
1211 if (query
->shaders
== SI_PC_SHADERS_WINDOWING
)
1212 query
->shaders
= 0xffffffff;
1215 /* Map user-supplied query array to result indices */
1216 query
->counters
= CALLOC(num_queries
, sizeof(*query
->counters
));
1217 for (i
= 0; i
< num_queries
; ++i
) {
1218 struct si_query_counter
*counter
= &query
->counters
[i
];
1219 struct si_pc_block
*block
;
1222 lookup_counter(pc
, query_types
[i
] - SI_QUERY_FIRST_PERFCOUNTER
, &base_gid
, &sub_index
);
1224 sub_gid
= sub_index
/ block
->b
->selectors
;
1225 sub_index
= sub_index
% block
->b
->selectors
;
1227 group
= get_group_state(screen
, query
, block
, sub_gid
);
1228 assert(group
!= NULL
);
1230 for (j
= 0; j
< group
->num_counters
; ++j
) {
1231 if (group
->selectors
[j
] == sub_index
)
1235 counter
->base
= group
->result_base
+ j
;
1236 counter
->stride
= group
->num_counters
;
1238 counter
->qwords
= 1;
1239 if ((block
->b
->b
->flags
& SI_PC_BLOCK_SE
) && group
->se
< 0)
1240 counter
->qwords
= screen
->info
.max_se
;
1241 if (group
->instance
< 0)
1242 counter
->qwords
*= block
->num_instances
;
1245 return (struct pipe_query
*)query
;
1248 si_pc_query_destroy((struct si_context
*)ctx
, &query
->b
);
1252 static bool si_init_block_names(struct si_screen
*screen
, struct si_pc_block
*block
)
1254 bool per_instance_groups
= si_pc_block_has_per_instance_groups(screen
->perfcounters
, block
);
1255 bool per_se_groups
= si_pc_block_has_per_se_groups(screen
->perfcounters
, block
);
1257 unsigned groups_shader
= 1, groups_se
= 1, groups_instance
= 1;
1262 if (per_instance_groups
)
1263 groups_instance
= block
->num_instances
;
1265 groups_se
= screen
->info
.max_se
;
1266 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
)
1267 groups_shader
= ARRAY_SIZE(si_pc_shader_type_bits
);
1269 namelen
= strlen(block
->b
->b
->name
);
1270 block
->group_name_stride
= namelen
+ 1;
1271 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
)
1272 block
->group_name_stride
+= 3;
1273 if (per_se_groups
) {
1274 assert(groups_se
<= 10);
1275 block
->group_name_stride
+= 1;
1277 if (per_instance_groups
)
1278 block
->group_name_stride
+= 1;
1280 if (per_instance_groups
) {
1281 assert(groups_instance
<= 100);
1282 block
->group_name_stride
+= 2;
1285 block
->group_names
= MALLOC(block
->num_groups
* block
->group_name_stride
);
1286 if (!block
->group_names
)
1289 groupname
= block
->group_names
;
1290 for (i
= 0; i
< groups_shader
; ++i
) {
1291 const char *shader_suffix
= si_pc_shader_type_suffixes
[i
];
1292 unsigned shaderlen
= strlen(shader_suffix
);
1293 for (j
= 0; j
< groups_se
; ++j
) {
1294 for (k
= 0; k
< groups_instance
; ++k
) {
1295 strcpy(groupname
, block
->b
->b
->name
);
1296 p
= groupname
+ namelen
;
1298 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
) {
1299 strcpy(p
, shader_suffix
);
1303 if (per_se_groups
) {
1304 p
+= sprintf(p
, "%d", j
);
1305 if (per_instance_groups
)
1309 if (per_instance_groups
)
1310 p
+= sprintf(p
, "%d", k
);
1312 groupname
+= block
->group_name_stride
;
1317 assert(block
->b
->selectors
<= 1000);
1318 block
->selector_name_stride
= block
->group_name_stride
+ 4;
1319 block
->selector_names
=
1320 MALLOC(block
->num_groups
* block
->b
->selectors
* block
->selector_name_stride
);
1321 if (!block
->selector_names
)
1324 groupname
= block
->group_names
;
1325 p
= block
->selector_names
;
1326 for (i
= 0; i
< block
->num_groups
; ++i
) {
1327 for (j
= 0; j
< block
->b
->selectors
; ++j
) {
1328 sprintf(p
, "%s_%03d", groupname
, j
);
1329 p
+= block
->selector_name_stride
;
1331 groupname
+= block
->group_name_stride
;
1337 int si_get_perfcounter_info(struct si_screen
*screen
, unsigned index
,
1338 struct pipe_driver_query_info
*info
)
1340 struct si_perfcounters
*pc
= screen
->perfcounters
;
1341 struct si_pc_block
*block
;
1342 unsigned base_gid
, sub
;
1348 unsigned bid
, num_queries
= 0;
1350 for (bid
= 0; bid
< pc
->num_blocks
; ++bid
) {
1351 num_queries
+= pc
->blocks
[bid
].b
->selectors
* pc
->blocks
[bid
].num_groups
;
1357 block
= lookup_counter(pc
, index
, &base_gid
, &sub
);
1361 if (!block
->selector_names
) {
1362 if (!si_init_block_names(screen
, block
))
1365 info
->name
= block
->selector_names
+ sub
* block
->selector_name_stride
;
1366 info
->query_type
= SI_QUERY_FIRST_PERFCOUNTER
+ index
;
1367 info
->max_value
.u64
= 0;
1368 info
->type
= PIPE_DRIVER_QUERY_TYPE_UINT64
;
1369 info
->result_type
= PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE
;
1370 info
->group_id
= base_gid
+ sub
/ block
->b
->selectors
;
1371 info
->flags
= PIPE_DRIVER_QUERY_FLAG_BATCH
;
1372 if (sub
> 0 && sub
+ 1 < block
->b
->selectors
* block
->num_groups
)
1373 info
->flags
|= PIPE_DRIVER_QUERY_FLAG_DONT_LIST
;
1377 int si_get_perfcounter_group_info(struct si_screen
*screen
, unsigned index
,
1378 struct pipe_driver_query_group_info
*info
)
1380 struct si_perfcounters
*pc
= screen
->perfcounters
;
1381 struct si_pc_block
*block
;
1387 return pc
->num_groups
;
1389 block
= lookup_group(pc
, &index
);
1393 if (!block
->group_names
) {
1394 if (!si_init_block_names(screen
, block
))
1397 info
->name
= block
->group_names
+ index
* block
->group_name_stride
;
1398 info
->num_queries
= block
->b
->selectors
;
1399 info
->max_active_queries
= block
->b
->b
->num_counters
;
1403 void si_destroy_perfcounters(struct si_screen
*screen
)
1405 struct si_perfcounters
*pc
= screen
->perfcounters
;
1411 for (i
= 0; i
< pc
->num_blocks
; ++i
) {
1412 FREE(pc
->blocks
[i
].group_names
);
1413 FREE(pc
->blocks
[i
].selector_names
);
1417 screen
->perfcounters
= NULL
;
1420 void si_init_perfcounters(struct si_screen
*screen
)
1422 struct si_perfcounters
*pc
;
1423 const struct si_pc_block_gfxdescr
*blocks
;
1424 unsigned num_blocks
;
1427 switch (screen
->info
.chip_class
) {
1429 blocks
= groups_CIK
;
1430 num_blocks
= ARRAY_SIZE(groups_CIK
);
1434 num_blocks
= ARRAY_SIZE(groups_VI
);
1437 blocks
= groups_gfx9
;
1438 num_blocks
= ARRAY_SIZE(groups_gfx9
);
1442 blocks
= groups_gfx10
;
1443 num_blocks
= ARRAY_SIZE(groups_gfx10
);
1447 return; /* not implemented */
1450 screen
->perfcounters
= pc
= CALLOC_STRUCT(si_perfcounters
);
1454 pc
->num_stop_cs_dwords
= 14 + si_cp_write_fence_dwords(screen
);
1455 pc
->num_instance_cs_dwords
= 3;
1457 pc
->separate_se
= debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
1458 pc
->separate_instance
= debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
1460 pc
->blocks
= CALLOC(num_blocks
, sizeof(struct si_pc_block
));
1463 pc
->num_blocks
= num_blocks
;
1465 for (i
= 0; i
< num_blocks
; ++i
) {
1466 struct si_pc_block
*block
= &pc
->blocks
[i
];
1467 block
->b
= &blocks
[i
];
1468 block
->num_instances
= MAX2(1, block
->b
->instances
);
1470 if (!strcmp(block
->b
->b
->name
, "CB") ||
1471 !strcmp(block
->b
->b
->name
, "DB") ||
1472 !strcmp(block
->b
->b
->name
, "RMI"))
1473 block
->num_instances
= screen
->info
.max_se
;
1474 else if (!strcmp(block
->b
->b
->name
, "TCC"))
1475 block
->num_instances
= screen
->info
.num_tcc_blocks
;
1476 else if (!strcmp(block
->b
->b
->name
, "IA"))
1477 block
->num_instances
= MAX2(1, screen
->info
.max_se
/ 2);
1478 else if (!strcmp(block
->b
->b
->name
, "TA") ||
1479 !strcmp(block
->b
->b
->name
, "TCP") ||
1480 !strcmp(block
->b
->b
->name
, "TD")) {
1481 block
->num_instances
= MAX2(1, screen
->info
.max_good_cu_per_sa
);
1484 if (si_pc_block_has_per_instance_groups(pc
, block
)) {
1485 block
->num_groups
= block
->num_instances
;
1487 block
->num_groups
= 1;
1490 if (si_pc_block_has_per_se_groups(pc
, block
))
1491 block
->num_groups
*= screen
->info
.max_se
;
1492 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
)
1493 block
->num_groups
*= ARRAY_SIZE(si_pc_shader_type_bits
);
1495 pc
->num_groups
+= block
->num_groups
;
1501 si_destroy_perfcounters(screen
);