2 * Copyright 2015 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 #include "si_build_pm4.h"
27 #include "util/u_memory.h"
29 enum si_pc_block_flags
31 /* This block is part of the shader engine */
32 SI_PC_BLOCK_SE
= (1 << 0),
34 /* Expose per-instance groups instead of summing all instances (within
36 SI_PC_BLOCK_INSTANCE_GROUPS
= (1 << 1),
38 /* Expose per-SE groups instead of summing instances across SEs. */
39 SI_PC_BLOCK_SE_GROUPS
= (1 << 2),
42 SI_PC_BLOCK_SHADER
= (1 << 3),
44 /* Non-shader block with perfcounters windowed by shaders. */
45 SI_PC_BLOCK_SHADER_WINDOWED
= (1 << 4),
50 /* All secondary selector dwords follow as one block after the primary
51 * selector dwords for the counters that have secondary selectors.
56 * PERFCOUNTER0_SELECT1
57 * PERFCOUNTER1_SELECT1
61 SI_PC_MULTI_BLOCK
= 0,
63 /* Each secondary selector dword follows immediately after the
64 * corresponding primary.
68 * PERFCOUNTER0_SELECT1
70 * PERFCOUNTER1_SELECT1
74 SI_PC_MULTI_ALTERNATE
= 1,
76 /* All secondary selector dwords follow as one block after all primary
84 * PERFCOUNTER0_SELECT1
85 * PERFCOUNTER1_SELECT1
89 /* Free-form arrangement of selector registers. */
90 SI_PC_MULTI_CUSTOM
= 3,
94 /* Registers are laid out in decreasing rather than increasing order. */
95 SI_PC_REG_REVERSE
= 4,
100 struct si_pc_block_base
{
102 unsigned num_counters
;
107 unsigned counter0_lo
;
111 unsigned num_prelude
;
115 struct si_pc_block_gfxdescr
{
116 struct si_pc_block_base
*b
;
122 const struct si_pc_block_gfxdescr
*b
;
123 unsigned num_instances
;
127 unsigned group_name_stride
;
129 char *selector_names
;
130 unsigned selector_name_stride
;
133 /* The order is chosen to be compatible with GPUPerfStudio's hardcoding of
134 * performance counter group IDs.
136 static const char *const si_pc_shader_type_suffixes
[] = {"", "_ES", "_GS", "_VS",
137 "_PS", "_LS", "_HS", "_CS"};
139 static const unsigned si_pc_shader_type_bits
[] = {
150 /* Max counters per HW block */
151 #define SI_QUERY_MAX_COUNTERS 16
153 #define SI_PC_SHADERS_WINDOWING (1u << 31)
155 struct si_query_group
{
156 struct si_query_group
*next
;
157 struct si_pc_block
*block
;
158 unsigned sub_gid
; /* only used during init */
159 unsigned result_base
; /* only used during init */
162 unsigned num_counters
;
163 unsigned selectors
[SI_QUERY_MAX_COUNTERS
];
166 struct si_query_counter
{
169 unsigned stride
; /* in uint64s */
174 struct si_query_buffer buffer
;
176 /* Size of the results in memory, in bytes. */
177 unsigned result_size
;
180 unsigned num_counters
;
181 struct si_query_counter
*counters
;
182 struct si_query_group
*groups
;
185 static struct si_pc_block_base cik_CB
= {
188 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
,
190 .select0
= R_037000_CB_PERFCOUNTER_FILTER
,
191 .counter0_lo
= R_035018_CB_PERFCOUNTER0_LO
,
194 .layout
= SI_PC_MULTI_ALTERNATE
,
197 static unsigned cik_CPC_select
[] = {
198 R_036024_CPC_PERFCOUNTER0_SELECT
,
199 R_036010_CPC_PERFCOUNTER0_SELECT1
,
200 R_03600C_CPC_PERFCOUNTER1_SELECT
,
202 static struct si_pc_block_base cik_CPC
= {
206 .select
= cik_CPC_select
,
207 .counter0_lo
= R_034018_CPC_PERFCOUNTER0_LO
,
209 .layout
= SI_PC_MULTI_CUSTOM
| SI_PC_REG_REVERSE
,
212 static struct si_pc_block_base cik_CPF
= {
216 .select0
= R_03601C_CPF_PERFCOUNTER0_SELECT
,
217 .counter0_lo
= R_034028_CPF_PERFCOUNTER0_LO
,
219 .layout
= SI_PC_MULTI_ALTERNATE
| SI_PC_REG_REVERSE
,
222 static struct si_pc_block_base cik_CPG
= {
226 .select0
= R_036008_CPG_PERFCOUNTER0_SELECT
,
227 .counter0_lo
= R_034008_CPG_PERFCOUNTER0_LO
,
229 .layout
= SI_PC_MULTI_ALTERNATE
| SI_PC_REG_REVERSE
,
232 static struct si_pc_block_base cik_DB
= {
235 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
,
237 .select0
= R_037100_DB_PERFCOUNTER0_SELECT
,
238 .counter0_lo
= R_035100_DB_PERFCOUNTER0_LO
,
239 .num_multi
= 3, // really only 2, but there's a gap between registers
240 .layout
= SI_PC_MULTI_ALTERNATE
,
243 static struct si_pc_block_base cik_GDS
= {
247 .select0
= R_036A00_GDS_PERFCOUNTER0_SELECT
,
248 .counter0_lo
= R_034A00_GDS_PERFCOUNTER0_LO
,
250 .layout
= SI_PC_MULTI_TAIL
,
253 static unsigned cik_GRBM_counters
[] = {
254 R_034100_GRBM_PERFCOUNTER0_LO
,
255 R_03410C_GRBM_PERFCOUNTER1_LO
,
257 static struct si_pc_block_base cik_GRBM
= {
261 .select0
= R_036100_GRBM_PERFCOUNTER0_SELECT
,
262 .counters
= cik_GRBM_counters
,
265 static struct si_pc_block_base cik_GRBMSE
= {
269 .select0
= R_036108_GRBM_SE0_PERFCOUNTER_SELECT
,
270 .counter0_lo
= R_034114_GRBM_SE0_PERFCOUNTER_LO
,
273 static struct si_pc_block_base cik_IA
= {
277 .select0
= R_036210_IA_PERFCOUNTER0_SELECT
,
278 .counter0_lo
= R_034220_IA_PERFCOUNTER0_LO
,
280 .layout
= SI_PC_MULTI_TAIL
,
283 static struct si_pc_block_base cik_PA_SC
= {
286 .flags
= SI_PC_BLOCK_SE
,
288 .select0
= R_036500_PA_SC_PERFCOUNTER0_SELECT
,
289 .counter0_lo
= R_034500_PA_SC_PERFCOUNTER0_LO
,
291 .layout
= SI_PC_MULTI_ALTERNATE
,
294 /* According to docs, PA_SU counters are only 48 bits wide. */
295 static struct si_pc_block_base cik_PA_SU
= {
298 .flags
= SI_PC_BLOCK_SE
,
300 .select0
= R_036400_PA_SU_PERFCOUNTER0_SELECT
,
301 .counter0_lo
= R_034400_PA_SU_PERFCOUNTER0_LO
,
303 .layout
= SI_PC_MULTI_ALTERNATE
,
306 static struct si_pc_block_base cik_SPI
= {
309 .flags
= SI_PC_BLOCK_SE
,
311 .select0
= R_036600_SPI_PERFCOUNTER0_SELECT
,
312 .counter0_lo
= R_034604_SPI_PERFCOUNTER0_LO
,
314 .layout
= SI_PC_MULTI_BLOCK
,
317 static struct si_pc_block_base cik_SQ
= {
320 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_SHADER
,
322 .select0
= R_036700_SQ_PERFCOUNTER0_SELECT
,
323 .select_or
= S_036700_SQC_BANK_MASK(15) | S_036700_SQC_CLIENT_MASK(15) | S_036700_SIMD_MASK(15),
324 .counter0_lo
= R_034700_SQ_PERFCOUNTER0_LO
,
327 static struct si_pc_block_base cik_SX
= {
330 .flags
= SI_PC_BLOCK_SE
,
332 .select0
= R_036900_SX_PERFCOUNTER0_SELECT
,
333 .counter0_lo
= R_034900_SX_PERFCOUNTER0_LO
,
335 .layout
= SI_PC_MULTI_TAIL
,
338 static struct si_pc_block_base cik_TA
= {
341 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
| SI_PC_BLOCK_SHADER_WINDOWED
,
343 .select0
= R_036B00_TA_PERFCOUNTER0_SELECT
,
344 .counter0_lo
= R_034B00_TA_PERFCOUNTER0_LO
,
346 .layout
= SI_PC_MULTI_ALTERNATE
,
349 static struct si_pc_block_base cik_TD
= {
352 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
| SI_PC_BLOCK_SHADER_WINDOWED
,
354 .select0
= R_036C00_TD_PERFCOUNTER0_SELECT
,
355 .counter0_lo
= R_034C00_TD_PERFCOUNTER0_LO
,
357 .layout
= SI_PC_MULTI_ALTERNATE
,
360 static struct si_pc_block_base cik_TCA
= {
363 .flags
= SI_PC_BLOCK_INSTANCE_GROUPS
,
365 .select0
= R_036E40_TCA_PERFCOUNTER0_SELECT
,
366 .counter0_lo
= R_034E40_TCA_PERFCOUNTER0_LO
,
368 .layout
= SI_PC_MULTI_ALTERNATE
,
371 static struct si_pc_block_base cik_TCC
= {
374 .flags
= SI_PC_BLOCK_INSTANCE_GROUPS
,
376 .select0
= R_036E00_TCC_PERFCOUNTER0_SELECT
,
377 .counter0_lo
= R_034E00_TCC_PERFCOUNTER0_LO
,
379 .layout
= SI_PC_MULTI_ALTERNATE
,
382 static struct si_pc_block_base cik_TCP
= {
385 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
| SI_PC_BLOCK_SHADER_WINDOWED
,
387 .select0
= R_036D00_TCP_PERFCOUNTER0_SELECT
,
388 .counter0_lo
= R_034D00_TCP_PERFCOUNTER0_LO
,
390 .layout
= SI_PC_MULTI_ALTERNATE
,
393 static struct si_pc_block_base cik_VGT
= {
396 .flags
= SI_PC_BLOCK_SE
,
398 .select0
= R_036230_VGT_PERFCOUNTER0_SELECT
,
399 .counter0_lo
= R_034240_VGT_PERFCOUNTER0_LO
,
401 .layout
= SI_PC_MULTI_TAIL
,
404 static struct si_pc_block_base cik_WD
= {
408 .select0
= R_036200_WD_PERFCOUNTER0_SELECT
,
409 .counter0_lo
= R_034200_WD_PERFCOUNTER0_LO
,
412 static struct si_pc_block_base cik_MC
= {
416 .layout
= SI_PC_FAKE
,
419 static struct si_pc_block_base cik_SRBM
= {
423 .layout
= SI_PC_FAKE
,
426 static struct si_pc_block_base gfx10_CHA
= {
430 .select0
= R_037780_CHA_PERFCOUNTER0_SELECT
,
431 .counter0_lo
= R_035800_CHA_PERFCOUNTER0_LO
,
433 .layout
= SI_PC_MULTI_ALTERNATE
,
436 static struct si_pc_block_base gfx10_CHCG
= {
440 .select0
= R_036F18_CHCG_PERFCOUNTER0_SELECT
,
441 .counter0_lo
= R_034F20_CHCG_PERFCOUNTER0_LO
,
443 .layout
= SI_PC_MULTI_ALTERNATE
,
446 static struct si_pc_block_base gfx10_CHC
= {
450 .select0
= R_036F00_CHC_PERFCOUNTER0_SELECT
,
451 .counter0_lo
= R_034F00_CHC_PERFCOUNTER0_LO
,
453 .layout
= SI_PC_MULTI_ALTERNATE
,
456 static struct si_pc_block_base gfx10_GCR
= {
460 .select0
= R_037580_GCR_PERFCOUNTER0_SELECT
,
461 .counter0_lo
= R_035480_GCR_PERFCOUNTER0_LO
,
463 .layout
= SI_PC_MULTI_ALTERNATE
,
466 static struct si_pc_block_base gfx10_GE
= {
470 .select0
= R_036200_GE_PERFCOUNTER0_SELECT
,
471 .counter0_lo
= R_034200_GE_PERFCOUNTER0_LO
,
473 .layout
= SI_PC_MULTI_ALTERNATE
,
476 static struct si_pc_block_base gfx10_GL1A
= {
479 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_SHADER_WINDOWED
,
481 .select0
= R_037700_GL1A_PERFCOUNTER0_SELECT
,
482 .counter0_lo
= R_035700_GL1A_PERFCOUNTER0_LO
,
484 .layout
= SI_PC_MULTI_ALTERNATE
,
487 static struct si_pc_block_base gfx10_GL1C
= {
490 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_SHADER_WINDOWED
,
492 .select0
= R_036E80_GL1C_PERFCOUNTER0_SELECT
,
493 .counter0_lo
= R_034E80_GL1C_PERFCOUNTER0_LO
,
495 .layout
= SI_PC_MULTI_ALTERNATE
,
498 static struct si_pc_block_base gfx10_GL2A
= {
502 .select0
= R_036E40_GL2A_PERFCOUNTER0_SELECT
,
503 .counter0_lo
= R_034E40_GL2A_PERFCOUNTER0_LO
,
505 .layout
= SI_PC_MULTI_ALTERNATE
,
508 static struct si_pc_block_base gfx10_GL2C
= {
512 .select0
= R_036E00_GL2C_PERFCOUNTER0_SELECT
,
513 .counter0_lo
= R_034E00_GL2C_PERFCOUNTER0_LO
,
515 .layout
= SI_PC_MULTI_ALTERNATE
,
518 static unsigned gfx10_PA_PH_select
[] = {
519 R_037600_PA_PH_PERFCOUNTER0_SELECT
,
520 R_037604_PA_PH_PERFCOUNTER0_SELECT1
,
521 R_037608_PA_PH_PERFCOUNTER1_SELECT
,
522 R_037640_PA_PH_PERFCOUNTER1_SELECT1
,
523 R_03760C_PA_PH_PERFCOUNTER2_SELECT
,
524 R_037644_PA_PH_PERFCOUNTER2_SELECT1
,
525 R_037610_PA_PH_PERFCOUNTER3_SELECT
,
526 R_037648_PA_PH_PERFCOUNTER3_SELECT1
,
527 R_037614_PA_PH_PERFCOUNTER4_SELECT
,
528 R_037618_PA_PH_PERFCOUNTER5_SELECT
,
529 R_03761C_PA_PH_PERFCOUNTER6_SELECT
,
530 R_037620_PA_PH_PERFCOUNTER7_SELECT
,
532 static struct si_pc_block_base gfx10_PA_PH
= {
535 .flags
= SI_PC_BLOCK_SE
,
537 .select
= gfx10_PA_PH_select
,
538 .counter0_lo
= R_035600_PA_PH_PERFCOUNTER0_LO
,
540 .layout
= SI_PC_MULTI_CUSTOM
,
543 static struct si_pc_block_base gfx10_PA_SU
= {
546 .flags
= SI_PC_BLOCK_SE
,
548 .select0
= R_036400_PA_SU_PERFCOUNTER0_SELECT
,
549 .counter0_lo
= R_034400_PA_SU_PERFCOUNTER0_LO
,
551 .layout
= SI_PC_MULTI_ALTERNATE
,
554 static struct si_pc_block_base gfx10_RLC
= {
558 .select0
= R_037304_RLC_PERFCOUNTER0_SELECT
,
559 .counter0_lo
= R_035200_RLC_PERFCOUNTER0_LO
,
561 .layout
= SI_PC_MULTI_ALTERNATE
,
564 static struct si_pc_block_base gfx10_RMI
= {
566 /* Actually 4, but the 2nd counter is missing the secondary selector while
567 * the 3rd counter has it, which complicates the register layout. */
569 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_INSTANCE_GROUPS
,
571 .select0
= R_037400_RMI_PERFCOUNTER0_SELECT
,
572 .counter0_lo
= R_035300_RMI_PERFCOUNTER0_LO
,
574 .layout
= SI_PC_MULTI_ALTERNATE
,
577 static struct si_pc_block_base gfx10_UTCL1
= {
580 .flags
= SI_PC_BLOCK_SE
| SI_PC_BLOCK_SHADER_WINDOWED
,
582 .select0
= R_03758C_UTCL1_PERFCOUNTER0_SELECT
,
583 .counter0_lo
= R_035470_UTCL1_PERFCOUNTER0_LO
,
585 .layout
= SI_PC_MULTI_ALTERNATE
,
588 /* Both the number of instances and selectors varies between chips of the same
589 * class. We only differentiate by class here and simply expose the maximum
590 * number over all chips in a class.
592 * Unfortunately, GPUPerfStudio uses the order of performance counter groups
593 * blindly once it believes it has identified the hardware, so the order of
594 * blocks here matters.
596 static struct si_pc_block_gfxdescr groups_CIK
[] = {
597 {&cik_CB
, 226}, {&cik_CPF
, 17}, {&cik_DB
, 257}, {&cik_GRBM
, 34}, {&cik_GRBMSE
, 15},
598 {&cik_PA_SU
, 153}, {&cik_PA_SC
, 395}, {&cik_SPI
, 186}, {&cik_SQ
, 252}, {&cik_SX
, 32},
599 {&cik_TA
, 111}, {&cik_TCA
, 39, 2}, {&cik_TCC
, 160}, {&cik_TD
, 55}, {&cik_TCP
, 154},
600 {&cik_GDS
, 121}, {&cik_VGT
, 140}, {&cik_IA
, 22}, {&cik_MC
, 22}, {&cik_SRBM
, 19},
601 {&cik_WD
, 22}, {&cik_CPG
, 46}, {&cik_CPC
, 22},
605 static struct si_pc_block_gfxdescr groups_VI
[] = {
606 {&cik_CB
, 405}, {&cik_CPF
, 19}, {&cik_DB
, 257}, {&cik_GRBM
, 34}, {&cik_GRBMSE
, 15},
607 {&cik_PA_SU
, 154}, {&cik_PA_SC
, 397}, {&cik_SPI
, 197}, {&cik_SQ
, 273}, {&cik_SX
, 34},
608 {&cik_TA
, 119}, {&cik_TCA
, 35, 2}, {&cik_TCC
, 192}, {&cik_TD
, 55}, {&cik_TCP
, 180},
609 {&cik_GDS
, 121}, {&cik_VGT
, 147}, {&cik_IA
, 24}, {&cik_MC
, 22}, {&cik_SRBM
, 27},
610 {&cik_WD
, 37}, {&cik_CPG
, 48}, {&cik_CPC
, 24},
614 static struct si_pc_block_gfxdescr groups_gfx9
[] = {
615 {&cik_CB
, 438}, {&cik_CPF
, 32}, {&cik_DB
, 328}, {&cik_GRBM
, 38}, {&cik_GRBMSE
, 16},
616 {&cik_PA_SU
, 292}, {&cik_PA_SC
, 491}, {&cik_SPI
, 196}, {&cik_SQ
, 374}, {&cik_SX
, 208},
617 {&cik_TA
, 119}, {&cik_TCA
, 35, 2}, {&cik_TCC
, 256}, {&cik_TD
, 57}, {&cik_TCP
, 85},
618 {&cik_GDS
, 121}, {&cik_VGT
, 148}, {&cik_IA
, 32}, {&cik_WD
, 58}, {&cik_CPG
, 59},
622 static struct si_pc_block_gfxdescr groups_gfx10
[] = {
654 static bool si_pc_block_has_per_se_groups(const struct si_perfcounters
*pc
,
655 const struct si_pc_block
*block
)
657 return block
->b
->b
->flags
& SI_PC_BLOCK_SE_GROUPS
||
658 (block
->b
->b
->flags
& SI_PC_BLOCK_SE
&& pc
->separate_se
);
661 static bool si_pc_block_has_per_instance_groups(const struct si_perfcounters
*pc
,
662 const struct si_pc_block
*block
)
664 return block
->b
->b
->flags
& SI_PC_BLOCK_INSTANCE_GROUPS
||
665 (block
->num_instances
> 1 && pc
->separate_instance
);
668 static struct si_pc_block
*lookup_counter(struct si_perfcounters
*pc
, unsigned index
,
669 unsigned *base_gid
, unsigned *sub_index
)
671 struct si_pc_block
*block
= pc
->blocks
;
675 for (bid
= 0; bid
< pc
->num_blocks
; ++bid
, ++block
) {
676 unsigned total
= block
->num_groups
* block
->b
->selectors
;
684 *base_gid
+= block
->num_groups
;
690 static struct si_pc_block
*lookup_group(struct si_perfcounters
*pc
, unsigned *index
)
693 struct si_pc_block
*block
= pc
->blocks
;
695 for (bid
= 0; bid
< pc
->num_blocks
; ++bid
, ++block
) {
696 if (*index
< block
->num_groups
)
698 *index
-= block
->num_groups
;
704 static void si_pc_emit_instance(struct si_context
*sctx
, int se
, int instance
)
706 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
707 unsigned value
= S_030800_SH_BROADCAST_WRITES(1);
710 value
|= S_030800_SE_INDEX(se
);
712 value
|= S_030800_SE_BROADCAST_WRITES(1);
715 if (sctx
->chip_class
>= GFX10
) {
716 /* TODO: Expose counters from each shader array separately if needed. */
717 value
|= S_030800_SA_BROADCAST_WRITES(1);
721 value
|= S_030800_INSTANCE_INDEX(instance
);
723 value
|= S_030800_INSTANCE_BROADCAST_WRITES(1);
726 radeon_set_uconfig_reg(cs
, R_030800_GRBM_GFX_INDEX
, value
);
729 static void si_pc_emit_shaders(struct si_context
*sctx
, unsigned shaders
)
731 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
733 radeon_set_uconfig_reg_seq(cs
, R_036780_SQ_PERFCOUNTER_CTRL
, 2);
734 radeon_emit(cs
, shaders
& 0x7f);
735 radeon_emit(cs
, 0xffffffff);
738 static void si_pc_emit_select(struct si_context
*sctx
, struct si_pc_block
*block
, unsigned count
,
741 struct si_pc_block_base
*regs
= block
->b
->b
;
742 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
744 unsigned layout_multi
= regs
->layout
& SI_PC_MULTI_MASK
;
747 assert(count
<= regs
->num_counters
);
749 if (regs
->layout
& SI_PC_FAKE
)
752 if (layout_multi
== SI_PC_MULTI_BLOCK
) {
753 assert(!(regs
->layout
& SI_PC_REG_REVERSE
));
755 dw
= count
+ regs
->num_prelude
;
756 if (count
>= regs
->num_multi
)
757 dw
+= regs
->num_multi
;
758 radeon_set_uconfig_reg_seq(cs
, regs
->select0
, dw
);
759 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
761 for (idx
= 0; idx
< MIN2(count
, regs
->num_multi
); ++idx
)
762 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
764 if (count
< regs
->num_multi
) {
765 unsigned select1
= regs
->select0
+ 4 * regs
->num_multi
;
766 radeon_set_uconfig_reg_seq(cs
, select1
, count
);
769 for (idx
= 0; idx
< MIN2(count
, regs
->num_multi
); ++idx
)
772 if (count
> regs
->num_multi
) {
773 for (idx
= regs
->num_multi
; idx
< count
; ++idx
)
774 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
776 } else if (layout_multi
== SI_PC_MULTI_TAIL
) {
777 unsigned select1
, select1_count
;
779 assert(!(regs
->layout
& SI_PC_REG_REVERSE
));
781 radeon_set_uconfig_reg_seq(cs
, regs
->select0
, count
+ regs
->num_prelude
);
782 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
784 for (idx
= 0; idx
< count
; ++idx
)
785 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
787 select1
= regs
->select0
+ 4 * regs
->num_counters
;
788 select1_count
= MIN2(count
, regs
->num_multi
);
789 radeon_set_uconfig_reg_seq(cs
, select1
, select1_count
);
790 for (idx
= 0; idx
< select1_count
; ++idx
)
792 } else if (layout_multi
== SI_PC_MULTI_CUSTOM
) {
793 unsigned *reg
= regs
->select
;
794 for (idx
= 0; idx
< count
; ++idx
) {
795 radeon_set_uconfig_reg(cs
, *reg
++, selectors
[idx
] | regs
->select_or
);
796 if (idx
< regs
->num_multi
)
797 radeon_set_uconfig_reg(cs
, *reg
++, 0);
800 assert(layout_multi
== SI_PC_MULTI_ALTERNATE
);
802 unsigned reg_base
= regs
->select0
;
803 unsigned reg_count
= count
+ MIN2(count
, regs
->num_multi
);
804 reg_count
+= regs
->num_prelude
;
806 if (!(regs
->layout
& SI_PC_REG_REVERSE
)) {
807 radeon_set_uconfig_reg_seq(cs
, reg_base
, reg_count
);
809 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
811 for (idx
= 0; idx
< count
; ++idx
) {
812 radeon_emit(cs
, selectors
[idx
] | regs
->select_or
);
813 if (idx
< regs
->num_multi
)
817 reg_base
-= (reg_count
- 1) * 4;
818 radeon_set_uconfig_reg_seq(cs
, reg_base
, reg_count
);
820 for (idx
= count
; idx
> 0; --idx
) {
821 if (idx
<= regs
->num_multi
)
823 radeon_emit(cs
, selectors
[idx
- 1] | regs
->select_or
);
825 for (idx
= 0; idx
< regs
->num_prelude
; ++idx
)
831 static void si_pc_emit_start(struct si_context
*sctx
, struct si_resource
*buffer
, uint64_t va
)
833 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
835 si_cp_copy_data(sctx
, sctx
->gfx_cs
, COPY_DATA_DST_MEM
, buffer
, va
- buffer
->gpu_address
,
836 COPY_DATA_IMM
, NULL
, 1);
838 radeon_set_uconfig_reg(cs
, R_036020_CP_PERFMON_CNTL
,
839 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET
));
840 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
841 radeon_emit(cs
, EVENT_TYPE(V_028A90_PERFCOUNTER_START
) | EVENT_INDEX(0));
842 radeon_set_uconfig_reg(cs
, R_036020_CP_PERFMON_CNTL
,
843 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING
));
846 /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
847 * do it again in here. */
848 static void si_pc_emit_stop(struct si_context
*sctx
, struct si_resource
*buffer
, uint64_t va
)
850 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
852 si_cp_release_mem(sctx
, cs
, V_028A90_BOTTOM_OF_PIPE_TS
, 0, EOP_DST_SEL_MEM
, EOP_INT_SEL_NONE
,
853 EOP_DATA_SEL_VALUE_32BIT
, buffer
, va
, 0, SI_NOT_QUERY
);
854 si_cp_wait_mem(sctx
, cs
, va
, 0, 0xffffffff, WAIT_REG_MEM_EQUAL
);
856 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
857 radeon_emit(cs
, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE
) | EVENT_INDEX(0));
858 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
859 radeon_emit(cs
, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP
) | EVENT_INDEX(0));
860 radeon_set_uconfig_reg(
861 cs
, R_036020_CP_PERFMON_CNTL
,
862 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_STOP_COUNTING
) | S_036020_PERFMON_SAMPLE_ENABLE(1));
865 static void si_pc_emit_read(struct si_context
*sctx
, struct si_pc_block
*block
, unsigned count
,
868 struct si_pc_block_base
*regs
= block
->b
->b
;
869 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
871 unsigned reg
= regs
->counter0_lo
;
872 unsigned reg_delta
= 8;
874 if (!(regs
->layout
& SI_PC_FAKE
)) {
875 if (regs
->layout
& SI_PC_REG_REVERSE
)
876 reg_delta
= -reg_delta
;
878 for (idx
= 0; idx
< count
; ++idx
) {
880 reg
= regs
->counters
[idx
];
882 radeon_emit(cs
, PKT3(PKT3_COPY_DATA
, 4, 0));
883 radeon_emit(cs
, COPY_DATA_SRC_SEL(COPY_DATA_PERF
) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM
) |
884 COPY_DATA_COUNT_SEL
); /* 64 bits */
885 radeon_emit(cs
, reg
>> 2);
886 radeon_emit(cs
, 0); /* unused */
888 radeon_emit(cs
, va
>> 32);
889 va
+= sizeof(uint64_t);
893 for (idx
= 0; idx
< count
; ++idx
) {
894 radeon_emit(cs
, PKT3(PKT3_COPY_DATA
, 4, 0));
895 radeon_emit(cs
, COPY_DATA_SRC_SEL(COPY_DATA_IMM
) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM
) |
896 COPY_DATA_COUNT_SEL
);
897 radeon_emit(cs
, 0); /* immediate */
900 radeon_emit(cs
, va
>> 32);
901 va
+= sizeof(uint64_t);
906 static void si_pc_query_destroy(struct si_context
*sctx
, struct si_query
*squery
)
908 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
910 while (query
->groups
) {
911 struct si_query_group
*group
= query
->groups
;
912 query
->groups
= group
->next
;
916 FREE(query
->counters
);
918 si_query_buffer_destroy(sctx
->screen
, &query
->buffer
);
922 static void si_inhibit_clockgating(struct si_context
*sctx
, bool inhibit
)
924 if (sctx
->chip_class
>= GFX10
) {
925 radeon_set_uconfig_reg(sctx
->gfx_cs
, R_037390_RLC_PERFMON_CLK_CNTL
,
926 S_037390_PERFMON_CLOCK_STATE(inhibit
));
927 } else if (sctx
->chip_class
>= GFX8
) {
928 radeon_set_uconfig_reg(sctx
->gfx_cs
, R_0372FC_RLC_PERFMON_CLK_CNTL
,
929 S_0372FC_PERFMON_CLOCK_STATE(inhibit
));
933 static void si_pc_query_resume(struct si_context
*sctx
, struct si_query
*squery
)
935 struct si_query_hw *hwquery,
936 struct si_resource *buffer, uint64_t va)*/
938 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
940 int current_instance
= -1;
942 if (!si_query_buffer_alloc(sctx
, &query
->buffer
, NULL
, query
->result_size
))
944 si_need_gfx_cs_space(sctx
);
947 si_pc_emit_shaders(sctx
, query
->shaders
);
949 si_inhibit_clockgating(sctx
, true);
951 for (struct si_query_group
*group
= query
->groups
; group
; group
= group
->next
) {
952 struct si_pc_block
*block
= group
->block
;
954 if (group
->se
!= current_se
|| group
->instance
!= current_instance
) {
955 current_se
= group
->se
;
956 current_instance
= group
->instance
;
957 si_pc_emit_instance(sctx
, group
->se
, group
->instance
);
960 si_pc_emit_select(sctx
, block
, group
->num_counters
, group
->selectors
);
963 if (current_se
!= -1 || current_instance
!= -1)
964 si_pc_emit_instance(sctx
, -1, -1);
966 uint64_t va
= query
->buffer
.buf
->gpu_address
+ query
->buffer
.results_end
;
967 si_pc_emit_start(sctx
, query
->buffer
.buf
, va
);
970 static void si_pc_query_suspend(struct si_context
*sctx
, struct si_query
*squery
)
972 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
974 if (!query
->buffer
.buf
)
977 uint64_t va
= query
->buffer
.buf
->gpu_address
+ query
->buffer
.results_end
;
978 query
->buffer
.results_end
+= query
->result_size
;
980 si_pc_emit_stop(sctx
, query
->buffer
.buf
, va
);
982 for (struct si_query_group
*group
= query
->groups
; group
; group
= group
->next
) {
983 struct si_pc_block
*block
= group
->block
;
984 unsigned se
= group
->se
>= 0 ? group
->se
: 0;
985 unsigned se_end
= se
+ 1;
987 if ((block
->b
->b
->flags
& SI_PC_BLOCK_SE
) && (group
->se
< 0))
988 se_end
= sctx
->screen
->info
.max_se
;
991 unsigned instance
= group
->instance
>= 0 ? group
->instance
: 0;
994 si_pc_emit_instance(sctx
, se
, instance
);
995 si_pc_emit_read(sctx
, block
, group
->num_counters
, va
);
996 va
+= sizeof(uint64_t) * group
->num_counters
;
997 } while (group
->instance
< 0 && ++instance
< block
->num_instances
);
998 } while (++se
< se_end
);
1001 si_pc_emit_instance(sctx
, -1, -1);
1003 si_inhibit_clockgating(sctx
, false);
1006 static bool si_pc_query_begin(struct si_context
*ctx
, struct si_query
*squery
)
1008 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
1010 si_query_buffer_reset(ctx
, &query
->buffer
);
1012 list_addtail(&query
->b
.active_list
, &ctx
->active_queries
);
1013 ctx
->num_cs_dw_queries_suspend
+= query
->b
.num_cs_dw_suspend
;
1015 si_pc_query_resume(ctx
, squery
);
1020 static bool si_pc_query_end(struct si_context
*ctx
, struct si_query
*squery
)
1022 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
1024 si_pc_query_suspend(ctx
, squery
);
1026 list_del(&squery
->active_list
);
1027 ctx
->num_cs_dw_queries_suspend
-= squery
->num_cs_dw_suspend
;
1029 return query
->buffer
.buf
!= NULL
;
1032 static void si_pc_query_add_result(struct si_query_pc
*query
, void *buffer
,
1033 union pipe_query_result
*result
)
1035 uint64_t *results
= buffer
;
1038 for (i
= 0; i
< query
->num_counters
; ++i
) {
1039 struct si_query_counter
*counter
= &query
->counters
[i
];
1041 for (j
= 0; j
< counter
->qwords
; ++j
) {
1042 uint32_t value
= results
[counter
->base
+ j
* counter
->stride
];
1043 result
->batch
[i
].u64
+= value
;
1048 static bool si_pc_query_get_result(struct si_context
*sctx
, struct si_query
*squery
, bool wait
,
1049 union pipe_query_result
*result
)
1051 struct si_query_pc
*query
= (struct si_query_pc
*)squery
;
1053 memset(result
, 0, sizeof(result
->batch
[0]) * query
->num_counters
);
1055 for (struct si_query_buffer
*qbuf
= &query
->buffer
; qbuf
; qbuf
= qbuf
->previous
) {
1056 unsigned usage
= PIPE_TRANSFER_READ
| (wait
? 0 : PIPE_TRANSFER_DONTBLOCK
);
1057 unsigned results_base
= 0;
1060 if (squery
->b
.flushed
)
1061 map
= sctx
->ws
->buffer_map(qbuf
->buf
->buf
, NULL
, usage
);
1063 map
= si_buffer_map_sync_with_rings(sctx
, qbuf
->buf
, usage
);
1068 while (results_base
!= qbuf
->results_end
) {
1069 si_pc_query_add_result(query
, map
+ results_base
, result
);
1070 results_base
+= query
->result_size
;
1077 static const struct si_query_ops batch_query_ops
= {
1078 .destroy
= si_pc_query_destroy
,
1079 .begin
= si_pc_query_begin
,
1080 .end
= si_pc_query_end
,
1081 .get_result
= si_pc_query_get_result
,
1083 .suspend
= si_pc_query_suspend
,
1084 .resume
= si_pc_query_resume
,
1087 static struct si_query_group
*get_group_state(struct si_screen
*screen
, struct si_query_pc
*query
,
1088 struct si_pc_block
*block
, unsigned sub_gid
)
1090 struct si_query_group
*group
= query
->groups
;
1093 if (group
->block
== block
&& group
->sub_gid
== sub_gid
)
1095 group
= group
->next
;
1098 group
= CALLOC_STRUCT(si_query_group
);
1102 group
->block
= block
;
1103 group
->sub_gid
= sub_gid
;
1105 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
) {
1106 unsigned sub_gids
= block
->num_instances
;
1109 unsigned query_shaders
;
1111 if (si_pc_block_has_per_se_groups(screen
->perfcounters
, block
))
1112 sub_gids
= sub_gids
* screen
->info
.max_se
;
1113 shader_id
= sub_gid
/ sub_gids
;
1114 sub_gid
= sub_gid
% sub_gids
;
1116 shaders
= si_pc_shader_type_bits
[shader_id
];
1118 query_shaders
= query
->shaders
& ~SI_PC_SHADERS_WINDOWING
;
1119 if (query_shaders
&& query_shaders
!= shaders
) {
1120 fprintf(stderr
, "si_perfcounter: incompatible shader groups\n");
1124 query
->shaders
= shaders
;
1127 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER_WINDOWED
&& !query
->shaders
) {
1128 // A non-zero value in query->shaders ensures that the shader
1129 // masking is reset unless the user explicitly requests one.
1130 query
->shaders
= SI_PC_SHADERS_WINDOWING
;
1133 if (si_pc_block_has_per_se_groups(screen
->perfcounters
, block
)) {
1134 group
->se
= sub_gid
/ block
->num_instances
;
1135 sub_gid
= sub_gid
% block
->num_instances
;
1140 if (si_pc_block_has_per_instance_groups(screen
->perfcounters
, block
)) {
1141 group
->instance
= sub_gid
;
1143 group
->instance
= -1;
1146 group
->next
= query
->groups
;
1147 query
->groups
= group
;
1152 struct pipe_query
*si_create_batch_query(struct pipe_context
*ctx
, unsigned num_queries
,
1153 unsigned *query_types
)
1155 struct si_screen
*screen
= (struct si_screen
*)ctx
->screen
;
1156 struct si_perfcounters
*pc
= screen
->perfcounters
;
1157 struct si_pc_block
*block
;
1158 struct si_query_group
*group
;
1159 struct si_query_pc
*query
;
1160 unsigned base_gid
, sub_gid
, sub_index
;
1166 query
= CALLOC_STRUCT(si_query_pc
);
1170 query
->b
.ops
= &batch_query_ops
;
1172 query
->num_counters
= num_queries
;
1174 /* Collect selectors per group */
1175 for (i
= 0; i
< num_queries
; ++i
) {
1178 if (query_types
[i
] < SI_QUERY_FIRST_PERFCOUNTER
)
1182 lookup_counter(pc
, query_types
[i
] - SI_QUERY_FIRST_PERFCOUNTER
, &base_gid
, &sub_index
);
1186 sub_gid
= sub_index
/ block
->b
->selectors
;
1187 sub_index
= sub_index
% block
->b
->selectors
;
1189 group
= get_group_state(screen
, query
, block
, sub_gid
);
1193 if (group
->num_counters
>= block
->b
->b
->num_counters
) {
1194 fprintf(stderr
, "perfcounter group %s: too many selected\n", block
->b
->b
->name
);
1197 group
->selectors
[group
->num_counters
] = sub_index
;
1198 ++group
->num_counters
;
1201 /* Compute result bases and CS size per group */
1202 query
->b
.num_cs_dw_suspend
= pc
->num_stop_cs_dwords
;
1203 query
->b
.num_cs_dw_suspend
+= pc
->num_instance_cs_dwords
;
1206 for (group
= query
->groups
; group
; group
= group
->next
) {
1207 struct si_pc_block
*block
= group
->block
;
1209 unsigned instances
= 1;
1211 if ((block
->b
->b
->flags
& SI_PC_BLOCK_SE
) && group
->se
< 0)
1212 instances
= screen
->info
.max_se
;
1213 if (group
->instance
< 0)
1214 instances
*= block
->num_instances
;
1216 group
->result_base
= i
;
1217 query
->result_size
+= sizeof(uint64_t) * instances
* group
->num_counters
;
1218 i
+= instances
* group
->num_counters
;
1220 read_dw
= 6 * group
->num_counters
;
1221 query
->b
.num_cs_dw_suspend
+= instances
* read_dw
;
1222 query
->b
.num_cs_dw_suspend
+= instances
* pc
->num_instance_cs_dwords
;
1225 if (query
->shaders
) {
1226 if (query
->shaders
== SI_PC_SHADERS_WINDOWING
)
1227 query
->shaders
= 0xffffffff;
1230 /* Map user-supplied query array to result indices */
1231 query
->counters
= CALLOC(num_queries
, sizeof(*query
->counters
));
1232 for (i
= 0; i
< num_queries
; ++i
) {
1233 struct si_query_counter
*counter
= &query
->counters
[i
];
1234 struct si_pc_block
*block
;
1237 lookup_counter(pc
, query_types
[i
] - SI_QUERY_FIRST_PERFCOUNTER
, &base_gid
, &sub_index
);
1239 sub_gid
= sub_index
/ block
->b
->selectors
;
1240 sub_index
= sub_index
% block
->b
->selectors
;
1242 group
= get_group_state(screen
, query
, block
, sub_gid
);
1243 assert(group
!= NULL
);
1245 for (j
= 0; j
< group
->num_counters
; ++j
) {
1246 if (group
->selectors
[j
] == sub_index
)
1250 counter
->base
= group
->result_base
+ j
;
1251 counter
->stride
= group
->num_counters
;
1253 counter
->qwords
= 1;
1254 if ((block
->b
->b
->flags
& SI_PC_BLOCK_SE
) && group
->se
< 0)
1255 counter
->qwords
= screen
->info
.max_se
;
1256 if (group
->instance
< 0)
1257 counter
->qwords
*= block
->num_instances
;
1260 return (struct pipe_query
*)query
;
1263 si_pc_query_destroy((struct si_context
*)ctx
, &query
->b
);
1267 static bool si_init_block_names(struct si_screen
*screen
, struct si_pc_block
*block
)
1269 bool per_instance_groups
= si_pc_block_has_per_instance_groups(screen
->perfcounters
, block
);
1270 bool per_se_groups
= si_pc_block_has_per_se_groups(screen
->perfcounters
, block
);
1272 unsigned groups_shader
= 1, groups_se
= 1, groups_instance
= 1;
1277 if (per_instance_groups
)
1278 groups_instance
= block
->num_instances
;
1280 groups_se
= screen
->info
.max_se
;
1281 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
)
1282 groups_shader
= ARRAY_SIZE(si_pc_shader_type_bits
);
1284 namelen
= strlen(block
->b
->b
->name
);
1285 block
->group_name_stride
= namelen
+ 1;
1286 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
)
1287 block
->group_name_stride
+= 3;
1288 if (per_se_groups
) {
1289 assert(groups_se
<= 10);
1290 block
->group_name_stride
+= 1;
1292 if (per_instance_groups
)
1293 block
->group_name_stride
+= 1;
1295 if (per_instance_groups
) {
1296 assert(groups_instance
<= 100);
1297 block
->group_name_stride
+= 2;
1300 block
->group_names
= MALLOC(block
->num_groups
* block
->group_name_stride
);
1301 if (!block
->group_names
)
1304 groupname
= block
->group_names
;
1305 for (i
= 0; i
< groups_shader
; ++i
) {
1306 const char *shader_suffix
= si_pc_shader_type_suffixes
[i
];
1307 unsigned shaderlen
= strlen(shader_suffix
);
1308 for (j
= 0; j
< groups_se
; ++j
) {
1309 for (k
= 0; k
< groups_instance
; ++k
) {
1310 strcpy(groupname
, block
->b
->b
->name
);
1311 p
= groupname
+ namelen
;
1313 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
) {
1314 strcpy(p
, shader_suffix
);
1318 if (per_se_groups
) {
1319 p
+= sprintf(p
, "%d", j
);
1320 if (per_instance_groups
)
1324 if (per_instance_groups
)
1325 p
+= sprintf(p
, "%d", k
);
1327 groupname
+= block
->group_name_stride
;
1332 assert(block
->b
->selectors
<= 1000);
1333 block
->selector_name_stride
= block
->group_name_stride
+ 4;
1334 block
->selector_names
=
1335 MALLOC(block
->num_groups
* block
->b
->selectors
* block
->selector_name_stride
);
1336 if (!block
->selector_names
)
1339 groupname
= block
->group_names
;
1340 p
= block
->selector_names
;
1341 for (i
= 0; i
< block
->num_groups
; ++i
) {
1342 for (j
= 0; j
< block
->b
->selectors
; ++j
) {
1343 sprintf(p
, "%s_%03d", groupname
, j
);
1344 p
+= block
->selector_name_stride
;
1346 groupname
+= block
->group_name_stride
;
1352 int si_get_perfcounter_info(struct si_screen
*screen
, unsigned index
,
1353 struct pipe_driver_query_info
*info
)
1355 struct si_perfcounters
*pc
= screen
->perfcounters
;
1356 struct si_pc_block
*block
;
1357 unsigned base_gid
, sub
;
1363 unsigned bid
, num_queries
= 0;
1365 for (bid
= 0; bid
< pc
->num_blocks
; ++bid
) {
1366 num_queries
+= pc
->blocks
[bid
].b
->selectors
* pc
->blocks
[bid
].num_groups
;
1372 block
= lookup_counter(pc
, index
, &base_gid
, &sub
);
1376 if (!block
->selector_names
) {
1377 if (!si_init_block_names(screen
, block
))
1380 info
->name
= block
->selector_names
+ sub
* block
->selector_name_stride
;
1381 info
->query_type
= SI_QUERY_FIRST_PERFCOUNTER
+ index
;
1382 info
->max_value
.u64
= 0;
1383 info
->type
= PIPE_DRIVER_QUERY_TYPE_UINT64
;
1384 info
->result_type
= PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE
;
1385 info
->group_id
= base_gid
+ sub
/ block
->b
->selectors
;
1386 info
->flags
= PIPE_DRIVER_QUERY_FLAG_BATCH
;
1387 if (sub
> 0 && sub
+ 1 < block
->b
->selectors
* block
->num_groups
)
1388 info
->flags
|= PIPE_DRIVER_QUERY_FLAG_DONT_LIST
;
1392 int si_get_perfcounter_group_info(struct si_screen
*screen
, unsigned index
,
1393 struct pipe_driver_query_group_info
*info
)
1395 struct si_perfcounters
*pc
= screen
->perfcounters
;
1396 struct si_pc_block
*block
;
1402 return pc
->num_groups
;
1404 block
= lookup_group(pc
, &index
);
1408 if (!block
->group_names
) {
1409 if (!si_init_block_names(screen
, block
))
1412 info
->name
= block
->group_names
+ index
* block
->group_name_stride
;
1413 info
->num_queries
= block
->b
->selectors
;
1414 info
->max_active_queries
= block
->b
->b
->num_counters
;
1418 void si_destroy_perfcounters(struct si_screen
*screen
)
1420 struct si_perfcounters
*pc
= screen
->perfcounters
;
1426 for (i
= 0; i
< pc
->num_blocks
; ++i
) {
1427 FREE(pc
->blocks
[i
].group_names
);
1428 FREE(pc
->blocks
[i
].selector_names
);
1432 screen
->perfcounters
= NULL
;
1435 void si_init_perfcounters(struct si_screen
*screen
)
1437 struct si_perfcounters
*pc
;
1438 const struct si_pc_block_gfxdescr
*blocks
;
1439 unsigned num_blocks
;
1442 switch (screen
->info
.chip_class
) {
1444 blocks
= groups_CIK
;
1445 num_blocks
= ARRAY_SIZE(groups_CIK
);
1449 num_blocks
= ARRAY_SIZE(groups_VI
);
1452 blocks
= groups_gfx9
;
1453 num_blocks
= ARRAY_SIZE(groups_gfx9
);
1457 blocks
= groups_gfx10
;
1458 num_blocks
= ARRAY_SIZE(groups_gfx10
);
1462 return; /* not implemented */
1465 screen
->perfcounters
= pc
= CALLOC_STRUCT(si_perfcounters
);
1469 pc
->num_stop_cs_dwords
= 14 + si_cp_write_fence_dwords(screen
);
1470 pc
->num_instance_cs_dwords
= 3;
1472 pc
->separate_se
= debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
1473 pc
->separate_instance
= debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
1475 pc
->blocks
= CALLOC(num_blocks
, sizeof(struct si_pc_block
));
1478 pc
->num_blocks
= num_blocks
;
1480 for (i
= 0; i
< num_blocks
; ++i
) {
1481 struct si_pc_block
*block
= &pc
->blocks
[i
];
1482 block
->b
= &blocks
[i
];
1483 block
->num_instances
= MAX2(1, block
->b
->instances
);
1485 if (!strcmp(block
->b
->b
->name
, "CB") ||
1486 !strcmp(block
->b
->b
->name
, "DB") ||
1487 !strcmp(block
->b
->b
->name
, "RMI"))
1488 block
->num_instances
= screen
->info
.max_se
;
1489 else if (!strcmp(block
->b
->b
->name
, "TCC"))
1490 block
->num_instances
= screen
->info
.num_tcc_blocks
;
1491 else if (!strcmp(block
->b
->b
->name
, "IA"))
1492 block
->num_instances
= MAX2(1, screen
->info
.max_se
/ 2);
1493 else if (!strcmp(block
->b
->b
->name
, "TA") ||
1494 !strcmp(block
->b
->b
->name
, "TCP") ||
1495 !strcmp(block
->b
->b
->name
, "TD")) {
1496 block
->num_instances
= MAX2(1, screen
->info
.max_good_cu_per_sa
);
1499 if (si_pc_block_has_per_instance_groups(pc
, block
)) {
1500 block
->num_groups
= block
->num_instances
;
1502 block
->num_groups
= 1;
1505 if (si_pc_block_has_per_se_groups(pc
, block
))
1506 block
->num_groups
*= screen
->info
.max_se
;
1507 if (block
->b
->b
->flags
& SI_PC_BLOCK_SHADER
)
1508 block
->num_groups
*= ARRAY_SIZE(si_pc_shader_type_bits
);
1510 pc
->num_groups
+= block
->num_groups
;
1516 si_destroy_perfcounters(screen
);