radeonsi: factor si_query_buffer logic out of si_query_hw
[mesa.git] / src / gallium / drivers / radeonsi / si_perfcounter.c
1 /*
2 * Copyright 2015 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "si_build_pm4.h"
26 #include "si_query.h"
27 #include "util/u_memory.h"
28
29
30 enum si_pc_block_flags {
31 /* This block is part of the shader engine */
32 SI_PC_BLOCK_SE = (1 << 0),
33
34 /* Expose per-instance groups instead of summing all instances (within
35 * an SE). */
36 SI_PC_BLOCK_INSTANCE_GROUPS = (1 << 1),
37
38 /* Expose per-SE groups instead of summing instances across SEs. */
39 SI_PC_BLOCK_SE_GROUPS = (1 << 2),
40
41 /* Shader block */
42 SI_PC_BLOCK_SHADER = (1 << 3),
43
44 /* Non-shader block with perfcounters windowed by shaders. */
45 SI_PC_BLOCK_SHADER_WINDOWED = (1 << 4),
46 };
47
48 enum si_pc_reg_layout {
49 /* All secondary selector dwords follow as one block after the primary
50 * selector dwords for the counters that have secondary selectors.
51 */
52 SI_PC_MULTI_BLOCK = 0,
53
54 /* Each secondary selector dword follows immediately afters the
55 * corresponding primary.
56 */
57 SI_PC_MULTI_ALTERNATE = 1,
58
59 /* All secondary selector dwords follow as one block after all primary
60 * selector dwords.
61 */
62 SI_PC_MULTI_TAIL = 2,
63
64 /* Free-form arrangement of selector registers. */
65 SI_PC_MULTI_CUSTOM = 3,
66
67 SI_PC_MULTI_MASK = 3,
68
69 /* Registers are laid out in decreasing rather than increasing order. */
70 SI_PC_REG_REVERSE = 4,
71
72 SI_PC_FAKE = 8,
73 };
74
75 struct si_pc_block_base {
76 const char *name;
77 unsigned num_counters;
78 unsigned flags;
79
80 unsigned select_or;
81 unsigned select0;
82 unsigned counter0_lo;
83 unsigned *select;
84 unsigned *counters;
85 unsigned num_multi;
86 unsigned num_prelude;
87 unsigned layout;
88 };
89
90 struct si_pc_block_gfxdescr {
91 struct si_pc_block_base *b;
92 unsigned selectors;
93 unsigned instances;
94 };
95
96 struct si_pc_block {
97 const struct si_pc_block_gfxdescr *b;
98 unsigned num_instances;
99
100 unsigned num_groups;
101 char *group_names;
102 unsigned group_name_stride;
103
104 char *selector_names;
105 unsigned selector_name_stride;
106 };
107
108 /* The order is chosen to be compatible with GPUPerfStudio's hardcoding of
109 * performance counter group IDs.
110 */
111 static const char * const si_pc_shader_type_suffixes[] = {
112 "", "_ES", "_GS", "_VS", "_PS", "_LS", "_HS", "_CS"
113 };
114
115 static const unsigned si_pc_shader_type_bits[] = {
116 0x7f,
117 S_036780_ES_EN(1),
118 S_036780_GS_EN(1),
119 S_036780_VS_EN(1),
120 S_036780_PS_EN(1),
121 S_036780_LS_EN(1),
122 S_036780_HS_EN(1),
123 S_036780_CS_EN(1),
124 };
125
126 /* Max counters per HW block */
127 #define SI_QUERY_MAX_COUNTERS 16
128
129 #define SI_PC_SHADERS_WINDOWING (1 << 31)
130
131 struct si_query_group {
132 struct si_query_group *next;
133 struct si_pc_block *block;
134 unsigned sub_gid; /* only used during init */
135 unsigned result_base; /* only used during init */
136 int se;
137 int instance;
138 unsigned num_counters;
139 unsigned selectors[SI_QUERY_MAX_COUNTERS];
140 };
141
142 struct si_query_counter {
143 unsigned base;
144 unsigned qwords;
145 unsigned stride; /* in uint64s */
146 };
147
148 struct si_query_pc {
149 struct si_query_hw b;
150
151 unsigned shaders;
152 unsigned num_counters;
153 struct si_query_counter *counters;
154 struct si_query_group *groups;
155 };
156
157
158 static struct si_pc_block_base cik_CB = {
159 .name = "CB",
160 .num_counters = 4,
161 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
162
163 .select0 = R_037000_CB_PERFCOUNTER_FILTER,
164 .counter0_lo = R_035018_CB_PERFCOUNTER0_LO,
165 .num_multi = 1,
166 .num_prelude = 1,
167 .layout = SI_PC_MULTI_ALTERNATE,
168 };
169
170 static unsigned cik_CPC_select[] = {
171 R_036024_CPC_PERFCOUNTER0_SELECT,
172 R_036010_CPC_PERFCOUNTER0_SELECT1,
173 R_03600C_CPC_PERFCOUNTER1_SELECT,
174 };
175 static struct si_pc_block_base cik_CPC = {
176 .name = "CPC",
177 .num_counters = 2,
178
179 .select = cik_CPC_select,
180 .counter0_lo = R_034018_CPC_PERFCOUNTER0_LO,
181 .num_multi = 1,
182 .layout = SI_PC_MULTI_CUSTOM | SI_PC_REG_REVERSE,
183 };
184
185 static struct si_pc_block_base cik_CPF = {
186 .name = "CPF",
187 .num_counters = 2,
188
189 .select0 = R_03601C_CPF_PERFCOUNTER0_SELECT,
190 .counter0_lo = R_034028_CPF_PERFCOUNTER0_LO,
191 .num_multi = 1,
192 .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
193 };
194
195 static struct si_pc_block_base cik_CPG = {
196 .name = "CPG",
197 .num_counters = 2,
198
199 .select0 = R_036008_CPG_PERFCOUNTER0_SELECT,
200 .counter0_lo = R_034008_CPG_PERFCOUNTER0_LO,
201 .num_multi = 1,
202 .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
203 };
204
205 static struct si_pc_block_base cik_DB = {
206 .name = "DB",
207 .num_counters = 4,
208 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
209
210 .select0 = R_037100_DB_PERFCOUNTER0_SELECT,
211 .counter0_lo = R_035100_DB_PERFCOUNTER0_LO,
212 .num_multi = 3, // really only 2, but there's a gap between registers
213 .layout = SI_PC_MULTI_ALTERNATE,
214 };
215
216 static struct si_pc_block_base cik_GDS = {
217 .name = "GDS",
218 .num_counters = 4,
219
220 .select0 = R_036A00_GDS_PERFCOUNTER0_SELECT,
221 .counter0_lo = R_034A00_GDS_PERFCOUNTER0_LO,
222 .num_multi = 1,
223 .layout = SI_PC_MULTI_TAIL,
224 };
225
226 static unsigned cik_GRBM_counters[] = {
227 R_034100_GRBM_PERFCOUNTER0_LO,
228 R_03410C_GRBM_PERFCOUNTER1_LO,
229 };
230 static struct si_pc_block_base cik_GRBM = {
231 .name = "GRBM",
232 .num_counters = 2,
233
234 .select0 = R_036100_GRBM_PERFCOUNTER0_SELECT,
235 .counters = cik_GRBM_counters,
236 };
237
238 static struct si_pc_block_base cik_GRBMSE = {
239 .name = "GRBMSE",
240 .num_counters = 4,
241
242 .select0 = R_036108_GRBM_SE0_PERFCOUNTER_SELECT,
243 .counter0_lo = R_034114_GRBM_SE0_PERFCOUNTER_LO,
244 };
245
246 static struct si_pc_block_base cik_IA = {
247 .name = "IA",
248 .num_counters = 4,
249
250 .select0 = R_036210_IA_PERFCOUNTER0_SELECT,
251 .counter0_lo = R_034220_IA_PERFCOUNTER0_LO,
252 .num_multi = 1,
253 .layout = SI_PC_MULTI_TAIL,
254 };
255
256 static struct si_pc_block_base cik_PA_SC = {
257 .name = "PA_SC",
258 .num_counters = 8,
259 .flags = SI_PC_BLOCK_SE,
260
261 .select0 = R_036500_PA_SC_PERFCOUNTER0_SELECT,
262 .counter0_lo = R_034500_PA_SC_PERFCOUNTER0_LO,
263 .num_multi = 1,
264 .layout = SI_PC_MULTI_ALTERNATE,
265 };
266
267 /* According to docs, PA_SU counters are only 48 bits wide. */
268 static struct si_pc_block_base cik_PA_SU = {
269 .name = "PA_SU",
270 .num_counters = 4,
271 .flags = SI_PC_BLOCK_SE,
272
273 .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
274 .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
275 .num_multi = 2,
276 .layout = SI_PC_MULTI_ALTERNATE,
277 };
278
279 static struct si_pc_block_base cik_SPI = {
280 .name = "SPI",
281 .num_counters = 6,
282 .flags = SI_PC_BLOCK_SE,
283
284 .select0 = R_036600_SPI_PERFCOUNTER0_SELECT,
285 .counter0_lo = R_034604_SPI_PERFCOUNTER0_LO,
286 .num_multi = 4,
287 .layout = SI_PC_MULTI_BLOCK,
288 };
289
290 static struct si_pc_block_base cik_SQ = {
291 .name = "SQ",
292 .num_counters = 16,
293 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER,
294
295 .select0 = R_036700_SQ_PERFCOUNTER0_SELECT,
296 .select_or = S_036700_SQC_BANK_MASK(15) |
297 S_036700_SQC_CLIENT_MASK(15) |
298 S_036700_SIMD_MASK(15),
299 .counter0_lo = R_034700_SQ_PERFCOUNTER0_LO,
300 };
301
302 static struct si_pc_block_base cik_SX = {
303 .name = "SX",
304 .num_counters = 4,
305 .flags = SI_PC_BLOCK_SE,
306
307 .select0 = R_036900_SX_PERFCOUNTER0_SELECT,
308 .counter0_lo = R_034900_SX_PERFCOUNTER0_LO,
309 .num_multi = 2,
310 .layout = SI_PC_MULTI_TAIL,
311 };
312
313 static struct si_pc_block_base cik_TA = {
314 .name = "TA",
315 .num_counters = 2,
316 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
317
318 .select0 = R_036B00_TA_PERFCOUNTER0_SELECT,
319 .counter0_lo = R_034B00_TA_PERFCOUNTER0_LO,
320 .num_multi = 1,
321 .layout = SI_PC_MULTI_ALTERNATE,
322 };
323
324 static struct si_pc_block_base cik_TD = {
325 .name = "TD",
326 .num_counters = 2,
327 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
328
329 .select0 = R_036C00_TD_PERFCOUNTER0_SELECT,
330 .counter0_lo = R_034C00_TD_PERFCOUNTER0_LO,
331 .num_multi = 1,
332 .layout = SI_PC_MULTI_ALTERNATE,
333 };
334
335 static struct si_pc_block_base cik_TCA = {
336 .name = "TCA",
337 .num_counters = 4,
338 .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
339
340 .select0 = R_036E40_TCA_PERFCOUNTER0_SELECT,
341 .counter0_lo = R_034E40_TCA_PERFCOUNTER0_LO,
342 .num_multi = 2,
343 .layout = SI_PC_MULTI_ALTERNATE,
344 };
345
346 static struct si_pc_block_base cik_TCC = {
347 .name = "TCC",
348 .num_counters = 4,
349 .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
350
351 .select0 = R_036E00_TCC_PERFCOUNTER0_SELECT,
352 .counter0_lo = R_034E00_TCC_PERFCOUNTER0_LO,
353 .num_multi = 2,
354 .layout = SI_PC_MULTI_ALTERNATE,
355 };
356
357 static struct si_pc_block_base cik_TCP = {
358 .name = "TCP",
359 .num_counters = 4,
360 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
361
362 .select0 = R_036D00_TCP_PERFCOUNTER0_SELECT,
363 .counter0_lo = R_034D00_TCP_PERFCOUNTER0_LO,
364 .num_multi = 2,
365 .layout = SI_PC_MULTI_ALTERNATE,
366 };
367
368 static struct si_pc_block_base cik_VGT = {
369 .name = "VGT",
370 .num_counters = 4,
371 .flags = SI_PC_BLOCK_SE,
372
373 .select0 = R_036230_VGT_PERFCOUNTER0_SELECT,
374 .counter0_lo = R_034240_VGT_PERFCOUNTER0_LO,
375 .num_multi = 1,
376 .layout = SI_PC_MULTI_TAIL,
377 };
378
379 static struct si_pc_block_base cik_WD = {
380 .name = "WD",
381 .num_counters = 4,
382
383 .select0 = R_036200_WD_PERFCOUNTER0_SELECT,
384 .counter0_lo = R_034200_WD_PERFCOUNTER0_LO,
385 };
386
387 static struct si_pc_block_base cik_MC = {
388 .name = "MC",
389 .num_counters = 4,
390
391 .layout = SI_PC_FAKE,
392 };
393
394 static struct si_pc_block_base cik_SRBM = {
395 .name = "SRBM",
396 .num_counters = 2,
397
398 .layout = SI_PC_FAKE,
399 };
400
401 /* Both the number of instances and selectors varies between chips of the same
402 * class. We only differentiate by class here and simply expose the maximum
403 * number over all chips in a class.
404 *
405 * Unfortunately, GPUPerfStudio uses the order of performance counter groups
406 * blindly once it believes it has identified the hardware, so the order of
407 * blocks here matters.
408 */
409 static struct si_pc_block_gfxdescr groups_CIK[] = {
410 { &cik_CB, 226},
411 { &cik_CPF, 17 },
412 { &cik_DB, 257},
413 { &cik_GRBM, 34 },
414 { &cik_GRBMSE, 15 },
415 { &cik_PA_SU, 153 },
416 { &cik_PA_SC, 395 },
417 { &cik_SPI, 186 },
418 { &cik_SQ, 252 },
419 { &cik_SX, 32 },
420 { &cik_TA, 111, 11 },
421 { &cik_TCA, 39, 2 },
422 { &cik_TCC, 160},
423 { &cik_TD, 55, 11 },
424 { &cik_TCP, 154, 11 },
425 { &cik_GDS, 121 },
426 { &cik_VGT, 140 },
427 { &cik_IA, 22 },
428 { &cik_MC, 22 },
429 { &cik_SRBM, 19 },
430 { &cik_WD, 22 },
431 { &cik_CPG, 46 },
432 { &cik_CPC, 22 },
433
434 };
435
436 static struct si_pc_block_gfxdescr groups_VI[] = {
437 { &cik_CB, 405},
438 { &cik_CPF, 19 },
439 { &cik_DB, 257},
440 { &cik_GRBM, 34 },
441 { &cik_GRBMSE, 15 },
442 { &cik_PA_SU, 154 },
443 { &cik_PA_SC, 397 },
444 { &cik_SPI, 197 },
445 { &cik_SQ, 273 },
446 { &cik_SX, 34 },
447 { &cik_TA, 119, 16 },
448 { &cik_TCA, 35, 2 },
449 { &cik_TCC, 192},
450 { &cik_TD, 55, 16 },
451 { &cik_TCP, 180, 16 },
452 { &cik_GDS, 121 },
453 { &cik_VGT, 147 },
454 { &cik_IA, 24 },
455 { &cik_MC, 22 },
456 { &cik_SRBM, 27 },
457 { &cik_WD, 37 },
458 { &cik_CPG, 48 },
459 { &cik_CPC, 24 },
460
461 };
462
463 static struct si_pc_block_gfxdescr groups_gfx9[] = {
464 { &cik_CB, 438},
465 { &cik_CPF, 32 },
466 { &cik_DB, 328},
467 { &cik_GRBM, 38 },
468 { &cik_GRBMSE, 16 },
469 { &cik_PA_SU, 292 },
470 { &cik_PA_SC, 491 },
471 { &cik_SPI, 196 },
472 { &cik_SQ, 374 },
473 { &cik_SX, 208 },
474 { &cik_TA, 119, 16 },
475 { &cik_TCA, 35, 2 },
476 { &cik_TCC, 256},
477 { &cik_TD, 57, 16 },
478 { &cik_TCP, 85, 16 },
479 { &cik_GDS, 121 },
480 { &cik_VGT, 148 },
481 { &cik_IA, 32 },
482 { &cik_WD, 58 },
483 { &cik_CPG, 59 },
484 { &cik_CPC, 35 },
485 };
486
487 static bool si_pc_block_has_per_se_groups(const struct si_perfcounters *pc,
488 const struct si_pc_block *block)
489 {
490 return block->b->b->flags & SI_PC_BLOCK_SE_GROUPS ||
491 (block->b->b->flags & SI_PC_BLOCK_SE && pc->separate_se);
492 }
493
494 static bool si_pc_block_has_per_instance_groups(const struct si_perfcounters *pc,
495 const struct si_pc_block *block)
496 {
497 return block->b->b->flags & SI_PC_BLOCK_INSTANCE_GROUPS ||
498 (block->num_instances > 1 && pc->separate_instance);
499 }
500
501 static struct si_pc_block *
502 lookup_counter(struct si_perfcounters *pc, unsigned index,
503 unsigned *base_gid, unsigned *sub_index)
504 {
505 struct si_pc_block *block = pc->blocks;
506 unsigned bid;
507
508 *base_gid = 0;
509 for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
510 unsigned total = block->num_groups * block->b->selectors;
511
512 if (index < total) {
513 *sub_index = index;
514 return block;
515 }
516
517 index -= total;
518 *base_gid += block->num_groups;
519 }
520
521 return NULL;
522 }
523
524 static struct si_pc_block *
525 lookup_group(struct si_perfcounters *pc, unsigned *index)
526 {
527 unsigned bid;
528 struct si_pc_block *block = pc->blocks;
529
530 for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
531 if (*index < block->num_groups)
532 return block;
533 *index -= block->num_groups;
534 }
535
536 return NULL;
537 }
538
539 static void si_pc_emit_instance(struct si_context *sctx,
540 int se, int instance)
541 {
542 struct radeon_cmdbuf *cs = sctx->gfx_cs;
543 unsigned value = S_030800_SH_BROADCAST_WRITES(1);
544
545 if (se >= 0) {
546 value |= S_030800_SE_INDEX(se);
547 } else {
548 value |= S_030800_SE_BROADCAST_WRITES(1);
549 }
550
551 if (instance >= 0) {
552 value |= S_030800_INSTANCE_INDEX(instance);
553 } else {
554 value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
555 }
556
557 radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
558 }
559
560 static void si_pc_emit_shaders(struct si_context *sctx,
561 unsigned shaders)
562 {
563 struct radeon_cmdbuf *cs = sctx->gfx_cs;
564
565 radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
566 radeon_emit(cs, shaders & 0x7f);
567 radeon_emit(cs, 0xffffffff);
568 }
569
570 static void si_pc_emit_select(struct si_context *sctx,
571 struct si_pc_block *block,
572 unsigned count, unsigned *selectors)
573 {
574 struct si_pc_block_base *regs = block->b->b;
575 struct radeon_cmdbuf *cs = sctx->gfx_cs;
576 unsigned idx;
577 unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
578 unsigned dw;
579
580 assert(count <= regs->num_counters);
581
582 if (regs->layout & SI_PC_FAKE)
583 return;
584
585 if (layout_multi == SI_PC_MULTI_BLOCK) {
586 assert(!(regs->layout & SI_PC_REG_REVERSE));
587
588 dw = count + regs->num_prelude;
589 if (count >= regs->num_multi)
590 dw += regs->num_multi;
591 radeon_set_uconfig_reg_seq(cs, regs->select0, dw);
592 for (idx = 0; idx < regs->num_prelude; ++idx)
593 radeon_emit(cs, 0);
594 for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
595 radeon_emit(cs, selectors[idx] | regs->select_or);
596
597 if (count < regs->num_multi) {
598 unsigned select1 =
599 regs->select0 + 4 * regs->num_multi;
600 radeon_set_uconfig_reg_seq(cs, select1, count);
601 }
602
603 for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
604 radeon_emit(cs, 0);
605
606 if (count > regs->num_multi) {
607 for (idx = regs->num_multi; idx < count; ++idx)
608 radeon_emit(cs, selectors[idx] | regs->select_or);
609 }
610 } else if (layout_multi == SI_PC_MULTI_TAIL) {
611 unsigned select1, select1_count;
612
613 assert(!(regs->layout & SI_PC_REG_REVERSE));
614
615 radeon_set_uconfig_reg_seq(cs, regs->select0, count + regs->num_prelude);
616 for (idx = 0; idx < regs->num_prelude; ++idx)
617 radeon_emit(cs, 0);
618 for (idx = 0; idx < count; ++idx)
619 radeon_emit(cs, selectors[idx] | regs->select_or);
620
621 select1 = regs->select0 + 4 * regs->num_counters;
622 select1_count = MIN2(count, regs->num_multi);
623 radeon_set_uconfig_reg_seq(cs, select1, select1_count);
624 for (idx = 0; idx < select1_count; ++idx)
625 radeon_emit(cs, 0);
626 } else if (layout_multi == SI_PC_MULTI_CUSTOM) {
627 unsigned *reg = regs->select;
628 for (idx = 0; idx < count; ++idx) {
629 radeon_set_uconfig_reg(cs, *reg++, selectors[idx] | regs->select_or);
630 if (idx < regs->num_multi)
631 radeon_set_uconfig_reg(cs, *reg++, 0);
632 }
633 } else {
634 assert(layout_multi == SI_PC_MULTI_ALTERNATE);
635
636 unsigned reg_base = regs->select0;
637 unsigned reg_count = count + MIN2(count, regs->num_multi);
638 reg_count += regs->num_prelude;
639
640 if (!(regs->layout & SI_PC_REG_REVERSE)) {
641 radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
642
643 for (idx = 0; idx < regs->num_prelude; ++idx)
644 radeon_emit(cs, 0);
645 for (idx = 0; idx < count; ++idx) {
646 radeon_emit(cs, selectors[idx] | regs->select_or);
647 if (idx < regs->num_multi)
648 radeon_emit(cs, 0);
649 }
650 } else {
651 reg_base -= (reg_count - 1) * 4;
652 radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
653
654 for (idx = count; idx > 0; --idx) {
655 if (idx <= regs->num_multi)
656 radeon_emit(cs, 0);
657 radeon_emit(cs, selectors[idx - 1] | regs->select_or);
658 }
659 for (idx = 0; idx < regs->num_prelude; ++idx)
660 radeon_emit(cs, 0);
661 }
662 }
663 }
664
665 static void si_pc_emit_start(struct si_context *sctx,
666 struct r600_resource *buffer, uint64_t va)
667 {
668 struct radeon_cmdbuf *cs = sctx->gfx_cs;
669
670 radeon_add_to_buffer_list(sctx, sctx->gfx_cs, buffer,
671 RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
672
673 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
674 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
675 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM));
676 radeon_emit(cs, 1); /* immediate */
677 radeon_emit(cs, 0); /* unused */
678 radeon_emit(cs, va);
679 radeon_emit(cs, va >> 32);
680
681 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
682 S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET));
683 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
684 radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
685 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
686 S_036020_PERFMON_STATE(V_036020_START_COUNTING));
687 }
688
689 /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
690 * do it again in here. */
691 static void si_pc_emit_stop(struct si_context *sctx,
692 struct r600_resource *buffer, uint64_t va)
693 {
694 struct radeon_cmdbuf *cs = sctx->gfx_cs;
695
696 si_cp_release_mem(sctx, V_028A90_BOTTOM_OF_PIPE_TS, 0,
697 EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
698 EOP_DATA_SEL_VALUE_32BIT,
699 buffer, va, 0, SI_NOT_QUERY);
700 si_cp_wait_mem(sctx, va, 0, 0xffffffff, 0);
701
702 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
703 radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
704 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
705 radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
706 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
707 S_036020_PERFMON_STATE(V_036020_STOP_COUNTING) |
708 S_036020_PERFMON_SAMPLE_ENABLE(1));
709 }
710
711 static void si_pc_emit_read(struct si_context *sctx,
712 struct si_pc_block *block,
713 unsigned count, uint64_t va)
714 {
715 struct si_pc_block_base *regs = block->b->b;
716 struct radeon_cmdbuf *cs = sctx->gfx_cs;
717 unsigned idx;
718 unsigned reg = regs->counter0_lo;
719 unsigned reg_delta = 8;
720
721 if (!(regs->layout & SI_PC_FAKE)) {
722 if (regs->layout & SI_PC_REG_REVERSE)
723 reg_delta = -reg_delta;
724
725 for (idx = 0; idx < count; ++idx) {
726 if (regs->counters)
727 reg = regs->counters[idx];
728
729 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
730 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
731 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM) |
732 COPY_DATA_COUNT_SEL); /* 64 bits */
733 radeon_emit(cs, reg >> 2);
734 radeon_emit(cs, 0); /* unused */
735 radeon_emit(cs, va);
736 radeon_emit(cs, va >> 32);
737 va += sizeof(uint64_t);
738 reg += reg_delta;
739 }
740 } else {
741 for (idx = 0; idx < count; ++idx) {
742 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
743 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
744 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM) |
745 COPY_DATA_COUNT_SEL);
746 radeon_emit(cs, 0); /* immediate */
747 radeon_emit(cs, 0);
748 radeon_emit(cs, va);
749 radeon_emit(cs, va >> 32);
750 va += sizeof(uint64_t);
751 }
752 }
753 }
754
755 static void si_pc_query_destroy(struct si_screen *sscreen,
756 struct si_query *rquery)
757 {
758 struct si_query_pc *query = (struct si_query_pc *)rquery;
759
760 while (query->groups) {
761 struct si_query_group *group = query->groups;
762 query->groups = group->next;
763 FREE(group);
764 }
765
766 FREE(query->counters);
767
768 si_query_hw_destroy(sscreen, rquery);
769 }
770
771 static bool si_pc_query_prepare_buffer(struct si_context *ctx,
772 struct si_query_buffer *qbuf)
773 {
774 /* no-op */
775 return true;
776 }
777
778 static void si_pc_query_emit_start(struct si_context *sctx,
779 struct si_query_hw *hwquery,
780 struct r600_resource *buffer, uint64_t va)
781 {
782 struct si_query_pc *query = (struct si_query_pc *)hwquery;
783 struct si_query_group *group;
784 int current_se = -1;
785 int current_instance = -1;
786
787 if (query->shaders)
788 si_pc_emit_shaders(sctx, query->shaders);
789
790 for (group = query->groups; group; group = group->next) {
791 struct si_pc_block *block = group->block;
792
793 if (group->se != current_se || group->instance != current_instance) {
794 current_se = group->se;
795 current_instance = group->instance;
796 si_pc_emit_instance(sctx, group->se, group->instance);
797 }
798
799 si_pc_emit_select(sctx, block, group->num_counters, group->selectors);
800 }
801
802 if (current_se != -1 || current_instance != -1)
803 si_pc_emit_instance(sctx, -1, -1);
804
805 si_pc_emit_start(sctx, buffer, va);
806 }
807
808 static void si_pc_query_emit_stop(struct si_context *sctx,
809 struct si_query_hw *hwquery,
810 struct r600_resource *buffer, uint64_t va)
811 {
812 struct si_query_pc *query = (struct si_query_pc *)hwquery;
813 struct si_query_group *group;
814
815 si_pc_emit_stop(sctx, buffer, va);
816
817 for (group = query->groups; group; group = group->next) {
818 struct si_pc_block *block = group->block;
819 unsigned se = group->se >= 0 ? group->se : 0;
820 unsigned se_end = se + 1;
821
822 if ((block->b->b->flags & SI_PC_BLOCK_SE) && (group->se < 0))
823 se_end = sctx->screen->info.max_se;
824
825 do {
826 unsigned instance = group->instance >= 0 ? group->instance : 0;
827
828 do {
829 si_pc_emit_instance(sctx, se, instance);
830 si_pc_emit_read(sctx, block, group->num_counters, va);
831 va += sizeof(uint64_t) * group->num_counters;
832 } while (group->instance < 0 && ++instance < block->num_instances);
833 } while (++se < se_end);
834 }
835
836 si_pc_emit_instance(sctx, -1, -1);
837 }
838
839 static void si_pc_query_clear_result(struct si_query_hw *hwquery,
840 union pipe_query_result *result)
841 {
842 struct si_query_pc *query = (struct si_query_pc *)hwquery;
843
844 memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
845 }
846
847 static void si_pc_query_add_result(struct si_screen *screen,
848 struct si_query_hw *hwquery,
849 void *buffer,
850 union pipe_query_result *result)
851 {
852 struct si_query_pc *query = (struct si_query_pc *)hwquery;
853 uint64_t *results = buffer;
854 unsigned i, j;
855
856 for (i = 0; i < query->num_counters; ++i) {
857 struct si_query_counter *counter = &query->counters[i];
858
859 for (j = 0; j < counter->qwords; ++j) {
860 uint32_t value = results[counter->base + j * counter->stride];
861 result->batch[i].u64 += value;
862 }
863 }
864 }
865
866 static struct si_query_ops batch_query_ops = {
867 .destroy = si_pc_query_destroy,
868 .begin = si_query_hw_begin,
869 .end = si_query_hw_end,
870 .get_result = si_query_hw_get_result,
871
872 .suspend = si_query_hw_suspend,
873 .resume = si_query_hw_resume,
874 };
875
876 static struct si_query_hw_ops batch_query_hw_ops = {
877 .prepare_buffer = si_pc_query_prepare_buffer,
878 .emit_start = si_pc_query_emit_start,
879 .emit_stop = si_pc_query_emit_stop,
880 .clear_result = si_pc_query_clear_result,
881 .add_result = si_pc_query_add_result,
882 };
883
884 static struct si_query_group *get_group_state(struct si_screen *screen,
885 struct si_query_pc *query,
886 struct si_pc_block *block,
887 unsigned sub_gid)
888 {
889 struct si_query_group *group = query->groups;
890
891 while (group) {
892 if (group->block == block && group->sub_gid == sub_gid)
893 return group;
894 group = group->next;
895 }
896
897 group = CALLOC_STRUCT(si_query_group);
898 if (!group)
899 return NULL;
900
901 group->block = block;
902 group->sub_gid = sub_gid;
903
904 if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
905 unsigned sub_gids = block->num_instances;
906 unsigned shader_id;
907 unsigned shaders;
908 unsigned query_shaders;
909
910 if (si_pc_block_has_per_se_groups(screen->perfcounters, block))
911 sub_gids = sub_gids * screen->info.max_se;
912 shader_id = sub_gid / sub_gids;
913 sub_gid = sub_gid % sub_gids;
914
915 shaders = si_pc_shader_type_bits[shader_id];
916
917 query_shaders = query->shaders & ~SI_PC_SHADERS_WINDOWING;
918 if (query_shaders && query_shaders != shaders) {
919 fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
920 FREE(group);
921 return NULL;
922 }
923 query->shaders = shaders;
924 }
925
926 if (block->b->b->flags & SI_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
927 // A non-zero value in query->shaders ensures that the shader
928 // masking is reset unless the user explicitly requests one.
929 query->shaders = SI_PC_SHADERS_WINDOWING;
930 }
931
932 if (si_pc_block_has_per_se_groups(screen->perfcounters, block)) {
933 group->se = sub_gid / block->num_instances;
934 sub_gid = sub_gid % block->num_instances;
935 } else {
936 group->se = -1;
937 }
938
939 if (si_pc_block_has_per_instance_groups(screen->perfcounters, block)) {
940 group->instance = sub_gid;
941 } else {
942 group->instance = -1;
943 }
944
945 group->next = query->groups;
946 query->groups = group;
947
948 return group;
949 }
950
951 struct pipe_query *si_create_batch_query(struct pipe_context *ctx,
952 unsigned num_queries,
953 unsigned *query_types)
954 {
955 struct si_screen *screen =
956 (struct si_screen *)ctx->screen;
957 struct si_perfcounters *pc = screen->perfcounters;
958 struct si_pc_block *block;
959 struct si_query_group *group;
960 struct si_query_pc *query;
961 unsigned base_gid, sub_gid, sub_index;
962 unsigned i, j;
963
964 if (!pc)
965 return NULL;
966
967 query = CALLOC_STRUCT(si_query_pc);
968 if (!query)
969 return NULL;
970
971 query->b.b.ops = &batch_query_ops;
972 query->b.ops = &batch_query_hw_ops;
973
974 query->num_counters = num_queries;
975
976 /* Collect selectors per group */
977 for (i = 0; i < num_queries; ++i) {
978 unsigned sub_gid;
979
980 if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER)
981 goto error;
982
983 block = lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER,
984 &base_gid, &sub_index);
985 if (!block)
986 goto error;
987
988 sub_gid = sub_index / block->b->selectors;
989 sub_index = sub_index % block->b->selectors;
990
991 group = get_group_state(screen, query, block, sub_gid);
992 if (!group)
993 goto error;
994
995 if (group->num_counters >= block->b->b->num_counters) {
996 fprintf(stderr,
997 "perfcounter group %s: too many selected\n",
998 block->b->b->name);
999 goto error;
1000 }
1001 group->selectors[group->num_counters] = sub_index;
1002 ++group->num_counters;
1003 }
1004
1005 /* Compute result bases and CS size per group */
1006 query->b.b.num_cs_dw_suspend = pc->num_stop_cs_dwords;
1007 query->b.b.num_cs_dw_suspend += pc->num_instance_cs_dwords;
1008
1009 i = 0;
1010 for (group = query->groups; group; group = group->next) {
1011 struct si_pc_block *block = group->block;
1012 unsigned read_dw;
1013 unsigned instances = 1;
1014
1015 if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
1016 instances = screen->info.max_se;
1017 if (group->instance < 0)
1018 instances *= block->num_instances;
1019
1020 group->result_base = i;
1021 query->b.result_size += sizeof(uint64_t) * instances * group->num_counters;
1022 i += instances * group->num_counters;
1023
1024 read_dw = 6 * group->num_counters;
1025 query->b.b.num_cs_dw_suspend += instances * read_dw;
1026 query->b.b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords;
1027 }
1028
1029 if (query->shaders) {
1030 if (query->shaders == SI_PC_SHADERS_WINDOWING)
1031 query->shaders = 0xffffffff;
1032 }
1033
1034 /* Map user-supplied query array to result indices */
1035 query->counters = CALLOC(num_queries, sizeof(*query->counters));
1036 for (i = 0; i < num_queries; ++i) {
1037 struct si_query_counter *counter = &query->counters[i];
1038 struct si_pc_block *block;
1039
1040 block = lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER,
1041 &base_gid, &sub_index);
1042
1043 sub_gid = sub_index / block->b->selectors;
1044 sub_index = sub_index % block->b->selectors;
1045
1046 group = get_group_state(screen, query, block, sub_gid);
1047 assert(group != NULL);
1048
1049 for (j = 0; j < group->num_counters; ++j) {
1050 if (group->selectors[j] == sub_index)
1051 break;
1052 }
1053
1054 counter->base = group->result_base + j;
1055 counter->stride = group->num_counters;
1056
1057 counter->qwords = 1;
1058 if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
1059 counter->qwords = screen->info.max_se;
1060 if (group->instance < 0)
1061 counter->qwords *= block->num_instances;
1062 }
1063
1064 return (struct pipe_query *)query;
1065
1066 error:
1067 si_pc_query_destroy(screen, &query->b.b);
1068 return NULL;
1069 }
1070
1071 static bool si_init_block_names(struct si_screen *screen,
1072 struct si_pc_block *block)
1073 {
1074 bool per_instance_groups = si_pc_block_has_per_instance_groups(screen->perfcounters, block);
1075 bool per_se_groups = si_pc_block_has_per_se_groups(screen->perfcounters, block);
1076 unsigned i, j, k;
1077 unsigned groups_shader = 1, groups_se = 1, groups_instance = 1;
1078 unsigned namelen;
1079 char *groupname;
1080 char *p;
1081
1082 if (per_instance_groups)
1083 groups_instance = block->num_instances;
1084 if (per_se_groups)
1085 groups_se = screen->info.max_se;
1086 if (block->b->b->flags & SI_PC_BLOCK_SHADER)
1087 groups_shader = ARRAY_SIZE(si_pc_shader_type_bits);
1088
1089 namelen = strlen(block->b->b->name);
1090 block->group_name_stride = namelen + 1;
1091 if (block->b->b->flags & SI_PC_BLOCK_SHADER)
1092 block->group_name_stride += 3;
1093 if (per_se_groups) {
1094 assert(groups_se <= 10);
1095 block->group_name_stride += 1;
1096
1097 if (per_instance_groups)
1098 block->group_name_stride += 1;
1099 }
1100 if (per_instance_groups) {
1101 assert(groups_instance <= 100);
1102 block->group_name_stride += 2;
1103 }
1104
1105 block->group_names = MALLOC(block->num_groups * block->group_name_stride);
1106 if (!block->group_names)
1107 return false;
1108
1109 groupname = block->group_names;
1110 for (i = 0; i < groups_shader; ++i) {
1111 const char *shader_suffix = si_pc_shader_type_suffixes[i];
1112 unsigned shaderlen = strlen(shader_suffix);
1113 for (j = 0; j < groups_se; ++j) {
1114 for (k = 0; k < groups_instance; ++k) {
1115 strcpy(groupname, block->b->b->name);
1116 p = groupname + namelen;
1117
1118 if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
1119 strcpy(p, shader_suffix);
1120 p += shaderlen;
1121 }
1122
1123 if (per_se_groups) {
1124 p += sprintf(p, "%d", j);
1125 if (per_instance_groups)
1126 *p++ = '_';
1127 }
1128
1129 if (per_instance_groups)
1130 p += sprintf(p, "%d", k);
1131
1132 groupname += block->group_name_stride;
1133 }
1134 }
1135 }
1136
1137 assert(block->b->selectors <= 1000);
1138 block->selector_name_stride = block->group_name_stride + 4;
1139 block->selector_names = MALLOC(block->num_groups * block->b->selectors *
1140 block->selector_name_stride);
1141 if (!block->selector_names)
1142 return false;
1143
1144 groupname = block->group_names;
1145 p = block->selector_names;
1146 for (i = 0; i < block->num_groups; ++i) {
1147 for (j = 0; j < block->b->selectors; ++j) {
1148 sprintf(p, "%s_%03d", groupname, j);
1149 p += block->selector_name_stride;
1150 }
1151 groupname += block->group_name_stride;
1152 }
1153
1154 return true;
1155 }
1156
1157 int si_get_perfcounter_info(struct si_screen *screen,
1158 unsigned index,
1159 struct pipe_driver_query_info *info)
1160 {
1161 struct si_perfcounters *pc = screen->perfcounters;
1162 struct si_pc_block *block;
1163 unsigned base_gid, sub;
1164
1165 if (!pc)
1166 return 0;
1167
1168 if (!info) {
1169 unsigned bid, num_queries = 0;
1170
1171 for (bid = 0; bid < pc->num_blocks; ++bid) {
1172 num_queries += pc->blocks[bid].b->selectors *
1173 pc->blocks[bid].num_groups;
1174 }
1175
1176 return num_queries;
1177 }
1178
1179 block = lookup_counter(pc, index, &base_gid, &sub);
1180 if (!block)
1181 return 0;
1182
1183 if (!block->selector_names) {
1184 if (!si_init_block_names(screen, block))
1185 return 0;
1186 }
1187 info->name = block->selector_names + sub * block->selector_name_stride;
1188 info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index;
1189 info->max_value.u64 = 0;
1190 info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
1191 info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
1192 info->group_id = base_gid + sub / block->b->selectors;
1193 info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
1194 if (sub > 0 && sub + 1 < block->b->selectors * block->num_groups)
1195 info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
1196 return 1;
1197 }
1198
1199 int si_get_perfcounter_group_info(struct si_screen *screen,
1200 unsigned index,
1201 struct pipe_driver_query_group_info *info)
1202 {
1203 struct si_perfcounters *pc = screen->perfcounters;
1204 struct si_pc_block *block;
1205
1206 if (!pc)
1207 return 0;
1208
1209 if (!info)
1210 return pc->num_groups;
1211
1212 block = lookup_group(pc, &index);
1213 if (!block)
1214 return 0;
1215
1216 if (!block->group_names) {
1217 if (!si_init_block_names(screen, block))
1218 return 0;
1219 }
1220 info->name = block->group_names + index * block->group_name_stride;
1221 info->num_queries = block->b->selectors;
1222 info->max_active_queries = block->b->b->num_counters;
1223 return 1;
1224 }
1225
1226 void si_destroy_perfcounters(struct si_screen *screen)
1227 {
1228 struct si_perfcounters *pc = screen->perfcounters;
1229 unsigned i;
1230
1231 if (!pc)
1232 return;
1233
1234 for (i = 0; i < pc->num_blocks; ++i) {
1235 FREE(pc->blocks[i].group_names);
1236 FREE(pc->blocks[i].selector_names);
1237 }
1238 FREE(pc->blocks);
1239 FREE(pc);
1240 screen->perfcounters = NULL;
1241 }
1242
1243 void si_init_perfcounters(struct si_screen *screen)
1244 {
1245 struct si_perfcounters *pc;
1246 const struct si_pc_block_gfxdescr *blocks;
1247 unsigned num_blocks;
1248 unsigned i;
1249
1250 switch (screen->info.chip_class) {
1251 case CIK:
1252 blocks = groups_CIK;
1253 num_blocks = ARRAY_SIZE(groups_CIK);
1254 break;
1255 case VI:
1256 blocks = groups_VI;
1257 num_blocks = ARRAY_SIZE(groups_VI);
1258 break;
1259 case GFX9:
1260 blocks = groups_gfx9;
1261 num_blocks = ARRAY_SIZE(groups_gfx9);
1262 break;
1263 case SI:
1264 default:
1265 return; /* not implemented */
1266 }
1267
1268 if (screen->info.max_sh_per_se != 1) {
1269 /* This should not happen on non-SI chips. */
1270 fprintf(stderr, "si_init_perfcounters: max_sh_per_se = %d not "
1271 "supported (inaccurate performance counters)\n",
1272 screen->info.max_sh_per_se);
1273 }
1274
1275 screen->perfcounters = pc = CALLOC_STRUCT(si_perfcounters);
1276 if (!pc)
1277 return;
1278
1279 pc->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
1280 pc->num_instance_cs_dwords = 3;
1281
1282 pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
1283 pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
1284
1285 pc->blocks = CALLOC(num_blocks, sizeof(struct si_pc_block));
1286 if (!pc->blocks)
1287 goto error;
1288 pc->num_blocks = num_blocks;
1289
1290 for (i = 0; i < num_blocks; ++i) {
1291 struct si_pc_block *block = &pc->blocks[i];
1292 block->b = &blocks[i];
1293 block->num_instances = block->b->instances;
1294
1295 if (!strcmp(block->b->b->name, "CB") ||
1296 !strcmp(block->b->b->name, "DB"))
1297 block->num_instances = screen->info.max_se;
1298 else if (!strcmp(block->b->b->name, "TCC"))
1299 block->num_instances = screen->info.num_tcc_blocks;
1300 else if (!strcmp(block->b->b->name, "IA"))
1301 block->num_instances = MAX2(1, screen->info.max_se / 2);
1302
1303 if (si_pc_block_has_per_instance_groups(pc, block)) {
1304 block->num_groups = block->num_instances;
1305 } else {
1306 block->num_groups = 1;
1307 }
1308
1309 if (si_pc_block_has_per_se_groups(pc, block))
1310 block->num_groups *= screen->info.max_se;
1311 if (block->b->b->flags & SI_PC_BLOCK_SHADER)
1312 block->num_groups *= ARRAY_SIZE(si_pc_shader_type_bits);
1313
1314 pc->num_groups += block->num_groups;
1315 }
1316
1317 return;
1318
1319 error:
1320 si_destroy_perfcounters(screen);
1321 }