69e149c76b618d448cec2bda4af71c619b72bbe8
[mesa.git] / src / gallium / drivers / radeonsi / si_perfcounter.c
1 /*
2 * Copyright 2015 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "si_build_pm4.h"
26 #include "si_query.h"
27 #include "util/u_memory.h"
28
29
30 enum si_pc_block_flags {
31 /* This block is part of the shader engine */
32 SI_PC_BLOCK_SE = (1 << 0),
33
34 /* Expose per-instance groups instead of summing all instances (within
35 * an SE). */
36 SI_PC_BLOCK_INSTANCE_GROUPS = (1 << 1),
37
38 /* Expose per-SE groups instead of summing instances across SEs. */
39 SI_PC_BLOCK_SE_GROUPS = (1 << 2),
40
41 /* Shader block */
42 SI_PC_BLOCK_SHADER = (1 << 3),
43
44 /* Non-shader block with perfcounters windowed by shaders. */
45 SI_PC_BLOCK_SHADER_WINDOWED = (1 << 4),
46 };
47
48 enum si_pc_reg_layout {
49 /* All secondary selector dwords follow as one block after the primary
50 * selector dwords for the counters that have secondary selectors.
51 */
52 SI_PC_MULTI_BLOCK = 0,
53
54 /* Each secondary selector dword follows immediately afters the
55 * corresponding primary.
56 */
57 SI_PC_MULTI_ALTERNATE = 1,
58
59 /* All secondary selector dwords follow as one block after all primary
60 * selector dwords.
61 */
62 SI_PC_MULTI_TAIL = 2,
63
64 /* Free-form arrangement of selector registers. */
65 SI_PC_MULTI_CUSTOM = 3,
66
67 SI_PC_MULTI_MASK = 3,
68
69 /* Registers are laid out in decreasing rather than increasing order. */
70 SI_PC_REG_REVERSE = 4,
71
72 SI_PC_FAKE = 8,
73 };
74
75 struct si_pc_block_base {
76 const char *name;
77 unsigned num_counters;
78 unsigned flags;
79
80 unsigned select_or;
81 unsigned select0;
82 unsigned counter0_lo;
83 unsigned *select;
84 unsigned *counters;
85 unsigned num_multi;
86 unsigned num_prelude;
87 unsigned layout;
88 };
89
90 struct si_pc_block_gfxdescr {
91 struct si_pc_block_base *b;
92 unsigned selectors;
93 unsigned instances;
94 };
95
96 struct si_pc_block {
97 const struct si_pc_block_gfxdescr *b;
98 unsigned num_instances;
99
100 unsigned num_groups;
101 char *group_names;
102 unsigned group_name_stride;
103
104 char *selector_names;
105 unsigned selector_name_stride;
106 };
107
108 /* The order is chosen to be compatible with GPUPerfStudio's hardcoding of
109 * performance counter group IDs.
110 */
111 static const char * const si_pc_shader_type_suffixes[] = {
112 "", "_ES", "_GS", "_VS", "_PS", "_LS", "_HS", "_CS"
113 };
114
115 static const unsigned si_pc_shader_type_bits[] = {
116 0x7f,
117 S_036780_ES_EN(1),
118 S_036780_GS_EN(1),
119 S_036780_VS_EN(1),
120 S_036780_PS_EN(1),
121 S_036780_LS_EN(1),
122 S_036780_HS_EN(1),
123 S_036780_CS_EN(1),
124 };
125
126 /* Max counters per HW block */
127 #define SI_QUERY_MAX_COUNTERS 16
128
129 #define SI_PC_SHADERS_WINDOWING (1 << 31)
130
131 struct si_query_group {
132 struct si_query_group *next;
133 struct si_pc_block *block;
134 unsigned sub_gid; /* only used during init */
135 unsigned result_base; /* only used during init */
136 int se;
137 int instance;
138 unsigned num_counters;
139 unsigned selectors[SI_QUERY_MAX_COUNTERS];
140 };
141
142 struct si_query_counter {
143 unsigned base;
144 unsigned qwords;
145 unsigned stride; /* in uint64s */
146 };
147
148 struct si_query_pc {
149 struct si_query_hw b;
150
151 unsigned shaders;
152 unsigned num_counters;
153 struct si_query_counter *counters;
154 struct si_query_group *groups;
155 };
156
157
158 static struct si_pc_block_base cik_CB = {
159 .name = "CB",
160 .num_counters = 4,
161 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
162
163 .select0 = R_037000_CB_PERFCOUNTER_FILTER,
164 .counter0_lo = R_035018_CB_PERFCOUNTER0_LO,
165 .num_multi = 1,
166 .num_prelude = 1,
167 .layout = SI_PC_MULTI_ALTERNATE,
168 };
169
170 static unsigned cik_CPC_select[] = {
171 R_036024_CPC_PERFCOUNTER0_SELECT,
172 R_036010_CPC_PERFCOUNTER0_SELECT1,
173 R_03600C_CPC_PERFCOUNTER1_SELECT,
174 };
175 static struct si_pc_block_base cik_CPC = {
176 .name = "CPC",
177 .num_counters = 2,
178
179 .select = cik_CPC_select,
180 .counter0_lo = R_034018_CPC_PERFCOUNTER0_LO,
181 .num_multi = 1,
182 .layout = SI_PC_MULTI_CUSTOM | SI_PC_REG_REVERSE,
183 };
184
185 static struct si_pc_block_base cik_CPF = {
186 .name = "CPF",
187 .num_counters = 2,
188
189 .select0 = R_03601C_CPF_PERFCOUNTER0_SELECT,
190 .counter0_lo = R_034028_CPF_PERFCOUNTER0_LO,
191 .num_multi = 1,
192 .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
193 };
194
195 static struct si_pc_block_base cik_CPG = {
196 .name = "CPG",
197 .num_counters = 2,
198
199 .select0 = R_036008_CPG_PERFCOUNTER0_SELECT,
200 .counter0_lo = R_034008_CPG_PERFCOUNTER0_LO,
201 .num_multi = 1,
202 .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
203 };
204
205 static struct si_pc_block_base cik_DB = {
206 .name = "DB",
207 .num_counters = 4,
208 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
209
210 .select0 = R_037100_DB_PERFCOUNTER0_SELECT,
211 .counter0_lo = R_035100_DB_PERFCOUNTER0_LO,
212 .num_multi = 3, // really only 2, but there's a gap between registers
213 .layout = SI_PC_MULTI_ALTERNATE,
214 };
215
216 static struct si_pc_block_base cik_GDS = {
217 .name = "GDS",
218 .num_counters = 4,
219
220 .select0 = R_036A00_GDS_PERFCOUNTER0_SELECT,
221 .counter0_lo = R_034A00_GDS_PERFCOUNTER0_LO,
222 .num_multi = 1,
223 .layout = SI_PC_MULTI_TAIL,
224 };
225
226 static unsigned cik_GRBM_counters[] = {
227 R_034100_GRBM_PERFCOUNTER0_LO,
228 R_03410C_GRBM_PERFCOUNTER1_LO,
229 };
230 static struct si_pc_block_base cik_GRBM = {
231 .name = "GRBM",
232 .num_counters = 2,
233
234 .select0 = R_036100_GRBM_PERFCOUNTER0_SELECT,
235 .counters = cik_GRBM_counters,
236 };
237
238 static struct si_pc_block_base cik_GRBMSE = {
239 .name = "GRBMSE",
240 .num_counters = 4,
241
242 .select0 = R_036108_GRBM_SE0_PERFCOUNTER_SELECT,
243 .counter0_lo = R_034114_GRBM_SE0_PERFCOUNTER_LO,
244 };
245
246 static struct si_pc_block_base cik_IA = {
247 .name = "IA",
248 .num_counters = 4,
249
250 .select0 = R_036210_IA_PERFCOUNTER0_SELECT,
251 .counter0_lo = R_034220_IA_PERFCOUNTER0_LO,
252 .num_multi = 1,
253 .layout = SI_PC_MULTI_TAIL,
254 };
255
256 static struct si_pc_block_base cik_PA_SC = {
257 .name = "PA_SC",
258 .num_counters = 8,
259 .flags = SI_PC_BLOCK_SE,
260
261 .select0 = R_036500_PA_SC_PERFCOUNTER0_SELECT,
262 .counter0_lo = R_034500_PA_SC_PERFCOUNTER0_LO,
263 .num_multi = 1,
264 .layout = SI_PC_MULTI_ALTERNATE,
265 };
266
267 /* According to docs, PA_SU counters are only 48 bits wide. */
268 static struct si_pc_block_base cik_PA_SU = {
269 .name = "PA_SU",
270 .num_counters = 4,
271 .flags = SI_PC_BLOCK_SE,
272
273 .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
274 .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
275 .num_multi = 2,
276 .layout = SI_PC_MULTI_ALTERNATE,
277 };
278
279 static struct si_pc_block_base cik_SPI = {
280 .name = "SPI",
281 .num_counters = 6,
282 .flags = SI_PC_BLOCK_SE,
283
284 .select0 = R_036600_SPI_PERFCOUNTER0_SELECT,
285 .counter0_lo = R_034604_SPI_PERFCOUNTER0_LO,
286 .num_multi = 4,
287 .layout = SI_PC_MULTI_BLOCK,
288 };
289
290 static struct si_pc_block_base cik_SQ = {
291 .name = "SQ",
292 .num_counters = 16,
293 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER,
294
295 .select0 = R_036700_SQ_PERFCOUNTER0_SELECT,
296 .select_or = S_036700_SQC_BANK_MASK(15) |
297 S_036700_SQC_CLIENT_MASK(15) |
298 S_036700_SIMD_MASK(15),
299 .counter0_lo = R_034700_SQ_PERFCOUNTER0_LO,
300 };
301
302 static struct si_pc_block_base cik_SX = {
303 .name = "SX",
304 .num_counters = 4,
305 .flags = SI_PC_BLOCK_SE,
306
307 .select0 = R_036900_SX_PERFCOUNTER0_SELECT,
308 .counter0_lo = R_034900_SX_PERFCOUNTER0_LO,
309 .num_multi = 2,
310 .layout = SI_PC_MULTI_TAIL,
311 };
312
313 static struct si_pc_block_base cik_TA = {
314 .name = "TA",
315 .num_counters = 2,
316 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
317
318 .select0 = R_036B00_TA_PERFCOUNTER0_SELECT,
319 .counter0_lo = R_034B00_TA_PERFCOUNTER0_LO,
320 .num_multi = 1,
321 .layout = SI_PC_MULTI_ALTERNATE,
322 };
323
324 static struct si_pc_block_base cik_TD = {
325 .name = "TD",
326 .num_counters = 2,
327 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
328
329 .select0 = R_036C00_TD_PERFCOUNTER0_SELECT,
330 .counter0_lo = R_034C00_TD_PERFCOUNTER0_LO,
331 .num_multi = 1,
332 .layout = SI_PC_MULTI_ALTERNATE,
333 };
334
335 static struct si_pc_block_base cik_TCA = {
336 .name = "TCA",
337 .num_counters = 4,
338 .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
339
340 .select0 = R_036E40_TCA_PERFCOUNTER0_SELECT,
341 .counter0_lo = R_034E40_TCA_PERFCOUNTER0_LO,
342 .num_multi = 2,
343 .layout = SI_PC_MULTI_ALTERNATE,
344 };
345
346 static struct si_pc_block_base cik_TCC = {
347 .name = "TCC",
348 .num_counters = 4,
349 .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
350
351 .select0 = R_036E00_TCC_PERFCOUNTER0_SELECT,
352 .counter0_lo = R_034E00_TCC_PERFCOUNTER0_LO,
353 .num_multi = 2,
354 .layout = SI_PC_MULTI_ALTERNATE,
355 };
356
357 static struct si_pc_block_base cik_TCP = {
358 .name = "TCP",
359 .num_counters = 4,
360 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
361
362 .select0 = R_036D00_TCP_PERFCOUNTER0_SELECT,
363 .counter0_lo = R_034D00_TCP_PERFCOUNTER0_LO,
364 .num_multi = 2,
365 .layout = SI_PC_MULTI_ALTERNATE,
366 };
367
368 static struct si_pc_block_base cik_VGT = {
369 .name = "VGT",
370 .num_counters = 4,
371 .flags = SI_PC_BLOCK_SE,
372
373 .select0 = R_036230_VGT_PERFCOUNTER0_SELECT,
374 .counter0_lo = R_034240_VGT_PERFCOUNTER0_LO,
375 .num_multi = 1,
376 .layout = SI_PC_MULTI_TAIL,
377 };
378
379 static struct si_pc_block_base cik_WD = {
380 .name = "WD",
381 .num_counters = 4,
382
383 .select0 = R_036200_WD_PERFCOUNTER0_SELECT,
384 .counter0_lo = R_034200_WD_PERFCOUNTER0_LO,
385 };
386
387 static struct si_pc_block_base cik_MC = {
388 .name = "MC",
389 .num_counters = 4,
390
391 .layout = SI_PC_FAKE,
392 };
393
394 static struct si_pc_block_base cik_SRBM = {
395 .name = "SRBM",
396 .num_counters = 2,
397
398 .layout = SI_PC_FAKE,
399 };
400
401 /* Both the number of instances and selectors varies between chips of the same
402 * class. We only differentiate by class here and simply expose the maximum
403 * number over all chips in a class.
404 *
405 * Unfortunately, GPUPerfStudio uses the order of performance counter groups
406 * blindly once it believes it has identified the hardware, so the order of
407 * blocks here matters.
408 */
409 static struct si_pc_block_gfxdescr groups_CIK[] = {
410 { &cik_CB, 226},
411 { &cik_CPF, 17 },
412 { &cik_DB, 257},
413 { &cik_GRBM, 34 },
414 { &cik_GRBMSE, 15 },
415 { &cik_PA_SU, 153 },
416 { &cik_PA_SC, 395 },
417 { &cik_SPI, 186 },
418 { &cik_SQ, 252 },
419 { &cik_SX, 32 },
420 { &cik_TA, 111, 11 },
421 { &cik_TCA, 39, 2 },
422 { &cik_TCC, 160},
423 { &cik_TD, 55, 11 },
424 { &cik_TCP, 154, 11 },
425 { &cik_GDS, 121 },
426 { &cik_VGT, 140 },
427 { &cik_IA, 22 },
428 { &cik_MC, 22 },
429 { &cik_SRBM, 19 },
430 { &cik_WD, 22 },
431 { &cik_CPG, 46 },
432 { &cik_CPC, 22 },
433
434 };
435
436 static struct si_pc_block_gfxdescr groups_VI[] = {
437 { &cik_CB, 405},
438 { &cik_CPF, 19 },
439 { &cik_DB, 257},
440 { &cik_GRBM, 34 },
441 { &cik_GRBMSE, 15 },
442 { &cik_PA_SU, 154 },
443 { &cik_PA_SC, 397 },
444 { &cik_SPI, 197 },
445 { &cik_SQ, 273 },
446 { &cik_SX, 34 },
447 { &cik_TA, 119, 16 },
448 { &cik_TCA, 35, 2 },
449 { &cik_TCC, 192},
450 { &cik_TD, 55, 16 },
451 { &cik_TCP, 180, 16 },
452 { &cik_GDS, 121 },
453 { &cik_VGT, 147 },
454 { &cik_IA, 24 },
455 { &cik_MC, 22 },
456 { &cik_SRBM, 27 },
457 { &cik_WD, 37 },
458 { &cik_CPG, 48 },
459 { &cik_CPC, 24 },
460
461 };
462
463 static struct si_pc_block_gfxdescr groups_gfx9[] = {
464 { &cik_CB, 438},
465 { &cik_CPF, 32 },
466 { &cik_DB, 328},
467 { &cik_GRBM, 38 },
468 { &cik_GRBMSE, 16 },
469 { &cik_PA_SU, 292 },
470 { &cik_PA_SC, 491 },
471 { &cik_SPI, 196 },
472 { &cik_SQ, 374 },
473 { &cik_SX, 208 },
474 { &cik_TA, 119, 16 },
475 { &cik_TCA, 35, 2 },
476 { &cik_TCC, 256},
477 { &cik_TD, 57, 16 },
478 { &cik_TCP, 85, 16 },
479 { &cik_GDS, 121 },
480 { &cik_VGT, 148 },
481 { &cik_IA, 32 },
482 { &cik_WD, 58 },
483 { &cik_CPG, 59 },
484 { &cik_CPC, 35 },
485 };
486
487 static bool si_pc_block_has_per_se_groups(const struct si_perfcounters *pc,
488 const struct si_pc_block *block)
489 {
490 return block->b->b->flags & SI_PC_BLOCK_SE_GROUPS ||
491 (block->b->b->flags & SI_PC_BLOCK_SE && pc->separate_se);
492 }
493
494 static bool si_pc_block_has_per_instance_groups(const struct si_perfcounters *pc,
495 const struct si_pc_block *block)
496 {
497 return block->b->b->flags & SI_PC_BLOCK_INSTANCE_GROUPS ||
498 (block->num_instances > 1 && pc->separate_instance);
499 }
500
501 static struct si_pc_block *
502 lookup_counter(struct si_perfcounters *pc, unsigned index,
503 unsigned *base_gid, unsigned *sub_index)
504 {
505 struct si_pc_block *block = pc->blocks;
506 unsigned bid;
507
508 *base_gid = 0;
509 for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
510 unsigned total = block->num_groups * block->b->selectors;
511
512 if (index < total) {
513 *sub_index = index;
514 return block;
515 }
516
517 index -= total;
518 *base_gid += block->num_groups;
519 }
520
521 return NULL;
522 }
523
524 static struct si_pc_block *
525 lookup_group(struct si_perfcounters *pc, unsigned *index)
526 {
527 unsigned bid;
528 struct si_pc_block *block = pc->blocks;
529
530 for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
531 if (*index < block->num_groups)
532 return block;
533 *index -= block->num_groups;
534 }
535
536 return NULL;
537 }
538
539 static void si_pc_emit_instance(struct si_context *sctx,
540 int se, int instance)
541 {
542 struct radeon_cmdbuf *cs = sctx->gfx_cs;
543 unsigned value = S_030800_SH_BROADCAST_WRITES(1);
544
545 if (se >= 0) {
546 value |= S_030800_SE_INDEX(se);
547 } else {
548 value |= S_030800_SE_BROADCAST_WRITES(1);
549 }
550
551 if (instance >= 0) {
552 value |= S_030800_INSTANCE_INDEX(instance);
553 } else {
554 value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
555 }
556
557 radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
558 }
559
560 static void si_pc_emit_shaders(struct si_context *sctx,
561 unsigned shaders)
562 {
563 struct radeon_cmdbuf *cs = sctx->gfx_cs;
564
565 radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
566 radeon_emit(cs, shaders & 0x7f);
567 radeon_emit(cs, 0xffffffff);
568 }
569
570 static void si_pc_emit_select(struct si_context *sctx,
571 struct si_pc_block *block,
572 unsigned count, unsigned *selectors)
573 {
574 struct si_pc_block_base *regs = block->b->b;
575 struct radeon_cmdbuf *cs = sctx->gfx_cs;
576 unsigned idx;
577 unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
578 unsigned dw;
579
580 assert(count <= regs->num_counters);
581
582 if (regs->layout & SI_PC_FAKE)
583 return;
584
585 if (layout_multi == SI_PC_MULTI_BLOCK) {
586 assert(!(regs->layout & SI_PC_REG_REVERSE));
587
588 dw = count + regs->num_prelude;
589 if (count >= regs->num_multi)
590 dw += regs->num_multi;
591 radeon_set_uconfig_reg_seq(cs, regs->select0, dw);
592 for (idx = 0; idx < regs->num_prelude; ++idx)
593 radeon_emit(cs, 0);
594 for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
595 radeon_emit(cs, selectors[idx] | regs->select_or);
596
597 if (count < regs->num_multi) {
598 unsigned select1 =
599 regs->select0 + 4 * regs->num_multi;
600 radeon_set_uconfig_reg_seq(cs, select1, count);
601 }
602
603 for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
604 radeon_emit(cs, 0);
605
606 if (count > regs->num_multi) {
607 for (idx = regs->num_multi; idx < count; ++idx)
608 radeon_emit(cs, selectors[idx] | regs->select_or);
609 }
610 } else if (layout_multi == SI_PC_MULTI_TAIL) {
611 unsigned select1, select1_count;
612
613 assert(!(regs->layout & SI_PC_REG_REVERSE));
614
615 radeon_set_uconfig_reg_seq(cs, regs->select0, count + regs->num_prelude);
616 for (idx = 0; idx < regs->num_prelude; ++idx)
617 radeon_emit(cs, 0);
618 for (idx = 0; idx < count; ++idx)
619 radeon_emit(cs, selectors[idx] | regs->select_or);
620
621 select1 = regs->select0 + 4 * regs->num_counters;
622 select1_count = MIN2(count, regs->num_multi);
623 radeon_set_uconfig_reg_seq(cs, select1, select1_count);
624 for (idx = 0; idx < select1_count; ++idx)
625 radeon_emit(cs, 0);
626 } else if (layout_multi == SI_PC_MULTI_CUSTOM) {
627 unsigned *reg = regs->select;
628 for (idx = 0; idx < count; ++idx) {
629 radeon_set_uconfig_reg(cs, *reg++, selectors[idx] | regs->select_or);
630 if (idx < regs->num_multi)
631 radeon_set_uconfig_reg(cs, *reg++, 0);
632 }
633 } else {
634 assert(layout_multi == SI_PC_MULTI_ALTERNATE);
635
636 unsigned reg_base = regs->select0;
637 unsigned reg_count = count + MIN2(count, regs->num_multi);
638 reg_count += regs->num_prelude;
639
640 if (!(regs->layout & SI_PC_REG_REVERSE)) {
641 radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
642
643 for (idx = 0; idx < regs->num_prelude; ++idx)
644 radeon_emit(cs, 0);
645 for (idx = 0; idx < count; ++idx) {
646 radeon_emit(cs, selectors[idx] | regs->select_or);
647 if (idx < regs->num_multi)
648 radeon_emit(cs, 0);
649 }
650 } else {
651 reg_base -= (reg_count - 1) * 4;
652 radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
653
654 for (idx = count; idx > 0; --idx) {
655 if (idx <= regs->num_multi)
656 radeon_emit(cs, 0);
657 radeon_emit(cs, selectors[idx - 1] | regs->select_or);
658 }
659 for (idx = 0; idx < regs->num_prelude; ++idx)
660 radeon_emit(cs, 0);
661 }
662 }
663 }
664
665 static void si_pc_emit_start(struct si_context *sctx,
666 struct r600_resource *buffer, uint64_t va)
667 {
668 struct radeon_cmdbuf *cs = sctx->gfx_cs;
669
670 radeon_add_to_buffer_list(sctx, sctx->gfx_cs, buffer,
671 RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
672
673 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
674 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
675 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM));
676 radeon_emit(cs, 1); /* immediate */
677 radeon_emit(cs, 0); /* unused */
678 radeon_emit(cs, va);
679 radeon_emit(cs, va >> 32);
680
681 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
682 S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET));
683 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
684 radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
685 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
686 S_036020_PERFMON_STATE(V_036020_START_COUNTING));
687 }
688
689 /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
690 * do it again in here. */
691 static void si_pc_emit_stop(struct si_context *sctx,
692 struct r600_resource *buffer, uint64_t va)
693 {
694 struct radeon_cmdbuf *cs = sctx->gfx_cs;
695
696 si_cp_release_mem(sctx, V_028A90_BOTTOM_OF_PIPE_TS, 0,
697 EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
698 EOP_DATA_SEL_VALUE_32BIT,
699 buffer, va, 0, SI_NOT_QUERY);
700 si_cp_wait_mem(sctx, va, 0, 0xffffffff, 0);
701
702 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
703 radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
704 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
705 radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
706 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
707 S_036020_PERFMON_STATE(V_036020_STOP_COUNTING) |
708 S_036020_PERFMON_SAMPLE_ENABLE(1));
709 }
710
711 static void si_pc_emit_read(struct si_context *sctx,
712 struct si_pc_block *block,
713 unsigned count, uint64_t va)
714 {
715 struct si_pc_block_base *regs = block->b->b;
716 struct radeon_cmdbuf *cs = sctx->gfx_cs;
717 unsigned idx;
718 unsigned reg = regs->counter0_lo;
719 unsigned reg_delta = 8;
720
721 if (!(regs->layout & SI_PC_FAKE)) {
722 if (regs->layout & SI_PC_REG_REVERSE)
723 reg_delta = -reg_delta;
724
725 for (idx = 0; idx < count; ++idx) {
726 if (regs->counters)
727 reg = regs->counters[idx];
728
729 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
730 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
731 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM) |
732 COPY_DATA_COUNT_SEL); /* 64 bits */
733 radeon_emit(cs, reg >> 2);
734 radeon_emit(cs, 0); /* unused */
735 radeon_emit(cs, va);
736 radeon_emit(cs, va >> 32);
737 va += sizeof(uint64_t);
738 reg += reg_delta;
739 }
740 } else {
741 for (idx = 0; idx < count; ++idx) {
742 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
743 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
744 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM) |
745 COPY_DATA_COUNT_SEL);
746 radeon_emit(cs, 0); /* immediate */
747 radeon_emit(cs, 0);
748 radeon_emit(cs, va);
749 radeon_emit(cs, va >> 32);
750 va += sizeof(uint64_t);
751 }
752 }
753 }
754
755 static void si_pc_query_destroy(struct si_screen *sscreen,
756 struct si_query *rquery)
757 {
758 struct si_query_pc *query = (struct si_query_pc *)rquery;
759
760 while (query->groups) {
761 struct si_query_group *group = query->groups;
762 query->groups = group->next;
763 FREE(group);
764 }
765
766 FREE(query->counters);
767
768 si_query_hw_destroy(sscreen, rquery);
769 }
770
771 static bool si_pc_query_prepare_buffer(struct si_screen *screen,
772 struct si_query_hw *hwquery,
773 struct r600_resource *buffer)
774 {
775 /* no-op */
776 return true;
777 }
778
779 static void si_pc_query_emit_start(struct si_context *sctx,
780 struct si_query_hw *hwquery,
781 struct r600_resource *buffer, uint64_t va)
782 {
783 struct si_query_pc *query = (struct si_query_pc *)hwquery;
784 struct si_query_group *group;
785 int current_se = -1;
786 int current_instance = -1;
787
788 if (query->shaders)
789 si_pc_emit_shaders(sctx, query->shaders);
790
791 for (group = query->groups; group; group = group->next) {
792 struct si_pc_block *block = group->block;
793
794 if (group->se != current_se || group->instance != current_instance) {
795 current_se = group->se;
796 current_instance = group->instance;
797 si_pc_emit_instance(sctx, group->se, group->instance);
798 }
799
800 si_pc_emit_select(sctx, block, group->num_counters, group->selectors);
801 }
802
803 if (current_se != -1 || current_instance != -1)
804 si_pc_emit_instance(sctx, -1, -1);
805
806 si_pc_emit_start(sctx, buffer, va);
807 }
808
809 static void si_pc_query_emit_stop(struct si_context *sctx,
810 struct si_query_hw *hwquery,
811 struct r600_resource *buffer, uint64_t va)
812 {
813 struct si_query_pc *query = (struct si_query_pc *)hwquery;
814 struct si_query_group *group;
815
816 si_pc_emit_stop(sctx, buffer, va);
817
818 for (group = query->groups; group; group = group->next) {
819 struct si_pc_block *block = group->block;
820 unsigned se = group->se >= 0 ? group->se : 0;
821 unsigned se_end = se + 1;
822
823 if ((block->b->b->flags & SI_PC_BLOCK_SE) && (group->se < 0))
824 se_end = sctx->screen->info.max_se;
825
826 do {
827 unsigned instance = group->instance >= 0 ? group->instance : 0;
828
829 do {
830 si_pc_emit_instance(sctx, se, instance);
831 si_pc_emit_read(sctx, block, group->num_counters, va);
832 va += sizeof(uint64_t) * group->num_counters;
833 } while (group->instance < 0 && ++instance < block->num_instances);
834 } while (++se < se_end);
835 }
836
837 si_pc_emit_instance(sctx, -1, -1);
838 }
839
840 static void si_pc_query_clear_result(struct si_query_hw *hwquery,
841 union pipe_query_result *result)
842 {
843 struct si_query_pc *query = (struct si_query_pc *)hwquery;
844
845 memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
846 }
847
848 static void si_pc_query_add_result(struct si_screen *screen,
849 struct si_query_hw *hwquery,
850 void *buffer,
851 union pipe_query_result *result)
852 {
853 struct si_query_pc *query = (struct si_query_pc *)hwquery;
854 uint64_t *results = buffer;
855 unsigned i, j;
856
857 for (i = 0; i < query->num_counters; ++i) {
858 struct si_query_counter *counter = &query->counters[i];
859
860 for (j = 0; j < counter->qwords; ++j) {
861 uint32_t value = results[counter->base + j * counter->stride];
862 result->batch[i].u64 += value;
863 }
864 }
865 }
866
867 static struct si_query_ops batch_query_ops = {
868 .destroy = si_pc_query_destroy,
869 .begin = si_query_hw_begin,
870 .end = si_query_hw_end,
871 .get_result = si_query_hw_get_result
872 };
873
874 static struct si_query_hw_ops batch_query_hw_ops = {
875 .prepare_buffer = si_pc_query_prepare_buffer,
876 .emit_start = si_pc_query_emit_start,
877 .emit_stop = si_pc_query_emit_stop,
878 .clear_result = si_pc_query_clear_result,
879 .add_result = si_pc_query_add_result,
880 };
881
882 static struct si_query_group *get_group_state(struct si_screen *screen,
883 struct si_query_pc *query,
884 struct si_pc_block *block,
885 unsigned sub_gid)
886 {
887 struct si_query_group *group = query->groups;
888
889 while (group) {
890 if (group->block == block && group->sub_gid == sub_gid)
891 return group;
892 group = group->next;
893 }
894
895 group = CALLOC_STRUCT(si_query_group);
896 if (!group)
897 return NULL;
898
899 group->block = block;
900 group->sub_gid = sub_gid;
901
902 if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
903 unsigned sub_gids = block->num_instances;
904 unsigned shader_id;
905 unsigned shaders;
906 unsigned query_shaders;
907
908 if (si_pc_block_has_per_se_groups(screen->perfcounters, block))
909 sub_gids = sub_gids * screen->info.max_se;
910 shader_id = sub_gid / sub_gids;
911 sub_gid = sub_gid % sub_gids;
912
913 shaders = si_pc_shader_type_bits[shader_id];
914
915 query_shaders = query->shaders & ~SI_PC_SHADERS_WINDOWING;
916 if (query_shaders && query_shaders != shaders) {
917 fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
918 FREE(group);
919 return NULL;
920 }
921 query->shaders = shaders;
922 }
923
924 if (block->b->b->flags & SI_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
925 // A non-zero value in query->shaders ensures that the shader
926 // masking is reset unless the user explicitly requests one.
927 query->shaders = SI_PC_SHADERS_WINDOWING;
928 }
929
930 if (si_pc_block_has_per_se_groups(screen->perfcounters, block)) {
931 group->se = sub_gid / block->num_instances;
932 sub_gid = sub_gid % block->num_instances;
933 } else {
934 group->se = -1;
935 }
936
937 if (si_pc_block_has_per_instance_groups(screen->perfcounters, block)) {
938 group->instance = sub_gid;
939 } else {
940 group->instance = -1;
941 }
942
943 group->next = query->groups;
944 query->groups = group;
945
946 return group;
947 }
948
949 struct pipe_query *si_create_batch_query(struct pipe_context *ctx,
950 unsigned num_queries,
951 unsigned *query_types)
952 {
953 struct si_screen *screen =
954 (struct si_screen *)ctx->screen;
955 struct si_perfcounters *pc = screen->perfcounters;
956 struct si_pc_block *block;
957 struct si_query_group *group;
958 struct si_query_pc *query;
959 unsigned base_gid, sub_gid, sub_index;
960 unsigned i, j;
961
962 if (!pc)
963 return NULL;
964
965 query = CALLOC_STRUCT(si_query_pc);
966 if (!query)
967 return NULL;
968
969 query->b.b.ops = &batch_query_ops;
970 query->b.ops = &batch_query_hw_ops;
971
972 query->num_counters = num_queries;
973
974 /* Collect selectors per group */
975 for (i = 0; i < num_queries; ++i) {
976 unsigned sub_gid;
977
978 if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER)
979 goto error;
980
981 block = lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER,
982 &base_gid, &sub_index);
983 if (!block)
984 goto error;
985
986 sub_gid = sub_index / block->b->selectors;
987 sub_index = sub_index % block->b->selectors;
988
989 group = get_group_state(screen, query, block, sub_gid);
990 if (!group)
991 goto error;
992
993 if (group->num_counters >= block->b->b->num_counters) {
994 fprintf(stderr,
995 "perfcounter group %s: too many selected\n",
996 block->b->b->name);
997 goto error;
998 }
999 group->selectors[group->num_counters] = sub_index;
1000 ++group->num_counters;
1001 }
1002
1003 /* Compute result bases and CS size per group */
1004 query->b.num_cs_dw_end = pc->num_stop_cs_dwords;
1005 query->b.num_cs_dw_end += pc->num_instance_cs_dwords;
1006
1007 i = 0;
1008 for (group = query->groups; group; group = group->next) {
1009 struct si_pc_block *block = group->block;
1010 unsigned read_dw;
1011 unsigned instances = 1;
1012
1013 if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
1014 instances = screen->info.max_se;
1015 if (group->instance < 0)
1016 instances *= block->num_instances;
1017
1018 group->result_base = i;
1019 query->b.result_size += sizeof(uint64_t) * instances * group->num_counters;
1020 i += instances * group->num_counters;
1021
1022 read_dw = 6 * group->num_counters;
1023 query->b.num_cs_dw_end += instances * read_dw;
1024 query->b.num_cs_dw_end += instances * pc->num_instance_cs_dwords;
1025 }
1026
1027 if (query->shaders) {
1028 if (query->shaders == SI_PC_SHADERS_WINDOWING)
1029 query->shaders = 0xffffffff;
1030 }
1031
1032 /* Map user-supplied query array to result indices */
1033 query->counters = CALLOC(num_queries, sizeof(*query->counters));
1034 for (i = 0; i < num_queries; ++i) {
1035 struct si_query_counter *counter = &query->counters[i];
1036 struct si_pc_block *block;
1037
1038 block = lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER,
1039 &base_gid, &sub_index);
1040
1041 sub_gid = sub_index / block->b->selectors;
1042 sub_index = sub_index % block->b->selectors;
1043
1044 group = get_group_state(screen, query, block, sub_gid);
1045 assert(group != NULL);
1046
1047 for (j = 0; j < group->num_counters; ++j) {
1048 if (group->selectors[j] == sub_index)
1049 break;
1050 }
1051
1052 counter->base = group->result_base + j;
1053 counter->stride = group->num_counters;
1054
1055 counter->qwords = 1;
1056 if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
1057 counter->qwords = screen->info.max_se;
1058 if (group->instance < 0)
1059 counter->qwords *= block->num_instances;
1060 }
1061
1062 if (!si_query_hw_init(screen, &query->b))
1063 goto error;
1064
1065 return (struct pipe_query *)query;
1066
1067 error:
1068 si_pc_query_destroy(screen, &query->b.b);
1069 return NULL;
1070 }
1071
1072 static bool si_init_block_names(struct si_screen *screen,
1073 struct si_pc_block *block)
1074 {
1075 bool per_instance_groups = si_pc_block_has_per_instance_groups(screen->perfcounters, block);
1076 bool per_se_groups = si_pc_block_has_per_se_groups(screen->perfcounters, block);
1077 unsigned i, j, k;
1078 unsigned groups_shader = 1, groups_se = 1, groups_instance = 1;
1079 unsigned namelen;
1080 char *groupname;
1081 char *p;
1082
1083 if (per_instance_groups)
1084 groups_instance = block->num_instances;
1085 if (per_se_groups)
1086 groups_se = screen->info.max_se;
1087 if (block->b->b->flags & SI_PC_BLOCK_SHADER)
1088 groups_shader = ARRAY_SIZE(si_pc_shader_type_bits);
1089
1090 namelen = strlen(block->b->b->name);
1091 block->group_name_stride = namelen + 1;
1092 if (block->b->b->flags & SI_PC_BLOCK_SHADER)
1093 block->group_name_stride += 3;
1094 if (per_se_groups) {
1095 assert(groups_se <= 10);
1096 block->group_name_stride += 1;
1097
1098 if (per_instance_groups)
1099 block->group_name_stride += 1;
1100 }
1101 if (per_instance_groups) {
1102 assert(groups_instance <= 100);
1103 block->group_name_stride += 2;
1104 }
1105
1106 block->group_names = MALLOC(block->num_groups * block->group_name_stride);
1107 if (!block->group_names)
1108 return false;
1109
1110 groupname = block->group_names;
1111 for (i = 0; i < groups_shader; ++i) {
1112 const char *shader_suffix = si_pc_shader_type_suffixes[i];
1113 unsigned shaderlen = strlen(shader_suffix);
1114 for (j = 0; j < groups_se; ++j) {
1115 for (k = 0; k < groups_instance; ++k) {
1116 strcpy(groupname, block->b->b->name);
1117 p = groupname + namelen;
1118
1119 if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
1120 strcpy(p, shader_suffix);
1121 p += shaderlen;
1122 }
1123
1124 if (per_se_groups) {
1125 p += sprintf(p, "%d", j);
1126 if (per_instance_groups)
1127 *p++ = '_';
1128 }
1129
1130 if (per_instance_groups)
1131 p += sprintf(p, "%d", k);
1132
1133 groupname += block->group_name_stride;
1134 }
1135 }
1136 }
1137
1138 assert(block->b->selectors <= 1000);
1139 block->selector_name_stride = block->group_name_stride + 4;
1140 block->selector_names = MALLOC(block->num_groups * block->b->selectors *
1141 block->selector_name_stride);
1142 if (!block->selector_names)
1143 return false;
1144
1145 groupname = block->group_names;
1146 p = block->selector_names;
1147 for (i = 0; i < block->num_groups; ++i) {
1148 for (j = 0; j < block->b->selectors; ++j) {
1149 sprintf(p, "%s_%03d", groupname, j);
1150 p += block->selector_name_stride;
1151 }
1152 groupname += block->group_name_stride;
1153 }
1154
1155 return true;
1156 }
1157
1158 int si_get_perfcounter_info(struct si_screen *screen,
1159 unsigned index,
1160 struct pipe_driver_query_info *info)
1161 {
1162 struct si_perfcounters *pc = screen->perfcounters;
1163 struct si_pc_block *block;
1164 unsigned base_gid, sub;
1165
1166 if (!pc)
1167 return 0;
1168
1169 if (!info) {
1170 unsigned bid, num_queries = 0;
1171
1172 for (bid = 0; bid < pc->num_blocks; ++bid) {
1173 num_queries += pc->blocks[bid].b->selectors *
1174 pc->blocks[bid].num_groups;
1175 }
1176
1177 return num_queries;
1178 }
1179
1180 block = lookup_counter(pc, index, &base_gid, &sub);
1181 if (!block)
1182 return 0;
1183
1184 if (!block->selector_names) {
1185 if (!si_init_block_names(screen, block))
1186 return 0;
1187 }
1188 info->name = block->selector_names + sub * block->selector_name_stride;
1189 info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index;
1190 info->max_value.u64 = 0;
1191 info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
1192 info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
1193 info->group_id = base_gid + sub / block->b->selectors;
1194 info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
1195 if (sub > 0 && sub + 1 < block->b->selectors * block->num_groups)
1196 info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
1197 return 1;
1198 }
1199
1200 int si_get_perfcounter_group_info(struct si_screen *screen,
1201 unsigned index,
1202 struct pipe_driver_query_group_info *info)
1203 {
1204 struct si_perfcounters *pc = screen->perfcounters;
1205 struct si_pc_block *block;
1206
1207 if (!pc)
1208 return 0;
1209
1210 if (!info)
1211 return pc->num_groups;
1212
1213 block = lookup_group(pc, &index);
1214 if (!block)
1215 return 0;
1216
1217 if (!block->group_names) {
1218 if (!si_init_block_names(screen, block))
1219 return 0;
1220 }
1221 info->name = block->group_names + index * block->group_name_stride;
1222 info->num_queries = block->b->selectors;
1223 info->max_active_queries = block->b->b->num_counters;
1224 return 1;
1225 }
1226
1227 void si_destroy_perfcounters(struct si_screen *screen)
1228 {
1229 struct si_perfcounters *pc = screen->perfcounters;
1230 unsigned i;
1231
1232 if (!pc)
1233 return;
1234
1235 for (i = 0; i < pc->num_blocks; ++i) {
1236 FREE(pc->blocks[i].group_names);
1237 FREE(pc->blocks[i].selector_names);
1238 }
1239 FREE(pc->blocks);
1240 FREE(pc);
1241 screen->perfcounters = NULL;
1242 }
1243
1244 void si_init_perfcounters(struct si_screen *screen)
1245 {
1246 struct si_perfcounters *pc;
1247 const struct si_pc_block_gfxdescr *blocks;
1248 unsigned num_blocks;
1249 unsigned i;
1250
1251 switch (screen->info.chip_class) {
1252 case CIK:
1253 blocks = groups_CIK;
1254 num_blocks = ARRAY_SIZE(groups_CIK);
1255 break;
1256 case VI:
1257 blocks = groups_VI;
1258 num_blocks = ARRAY_SIZE(groups_VI);
1259 break;
1260 case GFX9:
1261 blocks = groups_gfx9;
1262 num_blocks = ARRAY_SIZE(groups_gfx9);
1263 break;
1264 case SI:
1265 default:
1266 return; /* not implemented */
1267 }
1268
1269 if (screen->info.max_sh_per_se != 1) {
1270 /* This should not happen on non-SI chips. */
1271 fprintf(stderr, "si_init_perfcounters: max_sh_per_se = %d not "
1272 "supported (inaccurate performance counters)\n",
1273 screen->info.max_sh_per_se);
1274 }
1275
1276 screen->perfcounters = pc = CALLOC_STRUCT(si_perfcounters);
1277 if (!pc)
1278 return;
1279
1280 pc->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
1281 pc->num_instance_cs_dwords = 3;
1282
1283 pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
1284 pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
1285
1286 pc->blocks = CALLOC(num_blocks, sizeof(struct si_pc_block));
1287 if (!pc->blocks)
1288 goto error;
1289 pc->num_blocks = num_blocks;
1290
1291 for (i = 0; i < num_blocks; ++i) {
1292 struct si_pc_block *block = &pc->blocks[i];
1293 block->b = &blocks[i];
1294 block->num_instances = block->b->instances;
1295
1296 if (!strcmp(block->b->b->name, "CB") ||
1297 !strcmp(block->b->b->name, "DB"))
1298 block->num_instances = screen->info.max_se;
1299 else if (!strcmp(block->b->b->name, "TCC"))
1300 block->num_instances = screen->info.num_tcc_blocks;
1301 else if (!strcmp(block->b->b->name, "IA"))
1302 block->num_instances = MAX2(1, screen->info.max_se / 2);
1303
1304 if (si_pc_block_has_per_instance_groups(pc, block)) {
1305 block->num_groups = block->num_instances;
1306 } else {
1307 block->num_groups = 1;
1308 }
1309
1310 if (si_pc_block_has_per_se_groups(pc, block))
1311 block->num_groups *= screen->info.max_se;
1312 if (block->b->b->flags & SI_PC_BLOCK_SHADER)
1313 block->num_groups *= ARRAY_SIZE(si_pc_shader_type_bits);
1314
1315 pc->num_groups += block->num_groups;
1316 }
1317
1318 return;
1319
1320 error:
1321 si_destroy_perfcounters(screen);
1322 }