ac,radeonsi: start adding support for gfx10.3
[mesa.git] / src / gallium / drivers / radeonsi / si_perfcounter.c
1 /*
2 * Copyright 2015 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "si_build_pm4.h"
26 #include "si_query.h"
27 #include "util/u_memory.h"
28
29 enum si_pc_block_flags
30 {
31 /* This block is part of the shader engine */
32 SI_PC_BLOCK_SE = (1 << 0),
33
34 /* Expose per-instance groups instead of summing all instances (within
35 * an SE). */
36 SI_PC_BLOCK_INSTANCE_GROUPS = (1 << 1),
37
38 /* Expose per-SE groups instead of summing instances across SEs. */
39 SI_PC_BLOCK_SE_GROUPS = (1 << 2),
40
41 /* Shader block */
42 SI_PC_BLOCK_SHADER = (1 << 3),
43
44 /* Non-shader block with perfcounters windowed by shaders. */
45 SI_PC_BLOCK_SHADER_WINDOWED = (1 << 4),
46 };
47
48 enum si_pc_reg_layout
49 {
50 /* All secondary selector dwords follow as one block after the primary
51 * selector dwords for the counters that have secondary selectors.
52 *
53 * Example:
54 * PERFCOUNTER0_SELECT
55 * PERFCOUNTER1_SELECT
56 * PERFCOUNTER0_SELECT1
57 * PERFCOUNTER1_SELECT1
58 * PERFCOUNTER2_SELECT
59 * PERFCOUNTER3_SELECT
60 */
61 SI_PC_MULTI_BLOCK = 0,
62
63 /* Each secondary selector dword follows immediately after the
64 * corresponding primary.
65 *
66 * Example:
67 * PERFCOUNTER0_SELECT
68 * PERFCOUNTER0_SELECT1
69 * PERFCOUNTER1_SELECT
70 * PERFCOUNTER1_SELECT1
71 * PERFCOUNTER2_SELECT
72 * PERFCOUNTER3_SELECT
73 */
74 SI_PC_MULTI_ALTERNATE = 1,
75
76 /* All secondary selector dwords follow as one block after all primary
77 * selector dwords.
78 *
79 * Example:
80 * PERFCOUNTER0_SELECT
81 * PERFCOUNTER1_SELECT
82 * PERFCOUNTER2_SELECT
83 * PERFCOUNTER3_SELECT
84 * PERFCOUNTER0_SELECT1
85 * PERFCOUNTER1_SELECT1
86 */
87 SI_PC_MULTI_TAIL = 2,
88
89 /* Free-form arrangement of selector registers. */
90 SI_PC_MULTI_CUSTOM = 3,
91
92 SI_PC_MULTI_MASK = 3,
93
94 /* Registers are laid out in decreasing rather than increasing order. */
95 SI_PC_REG_REVERSE = 4,
96
97 SI_PC_FAKE = 8,
98 };
99
100 struct si_pc_block_base {
101 const char *name;
102 unsigned num_counters;
103 unsigned flags;
104
105 unsigned select_or;
106 unsigned select0;
107 unsigned counter0_lo;
108 unsigned *select;
109 unsigned *counters;
110 unsigned num_multi;
111 unsigned num_prelude;
112 unsigned layout;
113 };
114
115 struct si_pc_block_gfxdescr {
116 struct si_pc_block_base *b;
117 unsigned selectors;
118 unsigned instances;
119 };
120
121 struct si_pc_block {
122 const struct si_pc_block_gfxdescr *b;
123 unsigned num_instances;
124
125 unsigned num_groups;
126 char *group_names;
127 unsigned group_name_stride;
128
129 char *selector_names;
130 unsigned selector_name_stride;
131 };
132
133 /* The order is chosen to be compatible with GPUPerfStudio's hardcoding of
134 * performance counter group IDs.
135 */
136 static const char *const si_pc_shader_type_suffixes[] = {"", "_ES", "_GS", "_VS",
137 "_PS", "_LS", "_HS", "_CS"};
138
139 static const unsigned si_pc_shader_type_bits[] = {
140 0x7f,
141 S_036780_ES_EN(1),
142 S_036780_GS_EN(1),
143 S_036780_VS_EN(1),
144 S_036780_PS_EN(1),
145 S_036780_LS_EN(1),
146 S_036780_HS_EN(1),
147 S_036780_CS_EN(1),
148 };
149
150 /* Max counters per HW block */
151 #define SI_QUERY_MAX_COUNTERS 16
152
153 #define SI_PC_SHADERS_WINDOWING (1u << 31)
154
155 struct si_query_group {
156 struct si_query_group *next;
157 struct si_pc_block *block;
158 unsigned sub_gid; /* only used during init */
159 unsigned result_base; /* only used during init */
160 int se;
161 int instance;
162 unsigned num_counters;
163 unsigned selectors[SI_QUERY_MAX_COUNTERS];
164 };
165
166 struct si_query_counter {
167 unsigned base;
168 unsigned qwords;
169 unsigned stride; /* in uint64s */
170 };
171
172 struct si_query_pc {
173 struct si_query b;
174 struct si_query_buffer buffer;
175
176 /* Size of the results in memory, in bytes. */
177 unsigned result_size;
178
179 unsigned shaders;
180 unsigned num_counters;
181 struct si_query_counter *counters;
182 struct si_query_group *groups;
183 };
184
185 static struct si_pc_block_base cik_CB = {
186 .name = "CB",
187 .num_counters = 4,
188 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
189
190 .select0 = R_037000_CB_PERFCOUNTER_FILTER,
191 .counter0_lo = R_035018_CB_PERFCOUNTER0_LO,
192 .num_multi = 1,
193 .num_prelude = 1,
194 .layout = SI_PC_MULTI_ALTERNATE,
195 };
196
197 static unsigned cik_CPC_select[] = {
198 R_036024_CPC_PERFCOUNTER0_SELECT,
199 R_036010_CPC_PERFCOUNTER0_SELECT1,
200 R_03600C_CPC_PERFCOUNTER1_SELECT,
201 };
202 static struct si_pc_block_base cik_CPC = {
203 .name = "CPC",
204 .num_counters = 2,
205
206 .select = cik_CPC_select,
207 .counter0_lo = R_034018_CPC_PERFCOUNTER0_LO,
208 .num_multi = 1,
209 .layout = SI_PC_MULTI_CUSTOM | SI_PC_REG_REVERSE,
210 };
211
212 static struct si_pc_block_base cik_CPF = {
213 .name = "CPF",
214 .num_counters = 2,
215
216 .select0 = R_03601C_CPF_PERFCOUNTER0_SELECT,
217 .counter0_lo = R_034028_CPF_PERFCOUNTER0_LO,
218 .num_multi = 1,
219 .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
220 };
221
222 static struct si_pc_block_base cik_CPG = {
223 .name = "CPG",
224 .num_counters = 2,
225
226 .select0 = R_036008_CPG_PERFCOUNTER0_SELECT,
227 .counter0_lo = R_034008_CPG_PERFCOUNTER0_LO,
228 .num_multi = 1,
229 .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
230 };
231
232 static struct si_pc_block_base cik_DB = {
233 .name = "DB",
234 .num_counters = 4,
235 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
236
237 .select0 = R_037100_DB_PERFCOUNTER0_SELECT,
238 .counter0_lo = R_035100_DB_PERFCOUNTER0_LO,
239 .num_multi = 3, // really only 2, but there's a gap between registers
240 .layout = SI_PC_MULTI_ALTERNATE,
241 };
242
243 static struct si_pc_block_base cik_GDS = {
244 .name = "GDS",
245 .num_counters = 4,
246
247 .select0 = R_036A00_GDS_PERFCOUNTER0_SELECT,
248 .counter0_lo = R_034A00_GDS_PERFCOUNTER0_LO,
249 .num_multi = 1,
250 .layout = SI_PC_MULTI_TAIL,
251 };
252
253 static unsigned cik_GRBM_counters[] = {
254 R_034100_GRBM_PERFCOUNTER0_LO,
255 R_03410C_GRBM_PERFCOUNTER1_LO,
256 };
257 static struct si_pc_block_base cik_GRBM = {
258 .name = "GRBM",
259 .num_counters = 2,
260
261 .select0 = R_036100_GRBM_PERFCOUNTER0_SELECT,
262 .counters = cik_GRBM_counters,
263 };
264
265 static struct si_pc_block_base cik_GRBMSE = {
266 .name = "GRBMSE",
267 .num_counters = 4,
268
269 .select0 = R_036108_GRBM_SE0_PERFCOUNTER_SELECT,
270 .counter0_lo = R_034114_GRBM_SE0_PERFCOUNTER_LO,
271 };
272
273 static struct si_pc_block_base cik_IA = {
274 .name = "IA",
275 .num_counters = 4,
276
277 .select0 = R_036210_IA_PERFCOUNTER0_SELECT,
278 .counter0_lo = R_034220_IA_PERFCOUNTER0_LO,
279 .num_multi = 1,
280 .layout = SI_PC_MULTI_TAIL,
281 };
282
283 static struct si_pc_block_base cik_PA_SC = {
284 .name = "PA_SC",
285 .num_counters = 8,
286 .flags = SI_PC_BLOCK_SE,
287
288 .select0 = R_036500_PA_SC_PERFCOUNTER0_SELECT,
289 .counter0_lo = R_034500_PA_SC_PERFCOUNTER0_LO,
290 .num_multi = 1,
291 .layout = SI_PC_MULTI_ALTERNATE,
292 };
293
294 /* According to docs, PA_SU counters are only 48 bits wide. */
295 static struct si_pc_block_base cik_PA_SU = {
296 .name = "PA_SU",
297 .num_counters = 4,
298 .flags = SI_PC_BLOCK_SE,
299
300 .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
301 .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
302 .num_multi = 2,
303 .layout = SI_PC_MULTI_ALTERNATE,
304 };
305
306 static struct si_pc_block_base cik_SPI = {
307 .name = "SPI",
308 .num_counters = 6,
309 .flags = SI_PC_BLOCK_SE,
310
311 .select0 = R_036600_SPI_PERFCOUNTER0_SELECT,
312 .counter0_lo = R_034604_SPI_PERFCOUNTER0_LO,
313 .num_multi = 4,
314 .layout = SI_PC_MULTI_BLOCK,
315 };
316
317 static struct si_pc_block_base cik_SQ = {
318 .name = "SQ",
319 .num_counters = 16,
320 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER,
321
322 .select0 = R_036700_SQ_PERFCOUNTER0_SELECT,
323 .select_or = S_036700_SQC_BANK_MASK(15) | S_036700_SQC_CLIENT_MASK(15) | S_036700_SIMD_MASK(15),
324 .counter0_lo = R_034700_SQ_PERFCOUNTER0_LO,
325 };
326
327 static struct si_pc_block_base cik_SX = {
328 .name = "SX",
329 .num_counters = 4,
330 .flags = SI_PC_BLOCK_SE,
331
332 .select0 = R_036900_SX_PERFCOUNTER0_SELECT,
333 .counter0_lo = R_034900_SX_PERFCOUNTER0_LO,
334 .num_multi = 2,
335 .layout = SI_PC_MULTI_TAIL,
336 };
337
338 static struct si_pc_block_base cik_TA = {
339 .name = "TA",
340 .num_counters = 2,
341 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
342
343 .select0 = R_036B00_TA_PERFCOUNTER0_SELECT,
344 .counter0_lo = R_034B00_TA_PERFCOUNTER0_LO,
345 .num_multi = 1,
346 .layout = SI_PC_MULTI_ALTERNATE,
347 };
348
349 static struct si_pc_block_base cik_TD = {
350 .name = "TD",
351 .num_counters = 2,
352 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
353
354 .select0 = R_036C00_TD_PERFCOUNTER0_SELECT,
355 .counter0_lo = R_034C00_TD_PERFCOUNTER0_LO,
356 .num_multi = 1,
357 .layout = SI_PC_MULTI_ALTERNATE,
358 };
359
360 static struct si_pc_block_base cik_TCA = {
361 .name = "TCA",
362 .num_counters = 4,
363 .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
364
365 .select0 = R_036E40_TCA_PERFCOUNTER0_SELECT,
366 .counter0_lo = R_034E40_TCA_PERFCOUNTER0_LO,
367 .num_multi = 2,
368 .layout = SI_PC_MULTI_ALTERNATE,
369 };
370
371 static struct si_pc_block_base cik_TCC = {
372 .name = "TCC",
373 .num_counters = 4,
374 .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
375
376 .select0 = R_036E00_TCC_PERFCOUNTER0_SELECT,
377 .counter0_lo = R_034E00_TCC_PERFCOUNTER0_LO,
378 .num_multi = 2,
379 .layout = SI_PC_MULTI_ALTERNATE,
380 };
381
382 static struct si_pc_block_base cik_TCP = {
383 .name = "TCP",
384 .num_counters = 4,
385 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
386
387 .select0 = R_036D00_TCP_PERFCOUNTER0_SELECT,
388 .counter0_lo = R_034D00_TCP_PERFCOUNTER0_LO,
389 .num_multi = 2,
390 .layout = SI_PC_MULTI_ALTERNATE,
391 };
392
393 static struct si_pc_block_base cik_VGT = {
394 .name = "VGT",
395 .num_counters = 4,
396 .flags = SI_PC_BLOCK_SE,
397
398 .select0 = R_036230_VGT_PERFCOUNTER0_SELECT,
399 .counter0_lo = R_034240_VGT_PERFCOUNTER0_LO,
400 .num_multi = 1,
401 .layout = SI_PC_MULTI_TAIL,
402 };
403
404 static struct si_pc_block_base cik_WD = {
405 .name = "WD",
406 .num_counters = 4,
407
408 .select0 = R_036200_WD_PERFCOUNTER0_SELECT,
409 .counter0_lo = R_034200_WD_PERFCOUNTER0_LO,
410 };
411
412 static struct si_pc_block_base cik_MC = {
413 .name = "MC",
414 .num_counters = 4,
415
416 .layout = SI_PC_FAKE,
417 };
418
419 static struct si_pc_block_base cik_SRBM = {
420 .name = "SRBM",
421 .num_counters = 2,
422
423 .layout = SI_PC_FAKE,
424 };
425
426 static struct si_pc_block_base gfx10_CHA = {
427 .name = "CHA",
428 .num_counters = 4,
429
430 .select0 = R_037780_CHA_PERFCOUNTER0_SELECT,
431 .counter0_lo = R_035800_CHA_PERFCOUNTER0_LO,
432 .num_multi = 1,
433 .layout = SI_PC_MULTI_ALTERNATE,
434 };
435
436 static struct si_pc_block_base gfx10_CHCG = {
437 .name = "CHCG",
438 .num_counters = 4,
439
440 .select0 = R_036F18_CHCG_PERFCOUNTER0_SELECT,
441 .counter0_lo = R_034F20_CHCG_PERFCOUNTER0_LO,
442 .num_multi = 1,
443 .layout = SI_PC_MULTI_ALTERNATE,
444 };
445
446 static struct si_pc_block_base gfx10_CHC = {
447 .name = "CHC",
448 .num_counters = 4,
449
450 .select0 = R_036F00_CHC_PERFCOUNTER0_SELECT,
451 .counter0_lo = R_034F00_CHC_PERFCOUNTER0_LO,
452 .num_multi = 1,
453 .layout = SI_PC_MULTI_ALTERNATE,
454 };
455
456 static struct si_pc_block_base gfx10_GCR = {
457 .name = "GCR",
458 .num_counters = 2,
459
460 .select0 = R_037580_GCR_PERFCOUNTER0_SELECT,
461 .counter0_lo = R_035480_GCR_PERFCOUNTER0_LO,
462 .num_multi = 1,
463 .layout = SI_PC_MULTI_ALTERNATE,
464 };
465
466 static struct si_pc_block_base gfx10_GE = {
467 .name = "GE",
468 .num_counters = 12,
469
470 .select0 = R_036200_GE_PERFCOUNTER0_SELECT,
471 .counter0_lo = R_034200_GE_PERFCOUNTER0_LO,
472 .num_multi = 4,
473 .layout = SI_PC_MULTI_ALTERNATE,
474 };
475
476 static struct si_pc_block_base gfx10_GL1A = {
477 .name = "GL1A",
478 .num_counters = 4,
479 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER_WINDOWED,
480
481 .select0 = R_037700_GL1A_PERFCOUNTER0_SELECT,
482 .counter0_lo = R_035700_GL1A_PERFCOUNTER0_LO,
483 .num_multi = 1,
484 .layout = SI_PC_MULTI_ALTERNATE,
485 };
486
487 static struct si_pc_block_base gfx10_GL1C = {
488 .name = "GL1C",
489 .num_counters = 4,
490 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER_WINDOWED,
491
492 .select0 = R_036E80_GL1C_PERFCOUNTER0_SELECT,
493 .counter0_lo = R_034E80_GL1C_PERFCOUNTER0_LO,
494 .num_multi = 1,
495 .layout = SI_PC_MULTI_ALTERNATE,
496 };
497
498 static struct si_pc_block_base gfx10_GL2A = {
499 .name = "GL2A",
500 .num_counters = 4,
501
502 .select0 = R_036E40_GL2A_PERFCOUNTER0_SELECT,
503 .counter0_lo = R_034E40_GL2A_PERFCOUNTER0_LO,
504 .num_multi = 2,
505 .layout = SI_PC_MULTI_ALTERNATE,
506 };
507
508 static struct si_pc_block_base gfx10_GL2C = {
509 .name = "GL2C",
510 .num_counters = 4,
511
512 .select0 = R_036E00_GL2C_PERFCOUNTER0_SELECT,
513 .counter0_lo = R_034E00_GL2C_PERFCOUNTER0_LO,
514 .num_multi = 2,
515 .layout = SI_PC_MULTI_ALTERNATE,
516 };
517
518 static unsigned gfx10_PA_PH_select[] = {
519 R_037600_PA_PH_PERFCOUNTER0_SELECT,
520 R_037604_PA_PH_PERFCOUNTER0_SELECT1,
521 R_037608_PA_PH_PERFCOUNTER1_SELECT,
522 R_037640_PA_PH_PERFCOUNTER1_SELECT1,
523 R_03760C_PA_PH_PERFCOUNTER2_SELECT,
524 R_037644_PA_PH_PERFCOUNTER2_SELECT1,
525 R_037610_PA_PH_PERFCOUNTER3_SELECT,
526 R_037648_PA_PH_PERFCOUNTER3_SELECT1,
527 R_037614_PA_PH_PERFCOUNTER4_SELECT,
528 R_037618_PA_PH_PERFCOUNTER5_SELECT,
529 R_03761C_PA_PH_PERFCOUNTER6_SELECT,
530 R_037620_PA_PH_PERFCOUNTER7_SELECT,
531 };
532 static struct si_pc_block_base gfx10_PA_PH = {
533 .name = "PA_PH",
534 .num_counters = 8,
535 .flags = SI_PC_BLOCK_SE,
536
537 .select = gfx10_PA_PH_select,
538 .counter0_lo = R_035600_PA_PH_PERFCOUNTER0_LO,
539 .num_multi = 4,
540 .layout = SI_PC_MULTI_CUSTOM,
541 };
542
543 static struct si_pc_block_base gfx10_PA_SU = {
544 .name = "PA_SU",
545 .num_counters = 4,
546 .flags = SI_PC_BLOCK_SE,
547
548 .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
549 .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
550 .num_multi = 4,
551 .layout = SI_PC_MULTI_ALTERNATE,
552 };
553
554 static struct si_pc_block_base gfx10_RLC = {
555 .name = "RLC",
556 .num_counters = 2,
557
558 .select0 = R_037304_RLC_PERFCOUNTER0_SELECT,
559 .counter0_lo = R_035200_RLC_PERFCOUNTER0_LO,
560 .num_multi = 0,
561 .layout = SI_PC_MULTI_ALTERNATE,
562 };
563
564 static struct si_pc_block_base gfx10_RMI = {
565 .name = "RMI",
566 /* Actually 4, but the 2nd counter is missing the secondary selector while
567 * the 3rd counter has it, which complicates the register layout. */
568 .num_counters = 2,
569 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
570
571 .select0 = R_037400_RMI_PERFCOUNTER0_SELECT,
572 .counter0_lo = R_035300_RMI_PERFCOUNTER0_LO,
573 .num_multi = 1,
574 .layout = SI_PC_MULTI_ALTERNATE,
575 };
576
577 static struct si_pc_block_base gfx10_UTCL1 = {
578 .name = "UTCL1",
579 .num_counters = 2,
580 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER_WINDOWED,
581
582 .select0 = R_03758C_UTCL1_PERFCOUNTER0_SELECT,
583 .counter0_lo = R_035470_UTCL1_PERFCOUNTER0_LO,
584 .num_multi = 0,
585 .layout = SI_PC_MULTI_ALTERNATE,
586 };
587
588 /* Both the number of instances and selectors varies between chips of the same
589 * class. We only differentiate by class here and simply expose the maximum
590 * number over all chips in a class.
591 *
592 * Unfortunately, GPUPerfStudio uses the order of performance counter groups
593 * blindly once it believes it has identified the hardware, so the order of
594 * blocks here matters.
595 */
596 static struct si_pc_block_gfxdescr groups_CIK[] = {
597 {&cik_CB, 226}, {&cik_CPF, 17}, {&cik_DB, 257}, {&cik_GRBM, 34}, {&cik_GRBMSE, 15},
598 {&cik_PA_SU, 153}, {&cik_PA_SC, 395}, {&cik_SPI, 186}, {&cik_SQ, 252}, {&cik_SX, 32},
599 {&cik_TA, 111}, {&cik_TCA, 39, 2}, {&cik_TCC, 160}, {&cik_TD, 55}, {&cik_TCP, 154},
600 {&cik_GDS, 121}, {&cik_VGT, 140}, {&cik_IA, 22}, {&cik_MC, 22}, {&cik_SRBM, 19},
601 {&cik_WD, 22}, {&cik_CPG, 46}, {&cik_CPC, 22},
602
603 };
604
605 static struct si_pc_block_gfxdescr groups_VI[] = {
606 {&cik_CB, 405}, {&cik_CPF, 19}, {&cik_DB, 257}, {&cik_GRBM, 34}, {&cik_GRBMSE, 15},
607 {&cik_PA_SU, 154}, {&cik_PA_SC, 397}, {&cik_SPI, 197}, {&cik_SQ, 273}, {&cik_SX, 34},
608 {&cik_TA, 119}, {&cik_TCA, 35, 2}, {&cik_TCC, 192}, {&cik_TD, 55}, {&cik_TCP, 180},
609 {&cik_GDS, 121}, {&cik_VGT, 147}, {&cik_IA, 24}, {&cik_MC, 22}, {&cik_SRBM, 27},
610 {&cik_WD, 37}, {&cik_CPG, 48}, {&cik_CPC, 24},
611
612 };
613
614 static struct si_pc_block_gfxdescr groups_gfx9[] = {
615 {&cik_CB, 438}, {&cik_CPF, 32}, {&cik_DB, 328}, {&cik_GRBM, 38}, {&cik_GRBMSE, 16},
616 {&cik_PA_SU, 292}, {&cik_PA_SC, 491}, {&cik_SPI, 196}, {&cik_SQ, 374}, {&cik_SX, 208},
617 {&cik_TA, 119}, {&cik_TCA, 35, 2}, {&cik_TCC, 256}, {&cik_TD, 57}, {&cik_TCP, 85},
618 {&cik_GDS, 121}, {&cik_VGT, 148}, {&cik_IA, 32}, {&cik_WD, 58}, {&cik_CPG, 59},
619 {&cik_CPC, 35},
620 };
621
622 static struct si_pc_block_gfxdescr groups_gfx10[] = {
623 {&cik_CB, 461},
624 {&gfx10_CHA, 45},
625 {&gfx10_CHCG, 35},
626 {&gfx10_CHC, 35},
627 {&cik_CPC, 47},
628 {&cik_CPF, 40},
629 {&cik_CPG, 82},
630 {&cik_DB, 370},
631 {&gfx10_GCR, 94},
632 {&cik_GDS, 123},
633 {&gfx10_GE, 315},
634 {&gfx10_GL1A, 36},
635 {&gfx10_GL1C, 64},
636 {&gfx10_GL2A, 91},
637 {&gfx10_GL2C, 235},
638 {&cik_GRBM, 47},
639 {&cik_GRBMSE, 19},
640 {&gfx10_PA_PH, 960},
641 {&cik_PA_SC, 552},
642 {&gfx10_PA_SU, 266},
643 {&gfx10_RLC, 7},
644 {&gfx10_RMI, 258},
645 {&cik_SPI, 329},
646 {&cik_SQ, 509},
647 {&cik_SX, 225},
648 {&cik_TA, 226},
649 {&cik_TCP, 77},
650 {&cik_TD, 61},
651 {&gfx10_UTCL1, 15},
652 };
653
654 static bool si_pc_block_has_per_se_groups(const struct si_perfcounters *pc,
655 const struct si_pc_block *block)
656 {
657 return block->b->b->flags & SI_PC_BLOCK_SE_GROUPS ||
658 (block->b->b->flags & SI_PC_BLOCK_SE && pc->separate_se);
659 }
660
661 static bool si_pc_block_has_per_instance_groups(const struct si_perfcounters *pc,
662 const struct si_pc_block *block)
663 {
664 return block->b->b->flags & SI_PC_BLOCK_INSTANCE_GROUPS ||
665 (block->num_instances > 1 && pc->separate_instance);
666 }
667
668 static struct si_pc_block *lookup_counter(struct si_perfcounters *pc, unsigned index,
669 unsigned *base_gid, unsigned *sub_index)
670 {
671 struct si_pc_block *block = pc->blocks;
672 unsigned bid;
673
674 *base_gid = 0;
675 for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
676 unsigned total = block->num_groups * block->b->selectors;
677
678 if (index < total) {
679 *sub_index = index;
680 return block;
681 }
682
683 index -= total;
684 *base_gid += block->num_groups;
685 }
686
687 return NULL;
688 }
689
690 static struct si_pc_block *lookup_group(struct si_perfcounters *pc, unsigned *index)
691 {
692 unsigned bid;
693 struct si_pc_block *block = pc->blocks;
694
695 for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
696 if (*index < block->num_groups)
697 return block;
698 *index -= block->num_groups;
699 }
700
701 return NULL;
702 }
703
704 static void si_pc_emit_instance(struct si_context *sctx, int se, int instance)
705 {
706 struct radeon_cmdbuf *cs = sctx->gfx_cs;
707 unsigned value = S_030800_SH_BROADCAST_WRITES(1);
708
709 if (se >= 0) {
710 value |= S_030800_SE_INDEX(se);
711 } else {
712 value |= S_030800_SE_BROADCAST_WRITES(1);
713 }
714
715 if (sctx->chip_class >= GFX10) {
716 /* TODO: Expose counters from each shader array separately if needed. */
717 value |= S_030800_SA_BROADCAST_WRITES(1);
718 }
719
720 if (instance >= 0) {
721 value |= S_030800_INSTANCE_INDEX(instance);
722 } else {
723 value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
724 }
725
726 radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
727 }
728
729 static void si_pc_emit_shaders(struct si_context *sctx, unsigned shaders)
730 {
731 struct radeon_cmdbuf *cs = sctx->gfx_cs;
732
733 radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
734 radeon_emit(cs, shaders & 0x7f);
735 radeon_emit(cs, 0xffffffff);
736 }
737
738 static void si_pc_emit_select(struct si_context *sctx, struct si_pc_block *block, unsigned count,
739 unsigned *selectors)
740 {
741 struct si_pc_block_base *regs = block->b->b;
742 struct radeon_cmdbuf *cs = sctx->gfx_cs;
743 unsigned idx;
744 unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
745 unsigned dw;
746
747 assert(count <= regs->num_counters);
748
749 if (regs->layout & SI_PC_FAKE)
750 return;
751
752 if (layout_multi == SI_PC_MULTI_BLOCK) {
753 assert(!(regs->layout & SI_PC_REG_REVERSE));
754
755 dw = count + regs->num_prelude;
756 if (count >= regs->num_multi)
757 dw += regs->num_multi;
758 radeon_set_uconfig_reg_seq(cs, regs->select0, dw);
759 for (idx = 0; idx < regs->num_prelude; ++idx)
760 radeon_emit(cs, 0);
761 for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
762 radeon_emit(cs, selectors[idx] | regs->select_or);
763
764 if (count < regs->num_multi) {
765 unsigned select1 = regs->select0 + 4 * regs->num_multi;
766 radeon_set_uconfig_reg_seq(cs, select1, count);
767 }
768
769 for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
770 radeon_emit(cs, 0);
771
772 if (count > regs->num_multi) {
773 for (idx = regs->num_multi; idx < count; ++idx)
774 radeon_emit(cs, selectors[idx] | regs->select_or);
775 }
776 } else if (layout_multi == SI_PC_MULTI_TAIL) {
777 unsigned select1, select1_count;
778
779 assert(!(regs->layout & SI_PC_REG_REVERSE));
780
781 radeon_set_uconfig_reg_seq(cs, regs->select0, count + regs->num_prelude);
782 for (idx = 0; idx < regs->num_prelude; ++idx)
783 radeon_emit(cs, 0);
784 for (idx = 0; idx < count; ++idx)
785 radeon_emit(cs, selectors[idx] | regs->select_or);
786
787 select1 = regs->select0 + 4 * regs->num_counters;
788 select1_count = MIN2(count, regs->num_multi);
789 radeon_set_uconfig_reg_seq(cs, select1, select1_count);
790 for (idx = 0; idx < select1_count; ++idx)
791 radeon_emit(cs, 0);
792 } else if (layout_multi == SI_PC_MULTI_CUSTOM) {
793 unsigned *reg = regs->select;
794 for (idx = 0; idx < count; ++idx) {
795 radeon_set_uconfig_reg(cs, *reg++, selectors[idx] | regs->select_or);
796 if (idx < regs->num_multi)
797 radeon_set_uconfig_reg(cs, *reg++, 0);
798 }
799 } else {
800 assert(layout_multi == SI_PC_MULTI_ALTERNATE);
801
802 unsigned reg_base = regs->select0;
803 unsigned reg_count = count + MIN2(count, regs->num_multi);
804 reg_count += regs->num_prelude;
805
806 if (!(regs->layout & SI_PC_REG_REVERSE)) {
807 radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
808
809 for (idx = 0; idx < regs->num_prelude; ++idx)
810 radeon_emit(cs, 0);
811 for (idx = 0; idx < count; ++idx) {
812 radeon_emit(cs, selectors[idx] | regs->select_or);
813 if (idx < regs->num_multi)
814 radeon_emit(cs, 0);
815 }
816 } else {
817 reg_base -= (reg_count - 1) * 4;
818 radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
819
820 for (idx = count; idx > 0; --idx) {
821 if (idx <= regs->num_multi)
822 radeon_emit(cs, 0);
823 radeon_emit(cs, selectors[idx - 1] | regs->select_or);
824 }
825 for (idx = 0; idx < regs->num_prelude; ++idx)
826 radeon_emit(cs, 0);
827 }
828 }
829 }
830
831 static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
832 {
833 struct radeon_cmdbuf *cs = sctx->gfx_cs;
834
835 si_cp_copy_data(sctx, sctx->gfx_cs, COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address,
836 COPY_DATA_IMM, NULL, 1);
837
838 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
839 S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET));
840 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
841 radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
842 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
843 S_036020_PERFMON_STATE(V_036020_START_COUNTING));
844 }
845
846 /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
847 * do it again in here. */
848 static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
849 {
850 struct radeon_cmdbuf *cs = sctx->gfx_cs;
851
852 si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
853 EOP_DATA_SEL_VALUE_32BIT, buffer, va, 0, SI_NOT_QUERY);
854 si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
855
856 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
857 radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
858 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
859 radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
860 radeon_set_uconfig_reg(
861 cs, R_036020_CP_PERFMON_CNTL,
862 S_036020_PERFMON_STATE(V_036020_STOP_COUNTING) | S_036020_PERFMON_SAMPLE_ENABLE(1));
863 }
864
865 static void si_pc_emit_read(struct si_context *sctx, struct si_pc_block *block, unsigned count,
866 uint64_t va)
867 {
868 struct si_pc_block_base *regs = block->b->b;
869 struct radeon_cmdbuf *cs = sctx->gfx_cs;
870 unsigned idx;
871 unsigned reg = regs->counter0_lo;
872 unsigned reg_delta = 8;
873
874 if (!(regs->layout & SI_PC_FAKE)) {
875 if (regs->layout & SI_PC_REG_REVERSE)
876 reg_delta = -reg_delta;
877
878 for (idx = 0; idx < count; ++idx) {
879 if (regs->counters)
880 reg = regs->counters[idx];
881
882 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
883 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
884 COPY_DATA_COUNT_SEL); /* 64 bits */
885 radeon_emit(cs, reg >> 2);
886 radeon_emit(cs, 0); /* unused */
887 radeon_emit(cs, va);
888 radeon_emit(cs, va >> 32);
889 va += sizeof(uint64_t);
890 reg += reg_delta;
891 }
892 } else {
893 for (idx = 0; idx < count; ++idx) {
894 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
895 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
896 COPY_DATA_COUNT_SEL);
897 radeon_emit(cs, 0); /* immediate */
898 radeon_emit(cs, 0);
899 radeon_emit(cs, va);
900 radeon_emit(cs, va >> 32);
901 va += sizeof(uint64_t);
902 }
903 }
904 }
905
906 static void si_pc_query_destroy(struct si_context *sctx, struct si_query *squery)
907 {
908 struct si_query_pc *query = (struct si_query_pc *)squery;
909
910 while (query->groups) {
911 struct si_query_group *group = query->groups;
912 query->groups = group->next;
913 FREE(group);
914 }
915
916 FREE(query->counters);
917
918 si_query_buffer_destroy(sctx->screen, &query->buffer);
919 FREE(query);
920 }
921
922 static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery)
923 /*
924 struct si_query_hw *hwquery,
925 struct si_resource *buffer, uint64_t va)*/
926 {
927 struct si_query_pc *query = (struct si_query_pc *)squery;
928 int current_se = -1;
929 int current_instance = -1;
930
931 if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size))
932 return;
933 si_need_gfx_cs_space(sctx);
934
935 if (query->shaders)
936 si_pc_emit_shaders(sctx, query->shaders);
937
938 for (struct si_query_group *group = query->groups; group; group = group->next) {
939 struct si_pc_block *block = group->block;
940
941 if (group->se != current_se || group->instance != current_instance) {
942 current_se = group->se;
943 current_instance = group->instance;
944 si_pc_emit_instance(sctx, group->se, group->instance);
945 }
946
947 si_pc_emit_select(sctx, block, group->num_counters, group->selectors);
948 }
949
950 if (current_se != -1 || current_instance != -1)
951 si_pc_emit_instance(sctx, -1, -1);
952
953 uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
954 si_pc_emit_start(sctx, query->buffer.buf, va);
955 }
956
957 static void si_pc_query_suspend(struct si_context *sctx, struct si_query *squery)
958 {
959 struct si_query_pc *query = (struct si_query_pc *)squery;
960
961 if (!query->buffer.buf)
962 return;
963
964 uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
965 query->buffer.results_end += query->result_size;
966
967 si_pc_emit_stop(sctx, query->buffer.buf, va);
968
969 for (struct si_query_group *group = query->groups; group; group = group->next) {
970 struct si_pc_block *block = group->block;
971 unsigned se = group->se >= 0 ? group->se : 0;
972 unsigned se_end = se + 1;
973
974 if ((block->b->b->flags & SI_PC_BLOCK_SE) && (group->se < 0))
975 se_end = sctx->screen->info.max_se;
976
977 do {
978 unsigned instance = group->instance >= 0 ? group->instance : 0;
979
980 do {
981 si_pc_emit_instance(sctx, se, instance);
982 si_pc_emit_read(sctx, block, group->num_counters, va);
983 va += sizeof(uint64_t) * group->num_counters;
984 } while (group->instance < 0 && ++instance < block->num_instances);
985 } while (++se < se_end);
986 }
987
988 si_pc_emit_instance(sctx, -1, -1);
989 }
990
991 static bool si_pc_query_begin(struct si_context *ctx, struct si_query *squery)
992 {
993 struct si_query_pc *query = (struct si_query_pc *)squery;
994
995 si_query_buffer_reset(ctx, &query->buffer);
996
997 list_addtail(&query->b.active_list, &ctx->active_queries);
998 ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
999
1000 si_pc_query_resume(ctx, squery);
1001
1002 return true;
1003 }
1004
1005 static bool si_pc_query_end(struct si_context *ctx, struct si_query *squery)
1006 {
1007 struct si_query_pc *query = (struct si_query_pc *)squery;
1008
1009 si_pc_query_suspend(ctx, squery);
1010
1011 list_del(&squery->active_list);
1012 ctx->num_cs_dw_queries_suspend -= squery->num_cs_dw_suspend;
1013
1014 return query->buffer.buf != NULL;
1015 }
1016
1017 static void si_pc_query_add_result(struct si_query_pc *query, void *buffer,
1018 union pipe_query_result *result)
1019 {
1020 uint64_t *results = buffer;
1021 unsigned i, j;
1022
1023 for (i = 0; i < query->num_counters; ++i) {
1024 struct si_query_counter *counter = &query->counters[i];
1025
1026 for (j = 0; j < counter->qwords; ++j) {
1027 uint32_t value = results[counter->base + j * counter->stride];
1028 result->batch[i].u64 += value;
1029 }
1030 }
1031 }
1032
1033 static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
1034 union pipe_query_result *result)
1035 {
1036 struct si_query_pc *query = (struct si_query_pc *)squery;
1037
1038 memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
1039
1040 for (struct si_query_buffer *qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1041 unsigned usage = PIPE_TRANSFER_READ | (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
1042 unsigned results_base = 0;
1043 void *map;
1044
1045 if (squery->b.flushed)
1046 map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
1047 else
1048 map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
1049
1050 if (!map)
1051 return false;
1052
1053 while (results_base != qbuf->results_end) {
1054 si_pc_query_add_result(query, map + results_base, result);
1055 results_base += query->result_size;
1056 }
1057 }
1058
1059 return true;
1060 }
1061
1062 static const struct si_query_ops batch_query_ops = {
1063 .destroy = si_pc_query_destroy,
1064 .begin = si_pc_query_begin,
1065 .end = si_pc_query_end,
1066 .get_result = si_pc_query_get_result,
1067
1068 .suspend = si_pc_query_suspend,
1069 .resume = si_pc_query_resume,
1070 };
1071
1072 static struct si_query_group *get_group_state(struct si_screen *screen, struct si_query_pc *query,
1073 struct si_pc_block *block, unsigned sub_gid)
1074 {
1075 struct si_query_group *group = query->groups;
1076
1077 while (group) {
1078 if (group->block == block && group->sub_gid == sub_gid)
1079 return group;
1080 group = group->next;
1081 }
1082
1083 group = CALLOC_STRUCT(si_query_group);
1084 if (!group)
1085 return NULL;
1086
1087 group->block = block;
1088 group->sub_gid = sub_gid;
1089
1090 if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
1091 unsigned sub_gids = block->num_instances;
1092 unsigned shader_id;
1093 unsigned shaders;
1094 unsigned query_shaders;
1095
1096 if (si_pc_block_has_per_se_groups(screen->perfcounters, block))
1097 sub_gids = sub_gids * screen->info.max_se;
1098 shader_id = sub_gid / sub_gids;
1099 sub_gid = sub_gid % sub_gids;
1100
1101 shaders = si_pc_shader_type_bits[shader_id];
1102
1103 query_shaders = query->shaders & ~SI_PC_SHADERS_WINDOWING;
1104 if (query_shaders && query_shaders != shaders) {
1105 fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
1106 FREE(group);
1107 return NULL;
1108 }
1109 query->shaders = shaders;
1110 }
1111
1112 if (block->b->b->flags & SI_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
1113 // A non-zero value in query->shaders ensures that the shader
1114 // masking is reset unless the user explicitly requests one.
1115 query->shaders = SI_PC_SHADERS_WINDOWING;
1116 }
1117
1118 if (si_pc_block_has_per_se_groups(screen->perfcounters, block)) {
1119 group->se = sub_gid / block->num_instances;
1120 sub_gid = sub_gid % block->num_instances;
1121 } else {
1122 group->se = -1;
1123 }
1124
1125 if (si_pc_block_has_per_instance_groups(screen->perfcounters, block)) {
1126 group->instance = sub_gid;
1127 } else {
1128 group->instance = -1;
1129 }
1130
1131 group->next = query->groups;
1132 query->groups = group;
1133
1134 return group;
1135 }
1136
1137 struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_queries,
1138 unsigned *query_types)
1139 {
1140 struct si_screen *screen = (struct si_screen *)ctx->screen;
1141 struct si_perfcounters *pc = screen->perfcounters;
1142 struct si_pc_block *block;
1143 struct si_query_group *group;
1144 struct si_query_pc *query;
1145 unsigned base_gid, sub_gid, sub_index;
1146 unsigned i, j;
1147
1148 if (!pc)
1149 return NULL;
1150
1151 query = CALLOC_STRUCT(si_query_pc);
1152 if (!query)
1153 return NULL;
1154
1155 query->b.ops = &batch_query_ops;
1156
1157 query->num_counters = num_queries;
1158
1159 /* Collect selectors per group */
1160 for (i = 0; i < num_queries; ++i) {
1161 unsigned sub_gid;
1162
1163 if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER)
1164 goto error;
1165
1166 block =
1167 lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
1168 if (!block)
1169 goto error;
1170
1171 sub_gid = sub_index / block->b->selectors;
1172 sub_index = sub_index % block->b->selectors;
1173
1174 group = get_group_state(screen, query, block, sub_gid);
1175 if (!group)
1176 goto error;
1177
1178 if (group->num_counters >= block->b->b->num_counters) {
1179 fprintf(stderr, "perfcounter group %s: too many selected\n", block->b->b->name);
1180 goto error;
1181 }
1182 group->selectors[group->num_counters] = sub_index;
1183 ++group->num_counters;
1184 }
1185
1186 /* Compute result bases and CS size per group */
1187 query->b.num_cs_dw_suspend = pc->num_stop_cs_dwords;
1188 query->b.num_cs_dw_suspend += pc->num_instance_cs_dwords;
1189
1190 i = 0;
1191 for (group = query->groups; group; group = group->next) {
1192 struct si_pc_block *block = group->block;
1193 unsigned read_dw;
1194 unsigned instances = 1;
1195
1196 if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
1197 instances = screen->info.max_se;
1198 if (group->instance < 0)
1199 instances *= block->num_instances;
1200
1201 group->result_base = i;
1202 query->result_size += sizeof(uint64_t) * instances * group->num_counters;
1203 i += instances * group->num_counters;
1204
1205 read_dw = 6 * group->num_counters;
1206 query->b.num_cs_dw_suspend += instances * read_dw;
1207 query->b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords;
1208 }
1209
1210 if (query->shaders) {
1211 if (query->shaders == SI_PC_SHADERS_WINDOWING)
1212 query->shaders = 0xffffffff;
1213 }
1214
1215 /* Map user-supplied query array to result indices */
1216 query->counters = CALLOC(num_queries, sizeof(*query->counters));
1217 for (i = 0; i < num_queries; ++i) {
1218 struct si_query_counter *counter = &query->counters[i];
1219 struct si_pc_block *block;
1220
1221 block =
1222 lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
1223
1224 sub_gid = sub_index / block->b->selectors;
1225 sub_index = sub_index % block->b->selectors;
1226
1227 group = get_group_state(screen, query, block, sub_gid);
1228 assert(group != NULL);
1229
1230 for (j = 0; j < group->num_counters; ++j) {
1231 if (group->selectors[j] == sub_index)
1232 break;
1233 }
1234
1235 counter->base = group->result_base + j;
1236 counter->stride = group->num_counters;
1237
1238 counter->qwords = 1;
1239 if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
1240 counter->qwords = screen->info.max_se;
1241 if (group->instance < 0)
1242 counter->qwords *= block->num_instances;
1243 }
1244
1245 return (struct pipe_query *)query;
1246
1247 error:
1248 si_pc_query_destroy((struct si_context *)ctx, &query->b);
1249 return NULL;
1250 }
1251
1252 static bool si_init_block_names(struct si_screen *screen, struct si_pc_block *block)
1253 {
1254 bool per_instance_groups = si_pc_block_has_per_instance_groups(screen->perfcounters, block);
1255 bool per_se_groups = si_pc_block_has_per_se_groups(screen->perfcounters, block);
1256 unsigned i, j, k;
1257 unsigned groups_shader = 1, groups_se = 1, groups_instance = 1;
1258 unsigned namelen;
1259 char *groupname;
1260 char *p;
1261
1262 if (per_instance_groups)
1263 groups_instance = block->num_instances;
1264 if (per_se_groups)
1265 groups_se = screen->info.max_se;
1266 if (block->b->b->flags & SI_PC_BLOCK_SHADER)
1267 groups_shader = ARRAY_SIZE(si_pc_shader_type_bits);
1268
1269 namelen = strlen(block->b->b->name);
1270 block->group_name_stride = namelen + 1;
1271 if (block->b->b->flags & SI_PC_BLOCK_SHADER)
1272 block->group_name_stride += 3;
1273 if (per_se_groups) {
1274 assert(groups_se <= 10);
1275 block->group_name_stride += 1;
1276
1277 if (per_instance_groups)
1278 block->group_name_stride += 1;
1279 }
1280 if (per_instance_groups) {
1281 assert(groups_instance <= 100);
1282 block->group_name_stride += 2;
1283 }
1284
1285 block->group_names = MALLOC(block->num_groups * block->group_name_stride);
1286 if (!block->group_names)
1287 return false;
1288
1289 groupname = block->group_names;
1290 for (i = 0; i < groups_shader; ++i) {
1291 const char *shader_suffix = si_pc_shader_type_suffixes[i];
1292 unsigned shaderlen = strlen(shader_suffix);
1293 for (j = 0; j < groups_se; ++j) {
1294 for (k = 0; k < groups_instance; ++k) {
1295 strcpy(groupname, block->b->b->name);
1296 p = groupname + namelen;
1297
1298 if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
1299 strcpy(p, shader_suffix);
1300 p += shaderlen;
1301 }
1302
1303 if (per_se_groups) {
1304 p += sprintf(p, "%d", j);
1305 if (per_instance_groups)
1306 *p++ = '_';
1307 }
1308
1309 if (per_instance_groups)
1310 p += sprintf(p, "%d", k);
1311
1312 groupname += block->group_name_stride;
1313 }
1314 }
1315 }
1316
1317 assert(block->b->selectors <= 1000);
1318 block->selector_name_stride = block->group_name_stride + 4;
1319 block->selector_names =
1320 MALLOC(block->num_groups * block->b->selectors * block->selector_name_stride);
1321 if (!block->selector_names)
1322 return false;
1323
1324 groupname = block->group_names;
1325 p = block->selector_names;
1326 for (i = 0; i < block->num_groups; ++i) {
1327 for (j = 0; j < block->b->selectors; ++j) {
1328 sprintf(p, "%s_%03d", groupname, j);
1329 p += block->selector_name_stride;
1330 }
1331 groupname += block->group_name_stride;
1332 }
1333
1334 return true;
1335 }
1336
1337 int si_get_perfcounter_info(struct si_screen *screen, unsigned index,
1338 struct pipe_driver_query_info *info)
1339 {
1340 struct si_perfcounters *pc = screen->perfcounters;
1341 struct si_pc_block *block;
1342 unsigned base_gid, sub;
1343
1344 if (!pc)
1345 return 0;
1346
1347 if (!info) {
1348 unsigned bid, num_queries = 0;
1349
1350 for (bid = 0; bid < pc->num_blocks; ++bid) {
1351 num_queries += pc->blocks[bid].b->selectors * pc->blocks[bid].num_groups;
1352 }
1353
1354 return num_queries;
1355 }
1356
1357 block = lookup_counter(pc, index, &base_gid, &sub);
1358 if (!block)
1359 return 0;
1360
1361 if (!block->selector_names) {
1362 if (!si_init_block_names(screen, block))
1363 return 0;
1364 }
1365 info->name = block->selector_names + sub * block->selector_name_stride;
1366 info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index;
1367 info->max_value.u64 = 0;
1368 info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
1369 info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
1370 info->group_id = base_gid + sub / block->b->selectors;
1371 info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
1372 if (sub > 0 && sub + 1 < block->b->selectors * block->num_groups)
1373 info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
1374 return 1;
1375 }
1376
1377 int si_get_perfcounter_group_info(struct si_screen *screen, unsigned index,
1378 struct pipe_driver_query_group_info *info)
1379 {
1380 struct si_perfcounters *pc = screen->perfcounters;
1381 struct si_pc_block *block;
1382
1383 if (!pc)
1384 return 0;
1385
1386 if (!info)
1387 return pc->num_groups;
1388
1389 block = lookup_group(pc, &index);
1390 if (!block)
1391 return 0;
1392
1393 if (!block->group_names) {
1394 if (!si_init_block_names(screen, block))
1395 return 0;
1396 }
1397 info->name = block->group_names + index * block->group_name_stride;
1398 info->num_queries = block->b->selectors;
1399 info->max_active_queries = block->b->b->num_counters;
1400 return 1;
1401 }
1402
1403 void si_destroy_perfcounters(struct si_screen *screen)
1404 {
1405 struct si_perfcounters *pc = screen->perfcounters;
1406 unsigned i;
1407
1408 if (!pc)
1409 return;
1410
1411 for (i = 0; i < pc->num_blocks; ++i) {
1412 FREE(pc->blocks[i].group_names);
1413 FREE(pc->blocks[i].selector_names);
1414 }
1415 FREE(pc->blocks);
1416 FREE(pc);
1417 screen->perfcounters = NULL;
1418 }
1419
1420 void si_init_perfcounters(struct si_screen *screen)
1421 {
1422 struct si_perfcounters *pc;
1423 const struct si_pc_block_gfxdescr *blocks;
1424 unsigned num_blocks;
1425 unsigned i;
1426
1427 switch (screen->info.chip_class) {
1428 case GFX7:
1429 blocks = groups_CIK;
1430 num_blocks = ARRAY_SIZE(groups_CIK);
1431 break;
1432 case GFX8:
1433 blocks = groups_VI;
1434 num_blocks = ARRAY_SIZE(groups_VI);
1435 break;
1436 case GFX9:
1437 blocks = groups_gfx9;
1438 num_blocks = ARRAY_SIZE(groups_gfx9);
1439 break;
1440 case GFX10:
1441 case GFX10_3:
1442 blocks = groups_gfx10;
1443 num_blocks = ARRAY_SIZE(groups_gfx10);
1444 break;
1445 case GFX6:
1446 default:
1447 return; /* not implemented */
1448 }
1449
1450 screen->perfcounters = pc = CALLOC_STRUCT(si_perfcounters);
1451 if (!pc)
1452 return;
1453
1454 pc->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
1455 pc->num_instance_cs_dwords = 3;
1456
1457 pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
1458 pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
1459
1460 pc->blocks = CALLOC(num_blocks, sizeof(struct si_pc_block));
1461 if (!pc->blocks)
1462 goto error;
1463 pc->num_blocks = num_blocks;
1464
1465 for (i = 0; i < num_blocks; ++i) {
1466 struct si_pc_block *block = &pc->blocks[i];
1467 block->b = &blocks[i];
1468 block->num_instances = MAX2(1, block->b->instances);
1469
1470 if (!strcmp(block->b->b->name, "CB") ||
1471 !strcmp(block->b->b->name, "DB") ||
1472 !strcmp(block->b->b->name, "RMI"))
1473 block->num_instances = screen->info.max_se;
1474 else if (!strcmp(block->b->b->name, "TCC"))
1475 block->num_instances = screen->info.num_tcc_blocks;
1476 else if (!strcmp(block->b->b->name, "IA"))
1477 block->num_instances = MAX2(1, screen->info.max_se / 2);
1478 else if (!strcmp(block->b->b->name, "TA") ||
1479 !strcmp(block->b->b->name, "TCP") ||
1480 !strcmp(block->b->b->name, "TD")) {
1481 block->num_instances = MAX2(1, screen->info.max_good_cu_per_sa);
1482 }
1483
1484 if (si_pc_block_has_per_instance_groups(pc, block)) {
1485 block->num_groups = block->num_instances;
1486 } else {
1487 block->num_groups = 1;
1488 }
1489
1490 if (si_pc_block_has_per_se_groups(pc, block))
1491 block->num_groups *= screen->info.max_se;
1492 if (block->b->b->flags & SI_PC_BLOCK_SHADER)
1493 block->num_groups *= ARRAY_SIZE(si_pc_shader_type_bits);
1494
1495 pc->num_groups += block->num_groups;
1496 }
1497
1498 return;
1499
1500 error:
1501 si_destroy_perfcounters(screen);
1502 }