radeonsi: re-order the SQ_xx performance counter blocks
[mesa.git] / src / gallium / drivers / radeonsi / si_perfcounter.c
1 /*
2 * Copyright 2015 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Nicolai Hähnle <nicolai.haehnle@amd.com>
25 *
26 */
27
28 #include "radeon/r600_cs.h"
29 #include "radeon/r600_query.h"
30 #include "radeon/r600_pipe_common.h"
31 #include "util/u_memory.h"
32
33 #include "si_pipe.h"
34 #include "sid.h"
35
36 enum si_pc_reg_layout {
37 /* All secondary selector dwords follow as one block after the primary
38 * selector dwords for the counters that have secondary selectors.
39 */
40 SI_PC_MULTI_BLOCK = 0,
41
42 /* Each secondary selector dword follows immediately afters the
43 * corresponding primary.
44 */
45 SI_PC_MULTI_ALTERNATE = 1,
46
47 /* All secondary selector dwords follow as one block after all primary
48 * selector dwords.
49 */
50 SI_PC_MULTI_TAIL = 2,
51
52 /* Free-form arrangement of selector registers. */
53 SI_PC_MULTI_CUSTOM = 3,
54
55 SI_PC_MULTI_MASK = 3,
56
57 /* Registers are laid out in decreasing rather than increasing order. */
58 SI_PC_REG_REVERSE = 4,
59 };
60
61 struct si_pc_block_base {
62 const char *name;
63 unsigned num_counters;
64 unsigned flags;
65
66 unsigned select_or;
67 unsigned select0;
68 unsigned counter0_lo;
69 unsigned *select;
70 unsigned *counters;
71 unsigned num_multi;
72 unsigned num_prelude;
73 unsigned layout;
74 };
75
76 struct si_pc_block {
77 struct si_pc_block_base *b;
78 unsigned selectors;
79 unsigned instances;
80 };
81
82 /* The order is chosen to be compatible with GPUPerfStudio's hardcoding of
83 * performance counter group IDs.
84 */
85 static const char * const si_pc_shader_type_suffixes[] = {
86 "", "_ES", "_GS", "_VS", "_PS", "_LS", "_HS", "_CS"
87 };
88
89 static const unsigned si_pc_shader_type_bits[] = {
90 0x7f,
91 S_036780_ES_EN(1),
92 S_036780_GS_EN(1),
93 S_036780_VS_EN(1),
94 S_036780_PS_EN(1),
95 S_036780_LS_EN(1),
96 S_036780_HS_EN(1),
97 S_036780_CS_EN(1),
98 };
99
100 static struct si_pc_block_base cik_CB = {
101 .name = "CB",
102 .num_counters = 4,
103 .flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_INSTANCE_GROUPS,
104
105 .select0 = R_037000_CB_PERFCOUNTER_FILTER,
106 .counter0_lo = R_035018_CB_PERFCOUNTER0_LO,
107 .num_multi = 1,
108 .num_prelude = 1,
109 .layout = SI_PC_MULTI_ALTERNATE,
110 };
111
112 static unsigned cik_CPC_select[] = {
113 R_036024_CPC_PERFCOUNTER0_SELECT,
114 R_036010_CPC_PERFCOUNTER0_SELECT1,
115 R_03600C_CPC_PERFCOUNTER1_SELECT,
116 };
117 static struct si_pc_block_base cik_CPC = {
118 .name = "CPC",
119 .num_counters = 2,
120
121 .select = cik_CPC_select,
122 .counter0_lo = R_034018_CPC_PERFCOUNTER0_LO,
123 .num_multi = 1,
124 .layout = SI_PC_MULTI_CUSTOM | SI_PC_REG_REVERSE,
125 };
126
127 static struct si_pc_block_base cik_CPF = {
128 .name = "CPF",
129 .num_counters = 2,
130
131 .select0 = R_03601C_CPF_PERFCOUNTER0_SELECT,
132 .counter0_lo = R_034028_CPF_PERFCOUNTER0_LO,
133 .num_multi = 1,
134 .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
135 };
136
137 static struct si_pc_block_base cik_CPG = {
138 .name = "CPG",
139 .num_counters = 2,
140
141 .select0 = R_036008_CPG_PERFCOUNTER0_SELECT,
142 .counter0_lo = R_034008_CPG_PERFCOUNTER0_LO,
143 .num_multi = 1,
144 .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
145 };
146
147 static struct si_pc_block_base cik_DB = {
148 .name = "DB",
149 .num_counters = 4,
150 .flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_INSTANCE_GROUPS,
151
152 .select0 = R_037100_DB_PERFCOUNTER0_SELECT,
153 .counter0_lo = R_035100_DB_PERFCOUNTER0_LO,
154 .num_multi = 3, // really only 2, but there's a gap between registers
155 .layout = SI_PC_MULTI_ALTERNATE,
156 };
157
158 static struct si_pc_block_base cik_GDS = {
159 .name = "GDS",
160 .num_counters = 4,
161
162 .select0 = R_036A00_GDS_PERFCOUNTER0_SELECT,
163 .counter0_lo = R_034A00_GDS_PERFCOUNTER0_LO,
164 .num_multi = 1,
165 .layout = SI_PC_MULTI_TAIL,
166 };
167
168 static unsigned cik_GRBM_counters[] = {
169 R_034100_GRBM_PERFCOUNTER0_LO,
170 R_03410C_GRBM_PERFCOUNTER1_LO,
171 };
172 static struct si_pc_block_base cik_GRBM = {
173 .name = "GRBM",
174 .num_counters = 2,
175
176 .select0 = R_036100_GRBM_PERFCOUNTER0_SELECT,
177 .counters = cik_GRBM_counters,
178 };
179
180 static struct si_pc_block_base cik_GRBMSE = {
181 .name = "GRBMSE",
182 .num_counters = 4,
183
184 .select0 = R_036108_GRBM_SE0_PERFCOUNTER_SELECT,
185 .counter0_lo = R_034114_GRBM_SE0_PERFCOUNTER_LO,
186 };
187
188 static struct si_pc_block_base cik_IA = {
189 .name = "IA",
190 .num_counters = 4,
191
192 .select0 = R_036210_IA_PERFCOUNTER0_SELECT,
193 .counter0_lo = R_034220_IA_PERFCOUNTER0_LO,
194 .num_multi = 1,
195 .layout = SI_PC_MULTI_TAIL,
196 };
197
198 static struct si_pc_block_base cik_PA_SC = {
199 .name = "PA_SC",
200 .num_counters = 8,
201 .flags = R600_PC_BLOCK_SE,
202
203 .select0 = R_036500_PA_SC_PERFCOUNTER0_SELECT,
204 .counter0_lo = R_034500_PA_SC_PERFCOUNTER0_LO,
205 .num_multi = 1,
206 .layout = SI_PC_MULTI_ALTERNATE,
207 };
208
209 static struct si_pc_block_base cik_PA_SU = {
210 .name = "PA_SU",
211 .num_counters = 4,
212 .flags = R600_PC_BLOCK_SE,
213
214 .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
215 .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
216 .num_multi = 2,
217 .layout = SI_PC_MULTI_ALTERNATE,
218 };
219
220 static struct si_pc_block_base cik_SPI = {
221 .name = "SPI",
222 .num_counters = 6,
223 .flags = R600_PC_BLOCK_SE,
224
225 .select0 = R_036600_SPI_PERFCOUNTER0_SELECT,
226 .counter0_lo = R_034604_SPI_PERFCOUNTER0_LO,
227 .num_multi = 4,
228 .layout = SI_PC_MULTI_BLOCK,
229 };
230
231 static struct si_pc_block_base cik_SQ = {
232 .name = "SQ",
233 .num_counters = 16,
234 .flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_SHADER,
235
236 .select0 = R_036700_SQ_PERFCOUNTER0_SELECT,
237 .select_or = S_036700_SQC_BANK_MASK(15) |
238 S_036700_SQC_CLIENT_MASK(15) |
239 S_036700_SIMD_MASK(15),
240 .counter0_lo = R_034700_SQ_PERFCOUNTER0_LO,
241 };
242
243 static struct si_pc_block_base cik_SX = {
244 .name = "SX",
245 .num_counters = 4,
246 .flags = R600_PC_BLOCK_SE,
247
248 .select0 = R_036900_SX_PERFCOUNTER0_SELECT,
249 .counter0_lo = R_034900_SX_PERFCOUNTER0_LO,
250 .num_multi = 2,
251 .layout = SI_PC_MULTI_TAIL,
252 };
253
254 static struct si_pc_block_base cik_TA = {
255 .name = "TA",
256 .num_counters = 2,
257 .flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_INSTANCE_GROUPS | R600_PC_BLOCK_SHADER_WINDOWED,
258
259 .select0 = R_036B00_TA_PERFCOUNTER0_SELECT,
260 .counter0_lo = R_034B00_TA_PERFCOUNTER0_LO,
261 .num_multi = 1,
262 .layout = SI_PC_MULTI_ALTERNATE,
263 };
264
265 static struct si_pc_block_base cik_TD = {
266 .name = "TD",
267 .num_counters = 2,
268 .flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_INSTANCE_GROUPS | R600_PC_BLOCK_SHADER_WINDOWED,
269
270 .select0 = R_036C00_TD_PERFCOUNTER0_SELECT,
271 .counter0_lo = R_034C00_TD_PERFCOUNTER0_LO,
272 .num_multi = 1,
273 .layout = SI_PC_MULTI_ALTERNATE,
274 };
275
276 static struct si_pc_block_base cik_TCA = {
277 .name = "TCA",
278 .num_counters = 4,
279 .flags = R600_PC_BLOCK_INSTANCE_GROUPS,
280
281 .select0 = R_036E40_TCA_PERFCOUNTER0_SELECT,
282 .counter0_lo = R_034E40_TCA_PERFCOUNTER0_LO,
283 .num_multi = 2,
284 .layout = SI_PC_MULTI_ALTERNATE,
285 };
286
287 static struct si_pc_block_base cik_TCC = {
288 .name = "TCC",
289 .num_counters = 4,
290 .flags = R600_PC_BLOCK_INSTANCE_GROUPS,
291
292 .select0 = R_036E00_TCC_PERFCOUNTER0_SELECT,
293 .counter0_lo = R_034E00_TCC_PERFCOUNTER0_LO,
294 .num_multi = 2,
295 .layout = SI_PC_MULTI_ALTERNATE,
296 };
297
298 static struct si_pc_block_base cik_TCP = {
299 .name = "TCP",
300 .num_counters = 4,
301 .flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_INSTANCE_GROUPS | R600_PC_BLOCK_SHADER_WINDOWED,
302
303 .select0 = R_036D00_TCP_PERFCOUNTER0_SELECT,
304 .counter0_lo = R_034D00_TCP_PERFCOUNTER0_LO,
305 .num_multi = 2,
306 .layout = SI_PC_MULTI_ALTERNATE,
307 };
308
309 static struct si_pc_block_base cik_VGT = {
310 .name = "VGT",
311 .num_counters = 4,
312 .flags = R600_PC_BLOCK_SE,
313
314 .select0 = R_036230_VGT_PERFCOUNTER0_SELECT,
315 .counter0_lo = R_034240_VGT_PERFCOUNTER0_LO,
316 .num_multi = 1,
317 .layout = SI_PC_MULTI_TAIL,
318 };
319
320 static struct si_pc_block_base cik_WD = {
321 .name = "WD",
322 .num_counters = 4,
323
324 .select0 = R_036200_WD_PERFCOUNTER0_SELECT,
325 .counter0_lo = R_034200_WD_PERFCOUNTER0_LO,
326 };
327
328 /* Both the number of instances and selectors varies between chips of the same
329 * class. We only differentiate by class here and simply expose the maximum
330 * number over all chips in a class.
331 *
332 * Unfortunately, GPUPerfStudio uses the order of performance counter groups
333 * blindly once it believes it has identified the hardware, so the order of
334 * blocks here matters.
335 */
336 static struct si_pc_block groups_CIK[] = {
337 { &cik_CB, 226, 4 },
338 { &cik_CPF, 17 },
339 { &cik_DB, 257, 4 },
340 { &cik_GRBM, 34 },
341 { &cik_GRBMSE, 15 },
342 { &cik_PA_SU, 153 },
343 { &cik_PA_SC, 395 },
344 { &cik_SPI, 186 },
345 { &cik_SQ, 252 },
346 { &cik_SX, 32 },
347 { &cik_TA, 111, 11 },
348 { &cik_TCA, 39, 2 },
349 { &cik_TCC, 160, 16 },
350 { &cik_TD, 55, 11 },
351 { &cik_TCP, 154, 11 },
352 { &cik_GDS, 121 },
353 { &cik_VGT, 140 },
354 { &cik_IA, 22 },
355 { &cik_WD, 22 },
356 { &cik_CPG, 46 },
357 { &cik_CPC, 22 },
358
359 };
360
361 static struct si_pc_block groups_VI[] = {
362 { &cik_CB, 396, 4 },
363 { &cik_CPF, 19 },
364 { &cik_DB, 257, 4 },
365 { &cik_GRBM, 34 },
366 { &cik_GRBMSE, 15 },
367 { &cik_PA_SU, 153 },
368 { &cik_PA_SC, 397 },
369 { &cik_SPI, 197 },
370 { &cik_SQ, 273 },
371 { &cik_SX, 34 },
372 { &cik_TA, 119, 16 },
373 { &cik_TCA, 35, 2 },
374 { &cik_TCC, 192, 16 },
375 { &cik_TD, 55, 16 },
376 { &cik_TCP, 180, 16 },
377 { &cik_GDS, 121 },
378 { &cik_VGT, 147 },
379 { &cik_IA, 24 },
380 { &cik_WD, 37 },
381 { &cik_CPG, 48 },
382 { &cik_CPC, 24 },
383
384 };
385
386 static void si_pc_get_size(struct r600_perfcounter_block *group,
387 unsigned count, unsigned *selectors,
388 unsigned *num_select_dw, unsigned *num_read_dw)
389 {
390 struct si_pc_block *sigroup = (struct si_pc_block *)group->data;
391 struct si_pc_block_base *regs = sigroup->b;
392 unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
393
394 if (layout_multi == SI_PC_MULTI_BLOCK) {
395 if (count < regs->num_multi)
396 *num_select_dw = 2 * (count + 2) + regs->num_prelude;
397 else
398 *num_select_dw = 2 + count + regs->num_multi + regs->num_prelude;
399 } else if (layout_multi == SI_PC_MULTI_TAIL) {
400 *num_select_dw = 4 + count + MIN2(count, regs->num_multi) + regs->num_prelude;
401 } else if (layout_multi == SI_PC_MULTI_CUSTOM) {
402 assert(regs->num_prelude == 0);
403 *num_select_dw = 3 * (count + MIN2(count, regs->num_multi));
404 } else {
405 assert(layout_multi == SI_PC_MULTI_ALTERNATE);
406
407 *num_select_dw = 2 + count + MIN2(count, regs->num_multi) + regs->num_prelude;
408 }
409
410 *num_read_dw = 6 * count;
411 }
412
413 static void si_pc_emit_instance(struct r600_common_context *ctx,
414 int se, int instance)
415 {
416 struct radeon_winsys_cs *cs = ctx->gfx.cs;
417 unsigned value = S_030800_SH_BROADCAST_WRITES(1);
418
419 if (se >= 0) {
420 value |= S_030800_SE_INDEX(se);
421 } else {
422 value |= S_030800_SE_BROADCAST_WRITES(1);
423 }
424
425 if (instance >= 0) {
426 value |= S_030800_INSTANCE_INDEX(instance);
427 } else {
428 value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
429 }
430
431 radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
432 }
433
434 static void si_pc_emit_shaders(struct r600_common_context *ctx,
435 unsigned shaders)
436 {
437 struct radeon_winsys_cs *cs = ctx->gfx.cs;
438
439 radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
440 radeon_emit(cs, shaders & 0x7f);
441 radeon_emit(cs, 0xffffffff);
442 }
443
444 static void si_pc_emit_select(struct r600_common_context *ctx,
445 struct r600_perfcounter_block *group,
446 unsigned count, unsigned *selectors)
447 {
448 struct si_pc_block *sigroup = (struct si_pc_block *)group->data;
449 struct si_pc_block_base *regs = sigroup->b;
450 struct radeon_winsys_cs *cs = ctx->gfx.cs;
451 unsigned idx;
452 unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
453 unsigned dw;
454
455 assert(count <= regs->num_counters);
456
457 if (layout_multi == SI_PC_MULTI_BLOCK) {
458 assert(!(regs->layout & SI_PC_REG_REVERSE));
459
460 dw = count + regs->num_prelude;
461 if (count >= regs->num_multi)
462 dw += regs->num_multi;
463 radeon_set_uconfig_reg_seq(cs, regs->select0, dw);
464 for (idx = 0; idx < regs->num_prelude; ++idx)
465 radeon_emit(cs, 0);
466 for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
467 radeon_emit(cs, selectors[idx] | regs->select_or);
468
469 if (count < regs->num_multi) {
470 unsigned select1 =
471 regs->select0 + 4 * regs->num_multi;
472 radeon_set_uconfig_reg_seq(cs, select1, count);
473 }
474
475 for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
476 radeon_emit(cs, 0);
477
478 if (count > regs->num_multi) {
479 for (idx = regs->num_multi; idx < count; ++idx)
480 radeon_emit(cs, selectors[idx] | regs->select_or);
481 }
482 } else if (layout_multi == SI_PC_MULTI_TAIL) {
483 unsigned select1, select1_count;
484
485 assert(!(regs->layout & SI_PC_REG_REVERSE));
486
487 radeon_set_uconfig_reg_seq(cs, regs->select0, count + regs->num_prelude);
488 for (idx = 0; idx < regs->num_prelude; ++idx)
489 radeon_emit(cs, 0);
490 for (idx = 0; idx < count; ++idx)
491 radeon_emit(cs, selectors[idx] | regs->select_or);
492
493 select1 = regs->select0 + 4 * regs->num_counters;
494 select1_count = MIN2(count, regs->num_multi);
495 radeon_set_uconfig_reg_seq(cs, select1, select1_count);
496 for (idx = 0; idx < select1_count; ++idx)
497 radeon_emit(cs, 0);
498 } else if (layout_multi == SI_PC_MULTI_CUSTOM) {
499 unsigned *reg = regs->select;
500 for (idx = 0; idx < count; ++idx) {
501 radeon_set_uconfig_reg(cs, *reg++, selectors[idx] | regs->select_or);
502 if (idx < regs->num_multi)
503 radeon_set_uconfig_reg(cs, *reg++, 0);
504 }
505 } else {
506 assert(layout_multi == SI_PC_MULTI_ALTERNATE);
507
508 unsigned reg_base = regs->select0;
509 unsigned reg_count = count + MIN2(count, regs->num_multi);
510 reg_count += regs->num_prelude;
511
512 if (!(regs->layout & SI_PC_REG_REVERSE)) {
513 radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
514
515 for (idx = 0; idx < regs->num_prelude; ++idx)
516 radeon_emit(cs, 0);
517 for (idx = 0; idx < count; ++idx) {
518 radeon_emit(cs, selectors[idx] | regs->select_or);
519 if (idx < regs->num_multi)
520 radeon_emit(cs, 0);
521 }
522 } else {
523 reg_base -= (reg_count - 1) * 4;
524 radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
525
526 for (idx = count; idx > 0; --idx) {
527 if (idx <= regs->num_multi)
528 radeon_emit(cs, 0);
529 radeon_emit(cs, selectors[idx - 1] | regs->select_or);
530 }
531 for (idx = 0; idx < regs->num_prelude; ++idx)
532 radeon_emit(cs, 0);
533 }
534 }
535 }
536
537 static void si_pc_emit_start(struct r600_common_context *ctx,
538 struct r600_resource *buffer, uint64_t va)
539 {
540 struct radeon_winsys_cs *cs = ctx->gfx.cs;
541
542 radeon_add_to_buffer_list(ctx, &ctx->gfx, buffer,
543 RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
544
545 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
546 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
547 COPY_DATA_DST_SEL(COPY_DATA_MEM));
548 radeon_emit(cs, 1); /* immediate */
549 radeon_emit(cs, 0); /* unused */
550 radeon_emit(cs, va);
551 radeon_emit(cs, va >> 32);
552
553 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
554 S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET));
555 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
556 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PERFCOUNTER_START) | EVENT_INDEX(0));
557 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
558 S_036020_PERFMON_STATE(V_036020_START_COUNTING));
559 }
560
561 /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
562 * do it again in here. */
563 static void si_pc_emit_stop(struct r600_common_context *ctx,
564 struct r600_resource *buffer, uint64_t va)
565 {
566 struct radeon_winsys_cs *cs = ctx->gfx.cs;
567
568 if (ctx->screen->chip_class == CIK) {
569 /* Workaround for cache flush problems: send two EOP events. */
570 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
571 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) |
572 EVENT_INDEX(5));
573 radeon_emit(cs, va);
574 radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1));
575 radeon_emit(cs, 0); /* immediate data */
576 radeon_emit(cs, 0); /* unused */
577 }
578
579 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
580 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) |
581 EVENT_INDEX(5));
582 radeon_emit(cs, va);
583 radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1));
584 radeon_emit(cs, 0); /* immediate data */
585 radeon_emit(cs, 0); /* unused */
586
587 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
588 radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
589 radeon_emit(cs, va);
590 radeon_emit(cs, va >> 32);
591 radeon_emit(cs, 0); /* reference value */
592 radeon_emit(cs, 0xffffffff); /* mask */
593 radeon_emit(cs, 4); /* poll interval */
594
595 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
596 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
597 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
598 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PERFCOUNTER_STOP) | EVENT_INDEX(0));
599 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
600 S_036020_PERFMON_STATE(V_036020_STOP_COUNTING) |
601 S_036020_PERFMON_SAMPLE_ENABLE(1));
602 }
603
604 static void si_pc_emit_read(struct r600_common_context *ctx,
605 struct r600_perfcounter_block *group,
606 unsigned count, unsigned *selectors,
607 struct r600_resource *buffer, uint64_t va)
608 {
609 struct si_pc_block *sigroup = (struct si_pc_block *)group->data;
610 struct si_pc_block_base *regs = sigroup->b;
611 struct radeon_winsys_cs *cs = ctx->gfx.cs;
612 unsigned idx;
613 unsigned reg = regs->counter0_lo;
614 unsigned reg_delta = 8;
615
616 if (regs->layout & SI_PC_REG_REVERSE)
617 reg_delta = -reg_delta;
618
619 for (idx = 0; idx < count; ++idx) {
620 if (regs->counters)
621 reg = regs->counters[idx];
622
623 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
624 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
625 COPY_DATA_DST_SEL(COPY_DATA_MEM));
626 radeon_emit(cs, reg >> 2);
627 radeon_emit(cs, 0); /* unused */
628 radeon_emit(cs, va);
629 radeon_emit(cs, va >> 32);
630 va += 4;
631 reg += reg_delta;
632 }
633 }
634
635 static void si_pc_cleanup(struct r600_common_screen *rscreen)
636 {
637 r600_perfcounters_do_destroy(rscreen->perfcounters);
638 rscreen->perfcounters = NULL;
639 }
640
641 void si_init_perfcounters(struct si_screen *screen)
642 {
643 struct r600_perfcounters *pc;
644 struct si_pc_block *blocks;
645 unsigned num_blocks;
646 unsigned i;
647
648 switch (screen->b.chip_class) {
649 case CIK:
650 blocks = groups_CIK;
651 num_blocks = ARRAY_SIZE(groups_CIK);
652 break;
653 case VI:
654 blocks = groups_VI;
655 num_blocks = ARRAY_SIZE(groups_VI);
656 break;
657 case SI:
658 default:
659 return; /* not implemented */
660 }
661
662 if (screen->b.info.max_sh_per_se != 1) {
663 /* This should not happen on non-SI chips. */
664 fprintf(stderr, "si_init_perfcounters: max_sh_per_se = %d not "
665 "supported (inaccurate performance counters)\n",
666 screen->b.info.max_sh_per_se);
667 }
668
669 pc = CALLOC_STRUCT(r600_perfcounters);
670 if (!pc)
671 return;
672
673 pc->num_start_cs_dwords = 14;
674 pc->num_stop_cs_dwords = 20;
675 pc->num_instance_cs_dwords = 3;
676 pc->num_shaders_cs_dwords = 4;
677
678 if (screen->b.chip_class == CIK) {
679 pc->num_stop_cs_dwords += 6;
680 }
681
682 pc->num_shader_types = ARRAY_SIZE(si_pc_shader_type_bits);
683 pc->shader_type_suffixes = si_pc_shader_type_suffixes;
684 pc->shader_type_bits = si_pc_shader_type_bits;
685
686 pc->get_size = si_pc_get_size;
687 pc->emit_instance = si_pc_emit_instance;
688 pc->emit_shaders = si_pc_emit_shaders;
689 pc->emit_select = si_pc_emit_select;
690 pc->emit_start = si_pc_emit_start;
691 pc->emit_stop = si_pc_emit_stop;
692 pc->emit_read = si_pc_emit_read;
693 pc->cleanup = si_pc_cleanup;
694
695 if (!r600_perfcounters_init(pc, num_blocks))
696 goto error;
697
698 for (i = 0; i < num_blocks; ++i) {
699 struct si_pc_block *block = &blocks[i];
700 unsigned instances = block->instances;
701
702 if (!strcmp(block->b->name, "IA")) {
703 if (screen->b.info.max_se > 2)
704 instances = 2;
705 }
706
707 r600_perfcounters_add_block(&screen->b, pc,
708 block->b->name,
709 block->b->flags,
710 block->b->num_counters,
711 block->selectors,
712 instances,
713 block);
714 }
715
716 screen->b.perfcounters = pc;
717 return;
718
719 error:
720 r600_perfcounters_do_destroy(pc);
721 }