radeonsi: implement AMD_performance_monitor for CIK+
[mesa.git] / src / gallium / drivers / radeonsi / si_perfcounter.c
1 /*
2 * Copyright 2015 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Nicolai Hähnle <nicolai.haehnle@amd.com>
25 *
26 */
27
28 #include "radeon/r600_cs.h"
29 #include "radeon/r600_query.h"
30 #include "radeon/r600_pipe_common.h"
31 #include "util/u_memory.h"
32
33 #include "si_pipe.h"
34 #include "sid.h"
35
36 enum si_pc_reg_layout {
37 /* All secondary selector dwords follow as one block after the primary
38 * selector dwords for the counters that have secondary selectors.
39 */
40 SI_PC_MULTI_BLOCK = 0,
41
42 /* Each secondary selector dword follows immediately afters the
43 * corresponding primary.
44 */
45 SI_PC_MULTI_ALTERNATE = 1,
46
47 /* All secondary selector dwords follow as one block after all primary
48 * selector dwords.
49 */
50 SI_PC_MULTI_TAIL = 2,
51
52 /* Free-form arrangement of selector registers. */
53 SI_PC_MULTI_CUSTOM = 3,
54
55 SI_PC_MULTI_MASK = 3,
56
57 /* Registers are laid out in decreasing rather than increasing order. */
58 SI_PC_REG_REVERSE = 4,
59 };
60
61 struct si_pc_block_base {
62 const char *name;
63 unsigned num_counters;
64 unsigned flags;
65
66 unsigned select_or;
67 unsigned select0;
68 unsigned counter0_lo;
69 unsigned *select;
70 unsigned *counters;
71 unsigned num_multi;
72 unsigned num_prelude;
73 unsigned layout;
74 };
75
76 struct si_pc_block {
77 struct si_pc_block_base *b;
78 unsigned selectors;
79 unsigned instances;
80 };
81
82
83 static struct si_pc_block_base cik_CB = {
84 .name = "CB",
85 .num_counters = 4,
86 .flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_INSTANCE_GROUPS,
87
88 .select0 = R_037000_CB_PERFCOUNTER_FILTER,
89 .counter0_lo = R_035018_CB_PERFCOUNTER0_LO,
90 .num_multi = 1,
91 .num_prelude = 1,
92 .layout = SI_PC_MULTI_ALTERNATE,
93 };
94
95 static unsigned cik_CPC_select[] = {
96 R_036024_CPC_PERFCOUNTER0_SELECT,
97 R_036010_CPC_PERFCOUNTER0_SELECT1,
98 R_03600C_CPC_PERFCOUNTER1_SELECT,
99 };
100 static struct si_pc_block_base cik_CPC = {
101 .name = "CPC",
102 .num_counters = 2,
103
104 .select = cik_CPC_select,
105 .counter0_lo = R_034018_CPC_PERFCOUNTER0_LO,
106 .num_multi = 1,
107 .layout = SI_PC_MULTI_CUSTOM | SI_PC_REG_REVERSE,
108 };
109
110 static struct si_pc_block_base cik_CPF = {
111 .name = "CPF",
112 .num_counters = 2,
113
114 .select0 = R_03601C_CPF_PERFCOUNTER0_SELECT,
115 .counter0_lo = R_034028_CPF_PERFCOUNTER0_LO,
116 .num_multi = 1,
117 .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
118 };
119
120 static struct si_pc_block_base cik_CPG = {
121 .name = "CPG",
122 .num_counters = 2,
123
124 .select0 = R_036008_CPG_PERFCOUNTER0_SELECT,
125 .counter0_lo = R_034008_CPG_PERFCOUNTER0_LO,
126 .num_multi = 1,
127 .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
128 };
129
130 static struct si_pc_block_base cik_DB = {
131 .name = "DB",
132 .num_counters = 4,
133 .flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_INSTANCE_GROUPS,
134
135 .select0 = R_037100_DB_PERFCOUNTER0_SELECT,
136 .counter0_lo = R_035100_DB_PERFCOUNTER0_LO,
137 .num_multi = 3, // really only 2, but there's a gap between registers
138 .layout = SI_PC_MULTI_ALTERNATE,
139 };
140
141 static struct si_pc_block_base cik_GDS = {
142 .name = "GDS",
143 .num_counters = 4,
144
145 .select0 = R_036A00_GDS_PERFCOUNTER0_SELECT,
146 .counter0_lo = R_034A00_GDS_PERFCOUNTER0_LO,
147 .num_multi = 1,
148 .layout = SI_PC_MULTI_TAIL,
149 };
150
151 static unsigned cik_GRBM_counters[] = {
152 R_034100_GRBM_PERFCOUNTER0_LO,
153 R_03410C_GRBM_PERFCOUNTER1_LO,
154 };
155 static struct si_pc_block_base cik_GRBM = {
156 .name = "GRBM",
157 .num_counters = 2,
158
159 .select0 = R_036100_GRBM_PERFCOUNTER0_SELECT,
160 .counters = cik_GRBM_counters,
161 };
162
163 static struct si_pc_block_base cik_GRBMSE = {
164 .name = "GRBMSE",
165 .num_counters = 4,
166
167 .select0 = R_036108_GRBM_SE0_PERFCOUNTER_SELECT,
168 .counter0_lo = R_034114_GRBM_SE0_PERFCOUNTER_LO,
169 };
170
171 static struct si_pc_block_base cik_IA = {
172 .name = "IA",
173 .num_counters = 4,
174
175 .select0 = R_036210_IA_PERFCOUNTER0_SELECT,
176 .counter0_lo = R_034220_IA_PERFCOUNTER0_LO,
177 .num_multi = 1,
178 .layout = SI_PC_MULTI_TAIL,
179 };
180
181 static struct si_pc_block_base cik_PA_SC = {
182 .name = "PA_SC",
183 .num_counters = 8,
184 .flags = R600_PC_BLOCK_SE,
185
186 .select0 = R_036500_PA_SC_PERFCOUNTER0_SELECT,
187 .counter0_lo = R_034500_PA_SC_PERFCOUNTER0_LO,
188 .num_multi = 1,
189 .layout = SI_PC_MULTI_ALTERNATE,
190 };
191
192 static struct si_pc_block_base cik_PA_SU = {
193 .name = "PA_SU",
194 .num_counters = 4,
195 .flags = R600_PC_BLOCK_SE,
196
197 .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
198 .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
199 .num_multi = 2,
200 .layout = SI_PC_MULTI_ALTERNATE,
201 };
202
203 static struct si_pc_block_base cik_SPI = {
204 .name = "SPI",
205 .num_counters = 6,
206 .flags = R600_PC_BLOCK_SE,
207
208 .select0 = R_036600_SPI_PERFCOUNTER0_SELECT,
209 .counter0_lo = R_034604_SPI_PERFCOUNTER0_LO,
210 .num_multi = 4,
211 .layout = SI_PC_MULTI_BLOCK,
212 };
213
214 static struct si_pc_block_base cik_SQ = {
215 .name = "SQ",
216 .num_counters = 16,
217 .flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_SHADER,
218
219 .select0 = R_036700_SQ_PERFCOUNTER0_SELECT,
220 .select_or = S_036700_SQC_BANK_MASK(15) |
221 S_036700_SQC_CLIENT_MASK(15) |
222 S_036700_SIMD_MASK(15),
223 .counter0_lo = R_034700_SQ_PERFCOUNTER0_LO,
224 };
225
226 static struct si_pc_block_base cik_SX = {
227 .name = "SX",
228 .num_counters = 4,
229 .flags = R600_PC_BLOCK_SE,
230
231 .select0 = R_036900_SX_PERFCOUNTER0_SELECT,
232 .counter0_lo = R_034900_SX_PERFCOUNTER0_LO,
233 .num_multi = 2,
234 .layout = SI_PC_MULTI_TAIL,
235 };
236
237 static struct si_pc_block_base cik_TA = {
238 .name = "TA",
239 .num_counters = 2,
240 .flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_INSTANCE_GROUPS | R600_PC_BLOCK_SHADER_WINDOWED,
241
242 .select0 = R_036B00_TA_PERFCOUNTER0_SELECT,
243 .counter0_lo = R_034B00_TA_PERFCOUNTER0_LO,
244 .num_multi = 1,
245 .layout = SI_PC_MULTI_ALTERNATE,
246 };
247
248 static struct si_pc_block_base cik_TD = {
249 .name = "TD",
250 .num_counters = 2,
251 .flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_INSTANCE_GROUPS | R600_PC_BLOCK_SHADER_WINDOWED,
252
253 .select0 = R_036C00_TD_PERFCOUNTER0_SELECT,
254 .counter0_lo = R_034C00_TD_PERFCOUNTER0_LO,
255 .num_multi = 1,
256 .layout = SI_PC_MULTI_ALTERNATE,
257 };
258
259 static struct si_pc_block_base cik_TCA = {
260 .name = "TCA",
261 .num_counters = 4,
262 .flags = R600_PC_BLOCK_INSTANCE_GROUPS,
263
264 .select0 = R_036E40_TCA_PERFCOUNTER0_SELECT,
265 .counter0_lo = R_034E40_TCA_PERFCOUNTER0_LO,
266 .num_multi = 2,
267 .layout = SI_PC_MULTI_ALTERNATE,
268 };
269
270 static struct si_pc_block_base cik_TCC = {
271 .name = "TCC",
272 .num_counters = 4,
273 .flags = R600_PC_BLOCK_INSTANCE_GROUPS,
274
275 .select0 = R_036E00_TCC_PERFCOUNTER0_SELECT,
276 .counter0_lo = R_034E00_TCC_PERFCOUNTER0_LO,
277 .num_multi = 2,
278 .layout = SI_PC_MULTI_ALTERNATE,
279 };
280
281 static struct si_pc_block_base cik_TCP = {
282 .name = "TCP",
283 .num_counters = 4,
284 .flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_INSTANCE_GROUPS | R600_PC_BLOCK_SHADER_WINDOWED,
285
286 .select0 = R_036D00_TCP_PERFCOUNTER0_SELECT,
287 .counter0_lo = R_034D00_TCP_PERFCOUNTER0_LO,
288 .num_multi = 2,
289 .layout = SI_PC_MULTI_ALTERNATE,
290 };
291
292 static struct si_pc_block_base cik_VGT = {
293 .name = "VGT",
294 .num_counters = 4,
295 .flags = R600_PC_BLOCK_SE,
296
297 .select0 = R_036230_VGT_PERFCOUNTER0_SELECT,
298 .counter0_lo = R_034240_VGT_PERFCOUNTER0_LO,
299 .num_multi = 1,
300 .layout = SI_PC_MULTI_TAIL,
301 };
302
303 static struct si_pc_block_base cik_WD = {
304 .name = "WD",
305 .num_counters = 4,
306
307 .select0 = R_036200_WD_PERFCOUNTER0_SELECT,
308 .counter0_lo = R_034200_WD_PERFCOUNTER0_LO,
309 };
310
311 /* Both the number of instances and selectors varies between chips of the same
312 * class. We only differentiate by class here and simply expose the maximum
313 * number over all chips in a class.
314 */
315 static struct si_pc_block groups_CIK[] = {
316 { &cik_CB, 226, 4 },
317 { &cik_CPC, 22 },
318 { &cik_CPF, 17 },
319 { &cik_CPG, 46 },
320 { &cik_DB, 257, 4 },
321 { &cik_GDS, 121 },
322 { &cik_GRBM, 34 },
323 { &cik_GRBMSE, 15 },
324 { &cik_IA, 22 },
325 { &cik_PA_SC, 395 },
326 { &cik_PA_SU, 153 },
327 { &cik_SPI, 186 },
328 { &cik_SQ, 252 },
329 { &cik_SX, 32 },
330 { &cik_TA, 111, 11 },
331 { &cik_TCA, 39, 2 },
332 { &cik_TCC, 160, 16 },
333 { &cik_TCP, 154, 11 },
334 { &cik_TD, 55, 11 },
335 { &cik_VGT, 140 },
336 { &cik_WD, 22 },
337 };
338
339 static struct si_pc_block groups_VI[] = {
340 { &cik_CB, 396, 4 },
341 { &cik_CPC, 24 },
342 { &cik_CPF, 19 },
343 { &cik_CPG, 48 },
344 { &cik_DB, 257, 4 },
345 { &cik_GDS, 121 },
346 { &cik_GRBM, 34 },
347 { &cik_GRBMSE, 15 },
348 { &cik_IA, 24 },
349 { &cik_PA_SC, 397 },
350 { &cik_PA_SU, 153 },
351 { &cik_SPI, 197 },
352 { &cik_SQ, 273 },
353 { &cik_SX, 34 },
354 { &cik_TA, 119, 16 },
355 { &cik_TCA, 35, 2 },
356 { &cik_TCC, 192, 16 },
357 { &cik_TCP, 180, 16 },
358 { &cik_TD, 55, 16 },
359 { &cik_VGT, 147 },
360 { &cik_WD, 37 },
361 };
362
363 static void si_pc_get_size(struct r600_perfcounter_block *group,
364 unsigned count, unsigned *selectors,
365 unsigned *num_select_dw, unsigned *num_read_dw)
366 {
367 struct si_pc_block *sigroup = (struct si_pc_block *)group->data;
368 struct si_pc_block_base *regs = sigroup->b;
369 unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
370
371 if (layout_multi == SI_PC_MULTI_BLOCK) {
372 if (count < regs->num_multi)
373 *num_select_dw = 2 * (count + 2) + regs->num_prelude;
374 else
375 *num_select_dw = 2 + count + regs->num_multi + regs->num_prelude;
376 } else if (layout_multi == SI_PC_MULTI_TAIL) {
377 *num_select_dw = 4 + count + MIN2(count, regs->num_multi) + regs->num_prelude;
378 } else if (layout_multi == SI_PC_MULTI_CUSTOM) {
379 assert(regs->num_prelude == 0);
380 *num_select_dw = 3 * (count + MIN2(count, regs->num_multi));
381 } else {
382 assert(layout_multi == SI_PC_MULTI_ALTERNATE);
383
384 *num_select_dw = 2 + count + MIN2(count, regs->num_multi) + regs->num_prelude;
385 }
386
387 *num_read_dw = 6 * count;
388 }
389
390 static void si_pc_emit_instance(struct r600_common_context *ctx,
391 int se, int instance)
392 {
393 struct radeon_winsys_cs *cs = ctx->gfx.cs;
394 unsigned value = S_030800_SH_BROADCAST_WRITES(1);
395
396 if (se >= 0) {
397 value |= S_030800_SE_INDEX(se);
398 } else {
399 value |= S_030800_SE_BROADCAST_WRITES(1);
400 }
401
402 if (instance >= 0) {
403 value |= S_030800_INSTANCE_INDEX(instance);
404 } else {
405 value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
406 }
407
408 radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
409 }
410
411 static void si_pc_emit_shaders(struct r600_common_context *ctx,
412 unsigned shaders)
413 {
414 struct radeon_winsys_cs *cs = ctx->gfx.cs;
415
416 radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
417 radeon_emit(cs, shaders & 0x7f);
418 radeon_emit(cs, 0xffffffff);
419 }
420
421 static void si_pc_emit_select(struct r600_common_context *ctx,
422 struct r600_perfcounter_block *group,
423 unsigned count, unsigned *selectors)
424 {
425 struct si_pc_block *sigroup = (struct si_pc_block *)group->data;
426 struct si_pc_block_base *regs = sigroup->b;
427 struct radeon_winsys_cs *cs = ctx->gfx.cs;
428 unsigned idx;
429 unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
430 unsigned dw;
431
432 assert(count <= regs->num_counters);
433
434 if (layout_multi == SI_PC_MULTI_BLOCK) {
435 assert(!(regs->layout & SI_PC_REG_REVERSE));
436
437 dw = count + regs->num_prelude;
438 if (count >= regs->num_multi)
439 count += regs->num_multi;
440 radeon_set_uconfig_reg_seq(cs, regs->select0, dw);
441 for (idx = 0; idx < regs->num_prelude; ++idx)
442 radeon_emit(cs, 0);
443 for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
444 radeon_emit(cs, selectors[idx] | regs->select_or);
445
446 if (count < regs->num_multi) {
447 unsigned select1 =
448 regs->select0 + 4 * regs->num_multi;
449 radeon_set_uconfig_reg_seq(cs, select1, count);
450 }
451
452 for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
453 radeon_emit(cs, 0);
454
455 if (count > regs->num_multi) {
456 for (idx = regs->num_multi; idx < count; ++idx)
457 radeon_emit(cs, selectors[idx] | regs->select_or);
458 }
459 } else if (layout_multi == SI_PC_MULTI_TAIL) {
460 unsigned select1, select1_count;
461
462 assert(!(regs->layout & SI_PC_REG_REVERSE));
463
464 radeon_set_uconfig_reg_seq(cs, regs->select0, count + regs->num_prelude);
465 for (idx = 0; idx < regs->num_prelude; ++idx)
466 radeon_emit(cs, 0);
467 for (idx = 0; idx < count; ++idx)
468 radeon_emit(cs, selectors[idx] | regs->select_or);
469
470 select1 = regs->select0 + 4 * regs->num_counters;
471 select1_count = MIN2(count, regs->num_multi);
472 radeon_set_uconfig_reg_seq(cs, select1, select1_count);
473 for (idx = 0; idx < select1_count; ++idx)
474 radeon_emit(cs, 0);
475 } else if (layout_multi == SI_PC_MULTI_CUSTOM) {
476 unsigned *reg = regs->select;
477 for (idx = 0; idx < count; ++idx) {
478 radeon_set_uconfig_reg(cs, *reg++, selectors[idx] | regs->select_or);
479 if (idx < regs->num_multi)
480 radeon_set_uconfig_reg(cs, *reg++, 0);
481 }
482 } else {
483 assert(layout_multi == SI_PC_MULTI_ALTERNATE);
484
485 unsigned reg_base = regs->select0;
486 unsigned reg_count = count + MIN2(count, regs->num_multi);
487 reg_count += regs->num_prelude;
488
489 if (!(regs->layout & SI_PC_REG_REVERSE)) {
490 radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
491
492 for (idx = 0; idx < regs->num_prelude; ++idx)
493 radeon_emit(cs, 0);
494 for (idx = 0; idx < count; ++idx) {
495 radeon_emit(cs, selectors[idx] | regs->select_or);
496 if (idx < regs->num_multi)
497 radeon_emit(cs, 0);
498 }
499 } else {
500 reg_base -= (reg_count - 1) * 4;
501 radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
502
503 for (idx = count; idx > 0; --idx) {
504 if (idx <= regs->num_multi)
505 radeon_emit(cs, 0);
506 radeon_emit(cs, selectors[idx - 1] | regs->select_or);
507 }
508 for (idx = 0; idx < regs->num_prelude; ++idx)
509 radeon_emit(cs, 0);
510 }
511 }
512 }
513
514 static void si_pc_emit_start(struct r600_common_context *ctx,
515 struct r600_resource *buffer, uint64_t va)
516 {
517 struct radeon_winsys_cs *cs = ctx->gfx.cs;
518
519 radeon_add_to_buffer_list(ctx, &ctx->gfx, buffer,
520 RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
521
522 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
523 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
524 COPY_DATA_DST_SEL(COPY_DATA_MEM));
525 radeon_emit(cs, 1); /* immediate */
526 radeon_emit(cs, 0); /* unused */
527 radeon_emit(cs, va);
528 radeon_emit(cs, va >> 32);
529
530 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
531 S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET));
532 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
533 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PERFCOUNTER_START) | EVENT_INDEX(0));
534 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
535 S_036020_PERFMON_STATE(V_036020_START_COUNTING));
536 }
537
538 /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
539 * do it again in here. */
540 static void si_pc_emit_stop(struct r600_common_context *ctx,
541 struct r600_resource *buffer, uint64_t va)
542 {
543 struct radeon_winsys_cs *cs = ctx->gfx.cs;
544
545 if (ctx->screen->chip_class == CIK) {
546 /* Workaround for cache flush problems: send two EOP events. */
547 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
548 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) |
549 EVENT_INDEX(5));
550 radeon_emit(cs, va);
551 radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1));
552 radeon_emit(cs, 0); /* immediate data */
553 radeon_emit(cs, 0); /* unused */
554 }
555
556 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
557 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) |
558 EVENT_INDEX(5));
559 radeon_emit(cs, va);
560 radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1));
561 radeon_emit(cs, 0); /* immediate data */
562 radeon_emit(cs, 0); /* unused */
563
564 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
565 radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
566 radeon_emit(cs, va);
567 radeon_emit(cs, va >> 32);
568 radeon_emit(cs, 0); /* reference value */
569 radeon_emit(cs, 0xffffffff); /* mask */
570 radeon_emit(cs, 4); /* poll interval */
571
572 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
573 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
574 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
575 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PERFCOUNTER_STOP) | EVENT_INDEX(0));
576 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
577 S_036020_PERFMON_STATE(V_036020_STOP_COUNTING) |
578 S_036020_PERFMON_SAMPLE_ENABLE(1));
579 }
580
581 static void si_pc_emit_read(struct r600_common_context *ctx,
582 struct r600_perfcounter_block *group,
583 unsigned count, unsigned *selectors,
584 struct r600_resource *buffer, uint64_t va)
585 {
586 struct si_pc_block *sigroup = (struct si_pc_block *)group->data;
587 struct si_pc_block_base *regs = sigroup->b;
588 struct radeon_winsys_cs *cs = ctx->gfx.cs;
589 unsigned idx;
590 unsigned reg = regs->counter0_lo;
591 unsigned reg_delta = 8;
592
593 if (regs->layout & SI_PC_REG_REVERSE)
594 reg_delta = -reg_delta;
595
596 for (idx = 0; idx < count; ++idx) {
597 if (regs->counters)
598 reg = regs->counters[idx];
599
600 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
601 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
602 COPY_DATA_DST_SEL(COPY_DATA_MEM));
603 radeon_emit(cs, reg >> 2);
604 radeon_emit(cs, 0); /* unused */
605 radeon_emit(cs, va);
606 radeon_emit(cs, va >> 32);
607 va += 4;
608 reg += reg_delta;
609 }
610 }
611
612 static void si_pc_cleanup(struct r600_common_screen *rscreen)
613 {
614 r600_perfcounters_do_destroy(rscreen->perfcounters);
615 rscreen->perfcounters = NULL;
616 }
617
618 void si_init_perfcounters(struct si_screen *screen)
619 {
620 struct r600_perfcounters *pc;
621 struct si_pc_block *blocks;
622 unsigned num_blocks;
623 unsigned i;
624
625 switch (screen->b.chip_class) {
626 case CIK:
627 blocks = groups_CIK;
628 num_blocks = ARRAY_SIZE(groups_CIK);
629 break;
630 case VI:
631 blocks = groups_VI;
632 num_blocks = ARRAY_SIZE(groups_VI);
633 break;
634 case SI:
635 default:
636 return; /* not implemented */
637 }
638
639 if (screen->b.info.max_sh_per_se != 1) {
640 /* This should not happen on non-SI chips. */
641 fprintf(stderr, "si_init_perfcounters: max_sh_per_se = %d not "
642 "supported (inaccurate performance counters)\n",
643 screen->b.info.max_sh_per_se);
644 }
645
646 pc = CALLOC_STRUCT(r600_perfcounters);
647 if (!pc)
648 return;
649
650 pc->num_start_cs_dwords = 14;
651 pc->num_stop_cs_dwords = 20;
652 pc->num_instance_cs_dwords = 3;
653 pc->num_shaders_cs_dwords = 4;
654
655 if (screen->b.chip_class == CIK) {
656 pc->num_stop_cs_dwords += 6;
657 }
658
659 pc->get_size = si_pc_get_size;
660 pc->emit_instance = si_pc_emit_instance;
661 pc->emit_shaders = si_pc_emit_shaders;
662 pc->emit_select = si_pc_emit_select;
663 pc->emit_start = si_pc_emit_start;
664 pc->emit_stop = si_pc_emit_stop;
665 pc->emit_read = si_pc_emit_read;
666 pc->cleanup = si_pc_cleanup;
667
668 if (!r600_perfcounters_init(pc, num_blocks))
669 goto error;
670
671 for (i = 0; i < num_blocks; ++i) {
672 struct si_pc_block *block = &blocks[i];
673 unsigned instances = block->instances;
674
675 if (!strcmp(block->b->name, "IA")) {
676 if (screen->b.info.max_se > 2)
677 instances = 2;
678 }
679
680 if (!r600_perfcounters_add_block(&screen->b, pc,
681 block->b->name,
682 block->b->flags,
683 block->b->num_counters,
684 block->selectors,
685 instances,
686 block))
687 goto error;
688 }
689
690 screen->b.perfcounters = pc;
691 return;
692
693 error:
694 r600_perfcounters_do_destroy(pc);
695 }