radeonsi: re-order the SQ_xx performance counter blocks
[mesa.git] / src / gallium / drivers / radeon / r600_perfcounter.c
1 /*
2 * Copyright 2015 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Nicolai Hähnle <nicolai.haehnle@amd.com>
25 *
26 */
27
28 #include "util/u_memory.h"
29 #include "r600_query.h"
30 #include "r600_pipe_common.h"
31 #include "r600d_common.h"
32
33 /* Max counters per HW block */
34 #define R600_QUERY_MAX_COUNTERS 16
35
36 static struct r600_perfcounter_block *
37 lookup_counter(struct r600_perfcounters *pc, unsigned index,
38 unsigned *base_gid, unsigned *sub_index)
39 {
40 struct r600_perfcounter_block *block = pc->blocks;
41 unsigned bid;
42
43 *base_gid = 0;
44 for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
45 unsigned total = block->num_groups * block->num_selectors;
46
47 if (index < total) {
48 *sub_index = index;
49 return block;
50 }
51
52 index -= total;
53 *base_gid += block->num_groups;
54 }
55
56 return NULL;
57 }
58
59 static struct r600_perfcounter_block *
60 lookup_group(struct r600_perfcounters *pc, unsigned *index)
61 {
62 unsigned bid;
63 struct r600_perfcounter_block *block = pc->blocks;
64
65 for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
66 if (*index < block->num_groups)
67 return block;
68 *index -= block->num_groups;
69 }
70
71 return NULL;
72 }
73
74 struct r600_pc_group {
75 struct r600_pc_group *next;
76 struct r600_perfcounter_block *block;
77 unsigned sub_gid; /* only used during init */
78 unsigned result_base; /* only used during init */
79 int se;
80 int instance;
81 unsigned num_counters;
82 unsigned selectors[R600_QUERY_MAX_COUNTERS];
83 };
84
85 struct r600_pc_counter {
86 unsigned base;
87 unsigned dwords;
88 unsigned stride;
89 };
90
91 #define R600_PC_SHADERS_WINDOWING (1 << 31)
92
93 struct r600_query_pc {
94 struct r600_query_hw b;
95
96 unsigned shaders;
97 unsigned num_counters;
98 struct r600_pc_counter *counters;
99 struct r600_pc_group *groups;
100 };
101
102 static void r600_pc_query_destroy(struct r600_common_context *ctx,
103 struct r600_query *rquery)
104 {
105 struct r600_query_pc *query = (struct r600_query_pc *)rquery;
106
107 while (query->groups) {
108 struct r600_pc_group *group = query->groups;
109 query->groups = group->next;
110 FREE(group);
111 }
112
113 FREE(query->counters);
114
115 r600_query_hw_destroy(ctx, rquery);
116 }
117
118 static void r600_pc_query_emit_start(struct r600_common_context *ctx,
119 struct r600_query_hw *hwquery,
120 struct r600_resource *buffer, uint64_t va)
121 {
122 struct r600_perfcounters *pc = ctx->screen->perfcounters;
123 struct r600_query_pc *query = (struct r600_query_pc *)hwquery;
124 struct r600_pc_group *group;
125 int current_se = -1;
126 int current_instance = -1;
127
128 if (query->shaders)
129 pc->emit_shaders(ctx, query->shaders);
130
131 for (group = query->groups; group; group = group->next) {
132 struct r600_perfcounter_block *block = group->block;
133
134 if (group->se != current_se || group->instance != current_instance) {
135 current_se = group->se;
136 current_instance = group->instance;
137 pc->emit_instance(ctx, group->se, group->instance);
138 }
139
140 pc->emit_select(ctx, block, group->num_counters, group->selectors);
141 }
142
143 if (current_se != -1 || current_instance != -1)
144 pc->emit_instance(ctx, -1, -1);
145
146 pc->emit_start(ctx, buffer, va);
147 }
148
149 static void r600_pc_query_emit_stop(struct r600_common_context *ctx,
150 struct r600_query_hw *hwquery,
151 struct r600_resource *buffer, uint64_t va)
152 {
153 struct r600_perfcounters *pc = ctx->screen->perfcounters;
154 struct r600_query_pc *query = (struct r600_query_pc *)hwquery;
155 struct r600_pc_group *group;
156
157 pc->emit_stop(ctx, buffer, va);
158
159 for (group = query->groups; group; group = group->next) {
160 struct r600_perfcounter_block *block = group->block;
161 unsigned se = group->se >= 0 ? group->se : 0;
162 unsigned se_end = se + 1;
163
164 if ((block->flags & R600_PC_BLOCK_SE) && (group->se < 0))
165 se_end = ctx->screen->info.max_se;
166
167 do {
168 unsigned instance = group->instance >= 0 ? group->instance : 0;
169
170 do {
171 pc->emit_instance(ctx, se, instance);
172 pc->emit_read(ctx, block,
173 group->num_counters, group->selectors,
174 buffer, va);
175 va += 4 * group->num_counters;
176 } while (group->instance < 0 && ++instance < block->num_instances);
177 } while (++se < se_end);
178 }
179
180 pc->emit_instance(ctx, -1, -1);
181 }
182
183 static void r600_pc_query_clear_result(struct r600_query_hw *hwquery,
184 union pipe_query_result *result)
185 {
186 struct r600_query_pc *query = (struct r600_query_pc *)hwquery;
187
188 memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
189 }
190
191 static void r600_pc_query_add_result(struct r600_common_context *ctx,
192 struct r600_query_hw *hwquery,
193 void *buffer,
194 union pipe_query_result *result)
195 {
196 struct r600_query_pc *query = (struct r600_query_pc *)hwquery;
197 uint32_t *results = buffer;
198 unsigned i, j;
199
200 for (i = 0; i < query->num_counters; ++i) {
201 struct r600_pc_counter *counter = &query->counters[i];
202
203 for (j = 0; j < counter->dwords; ++j) {
204 uint32_t value = results[counter->base + j * counter->stride];
205 result->batch[i].u32 += value;
206 }
207 }
208 }
209
210 static struct r600_query_ops batch_query_ops = {
211 .destroy = r600_pc_query_destroy,
212 .begin = r600_query_hw_begin,
213 .end = r600_query_hw_end,
214 .get_result = r600_query_hw_get_result
215 };
216
217 static struct r600_query_hw_ops batch_query_hw_ops = {
218 .emit_start = r600_pc_query_emit_start,
219 .emit_stop = r600_pc_query_emit_stop,
220 .clear_result = r600_pc_query_clear_result,
221 .add_result = r600_pc_query_add_result,
222 };
223
224 static struct r600_pc_group *get_group_state(struct r600_common_screen *screen,
225 struct r600_query_pc *query,
226 struct r600_perfcounter_block *block,
227 unsigned sub_gid)
228 {
229 struct r600_pc_group *group = query->groups;
230
231 while (group) {
232 if (group->block == block && group->sub_gid == sub_gid)
233 return group;
234 group = group->next;
235 }
236
237 group = CALLOC_STRUCT(r600_pc_group);
238 if (!group)
239 return NULL;
240
241 group->block = block;
242 group->sub_gid = sub_gid;
243
244 if (block->flags & R600_PC_BLOCK_SHADER) {
245 unsigned sub_gids = block->num_instances;
246 unsigned shader_id;
247 unsigned shaders;
248 unsigned query_shaders;
249
250 if (block->flags & R600_PC_BLOCK_SE_GROUPS)
251 sub_gids = sub_gids * screen->info.max_se;
252 shader_id = sub_gid / sub_gids;
253 sub_gid = sub_gid % sub_gids;
254
255 shaders = screen->perfcounters->shader_type_bits[shader_id];
256
257 query_shaders = query->shaders & ~R600_PC_SHADERS_WINDOWING;
258 if (query_shaders && query_shaders != shaders) {
259 fprintf(stderr, "r600_perfcounter: incompatible shader groups\n");
260 FREE(group);
261 return NULL;
262 }
263 query->shaders = shaders;
264 }
265
266 if (block->flags & R600_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
267 // A non-zero value in query->shaders ensures that the shader
268 // masking is reset unless the user explicitly requests one.
269 query->shaders = R600_PC_SHADERS_WINDOWING;
270 }
271
272 if (block->flags & R600_PC_BLOCK_SE_GROUPS) {
273 group->se = sub_gid / block->num_instances;
274 sub_gid = sub_gid % block->num_instances;
275 } else {
276 group->se = -1;
277 }
278
279 if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) {
280 group->instance = sub_gid;
281 } else {
282 group->instance = -1;
283 }
284
285 group->next = query->groups;
286 query->groups = group;
287
288 return group;
289 }
290
291 struct pipe_query *r600_create_batch_query(struct pipe_context *ctx,
292 unsigned num_queries,
293 unsigned *query_types)
294 {
295 struct r600_common_context *rctx = (struct r600_common_context *)ctx;
296 struct r600_common_screen *screen = rctx->screen;
297 struct r600_perfcounters *pc = screen->perfcounters;
298 struct r600_perfcounter_block *block;
299 struct r600_pc_group *group;
300 struct r600_query_pc *query;
301 unsigned base_gid, sub_gid, sub_index;
302 unsigned i, j;
303
304 if (!pc)
305 return NULL;
306
307 query = CALLOC_STRUCT(r600_query_pc);
308 if (!query)
309 return NULL;
310
311 query->b.b.ops = &batch_query_ops;
312 query->b.ops = &batch_query_hw_ops;
313 query->b.flags = R600_QUERY_HW_FLAG_TIMER;
314
315 query->num_counters = num_queries;
316
317 /* Collect selectors per group */
318 for (i = 0; i < num_queries; ++i) {
319 unsigned sub_gid;
320
321 if (query_types[i] < R600_QUERY_FIRST_PERFCOUNTER)
322 goto error;
323
324 block = lookup_counter(pc, query_types[i] - R600_QUERY_FIRST_PERFCOUNTER,
325 &base_gid, &sub_index);
326 if (!block)
327 goto error;
328
329 sub_gid = sub_index / block->num_selectors;
330 sub_index = sub_index % block->num_selectors;
331
332 group = get_group_state(screen, query, block, sub_gid);
333 if (!group)
334 goto error;
335
336 if (group->num_counters >= block->num_counters) {
337 fprintf(stderr,
338 "perfcounter group %s: too many selected\n",
339 block->basename);
340 goto error;
341 }
342 group->selectors[group->num_counters] = sub_index;
343 ++group->num_counters;
344 }
345
346 /* Compute result bases and CS size per group */
347 query->b.num_cs_dw_begin = pc->num_start_cs_dwords;
348 query->b.num_cs_dw_end = pc->num_stop_cs_dwords;
349
350 query->b.num_cs_dw_begin += pc->num_instance_cs_dwords; /* conservative */
351 query->b.num_cs_dw_end += pc->num_instance_cs_dwords;
352
353 i = 0;
354 for (group = query->groups; group; group = group->next) {
355 struct r600_perfcounter_block *block = group->block;
356 unsigned select_dw, read_dw;
357 unsigned instances = 1;
358
359 if ((block->flags & R600_PC_BLOCK_SE) && group->se < 0)
360 instances = rctx->screen->info.max_se;
361 if (group->instance < 0)
362 instances *= block->num_instances;
363
364 group->result_base = i;
365 query->b.result_size += 4 * instances * group->num_counters;
366 i += instances * group->num_counters;
367
368 pc->get_size(block, group->num_counters, group->selectors,
369 &select_dw, &read_dw);
370 query->b.num_cs_dw_begin += select_dw;
371 query->b.num_cs_dw_end += instances * read_dw;
372 query->b.num_cs_dw_begin += pc->num_instance_cs_dwords; /* conservative */
373 query->b.num_cs_dw_end += instances * pc->num_instance_cs_dwords;
374 }
375
376 if (query->shaders) {
377 if (query->shaders == R600_PC_SHADERS_WINDOWING)
378 query->shaders = 0xffffffff;
379 query->b.num_cs_dw_begin += pc->num_shaders_cs_dwords;
380 }
381
382 /* Map user-supplied query array to result indices */
383 query->counters = CALLOC(num_queries, sizeof(*query->counters));
384 for (i = 0; i < num_queries; ++i) {
385 struct r600_pc_counter *counter = &query->counters[i];
386 struct r600_perfcounter_block *block;
387
388 block = lookup_counter(pc, query_types[i] - R600_QUERY_FIRST_PERFCOUNTER,
389 &base_gid, &sub_index);
390
391 sub_gid = sub_index / block->num_selectors;
392 sub_index = sub_index % block->num_selectors;
393
394 group = get_group_state(screen, query, block, sub_gid);
395 assert(group != NULL);
396
397 for (j = 0; j < group->num_counters; ++j) {
398 if (group->selectors[j] == sub_index)
399 break;
400 }
401
402 counter->base = group->result_base + j;
403 counter->stride = group->num_counters;
404
405 counter->dwords = 1;
406 if ((block->flags & R600_PC_BLOCK_SE) && group->se < 0)
407 counter->dwords = screen->info.max_se;
408 if (group->instance < 0)
409 counter->dwords *= block->num_instances;
410 }
411
412 if (!r600_query_hw_init(rctx, &query->b))
413 goto error;
414
415 return (struct pipe_query *)query;
416
417 error:
418 r600_pc_query_destroy(rctx, &query->b.b);
419 return NULL;
420 }
421
422 static boolean r600_init_block_names(struct r600_common_screen *screen,
423 struct r600_perfcounter_block *block)
424 {
425 unsigned i, j, k;
426 unsigned groups_shader = 1, groups_se = 1, groups_instance = 1;
427 unsigned namelen;
428 char *groupname;
429 char *p;
430
431 if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS)
432 groups_instance = block->num_instances;
433 if (block->flags & R600_PC_BLOCK_SE_GROUPS)
434 groups_se = screen->info.max_se;
435 if (block->flags & R600_PC_BLOCK_SHADER)
436 groups_shader = screen->perfcounters->num_shader_types;
437
438 namelen = strlen(block->basename);
439 block->group_name_stride = namelen + 1;
440 if (block->flags & R600_PC_BLOCK_SHADER)
441 block->group_name_stride += 3;
442 if (block->flags & R600_PC_BLOCK_SE_GROUPS) {
443 assert(groups_se <= 10);
444 block->group_name_stride += 1;
445
446 if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS)
447 block->group_name_stride += 1;
448 }
449 if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) {
450 assert(groups_instance <= 100);
451 block->group_name_stride += 2;
452 }
453
454 block->group_names = MALLOC(block->num_groups * block->group_name_stride);
455 if (!block->group_names)
456 return FALSE;
457
458 groupname = block->group_names;
459 for (i = 0; i < groups_shader; ++i) {
460 const char *shader_suffix = screen->perfcounters->shader_type_suffixes[i];
461 unsigned shaderlen = strlen(shader_suffix);
462 for (j = 0; j < groups_se; ++j) {
463 for (k = 0; k < groups_instance; ++k) {
464 strcpy(groupname, block->basename);
465 p = groupname + namelen;
466
467 if (block->flags & R600_PC_BLOCK_SHADER) {
468 strcpy(p, shader_suffix);
469 p += shaderlen;
470 }
471
472 if (block->flags & R600_PC_BLOCK_SE_GROUPS) {
473 p += sprintf(p, "%d", j);
474 if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS)
475 *p++ = '_';
476 }
477
478 if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS)
479 p += sprintf(p, "%d", k);
480
481 groupname += block->group_name_stride;
482 }
483 }
484 }
485
486 assert(block->num_selectors <= 1000);
487 block->selector_name_stride = block->group_name_stride + 4;
488 block->selector_names = MALLOC(block->num_groups * block->num_selectors *
489 block->selector_name_stride);
490 if (!block->selector_names)
491 return FALSE;
492
493 groupname = block->group_names;
494 p = block->selector_names;
495 for (i = 0; i < block->num_groups; ++i) {
496 for (j = 0; j < block->num_selectors; ++j) {
497 sprintf(p, "%s_%03d", groupname, j);
498 p += block->selector_name_stride;
499 }
500 groupname += block->group_name_stride;
501 }
502
503 return TRUE;
504 }
505
506 int r600_get_perfcounter_info(struct r600_common_screen *screen,
507 unsigned index,
508 struct pipe_driver_query_info *info)
509 {
510 struct r600_perfcounters *pc = screen->perfcounters;
511 struct r600_perfcounter_block *block;
512 unsigned base_gid, sub;
513
514 if (!pc)
515 return 0;
516
517 if (!info) {
518 unsigned bid, num_queries = 0;
519
520 for (bid = 0; bid < pc->num_blocks; ++bid) {
521 num_queries += pc->blocks[bid].num_selectors *
522 pc->blocks[bid].num_groups;
523 }
524
525 return num_queries;
526 }
527
528 block = lookup_counter(pc, index, &base_gid, &sub);
529 if (!block)
530 return 0;
531
532 if (!block->selector_names) {
533 if (!r600_init_block_names(screen, block))
534 return 0;
535 }
536 info->name = block->selector_names + sub * block->selector_name_stride;
537 info->query_type = R600_QUERY_FIRST_PERFCOUNTER + index;
538 info->max_value.u64 = 0;
539 info->type = PIPE_DRIVER_QUERY_TYPE_UINT;
540 info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE;
541 info->group_id = base_gid + sub / block->num_selectors;
542 info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
543 if (sub > 0 && sub + 1 < block->num_selectors * block->num_groups)
544 info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
545 return 1;
546 }
547
548 int r600_get_perfcounter_group_info(struct r600_common_screen *screen,
549 unsigned index,
550 struct pipe_driver_query_group_info *info)
551 {
552 struct r600_perfcounters *pc = screen->perfcounters;
553 struct r600_perfcounter_block *block;
554
555 if (!pc)
556 return 0;
557
558 if (!info)
559 return pc->num_groups;
560
561 block = lookup_group(pc, &index);
562 if (!block)
563 return 0;
564
565 if (!block->group_names) {
566 if (!r600_init_block_names(screen, block))
567 return 0;
568 }
569 info->name = block->group_names + index * block->group_name_stride;
570 info->num_queries = block->num_selectors;
571 info->max_active_queries = block->num_counters;
572 return 1;
573 }
574
575 void r600_perfcounters_destroy(struct r600_common_screen *rscreen)
576 {
577 if (rscreen->perfcounters)
578 rscreen->perfcounters->cleanup(rscreen);
579 }
580
581 boolean r600_perfcounters_init(struct r600_perfcounters *pc,
582 unsigned num_blocks)
583 {
584 pc->blocks = CALLOC(num_blocks, sizeof(struct r600_perfcounter_block));
585 if (!pc->blocks)
586 return FALSE;
587
588 pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", FALSE);
589 pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", FALSE);
590
591 return TRUE;
592 }
593
594 void r600_perfcounters_add_block(struct r600_common_screen *rscreen,
595 struct r600_perfcounters *pc,
596 const char *name, unsigned flags,
597 unsigned counters, unsigned selectors,
598 unsigned instances, void *data)
599 {
600 struct r600_perfcounter_block *block = &pc->blocks[pc->num_blocks];
601
602 assert(counters <= R600_QUERY_MAX_COUNTERS);
603
604 block->basename = name;
605 block->flags = flags;
606 block->num_counters = counters;
607 block->num_selectors = selectors;
608 block->num_instances = MAX2(instances, 1);
609 block->data = data;
610
611 if (pc->separate_se && (block->flags & R600_PC_BLOCK_SE))
612 block->flags |= R600_PC_BLOCK_SE_GROUPS;
613 if (pc->separate_instance && block->num_instances > 1)
614 block->flags |= R600_PC_BLOCK_INSTANCE_GROUPS;
615
616 if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) {
617 block->num_groups = block->num_instances;
618 } else {
619 block->num_groups = 1;
620 }
621
622 if (block->flags & R600_PC_BLOCK_SE_GROUPS)
623 block->num_groups *= rscreen->info.max_se;
624 if (block->flags & R600_PC_BLOCK_SHADER)
625 block->num_groups *= pc->num_shader_types;
626
627 ++pc->num_blocks;
628 pc->num_groups += block->num_groups;
629 }
630
631 void r600_perfcounters_do_destroy(struct r600_perfcounters *pc)
632 {
633 unsigned i;
634
635 for (i = 0; i < pc->num_blocks; ++i) {
636 FREE(pc->blocks[i].group_names);
637 FREE(pc->blocks[i].selector_names);
638 }
639 FREE(pc->blocks);
640 FREE(pc);
641 }