anv: Implement VK_KHR_performance_query
[mesa.git] / src / intel / perf / gen_perf.c
1 /*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <dirent.h>
25
26 #include <sys/types.h>
27 #include <sys/stat.h>
28 #include <fcntl.h>
29 #include <unistd.h>
30 #include <errno.h>
31
32 #ifndef HAVE_DIRENT_D_TYPE
33 #include <limits.h> // PATH_MAX
34 #endif
35
36 #include <drm-uapi/i915_drm.h>
37
38 #include "common/gen_gem.h"
39
40 #include "dev/gen_debug.h"
41 #include "dev/gen_device_info.h"
42
43 #include "perf/gen_perf.h"
44 #include "perf/gen_perf_regs.h"
45 #include "perf/gen_perf_mdapi.h"
46 #include "perf/gen_perf_metrics.h"
47 #include "perf/gen_perf_private.h"
48
49 #include "util/bitscan.h"
50 #include "util/macros.h"
51 #include "util/mesa-sha1.h"
52 #include "util/u_math.h"
53
54 #define FILE_DEBUG_FLAG DEBUG_PERFMON
55
56 #define OA_REPORT_INVALID_CTX_ID (0xffffffff)
57
58 static bool
59 is_dir_or_link(const struct dirent *entry, const char *parent_dir)
60 {
61 #ifdef HAVE_DIRENT_D_TYPE
62 return entry->d_type == DT_DIR || entry->d_type == DT_LNK;
63 #else
64 struct stat st;
65 char path[PATH_MAX + 1];
66 snprintf(path, sizeof(path), "%s/%s", parent_dir, entry->d_name);
67 lstat(path, &st);
68 return S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode);
69 #endif
70 }
71
72 static bool
73 get_sysfs_dev_dir(struct gen_perf_config *perf, int fd)
74 {
75 struct stat sb;
76 int min, maj;
77 DIR *drmdir;
78 struct dirent *drm_entry;
79 int len;
80
81 perf->sysfs_dev_dir[0] = '\0';
82
83 if (fstat(fd, &sb)) {
84 DBG("Failed to stat DRM fd\n");
85 return false;
86 }
87
88 maj = major(sb.st_rdev);
89 min = minor(sb.st_rdev);
90
91 if (!S_ISCHR(sb.st_mode)) {
92 DBG("DRM fd is not a character device as expected\n");
93 return false;
94 }
95
96 len = snprintf(perf->sysfs_dev_dir,
97 sizeof(perf->sysfs_dev_dir),
98 "/sys/dev/char/%d:%d/device/drm", maj, min);
99 if (len < 0 || len >= sizeof(perf->sysfs_dev_dir)) {
100 DBG("Failed to concatenate sysfs path to drm device\n");
101 return false;
102 }
103
104 drmdir = opendir(perf->sysfs_dev_dir);
105 if (!drmdir) {
106 DBG("Failed to open %s: %m\n", perf->sysfs_dev_dir);
107 return false;
108 }
109
110 while ((drm_entry = readdir(drmdir))) {
111 if (is_dir_or_link(drm_entry, perf->sysfs_dev_dir) &&
112 strncmp(drm_entry->d_name, "card", 4) == 0)
113 {
114 len = snprintf(perf->sysfs_dev_dir,
115 sizeof(perf->sysfs_dev_dir),
116 "/sys/dev/char/%d:%d/device/drm/%s",
117 maj, min, drm_entry->d_name);
118 closedir(drmdir);
119 if (len < 0 || len >= sizeof(perf->sysfs_dev_dir))
120 return false;
121 else
122 return true;
123 }
124 }
125
126 closedir(drmdir);
127
128 DBG("Failed to find cardX directory under /sys/dev/char/%d:%d/device/drm\n",
129 maj, min);
130
131 return false;
132 }
133
134 static bool
135 read_file_uint64(const char *file, uint64_t *val)
136 {
137 char buf[32];
138 int fd, n;
139
140 fd = open(file, 0);
141 if (fd < 0)
142 return false;
143 while ((n = read(fd, buf, sizeof (buf) - 1)) < 0 &&
144 errno == EINTR);
145 close(fd);
146 if (n < 0)
147 return false;
148
149 buf[n] = '\0';
150 *val = strtoull(buf, NULL, 0);
151
152 return true;
153 }
154
155 static bool
156 read_sysfs_drm_device_file_uint64(struct gen_perf_config *perf,
157 const char *file,
158 uint64_t *value)
159 {
160 char buf[512];
161 int len;
162
163 len = snprintf(buf, sizeof(buf), "%s/%s", perf->sysfs_dev_dir, file);
164 if (len < 0 || len >= sizeof(buf)) {
165 DBG("Failed to concatenate sys filename to read u64 from\n");
166 return false;
167 }
168
169 return read_file_uint64(buf, value);
170 }
171
172 static void
173 register_oa_config(struct gen_perf_config *perf,
174 const struct gen_device_info *devinfo,
175 const struct gen_perf_query_info *query,
176 uint64_t config_id)
177 {
178 struct gen_perf_query_info *registered_query =
179 gen_perf_append_query_info(perf, 0);
180
181 *registered_query = *query;
182 registered_query->oa_format = devinfo->gen >= 8 ?
183 I915_OA_FORMAT_A32u40_A4u32_B8_C8 : I915_OA_FORMAT_A45_B8_C8;
184 registered_query->oa_metrics_set_id = config_id;
185 DBG("metric set registered: id = %" PRIu64", guid = %s\n",
186 registered_query->oa_metrics_set_id, query->guid);
187 }
188
189 static void
190 enumerate_sysfs_metrics(struct gen_perf_config *perf,
191 const struct gen_device_info *devinfo)
192 {
193 DIR *metricsdir = NULL;
194 struct dirent *metric_entry;
195 char buf[256];
196 int len;
197
198 len = snprintf(buf, sizeof(buf), "%s/metrics", perf->sysfs_dev_dir);
199 if (len < 0 || len >= sizeof(buf)) {
200 DBG("Failed to concatenate path to sysfs metrics/ directory\n");
201 return;
202 }
203
204 metricsdir = opendir(buf);
205 if (!metricsdir) {
206 DBG("Failed to open %s: %m\n", buf);
207 return;
208 }
209
210 while ((metric_entry = readdir(metricsdir))) {
211 struct hash_entry *entry;
212 if (!is_dir_or_link(metric_entry, buf) ||
213 metric_entry->d_name[0] == '.')
214 continue;
215
216 DBG("metric set: %s\n", metric_entry->d_name);
217 entry = _mesa_hash_table_search(perf->oa_metrics_table,
218 metric_entry->d_name);
219 if (entry) {
220 uint64_t id;
221 if (!gen_perf_load_metric_id(perf, metric_entry->d_name, &id)) {
222 DBG("Failed to read metric set id from %s: %m", buf);
223 continue;
224 }
225
226 register_oa_config(perf, devinfo,
227 (const struct gen_perf_query_info *)entry->data, id);
228 } else
229 DBG("metric set not known by mesa (skipping)\n");
230 }
231
232 closedir(metricsdir);
233 }
234
235 static bool
236 kernel_has_dynamic_config_support(struct gen_perf_config *perf, int fd)
237 {
238 uint64_t invalid_config_id = UINT64_MAX;
239
240 return gen_ioctl(fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG,
241 &invalid_config_id) < 0 && errno == ENOENT;
242 }
243
244 static int
245 i915_query_items(struct gen_perf_config *perf, int fd,
246 struct drm_i915_query_item *items, uint32_t n_items)
247 {
248 struct drm_i915_query q = {
249 .num_items = n_items,
250 .items_ptr = to_user_pointer(items),
251 };
252 return gen_ioctl(fd, DRM_IOCTL_I915_QUERY, &q);
253 }
254
255 static bool
256 i915_query_perf_config_supported(struct gen_perf_config *perf, int fd)
257 {
258 struct drm_i915_query_item item = {
259 .query_id = DRM_I915_QUERY_PERF_CONFIG,
260 .flags = DRM_I915_QUERY_PERF_CONFIG_LIST,
261 };
262
263 return i915_query_items(perf, fd, &item, 1) == 0 && item.length > 0;
264 }
265
266 static bool
267 i915_query_perf_config_data(struct gen_perf_config *perf,
268 int fd, const char *guid,
269 struct drm_i915_perf_oa_config *config)
270 {
271 struct {
272 struct drm_i915_query_perf_config query;
273 struct drm_i915_perf_oa_config config;
274 } item_data;
275 struct drm_i915_query_item item = {
276 .query_id = DRM_I915_QUERY_PERF_CONFIG,
277 .flags = DRM_I915_QUERY_PERF_CONFIG_DATA_FOR_UUID,
278 .data_ptr = to_user_pointer(&item_data),
279 .length = sizeof(item_data),
280 };
281
282 memset(&item_data, 0, sizeof(item_data));
283 memcpy(item_data.query.uuid, guid, sizeof(item_data.query.uuid));
284 memcpy(&item_data.config, config, sizeof(item_data.config));
285
286 if (!(i915_query_items(perf, fd, &item, 1) == 0 && item.length > 0))
287 return false;
288
289 memcpy(config, &item_data.config, sizeof(item_data.config));
290
291 return true;
292 }
293
294 bool
295 gen_perf_load_metric_id(struct gen_perf_config *perf_cfg,
296 const char *guid,
297 uint64_t *metric_id)
298 {
299 char config_path[280];
300
301 snprintf(config_path, sizeof(config_path), "%s/metrics/%s/id",
302 perf_cfg->sysfs_dev_dir, guid);
303
304 /* Don't recreate already loaded configs. */
305 return read_file_uint64(config_path, metric_id);
306 }
307
308 static uint64_t
309 i915_add_config(struct gen_perf_config *perf, int fd,
310 const struct gen_perf_registers *config,
311 const char *guid)
312 {
313 struct drm_i915_perf_oa_config i915_config = { 0, };
314
315 memcpy(i915_config.uuid, guid, sizeof(i915_config.uuid));
316
317 i915_config.n_mux_regs = config->n_mux_regs;
318 i915_config.mux_regs_ptr = to_user_pointer(config->mux_regs);
319
320 i915_config.n_boolean_regs = config->n_b_counter_regs;
321 i915_config.boolean_regs_ptr = to_user_pointer(config->b_counter_regs);
322
323 i915_config.n_flex_regs = config->n_flex_regs;
324 i915_config.flex_regs_ptr = to_user_pointer(config->flex_regs);
325
326 int ret = gen_ioctl(fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &i915_config);
327 return ret > 0 ? ret : 0;
328 }
329
330 static void
331 init_oa_configs(struct gen_perf_config *perf, int fd,
332 const struct gen_device_info *devinfo)
333 {
334 hash_table_foreach(perf->oa_metrics_table, entry) {
335 const struct gen_perf_query_info *query = entry->data;
336 uint64_t config_id;
337
338 if (gen_perf_load_metric_id(perf, query->guid, &config_id)) {
339 DBG("metric set: %s (already loaded)\n", query->guid);
340 register_oa_config(perf, devinfo, query, config_id);
341 continue;
342 }
343
344 int ret = i915_add_config(perf, fd, &query->config, query->guid);
345 if (ret < 0) {
346 DBG("Failed to load \"%s\" (%s) metrics set in kernel: %s\n",
347 query->name, query->guid, strerror(errno));
348 continue;
349 }
350
351 register_oa_config(perf, devinfo, query, ret);
352 DBG("metric set: %s (added)\n", query->guid);
353 }
354 }
355
356 static void
357 compute_topology_builtins(struct gen_perf_config *perf,
358 const struct gen_device_info *devinfo)
359 {
360 perf->sys_vars.slice_mask = devinfo->slice_masks;
361 perf->sys_vars.n_eu_slices = devinfo->num_slices;
362
363 for (int i = 0; i < sizeof(devinfo->subslice_masks[i]); i++) {
364 perf->sys_vars.n_eu_sub_slices +=
365 __builtin_popcount(devinfo->subslice_masks[i]);
366 }
367
368 for (int i = 0; i < sizeof(devinfo->eu_masks); i++)
369 perf->sys_vars.n_eus += __builtin_popcount(devinfo->eu_masks[i]);
370
371 perf->sys_vars.eu_threads_count = devinfo->num_thread_per_eu;
372
373 /* The subslice mask builtin contains bits for all slices. Prior to Gen11
374 * it had groups of 3bits for each slice, on Gen11 it's 8bits for each
375 * slice.
376 *
377 * Ideally equations would be updated to have a slice/subslice query
378 * function/operator.
379 */
380 perf->sys_vars.subslice_mask = 0;
381
382 int bits_per_subslice = devinfo->gen == 11 ? 8 : 3;
383
384 for (int s = 0; s < util_last_bit(devinfo->slice_masks); s++) {
385 for (int ss = 0; ss < (devinfo->subslice_slice_stride * 8); ss++) {
386 if (gen_device_info_subslice_available(devinfo, s, ss))
387 perf->sys_vars.subslice_mask |= 1ULL << (s * bits_per_subslice + ss);
388 }
389 }
390 }
391
392 static bool
393 init_oa_sys_vars(struct gen_perf_config *perf, const struct gen_device_info *devinfo)
394 {
395 uint64_t min_freq_mhz = 0, max_freq_mhz = 0;
396
397 if (!read_sysfs_drm_device_file_uint64(perf, "gt_min_freq_mhz", &min_freq_mhz))
398 return false;
399
400 if (!read_sysfs_drm_device_file_uint64(perf, "gt_max_freq_mhz", &max_freq_mhz))
401 return false;
402
403 memset(&perf->sys_vars, 0, sizeof(perf->sys_vars));
404 perf->sys_vars.gt_min_freq = min_freq_mhz * 1000000;
405 perf->sys_vars.gt_max_freq = max_freq_mhz * 1000000;
406 perf->sys_vars.timestamp_frequency = devinfo->timestamp_frequency;
407 perf->sys_vars.revision = devinfo->revision;
408 compute_topology_builtins(perf, devinfo);
409
410 return true;
411 }
412
413 typedef void (*perf_register_oa_queries_t)(struct gen_perf_config *);
414
415 static perf_register_oa_queries_t
416 get_register_queries_function(const struct gen_device_info *devinfo)
417 {
418 if (devinfo->is_haswell)
419 return gen_oa_register_queries_hsw;
420 if (devinfo->is_cherryview)
421 return gen_oa_register_queries_chv;
422 if (devinfo->is_broadwell)
423 return gen_oa_register_queries_bdw;
424 if (devinfo->is_broxton)
425 return gen_oa_register_queries_bxt;
426 if (devinfo->is_skylake) {
427 if (devinfo->gt == 2)
428 return gen_oa_register_queries_sklgt2;
429 if (devinfo->gt == 3)
430 return gen_oa_register_queries_sklgt3;
431 if (devinfo->gt == 4)
432 return gen_oa_register_queries_sklgt4;
433 }
434 if (devinfo->is_kabylake) {
435 if (devinfo->gt == 2)
436 return gen_oa_register_queries_kblgt2;
437 if (devinfo->gt == 3)
438 return gen_oa_register_queries_kblgt3;
439 }
440 if (devinfo->is_geminilake)
441 return gen_oa_register_queries_glk;
442 if (devinfo->is_coffeelake) {
443 if (devinfo->gt == 2)
444 return gen_oa_register_queries_cflgt2;
445 if (devinfo->gt == 3)
446 return gen_oa_register_queries_cflgt3;
447 }
448 if (devinfo->is_cannonlake)
449 return gen_oa_register_queries_cnl;
450 if (devinfo->gen == 11) {
451 if (devinfo->is_elkhartlake)
452 return gen_oa_register_queries_lkf;
453 return gen_oa_register_queries_icl;
454 }
455 if (devinfo->gen == 12)
456 return gen_oa_register_queries_tgl;
457
458 return NULL;
459 }
460
461 static void
462 load_pipeline_statistic_metrics(struct gen_perf_config *perf_cfg,
463 const struct gen_device_info *devinfo)
464 {
465 struct gen_perf_query_info *query =
466 gen_perf_append_query_info(perf_cfg, MAX_STAT_COUNTERS);
467
468 query->kind = GEN_PERF_QUERY_TYPE_PIPELINE;
469 query->name = "Pipeline Statistics Registers";
470
471 gen_perf_query_add_basic_stat_reg(query, IA_VERTICES_COUNT,
472 "N vertices submitted");
473 gen_perf_query_add_basic_stat_reg(query, IA_PRIMITIVES_COUNT,
474 "N primitives submitted");
475 gen_perf_query_add_basic_stat_reg(query, VS_INVOCATION_COUNT,
476 "N vertex shader invocations");
477
478 if (devinfo->gen == 6) {
479 gen_perf_query_add_stat_reg(query, GEN6_SO_PRIM_STORAGE_NEEDED, 1, 1,
480 "SO_PRIM_STORAGE_NEEDED",
481 "N geometry shader stream-out primitives (total)");
482 gen_perf_query_add_stat_reg(query, GEN6_SO_NUM_PRIMS_WRITTEN, 1, 1,
483 "SO_NUM_PRIMS_WRITTEN",
484 "N geometry shader stream-out primitives (written)");
485 } else {
486 gen_perf_query_add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(0), 1, 1,
487 "SO_PRIM_STORAGE_NEEDED (Stream 0)",
488 "N stream-out (stream 0) primitives (total)");
489 gen_perf_query_add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(1), 1, 1,
490 "SO_PRIM_STORAGE_NEEDED (Stream 1)",
491 "N stream-out (stream 1) primitives (total)");
492 gen_perf_query_add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(2), 1, 1,
493 "SO_PRIM_STORAGE_NEEDED (Stream 2)",
494 "N stream-out (stream 2) primitives (total)");
495 gen_perf_query_add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(3), 1, 1,
496 "SO_PRIM_STORAGE_NEEDED (Stream 3)",
497 "N stream-out (stream 3) primitives (total)");
498 gen_perf_query_add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(0), 1, 1,
499 "SO_NUM_PRIMS_WRITTEN (Stream 0)",
500 "N stream-out (stream 0) primitives (written)");
501 gen_perf_query_add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(1), 1, 1,
502 "SO_NUM_PRIMS_WRITTEN (Stream 1)",
503 "N stream-out (stream 1) primitives (written)");
504 gen_perf_query_add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(2), 1, 1,
505 "SO_NUM_PRIMS_WRITTEN (Stream 2)",
506 "N stream-out (stream 2) primitives (written)");
507 gen_perf_query_add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(3), 1, 1,
508 "SO_NUM_PRIMS_WRITTEN (Stream 3)",
509 "N stream-out (stream 3) primitives (written)");
510 }
511
512 gen_perf_query_add_basic_stat_reg(query, HS_INVOCATION_COUNT,
513 "N TCS shader invocations");
514 gen_perf_query_add_basic_stat_reg(query, DS_INVOCATION_COUNT,
515 "N TES shader invocations");
516
517 gen_perf_query_add_basic_stat_reg(query, GS_INVOCATION_COUNT,
518 "N geometry shader invocations");
519 gen_perf_query_add_basic_stat_reg(query, GS_PRIMITIVES_COUNT,
520 "N geometry shader primitives emitted");
521
522 gen_perf_query_add_basic_stat_reg(query, CL_INVOCATION_COUNT,
523 "N primitives entering clipping");
524 gen_perf_query_add_basic_stat_reg(query, CL_PRIMITIVES_COUNT,
525 "N primitives leaving clipping");
526
527 if (devinfo->is_haswell || devinfo->gen == 8) {
528 gen_perf_query_add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4,
529 "N fragment shader invocations",
530 "N fragment shader invocations");
531 } else {
532 gen_perf_query_add_basic_stat_reg(query, PS_INVOCATION_COUNT,
533 "N fragment shader invocations");
534 }
535
536 gen_perf_query_add_basic_stat_reg(query, PS_DEPTH_COUNT,
537 "N z-pass fragments");
538
539 if (devinfo->gen >= 7) {
540 gen_perf_query_add_basic_stat_reg(query, CS_INVOCATION_COUNT,
541 "N compute shader invocations");
542 }
543
544 query->data_size = sizeof(uint64_t) * query->n_counters;
545 }
546
547 static int
548 i915_perf_version(int drm_fd)
549 {
550 int tmp;
551 drm_i915_getparam_t gp = {
552 .param = I915_PARAM_PERF_REVISION,
553 .value = &tmp,
554 };
555
556 int ret = gen_ioctl(drm_fd, DRM_IOCTL_I915_GETPARAM, &gp);
557
558 /* Return 0 if this getparam is not supported, the first version supported
559 * is 1.
560 */
561 return ret < 0 ? 0 : tmp;
562 }
563
564 static void
565 i915_get_sseu(int drm_fd, struct drm_i915_gem_context_param_sseu *sseu)
566 {
567 struct drm_i915_gem_context_param arg = {
568 .param = I915_CONTEXT_PARAM_SSEU,
569 .size = sizeof(*sseu),
570 .value = to_user_pointer(sseu)
571 };
572
573 gen_ioctl(drm_fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &arg);
574 }
575
576 static int
577 compare_counters(const void *_c1, const void *_c2)
578 {
579 const struct gen_perf_query_counter * const *c1 = _c1, * const *c2 = _c2;
580 return strcmp((*c1)->symbol_name, (*c2)->symbol_name);
581 }
582
583 static void
584 build_unique_counter_list(struct gen_perf_config *perf)
585 {
586 assert(perf->n_queries < 64);
587
588 struct hash_table *counters_table =
589 _mesa_hash_table_create(perf,
590 _mesa_hash_string,
591 _mesa_key_string_equal);
592 struct hash_entry *entry;
593 for (int q = 0; q < perf->n_queries ; q++) {
594 struct gen_perf_query_info *query = &perf->queries[q];
595
596 for (int c = 0; c < query->n_counters; c++) {
597 struct gen_perf_query_counter *counter, *unique_counter;
598
599 counter = &query->counters[c];
600 entry = _mesa_hash_table_search(counters_table, counter->symbol_name);
601
602 if (entry) {
603 unique_counter = entry->data;
604 unique_counter->query_mask |= BITFIELD64_BIT(q);
605 continue;
606 }
607
608 unique_counter = counter;
609 unique_counter->query_mask = BITFIELD64_BIT(q);
610
611 _mesa_hash_table_insert(counters_table, unique_counter->symbol_name, unique_counter);
612 }
613 }
614
615 perf->n_counters = _mesa_hash_table_num_entries(counters_table);
616 perf->counters = ralloc_array(perf, struct gen_perf_query_counter *,
617 perf->n_counters);
618
619 int c = 0;
620 hash_table_foreach(counters_table, entry) {
621 struct gen_perf_query_counter *counter = entry->data;
622 perf->counters[c++] = counter;
623 }
624
625 _mesa_hash_table_destroy(counters_table, NULL);
626
627 qsort(perf->counters, perf->n_counters, sizeof(perf->counters[0]),
628 compare_counters);
629 }
630
631 static bool
632 load_oa_metrics(struct gen_perf_config *perf, int fd,
633 const struct gen_device_info *devinfo)
634 {
635 perf_register_oa_queries_t oa_register = get_register_queries_function(devinfo);
636 bool i915_perf_oa_available = false;
637 struct stat sb;
638
639 perf->i915_query_supported = i915_query_perf_config_supported(perf, fd);
640 perf->i915_perf_version = i915_perf_version(fd);
641
642 /* Record the default SSEU configuration. */
643 i915_get_sseu(fd, &perf->sseu);
644
645 /* The existence of this sysctl parameter implies the kernel supports
646 * the i915 perf interface.
647 */
648 if (stat("/proc/sys/dev/i915/perf_stream_paranoid", &sb) == 0) {
649
650 /* If _paranoid == 1 then on Gen8+ we won't be able to access OA
651 * metrics unless running as root.
652 */
653 if (devinfo->is_haswell)
654 i915_perf_oa_available = true;
655 else {
656 uint64_t paranoid = 1;
657
658 read_file_uint64("/proc/sys/dev/i915/perf_stream_paranoid", &paranoid);
659
660 if (paranoid == 0 || geteuid() == 0)
661 i915_perf_oa_available = true;
662 }
663 }
664
665 perf->platform_supported = oa_register != NULL;
666
667 if (!i915_perf_oa_available ||
668 !oa_register ||
669 !get_sysfs_dev_dir(perf, fd) ||
670 !init_oa_sys_vars(perf, devinfo))
671 return false;
672
673 perf->oa_metrics_table =
674 _mesa_hash_table_create(perf, _mesa_hash_string,
675 _mesa_key_string_equal);
676
677 /* Index all the metric sets mesa knows about before looking to see what
678 * the kernel is advertising.
679 */
680 oa_register(perf);
681
682 if (likely((INTEL_DEBUG & DEBUG_NO_OACONFIG) == 0) &&
683 kernel_has_dynamic_config_support(perf, fd))
684 init_oa_configs(perf, fd, devinfo);
685 else
686 enumerate_sysfs_metrics(perf, devinfo);
687
688 build_unique_counter_list(perf);
689
690 return true;
691 }
692
693 struct gen_perf_registers *
694 gen_perf_load_configuration(struct gen_perf_config *perf_cfg, int fd, const char *guid)
695 {
696 if (!perf_cfg->i915_query_supported)
697 return NULL;
698
699 struct drm_i915_perf_oa_config i915_config = { 0, };
700 if (!i915_query_perf_config_data(perf_cfg, fd, guid, &i915_config))
701 return NULL;
702
703 struct gen_perf_registers *config = rzalloc(NULL, struct gen_perf_registers);
704 config->n_flex_regs = i915_config.n_flex_regs;
705 config->flex_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_flex_regs);
706 config->n_mux_regs = i915_config.n_mux_regs;
707 config->mux_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_mux_regs);
708 config->n_b_counter_regs = i915_config.n_boolean_regs;
709 config->b_counter_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_b_counter_regs);
710
711 /*
712 * struct gen_perf_query_register_prog maps exactly to the tuple of
713 * (register offset, register value) returned by the i915.
714 */
715 i915_config.flex_regs_ptr = to_user_pointer(config->flex_regs);
716 i915_config.mux_regs_ptr = to_user_pointer(config->mux_regs);
717 i915_config.boolean_regs_ptr = to_user_pointer(config->b_counter_regs);
718 if (!i915_query_perf_config_data(perf_cfg, fd, guid, &i915_config)) {
719 ralloc_free(config);
720 return NULL;
721 }
722
723 return config;
724 }
725
726 uint64_t
727 gen_perf_store_configuration(struct gen_perf_config *perf_cfg, int fd,
728 const struct gen_perf_registers *config,
729 const char *guid)
730 {
731 if (guid)
732 return i915_add_config(perf_cfg, fd, config, guid);
733
734 struct mesa_sha1 sha1_ctx;
735 _mesa_sha1_init(&sha1_ctx);
736
737 if (config->flex_regs) {
738 _mesa_sha1_update(&sha1_ctx, config->flex_regs,
739 sizeof(config->flex_regs[0]) *
740 config->n_flex_regs);
741 }
742 if (config->mux_regs) {
743 _mesa_sha1_update(&sha1_ctx, config->mux_regs,
744 sizeof(config->mux_regs[0]) *
745 config->n_mux_regs);
746 }
747 if (config->b_counter_regs) {
748 _mesa_sha1_update(&sha1_ctx, config->b_counter_regs,
749 sizeof(config->b_counter_regs[0]) *
750 config->n_b_counter_regs);
751 }
752
753 uint8_t hash[20];
754 _mesa_sha1_final(&sha1_ctx, hash);
755
756 char formatted_hash[41];
757 _mesa_sha1_format(formatted_hash, hash);
758
759 char generated_guid[37];
760 snprintf(generated_guid, sizeof(generated_guid),
761 "%.8s-%.4s-%.4s-%.4s-%.12s",
762 &formatted_hash[0], &formatted_hash[8],
763 &formatted_hash[8 + 4], &formatted_hash[8 + 4 + 4],
764 &formatted_hash[8 + 4 + 4 + 4]);
765
766 /* Check if already present. */
767 uint64_t id;
768 if (gen_perf_load_metric_id(perf_cfg, generated_guid, &id))
769 return id;
770
771 return i915_add_config(perf_cfg, fd, config, generated_guid);
772 }
773
774 static uint64_t
775 get_passes_mask(struct gen_perf_config *perf,
776 const uint32_t *counter_indices,
777 uint32_t counter_indices_count)
778 {
779 uint64_t queries_mask = 0;
780
781 assert(perf->n_queries < 64);
782
783 /* Compute the number of passes by going through all counters N times (with
784 * N the number of queries) to make sure we select the most constraining
785 * counters first and look at the more flexible ones (that could be
786 * obtained from multiple queries) later. That way we minimize the number
787 * of passes required.
788 */
789 for (uint32_t q = 0; q < perf->n_queries; q++) {
790 for (uint32_t i = 0; i < counter_indices_count; i++) {
791 assert(counter_indices[i] < perf->n_counters);
792
793 uint32_t idx = counter_indices[i];
794 if (__builtin_popcount(perf->counters[idx]->query_mask) != (q + 1))
795 continue;
796
797 if (queries_mask & perf->counters[idx]->query_mask)
798 continue;
799
800 queries_mask |= BITFIELD64_BIT(ffsll(perf->counters[idx]->query_mask) - 1);
801 }
802 }
803
804 return queries_mask;
805 }
806
807 uint32_t
808 gen_perf_get_n_passes(struct gen_perf_config *perf,
809 const uint32_t *counter_indices,
810 uint32_t counter_indices_count,
811 struct gen_perf_query_info **pass_queries)
812 {
813 uint64_t queries_mask = get_passes_mask(perf, counter_indices, counter_indices_count);
814
815 if (pass_queries) {
816 uint32_t pass = 0;
817 for (uint32_t q = 0; q < perf->n_queries; q++) {
818 if ((1ULL << q) & queries_mask)
819 pass_queries[pass++] = &perf->queries[q];
820 }
821 }
822
823 return __builtin_popcount(queries_mask);
824 }
825
826 void
827 gen_perf_get_counters_passes(struct gen_perf_config *perf,
828 const uint32_t *counter_indices,
829 uint32_t counter_indices_count,
830 struct gen_perf_counter_pass *counter_pass)
831 {
832 uint64_t queries_mask = get_passes_mask(perf, counter_indices, counter_indices_count);
833 uint32_t n_passes = __builtin_popcount(queries_mask);
834
835 for (uint32_t i = 0; i < counter_indices_count; i++) {
836 assert(counter_indices[i] < perf->n_counters);
837
838 uint32_t idx = counter_indices[i];
839 counter_pass[i].counter = perf->counters[idx];
840
841 uint32_t query_idx = ffsll(perf->counters[idx]->query_mask & queries_mask) - 1;
842 counter_pass[i].query = &perf->queries[query_idx];
843
844 uint32_t clear_bits = 63 - query_idx;
845 counter_pass[i].pass = __builtin_popcount((queries_mask << clear_bits) >> clear_bits) - 1;
846 assert(counter_pass[i].pass < n_passes);
847 }
848 }
849
850 /* Accumulate 32bits OA counters */
851 static inline void
852 accumulate_uint32(const uint32_t *report0,
853 const uint32_t *report1,
854 uint64_t *accumulator)
855 {
856 *accumulator += (uint32_t)(*report1 - *report0);
857 }
858
859 /* Accumulate 40bits OA counters */
860 static inline void
861 accumulate_uint40(int a_index,
862 const uint32_t *report0,
863 const uint32_t *report1,
864 uint64_t *accumulator)
865 {
866 const uint8_t *high_bytes0 = (uint8_t *)(report0 + 40);
867 const uint8_t *high_bytes1 = (uint8_t *)(report1 + 40);
868 uint64_t high0 = (uint64_t)(high_bytes0[a_index]) << 32;
869 uint64_t high1 = (uint64_t)(high_bytes1[a_index]) << 32;
870 uint64_t value0 = report0[a_index + 4] | high0;
871 uint64_t value1 = report1[a_index + 4] | high1;
872 uint64_t delta;
873
874 if (value0 > value1)
875 delta = (1ULL << 40) + value1 - value0;
876 else
877 delta = value1 - value0;
878
879 *accumulator += delta;
880 }
881
882 static void
883 gen8_read_report_clock_ratios(const uint32_t *report,
884 uint64_t *slice_freq_hz,
885 uint64_t *unslice_freq_hz)
886 {
887 /* The lower 16bits of the RPT_ID field of the OA reports contains a
888 * snapshot of the bits coming from the RP_FREQ_NORMAL register and is
889 * divided this way :
890 *
891 * RPT_ID[31:25]: RP_FREQ_NORMAL[20:14] (low squashed_slice_clock_frequency)
892 * RPT_ID[10:9]: RP_FREQ_NORMAL[22:21] (high squashed_slice_clock_frequency)
893 * RPT_ID[8:0]: RP_FREQ_NORMAL[31:23] (squashed_unslice_clock_frequency)
894 *
895 * RP_FREQ_NORMAL[31:23]: Software Unslice Ratio Request
896 * Multiple of 33.33MHz 2xclk (16 MHz 1xclk)
897 *
898 * RP_FREQ_NORMAL[22:14]: Software Slice Ratio Request
899 * Multiple of 33.33MHz 2xclk (16 MHz 1xclk)
900 */
901
902 uint32_t unslice_freq = report[0] & 0x1ff;
903 uint32_t slice_freq_low = (report[0] >> 25) & 0x7f;
904 uint32_t slice_freq_high = (report[0] >> 9) & 0x3;
905 uint32_t slice_freq = slice_freq_low | (slice_freq_high << 7);
906
907 *slice_freq_hz = slice_freq * 16666667ULL;
908 *unslice_freq_hz = unslice_freq * 16666667ULL;
909 }
910
911 void
912 gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result,
913 const struct gen_device_info *devinfo,
914 const uint32_t *start,
915 const uint32_t *end)
916 {
917 /* Slice/Unslice frequency is only available in the OA reports when the
918 * "Disable OA reports due to clock ratio change" field in
919 * OA_DEBUG_REGISTER is set to 1. This is how the kernel programs this
920 * global register (see drivers/gpu/drm/i915/i915_perf.c)
921 *
922 * Documentation says this should be available on Gen9+ but experimentation
923 * shows that Gen8 reports similar values, so we enable it there too.
924 */
925 if (devinfo->gen < 8)
926 return;
927
928 gen8_read_report_clock_ratios(start,
929 &result->slice_frequency[0],
930 &result->unslice_frequency[0]);
931 gen8_read_report_clock_ratios(end,
932 &result->slice_frequency[1],
933 &result->unslice_frequency[1]);
934 }
935
936 void
937 gen_perf_query_result_accumulate(struct gen_perf_query_result *result,
938 const struct gen_perf_query_info *query,
939 const uint32_t *start,
940 const uint32_t *end)
941 {
942 int i;
943
944 if (result->hw_id == OA_REPORT_INVALID_CTX_ID &&
945 start[2] != OA_REPORT_INVALID_CTX_ID)
946 result->hw_id = start[2];
947 if (result->reports_accumulated == 0)
948 result->begin_timestamp = start[1];
949 result->reports_accumulated++;
950
951 switch (query->oa_format) {
952 case I915_OA_FORMAT_A32u40_A4u32_B8_C8:
953 accumulate_uint32(start + 1, end + 1,
954 result->accumulator + query->gpu_time_offset); /* timestamp */
955 accumulate_uint32(start + 3, end + 3,
956 result->accumulator + query->gpu_clock_offset); /* clock */
957
958 /* 32x 40bit A counters... */
959 for (i = 0; i < 32; i++) {
960 accumulate_uint40(i, start, end,
961 result->accumulator + query->a_offset + i);
962 }
963
964 /* 4x 32bit A counters... */
965 for (i = 0; i < 4; i++) {
966 accumulate_uint32(start + 36 + i, end + 36 + i,
967 result->accumulator + query->a_offset + 32 + i);
968 }
969
970 /* 8x 32bit B counters */
971 for (i = 0; i < 8; i++) {
972 accumulate_uint32(start + 48 + i, end + 48 + i,
973 result->accumulator + query->b_offset + i);
974 }
975
976 /* 8x 32bit C counters... */
977 for (i = 0; i < 8; i++) {
978 accumulate_uint32(start + 56 + i, end + 56 + i,
979 result->accumulator + query->c_offset + i);
980 }
981 break;
982
983 case I915_OA_FORMAT_A45_B8_C8:
984 accumulate_uint32(start + 1, end + 1, result->accumulator); /* timestamp */
985
986 for (i = 0; i < 61; i++) {
987 accumulate_uint32(start + 3 + i, end + 3 + i,
988 result->accumulator + query->a_offset + i);
989 }
990 break;
991
992 default:
993 unreachable("Can't accumulate OA counters in unknown format");
994 }
995
996 }
997
998 void
999 gen_perf_query_result_clear(struct gen_perf_query_result *result)
1000 {
1001 memset(result, 0, sizeof(*result));
1002 result->hw_id = OA_REPORT_INVALID_CTX_ID; /* invalid */
1003 }
1004
1005 void
1006 gen_perf_init_metrics(struct gen_perf_config *perf_cfg,
1007 const struct gen_device_info *devinfo,
1008 int drm_fd,
1009 bool include_pipeline_statistics)
1010 {
1011 if (include_pipeline_statistics) {
1012 load_pipeline_statistic_metrics(perf_cfg, devinfo);
1013 gen_perf_register_mdapi_statistic_query(perf_cfg, devinfo);
1014 }
1015 if (load_oa_metrics(perf_cfg, drm_fd, devinfo))
1016 gen_perf_register_mdapi_oa_query(perf_cfg, devinfo);
1017 }