intel/perf: break GL query stuff away
[mesa.git] / src / intel / perf / gen_perf.c
1 /*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <dirent.h>
25
26 #include <sys/types.h>
27 #include <sys/stat.h>
28 #include <fcntl.h>
29 #include <unistd.h>
30 #include <errno.h>
31
32 #ifndef HAVE_DIRENT_D_TYPE
33 #include <limits.h> // PATH_MAX
34 #endif
35
36 #include <drm-uapi/i915_drm.h>
37
38 #include "common/gen_gem.h"
39
40 #include "dev/gen_debug.h"
41 #include "dev/gen_device_info.h"
42
43 #include "perf/gen_perf.h"
44 #include "perf/gen_perf_regs.h"
45 #include "perf/gen_perf_mdapi.h"
46 #include "perf/gen_perf_metrics.h"
47
48 #include "util/bitscan.h"
49 #include "util/mesa-sha1.h"
50 #include "util/u_math.h"
51
52 #define FILE_DEBUG_FLAG DEBUG_PERFMON
53
54 #define OA_REPORT_INVALID_CTX_ID (0xffffffff)
55
56 static inline uint64_t to_user_pointer(void *ptr)
57 {
58 return (uintptr_t) ptr;
59 }
60
61 static bool
62 is_dir_or_link(const struct dirent *entry, const char *parent_dir)
63 {
64 #ifdef HAVE_DIRENT_D_TYPE
65 return entry->d_type == DT_DIR || entry->d_type == DT_LNK;
66 #else
67 struct stat st;
68 char path[PATH_MAX + 1];
69 snprintf(path, sizeof(path), "%s/%s", parent_dir, entry->d_name);
70 lstat(path, &st);
71 return S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode);
72 #endif
73 }
74
75 static bool
76 get_sysfs_dev_dir(struct gen_perf_config *perf, int fd)
77 {
78 struct stat sb;
79 int min, maj;
80 DIR *drmdir;
81 struct dirent *drm_entry;
82 int len;
83
84 perf->sysfs_dev_dir[0] = '\0';
85
86 if (fstat(fd, &sb)) {
87 DBG("Failed to stat DRM fd\n");
88 return false;
89 }
90
91 maj = major(sb.st_rdev);
92 min = minor(sb.st_rdev);
93
94 if (!S_ISCHR(sb.st_mode)) {
95 DBG("DRM fd is not a character device as expected\n");
96 return false;
97 }
98
99 len = snprintf(perf->sysfs_dev_dir,
100 sizeof(perf->sysfs_dev_dir),
101 "/sys/dev/char/%d:%d/device/drm", maj, min);
102 if (len < 0 || len >= sizeof(perf->sysfs_dev_dir)) {
103 DBG("Failed to concatenate sysfs path to drm device\n");
104 return false;
105 }
106
107 drmdir = opendir(perf->sysfs_dev_dir);
108 if (!drmdir) {
109 DBG("Failed to open %s: %m\n", perf->sysfs_dev_dir);
110 return false;
111 }
112
113 while ((drm_entry = readdir(drmdir))) {
114 if (is_dir_or_link(drm_entry, perf->sysfs_dev_dir) &&
115 strncmp(drm_entry->d_name, "card", 4) == 0)
116 {
117 len = snprintf(perf->sysfs_dev_dir,
118 sizeof(perf->sysfs_dev_dir),
119 "/sys/dev/char/%d:%d/device/drm/%s",
120 maj, min, drm_entry->d_name);
121 closedir(drmdir);
122 if (len < 0 || len >= sizeof(perf->sysfs_dev_dir))
123 return false;
124 else
125 return true;
126 }
127 }
128
129 closedir(drmdir);
130
131 DBG("Failed to find cardX directory under /sys/dev/char/%d:%d/device/drm\n",
132 maj, min);
133
134 return false;
135 }
136
137 static bool
138 read_file_uint64(const char *file, uint64_t *val)
139 {
140 char buf[32];
141 int fd, n;
142
143 fd = open(file, 0);
144 if (fd < 0)
145 return false;
146 while ((n = read(fd, buf, sizeof (buf) - 1)) < 0 &&
147 errno == EINTR);
148 close(fd);
149 if (n < 0)
150 return false;
151
152 buf[n] = '\0';
153 *val = strtoull(buf, NULL, 0);
154
155 return true;
156 }
157
158 static bool
159 read_sysfs_drm_device_file_uint64(struct gen_perf_config *perf,
160 const char *file,
161 uint64_t *value)
162 {
163 char buf[512];
164 int len;
165
166 len = snprintf(buf, sizeof(buf), "%s/%s", perf->sysfs_dev_dir, file);
167 if (len < 0 || len >= sizeof(buf)) {
168 DBG("Failed to concatenate sys filename to read u64 from\n");
169 return false;
170 }
171
172 return read_file_uint64(buf, value);
173 }
174
175 static inline struct gen_perf_query_info *
176 append_query_info(struct gen_perf_config *perf, int max_counters)
177 {
178 struct gen_perf_query_info *query;
179
180 perf->queries = reralloc(perf, perf->queries,
181 struct gen_perf_query_info,
182 ++perf->n_queries);
183 query = &perf->queries[perf->n_queries - 1];
184 memset(query, 0, sizeof(*query));
185
186 if (max_counters > 0) {
187 query->max_counters = max_counters;
188 query->counters =
189 rzalloc_array(perf, struct gen_perf_query_counter, max_counters);
190 }
191
192 return query;
193 }
194
195 static void
196 register_oa_config(struct gen_perf_config *perf,
197 const struct gen_perf_query_info *query,
198 uint64_t config_id)
199 {
200 struct gen_perf_query_info *registered_query = append_query_info(perf, 0);
201
202 *registered_query = *query;
203 registered_query->oa_metrics_set_id = config_id;
204 DBG("metric set registered: id = %" PRIu64", guid = %s\n",
205 registered_query->oa_metrics_set_id, query->guid);
206 }
207
208 static void
209 enumerate_sysfs_metrics(struct gen_perf_config *perf)
210 {
211 DIR *metricsdir = NULL;
212 struct dirent *metric_entry;
213 char buf[256];
214 int len;
215
216 len = snprintf(buf, sizeof(buf), "%s/metrics", perf->sysfs_dev_dir);
217 if (len < 0 || len >= sizeof(buf)) {
218 DBG("Failed to concatenate path to sysfs metrics/ directory\n");
219 return;
220 }
221
222 metricsdir = opendir(buf);
223 if (!metricsdir) {
224 DBG("Failed to open %s: %m\n", buf);
225 return;
226 }
227
228 while ((metric_entry = readdir(metricsdir))) {
229 struct hash_entry *entry;
230 if (!is_dir_or_link(metric_entry, buf) ||
231 metric_entry->d_name[0] == '.')
232 continue;
233
234 DBG("metric set: %s\n", metric_entry->d_name);
235 entry = _mesa_hash_table_search(perf->oa_metrics_table,
236 metric_entry->d_name);
237 if (entry) {
238 uint64_t id;
239 if (!gen_perf_load_metric_id(perf, metric_entry->d_name, &id)) {
240 DBG("Failed to read metric set id from %s: %m", buf);
241 continue;
242 }
243
244 register_oa_config(perf, (const struct gen_perf_query_info *)entry->data, id);
245 } else
246 DBG("metric set not known by mesa (skipping)\n");
247 }
248
249 closedir(metricsdir);
250 }
251
252 static bool
253 kernel_has_dynamic_config_support(struct gen_perf_config *perf, int fd)
254 {
255 uint64_t invalid_config_id = UINT64_MAX;
256
257 return gen_ioctl(fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG,
258 &invalid_config_id) < 0 && errno == ENOENT;
259 }
260
261 static int
262 i915_query_items(struct gen_perf_config *perf, int fd,
263 struct drm_i915_query_item *items, uint32_t n_items)
264 {
265 struct drm_i915_query q = {
266 .num_items = n_items,
267 .items_ptr = to_user_pointer(items),
268 };
269 return gen_ioctl(fd, DRM_IOCTL_I915_QUERY, &q);
270 }
271
272 static bool
273 i915_query_perf_config_supported(struct gen_perf_config *perf, int fd)
274 {
275 struct drm_i915_query_item item = {
276 .query_id = DRM_I915_QUERY_PERF_CONFIG,
277 .flags = DRM_I915_QUERY_PERF_CONFIG_LIST,
278 };
279
280 return i915_query_items(perf, fd, &item, 1) == 0 && item.length > 0;
281 }
282
283 static bool
284 i915_query_perf_config_data(struct gen_perf_config *perf,
285 int fd, const char *guid,
286 struct drm_i915_perf_oa_config *config)
287 {
288 struct {
289 struct drm_i915_query_perf_config query;
290 struct drm_i915_perf_oa_config config;
291 } item_data;
292 struct drm_i915_query_item item = {
293 .query_id = DRM_I915_QUERY_PERF_CONFIG,
294 .flags = DRM_I915_QUERY_PERF_CONFIG_DATA_FOR_UUID,
295 .data_ptr = to_user_pointer(&item_data),
296 .length = sizeof(item_data),
297 };
298
299 memset(&item_data, 0, sizeof(item_data));
300 memcpy(item_data.query.uuid, guid, sizeof(item_data.query.uuid));
301 memcpy(&item_data.config, config, sizeof(item_data.config));
302
303 if (!(i915_query_items(perf, fd, &item, 1) == 0 && item.length > 0))
304 return false;
305
306 memcpy(config, &item_data.config, sizeof(item_data.config));
307
308 return true;
309 }
310
311 bool
312 gen_perf_load_metric_id(struct gen_perf_config *perf_cfg,
313 const char *guid,
314 uint64_t *metric_id)
315 {
316 char config_path[280];
317
318 snprintf(config_path, sizeof(config_path), "%s/metrics/%s/id",
319 perf_cfg->sysfs_dev_dir, guid);
320
321 /* Don't recreate already loaded configs. */
322 return read_file_uint64(config_path, metric_id);
323 }
324
325 static uint64_t
326 i915_add_config(struct gen_perf_config *perf, int fd,
327 const struct gen_perf_registers *config,
328 const char *guid)
329 {
330 struct drm_i915_perf_oa_config i915_config = { 0, };
331
332 memcpy(i915_config.uuid, guid, sizeof(i915_config.uuid));
333
334 i915_config.n_mux_regs = config->n_mux_regs;
335 i915_config.mux_regs_ptr = to_user_pointer(config->mux_regs);
336
337 i915_config.n_boolean_regs = config->n_b_counter_regs;
338 i915_config.boolean_regs_ptr = to_user_pointer(config->b_counter_regs);
339
340 i915_config.n_flex_regs = config->n_flex_regs;
341 i915_config.flex_regs_ptr = to_user_pointer(config->flex_regs);
342
343 int ret = gen_ioctl(fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &i915_config);
344 return ret > 0 ? ret : 0;
345 }
346
347 static void
348 init_oa_configs(struct gen_perf_config *perf, int fd)
349 {
350 hash_table_foreach(perf->oa_metrics_table, entry) {
351 const struct gen_perf_query_info *query = entry->data;
352 uint64_t config_id;
353
354 if (gen_perf_load_metric_id(perf, query->guid, &config_id)) {
355 DBG("metric set: %s (already loaded)\n", query->guid);
356 register_oa_config(perf, query, config_id);
357 continue;
358 }
359
360 int ret = i915_add_config(perf, fd, &query->config, query->guid);
361 if (ret < 0) {
362 DBG("Failed to load \"%s\" (%s) metrics set in kernel: %s\n",
363 query->name, query->guid, strerror(errno));
364 continue;
365 }
366
367 register_oa_config(perf, query, ret);
368 DBG("metric set: %s (added)\n", query->guid);
369 }
370 }
371
372 static void
373 compute_topology_builtins(struct gen_perf_config *perf,
374 const struct gen_device_info *devinfo)
375 {
376 perf->sys_vars.slice_mask = devinfo->slice_masks;
377 perf->sys_vars.n_eu_slices = devinfo->num_slices;
378
379 for (int i = 0; i < sizeof(devinfo->subslice_masks[i]); i++) {
380 perf->sys_vars.n_eu_sub_slices +=
381 __builtin_popcount(devinfo->subslice_masks[i]);
382 }
383
384 for (int i = 0; i < sizeof(devinfo->eu_masks); i++)
385 perf->sys_vars.n_eus += __builtin_popcount(devinfo->eu_masks[i]);
386
387 perf->sys_vars.eu_threads_count = devinfo->num_thread_per_eu;
388
389 /* The subslice mask builtin contains bits for all slices. Prior to Gen11
390 * it had groups of 3bits for each slice, on Gen11 it's 8bits for each
391 * slice.
392 *
393 * Ideally equations would be updated to have a slice/subslice query
394 * function/operator.
395 */
396 perf->sys_vars.subslice_mask = 0;
397
398 int bits_per_subslice = devinfo->gen == 11 ? 8 : 3;
399
400 for (int s = 0; s < util_last_bit(devinfo->slice_masks); s++) {
401 for (int ss = 0; ss < (devinfo->subslice_slice_stride * 8); ss++) {
402 if (gen_device_info_subslice_available(devinfo, s, ss))
403 perf->sys_vars.subslice_mask |= 1ULL << (s * bits_per_subslice + ss);
404 }
405 }
406 }
407
408 static bool
409 init_oa_sys_vars(struct gen_perf_config *perf, const struct gen_device_info *devinfo)
410 {
411 uint64_t min_freq_mhz = 0, max_freq_mhz = 0;
412
413 if (!read_sysfs_drm_device_file_uint64(perf, "gt_min_freq_mhz", &min_freq_mhz))
414 return false;
415
416 if (!read_sysfs_drm_device_file_uint64(perf, "gt_max_freq_mhz", &max_freq_mhz))
417 return false;
418
419 memset(&perf->sys_vars, 0, sizeof(perf->sys_vars));
420 perf->sys_vars.gt_min_freq = min_freq_mhz * 1000000;
421 perf->sys_vars.gt_max_freq = max_freq_mhz * 1000000;
422 perf->sys_vars.timestamp_frequency = devinfo->timestamp_frequency;
423 perf->sys_vars.revision = devinfo->revision;
424 compute_topology_builtins(perf, devinfo);
425
426 return true;
427 }
428
429 typedef void (*perf_register_oa_queries_t)(struct gen_perf_config *);
430
431 static perf_register_oa_queries_t
432 get_register_queries_function(const struct gen_device_info *devinfo)
433 {
434 if (devinfo->is_haswell)
435 return gen_oa_register_queries_hsw;
436 if (devinfo->is_cherryview)
437 return gen_oa_register_queries_chv;
438 if (devinfo->is_broadwell)
439 return gen_oa_register_queries_bdw;
440 if (devinfo->is_broxton)
441 return gen_oa_register_queries_bxt;
442 if (devinfo->is_skylake) {
443 if (devinfo->gt == 2)
444 return gen_oa_register_queries_sklgt2;
445 if (devinfo->gt == 3)
446 return gen_oa_register_queries_sklgt3;
447 if (devinfo->gt == 4)
448 return gen_oa_register_queries_sklgt4;
449 }
450 if (devinfo->is_kabylake) {
451 if (devinfo->gt == 2)
452 return gen_oa_register_queries_kblgt2;
453 if (devinfo->gt == 3)
454 return gen_oa_register_queries_kblgt3;
455 }
456 if (devinfo->is_geminilake)
457 return gen_oa_register_queries_glk;
458 if (devinfo->is_coffeelake) {
459 if (devinfo->gt == 2)
460 return gen_oa_register_queries_cflgt2;
461 if (devinfo->gt == 3)
462 return gen_oa_register_queries_cflgt3;
463 }
464 if (devinfo->is_cannonlake)
465 return gen_oa_register_queries_cnl;
466 if (devinfo->gen == 11) {
467 if (devinfo->is_elkhartlake)
468 return gen_oa_register_queries_lkf;
469 return gen_oa_register_queries_icl;
470 }
471 if (devinfo->gen == 12)
472 return gen_oa_register_queries_tgl;
473
474 return NULL;
475 }
476
477 static inline void
478 add_stat_reg(struct gen_perf_query_info *query, uint32_t reg,
479 uint32_t numerator, uint32_t denominator,
480 const char *name, const char *description)
481 {
482 struct gen_perf_query_counter *counter;
483
484 assert(query->n_counters < query->max_counters);
485
486 counter = &query->counters[query->n_counters];
487 counter->name = name;
488 counter->desc = description;
489 counter->type = GEN_PERF_COUNTER_TYPE_RAW;
490 counter->data_type = GEN_PERF_COUNTER_DATA_TYPE_UINT64;
491 counter->offset = sizeof(uint64_t) * query->n_counters;
492 counter->pipeline_stat.reg = reg;
493 counter->pipeline_stat.numerator = numerator;
494 counter->pipeline_stat.denominator = denominator;
495
496 query->n_counters++;
497 }
498
499 static inline void
500 add_basic_stat_reg(struct gen_perf_query_info *query,
501 uint32_t reg, const char *name)
502 {
503 add_stat_reg(query, reg, 1, 1, name, name);
504 }
505
506 static void
507 load_pipeline_statistic_metrics(struct gen_perf_config *perf_cfg,
508 const struct gen_device_info *devinfo)
509 {
510 struct gen_perf_query_info *query =
511 append_query_info(perf_cfg, MAX_STAT_COUNTERS);
512
513 query->kind = GEN_PERF_QUERY_TYPE_PIPELINE;
514 query->name = "Pipeline Statistics Registers";
515
516 add_basic_stat_reg(query, IA_VERTICES_COUNT,
517 "N vertices submitted");
518 add_basic_stat_reg(query, IA_PRIMITIVES_COUNT,
519 "N primitives submitted");
520 add_basic_stat_reg(query, VS_INVOCATION_COUNT,
521 "N vertex shader invocations");
522
523 if (devinfo->gen == 6) {
524 add_stat_reg(query, GEN6_SO_PRIM_STORAGE_NEEDED, 1, 1,
525 "SO_PRIM_STORAGE_NEEDED",
526 "N geometry shader stream-out primitives (total)");
527 add_stat_reg(query, GEN6_SO_NUM_PRIMS_WRITTEN, 1, 1,
528 "SO_NUM_PRIMS_WRITTEN",
529 "N geometry shader stream-out primitives (written)");
530 } else {
531 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(0), 1, 1,
532 "SO_PRIM_STORAGE_NEEDED (Stream 0)",
533 "N stream-out (stream 0) primitives (total)");
534 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(1), 1, 1,
535 "SO_PRIM_STORAGE_NEEDED (Stream 1)",
536 "N stream-out (stream 1) primitives (total)");
537 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(2), 1, 1,
538 "SO_PRIM_STORAGE_NEEDED (Stream 2)",
539 "N stream-out (stream 2) primitives (total)");
540 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(3), 1, 1,
541 "SO_PRIM_STORAGE_NEEDED (Stream 3)",
542 "N stream-out (stream 3) primitives (total)");
543 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(0), 1, 1,
544 "SO_NUM_PRIMS_WRITTEN (Stream 0)",
545 "N stream-out (stream 0) primitives (written)");
546 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(1), 1, 1,
547 "SO_NUM_PRIMS_WRITTEN (Stream 1)",
548 "N stream-out (stream 1) primitives (written)");
549 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(2), 1, 1,
550 "SO_NUM_PRIMS_WRITTEN (Stream 2)",
551 "N stream-out (stream 2) primitives (written)");
552 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(3), 1, 1,
553 "SO_NUM_PRIMS_WRITTEN (Stream 3)",
554 "N stream-out (stream 3) primitives (written)");
555 }
556
557 add_basic_stat_reg(query, HS_INVOCATION_COUNT,
558 "N TCS shader invocations");
559 add_basic_stat_reg(query, DS_INVOCATION_COUNT,
560 "N TES shader invocations");
561
562 add_basic_stat_reg(query, GS_INVOCATION_COUNT,
563 "N geometry shader invocations");
564 add_basic_stat_reg(query, GS_PRIMITIVES_COUNT,
565 "N geometry shader primitives emitted");
566
567 add_basic_stat_reg(query, CL_INVOCATION_COUNT,
568 "N primitives entering clipping");
569 add_basic_stat_reg(query, CL_PRIMITIVES_COUNT,
570 "N primitives leaving clipping");
571
572 if (devinfo->is_haswell || devinfo->gen == 8) {
573 add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4,
574 "N fragment shader invocations",
575 "N fragment shader invocations");
576 } else {
577 add_basic_stat_reg(query, PS_INVOCATION_COUNT,
578 "N fragment shader invocations");
579 }
580
581 add_basic_stat_reg(query, PS_DEPTH_COUNT,
582 "N z-pass fragments");
583
584 if (devinfo->gen >= 7) {
585 add_basic_stat_reg(query, CS_INVOCATION_COUNT,
586 "N compute shader invocations");
587 }
588
589 query->data_size = sizeof(uint64_t) * query->n_counters;
590 }
591
592 static bool
593 load_oa_metrics(struct gen_perf_config *perf, int fd,
594 const struct gen_device_info *devinfo)
595 {
596 perf_register_oa_queries_t oa_register = get_register_queries_function(devinfo);
597 bool i915_perf_oa_available = false;
598 struct stat sb;
599
600 perf->i915_query_supported = i915_query_perf_config_supported(perf, fd);
601
602 /* The existence of this sysctl parameter implies the kernel supports
603 * the i915 perf interface.
604 */
605 if (stat("/proc/sys/dev/i915/perf_stream_paranoid", &sb) == 0) {
606
607 /* If _paranoid == 1 then on Gen8+ we won't be able to access OA
608 * metrics unless running as root.
609 */
610 if (devinfo->is_haswell)
611 i915_perf_oa_available = true;
612 else {
613 uint64_t paranoid = 1;
614
615 read_file_uint64("/proc/sys/dev/i915/perf_stream_paranoid", &paranoid);
616
617 if (paranoid == 0 || geteuid() == 0)
618 i915_perf_oa_available = true;
619 }
620 }
621
622 if (!i915_perf_oa_available ||
623 !oa_register ||
624 !get_sysfs_dev_dir(perf, fd) ||
625 !init_oa_sys_vars(perf, devinfo))
626 return false;
627
628 perf->oa_metrics_table =
629 _mesa_hash_table_create(perf, _mesa_hash_string,
630 _mesa_key_string_equal);
631
632 /* Index all the metric sets mesa knows about before looking to see what
633 * the kernel is advertising.
634 */
635 oa_register(perf);
636
637 if (likely((INTEL_DEBUG & DEBUG_NO_OACONFIG) == 0) &&
638 kernel_has_dynamic_config_support(perf, fd))
639 init_oa_configs(perf, fd);
640 else
641 enumerate_sysfs_metrics(perf);
642
643 return true;
644 }
645
646 struct gen_perf_registers *
647 gen_perf_load_configuration(struct gen_perf_config *perf_cfg, int fd, const char *guid)
648 {
649 if (!perf_cfg->i915_query_supported)
650 return NULL;
651
652 struct drm_i915_perf_oa_config i915_config = { 0, };
653 if (!i915_query_perf_config_data(perf_cfg, fd, guid, &i915_config))
654 return NULL;
655
656 struct gen_perf_registers *config = rzalloc(NULL, struct gen_perf_registers);
657 config->n_flex_regs = i915_config.n_flex_regs;
658 config->flex_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_flex_regs);
659 config->n_mux_regs = i915_config.n_mux_regs;
660 config->mux_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_mux_regs);
661 config->n_b_counter_regs = i915_config.n_boolean_regs;
662 config->b_counter_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_b_counter_regs);
663
664 /*
665 * struct gen_perf_query_register_prog maps exactly to the tuple of
666 * (register offset, register value) returned by the i915.
667 */
668 i915_config.flex_regs_ptr = to_user_pointer(config->flex_regs);
669 i915_config.mux_regs_ptr = to_user_pointer(config->mux_regs);
670 i915_config.boolean_regs_ptr = to_user_pointer(config->b_counter_regs);
671 if (!i915_query_perf_config_data(perf_cfg, fd, guid, &i915_config)) {
672 ralloc_free(config);
673 return NULL;
674 }
675
676 return config;
677 }
678
679 uint64_t
680 gen_perf_store_configuration(struct gen_perf_config *perf_cfg, int fd,
681 const struct gen_perf_registers *config,
682 const char *guid)
683 {
684 if (guid)
685 return i915_add_config(perf_cfg, fd, config, guid);
686
687 struct mesa_sha1 sha1_ctx;
688 _mesa_sha1_init(&sha1_ctx);
689
690 if (config->flex_regs) {
691 _mesa_sha1_update(&sha1_ctx, config->flex_regs,
692 sizeof(config->flex_regs[0]) *
693 config->n_flex_regs);
694 }
695 if (config->mux_regs) {
696 _mesa_sha1_update(&sha1_ctx, config->mux_regs,
697 sizeof(config->mux_regs[0]) *
698 config->n_mux_regs);
699 }
700 if (config->b_counter_regs) {
701 _mesa_sha1_update(&sha1_ctx, config->b_counter_regs,
702 sizeof(config->b_counter_regs[0]) *
703 config->n_b_counter_regs);
704 }
705
706 uint8_t hash[20];
707 _mesa_sha1_final(&sha1_ctx, hash);
708
709 char formatted_hash[41];
710 _mesa_sha1_format(formatted_hash, hash);
711
712 char generated_guid[37];
713 snprintf(generated_guid, sizeof(generated_guid),
714 "%.8s-%.4s-%.4s-%.4s-%.12s",
715 &formatted_hash[0], &formatted_hash[8],
716 &formatted_hash[8 + 4], &formatted_hash[8 + 4 + 4],
717 &formatted_hash[8 + 4 + 4 + 4]);
718
719 /* Check if already present. */
720 uint64_t id;
721 if (gen_perf_load_metric_id(perf_cfg, generated_guid, &id))
722 return id;
723
724 return i915_add_config(perf_cfg, fd, config, generated_guid);
725 }
726
727 /* Accumulate 32bits OA counters */
728 static inline void
729 accumulate_uint32(const uint32_t *report0,
730 const uint32_t *report1,
731 uint64_t *accumulator)
732 {
733 *accumulator += (uint32_t)(*report1 - *report0);
734 }
735
736 /* Accumulate 40bits OA counters */
737 static inline void
738 accumulate_uint40(int a_index,
739 const uint32_t *report0,
740 const uint32_t *report1,
741 uint64_t *accumulator)
742 {
743 const uint8_t *high_bytes0 = (uint8_t *)(report0 + 40);
744 const uint8_t *high_bytes1 = (uint8_t *)(report1 + 40);
745 uint64_t high0 = (uint64_t)(high_bytes0[a_index]) << 32;
746 uint64_t high1 = (uint64_t)(high_bytes1[a_index]) << 32;
747 uint64_t value0 = report0[a_index + 4] | high0;
748 uint64_t value1 = report1[a_index + 4] | high1;
749 uint64_t delta;
750
751 if (value0 > value1)
752 delta = (1ULL << 40) + value1 - value0;
753 else
754 delta = value1 - value0;
755
756 *accumulator += delta;
757 }
758
759 static void
760 gen8_read_report_clock_ratios(const uint32_t *report,
761 uint64_t *slice_freq_hz,
762 uint64_t *unslice_freq_hz)
763 {
764 /* The lower 16bits of the RPT_ID field of the OA reports contains a
765 * snapshot of the bits coming from the RP_FREQ_NORMAL register and is
766 * divided this way :
767 *
768 * RPT_ID[31:25]: RP_FREQ_NORMAL[20:14] (low squashed_slice_clock_frequency)
769 * RPT_ID[10:9]: RP_FREQ_NORMAL[22:21] (high squashed_slice_clock_frequency)
770 * RPT_ID[8:0]: RP_FREQ_NORMAL[31:23] (squashed_unslice_clock_frequency)
771 *
772 * RP_FREQ_NORMAL[31:23]: Software Unslice Ratio Request
773 * Multiple of 33.33MHz 2xclk (16 MHz 1xclk)
774 *
775 * RP_FREQ_NORMAL[22:14]: Software Slice Ratio Request
776 * Multiple of 33.33MHz 2xclk (16 MHz 1xclk)
777 */
778
779 uint32_t unslice_freq = report[0] & 0x1ff;
780 uint32_t slice_freq_low = (report[0] >> 25) & 0x7f;
781 uint32_t slice_freq_high = (report[0] >> 9) & 0x3;
782 uint32_t slice_freq = slice_freq_low | (slice_freq_high << 7);
783
784 *slice_freq_hz = slice_freq * 16666667ULL;
785 *unslice_freq_hz = unslice_freq * 16666667ULL;
786 }
787
788 void
789 gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result,
790 const struct gen_device_info *devinfo,
791 const uint32_t *start,
792 const uint32_t *end)
793 {
794 /* Slice/Unslice frequency is only available in the OA reports when the
795 * "Disable OA reports due to clock ratio change" field in
796 * OA_DEBUG_REGISTER is set to 1. This is how the kernel programs this
797 * global register (see drivers/gpu/drm/i915/i915_perf.c)
798 *
799 * Documentation says this should be available on Gen9+ but experimentation
800 * shows that Gen8 reports similar values, so we enable it there too.
801 */
802 if (devinfo->gen < 8)
803 return;
804
805 gen8_read_report_clock_ratios(start,
806 &result->slice_frequency[0],
807 &result->unslice_frequency[0]);
808 gen8_read_report_clock_ratios(end,
809 &result->slice_frequency[1],
810 &result->unslice_frequency[1]);
811 }
812
813 void
814 gen_perf_query_result_accumulate(struct gen_perf_query_result *result,
815 const struct gen_perf_query_info *query,
816 const uint32_t *start,
817 const uint32_t *end)
818 {
819 int i, idx = 0;
820
821 if (result->hw_id == OA_REPORT_INVALID_CTX_ID &&
822 start[2] != OA_REPORT_INVALID_CTX_ID)
823 result->hw_id = start[2];
824 if (result->reports_accumulated == 0)
825 result->begin_timestamp = start[1];
826 result->reports_accumulated++;
827
828 switch (query->oa_format) {
829 case I915_OA_FORMAT_A32u40_A4u32_B8_C8:
830 accumulate_uint32(start + 1, end + 1, result->accumulator + idx++); /* timestamp */
831 accumulate_uint32(start + 3, end + 3, result->accumulator + idx++); /* clock */
832
833 /* 32x 40bit A counters... */
834 for (i = 0; i < 32; i++)
835 accumulate_uint40(i, start, end, result->accumulator + idx++);
836
837 /* 4x 32bit A counters... */
838 for (i = 0; i < 4; i++)
839 accumulate_uint32(start + 36 + i, end + 36 + i, result->accumulator + idx++);
840
841 /* 8x 32bit B counters + 8x 32bit C counters... */
842 for (i = 0; i < 16; i++)
843 accumulate_uint32(start + 48 + i, end + 48 + i, result->accumulator + idx++);
844 break;
845
846 case I915_OA_FORMAT_A45_B8_C8:
847 accumulate_uint32(start + 1, end + 1, result->accumulator); /* timestamp */
848
849 for (i = 0; i < 61; i++)
850 accumulate_uint32(start + 3 + i, end + 3 + i, result->accumulator + 1 + i);
851 break;
852
853 default:
854 unreachable("Can't accumulate OA counters in unknown format");
855 }
856
857 }
858
859 void
860 gen_perf_query_result_clear(struct gen_perf_query_result *result)
861 {
862 memset(result, 0, sizeof(*result));
863 result->hw_id = OA_REPORT_INVALID_CTX_ID; /* invalid */
864 }
865
866 static void
867 register_mdapi_statistic_query(struct gen_perf_config *perf_cfg,
868 const struct gen_device_info *devinfo)
869 {
870 if (!(devinfo->gen >= 7 && devinfo->gen <= 11))
871 return;
872
873 struct gen_perf_query_info *query =
874 append_query_info(perf_cfg, MAX_STAT_COUNTERS);
875
876 query->kind = GEN_PERF_QUERY_TYPE_PIPELINE;
877 query->name = "Intel_Raw_Pipeline_Statistics_Query";
878
879 /* The order has to match mdapi_pipeline_metrics. */
880 add_basic_stat_reg(query, IA_VERTICES_COUNT,
881 "N vertices submitted");
882 add_basic_stat_reg(query, IA_PRIMITIVES_COUNT,
883 "N primitives submitted");
884 add_basic_stat_reg(query, VS_INVOCATION_COUNT,
885 "N vertex shader invocations");
886 add_basic_stat_reg(query, GS_INVOCATION_COUNT,
887 "N geometry shader invocations");
888 add_basic_stat_reg(query, GS_PRIMITIVES_COUNT,
889 "N geometry shader primitives emitted");
890 add_basic_stat_reg(query, CL_INVOCATION_COUNT,
891 "N primitives entering clipping");
892 add_basic_stat_reg(query, CL_PRIMITIVES_COUNT,
893 "N primitives leaving clipping");
894 if (devinfo->is_haswell || devinfo->gen == 8) {
895 add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4,
896 "N fragment shader invocations",
897 "N fragment shader invocations");
898 } else {
899 add_basic_stat_reg(query, PS_INVOCATION_COUNT,
900 "N fragment shader invocations");
901 }
902 add_basic_stat_reg(query, HS_INVOCATION_COUNT,
903 "N TCS shader invocations");
904 add_basic_stat_reg(query, DS_INVOCATION_COUNT,
905 "N TES shader invocations");
906 if (devinfo->gen >= 7) {
907 add_basic_stat_reg(query, CS_INVOCATION_COUNT,
908 "N compute shader invocations");
909 }
910
911 if (devinfo->gen >= 10) {
912 /* Reuse existing CS invocation register until we can expose this new
913 * one.
914 */
915 add_basic_stat_reg(query, CS_INVOCATION_COUNT,
916 "Reserved1");
917 }
918
919 query->data_size = sizeof(uint64_t) * query->n_counters;
920 }
921
922 static void
923 fill_mdapi_perf_query_counter(struct gen_perf_query_info *query,
924 const char *name,
925 uint32_t data_offset,
926 uint32_t data_size,
927 enum gen_perf_counter_data_type data_type)
928 {
929 struct gen_perf_query_counter *counter = &query->counters[query->n_counters];
930
931 assert(query->n_counters <= query->max_counters);
932
933 counter->name = name;
934 counter->desc = "Raw counter value";
935 counter->type = GEN_PERF_COUNTER_TYPE_RAW;
936 counter->data_type = data_type;
937 counter->offset = data_offset;
938
939 query->n_counters++;
940
941 assert(counter->offset + gen_perf_query_counter_get_size(counter) <= query->data_size);
942 }
943
944 #define MDAPI_QUERY_ADD_COUNTER(query, struct_name, field_name, type_name) \
945 fill_mdapi_perf_query_counter(query, #field_name, \
946 (uint8_t *) &struct_name.field_name - \
947 (uint8_t *) &struct_name, \
948 sizeof(struct_name.field_name), \
949 GEN_PERF_COUNTER_DATA_TYPE_##type_name)
950 #define MDAPI_QUERY_ADD_ARRAY_COUNTER(ctx, query, struct_name, field_name, idx, type_name) \
951 fill_mdapi_perf_query_counter(query, \
952 ralloc_asprintf(ctx, "%s%i", #field_name, idx), \
953 (uint8_t *) &struct_name.field_name[idx] - \
954 (uint8_t *) &struct_name, \
955 sizeof(struct_name.field_name[0]), \
956 GEN_PERF_COUNTER_DATA_TYPE_##type_name)
957
958 static void
959 register_mdapi_oa_query(const struct gen_device_info *devinfo,
960 struct gen_perf_config *perf)
961 {
962 struct gen_perf_query_info *query = NULL;
963
964 /* MDAPI requires different structures for pretty much every generation
965 * (right now we have definitions for gen 7 to 11).
966 */
967 if (!(devinfo->gen >= 7 && devinfo->gen <= 11))
968 return;
969
970 switch (devinfo->gen) {
971 case 7: {
972 query = append_query_info(perf, 1 + 45 + 16 + 7);
973 query->oa_format = I915_OA_FORMAT_A45_B8_C8;
974
975 struct gen7_mdapi_metrics metric_data;
976 query->data_size = sizeof(metric_data);
977
978 MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
979 for (int i = 0; i < ARRAY_SIZE(metric_data.ACounters); i++) {
980 MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
981 metric_data, ACounters, i, UINT64);
982 }
983 for (int i = 0; i < ARRAY_SIZE(metric_data.NOACounters); i++) {
984 MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
985 metric_data, NOACounters, i, UINT64);
986 }
987 MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
988 MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
989 MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
990 MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
991 MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
992 MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
993 MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
994 break;
995 }
996 case 8: {
997 query = append_query_info(perf, 2 + 36 + 16 + 16);
998 query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8;
999
1000 struct gen8_mdapi_metrics metric_data;
1001 query->data_size = sizeof(metric_data);
1002
1003 MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
1004 MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64);
1005 for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) {
1006 MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
1007 metric_data, OaCntr, i, UINT64);
1008 }
1009 for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) {
1010 MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
1011 metric_data, NoaCntr, i, UINT64);
1012 }
1013 MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64);
1014 MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64);
1015 MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64);
1016 MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32);
1017 MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32);
1018 MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64);
1019 MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64);
1020 MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64);
1021 MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64);
1022 MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
1023 MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
1024 MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
1025 MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
1026 MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
1027 MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
1028 MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
1029 break;
1030 }
1031 case 9:
1032 case 10:
1033 case 11: {
1034 query = append_query_info(perf, 2 + 36 + 16 + 16 + 16 + 2);
1035 query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8;
1036
1037 struct gen9_mdapi_metrics metric_data;
1038 query->data_size = sizeof(metric_data);
1039
1040 MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
1041 MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64);
1042 for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) {
1043 MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
1044 metric_data, OaCntr, i, UINT64);
1045 }
1046 for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) {
1047 MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
1048 metric_data, NoaCntr, i, UINT64);
1049 }
1050 MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64);
1051 MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64);
1052 MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64);
1053 MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32);
1054 MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32);
1055 MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64);
1056 MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64);
1057 MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64);
1058 MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64);
1059 MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
1060 MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
1061 MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
1062 MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
1063 MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
1064 MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
1065 MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
1066 for (int i = 0; i < ARRAY_SIZE(metric_data.UserCntr); i++) {
1067 MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
1068 metric_data, UserCntr, i, UINT64);
1069 }
1070 MDAPI_QUERY_ADD_COUNTER(query, metric_data, UserCntrCfgId, UINT32);
1071 MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved4, UINT32);
1072 break;
1073 }
1074 default:
1075 unreachable("Unsupported gen");
1076 break;
1077 }
1078
1079 query->kind = GEN_PERF_QUERY_TYPE_RAW;
1080 query->name = "Intel_Raw_Hardware_Counters_Set_0_Query";
1081 query->guid = GEN_PERF_QUERY_GUID_MDAPI;
1082
1083 {
1084 /* Accumulation buffer offsets copied from an actual query... */
1085 const struct gen_perf_query_info *copy_query =
1086 &perf->queries[0];
1087
1088 query->gpu_time_offset = copy_query->gpu_time_offset;
1089 query->gpu_clock_offset = copy_query->gpu_clock_offset;
1090 query->a_offset = copy_query->a_offset;
1091 query->b_offset = copy_query->b_offset;
1092 query->c_offset = copy_query->c_offset;
1093 }
1094 }
1095
1096 void
1097 gen_perf_init_metrics(struct gen_perf_config *perf_cfg,
1098 const struct gen_device_info *devinfo,
1099 int drm_fd)
1100 {
1101 load_pipeline_statistic_metrics(perf_cfg, devinfo);
1102 register_mdapi_statistic_query(perf_cfg, devinfo);
1103 if (load_oa_metrics(perf_cfg, drm_fd, devinfo))
1104 register_mdapi_oa_query(devinfo, perf_cfg);
1105 }