intel/perf: make perf context private
[mesa.git] / src / intel / perf / gen_perf.c
1 /*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <dirent.h>
25
26 #include <sys/types.h>
27 #include <sys/stat.h>
28 #include <fcntl.h>
29 #include <unistd.h>
30 #include <errno.h>
31
32 #include <drm-uapi/i915_drm.h>
33
34 #include "common/gen_gem.h"
35 #include "gen_perf.h"
36 #include "perf/gen_perf_mdapi.h"
37 #include "perf/gen_perf_metrics.h"
38
39 #include "dev/gen_debug.h"
40 #include "dev/gen_device_info.h"
41 #include "util/bitscan.h"
42 #include "util/u_math.h"
43
44 #define FILE_DEBUG_FLAG DEBUG_PERFMON
45 #define MI_RPC_BO_SIZE 4096
46 #define MI_FREQ_START_OFFSET_BYTES (3072)
47 #define MI_RPC_BO_END_OFFSET_BYTES (MI_RPC_BO_SIZE / 2)
48 #define MI_FREQ_END_OFFSET_BYTES (3076)
49
50 #define INTEL_MASK(high, low) (((1u<<((high)-(low)+1))-1)<<(low))
51
52 #define GEN7_RPSTAT1 0xA01C
53 #define GEN7_RPSTAT1_CURR_GT_FREQ_SHIFT 7
54 #define GEN7_RPSTAT1_CURR_GT_FREQ_MASK INTEL_MASK(13, 7)
55 #define GEN7_RPSTAT1_PREV_GT_FREQ_SHIFT 0
56 #define GEN7_RPSTAT1_PREV_GT_FREQ_MASK INTEL_MASK(6, 0)
57
58 #define GEN9_RPSTAT0 0xA01C
59 #define GEN9_RPSTAT0_CURR_GT_FREQ_SHIFT 23
60 #define GEN9_RPSTAT0_CURR_GT_FREQ_MASK INTEL_MASK(31, 23)
61 #define GEN9_RPSTAT0_PREV_GT_FREQ_SHIFT 0
62 #define GEN9_RPSTAT0_PREV_GT_FREQ_MASK INTEL_MASK(8, 0)
63
64 #define GEN6_SO_PRIM_STORAGE_NEEDED 0x2280
65 #define GEN7_SO_PRIM_STORAGE_NEEDED(n) (0x5240 + (n) * 8)
66 #define GEN6_SO_NUM_PRIMS_WRITTEN 0x2288
67 #define GEN7_SO_NUM_PRIMS_WRITTEN(n) (0x5200 + (n) * 8)
68
69 #define MAP_READ (1 << 0)
70 #define MAP_WRITE (1 << 1)
71
72 /**
73 * Periodic OA samples are read() into these buffer structures via the
74 * i915 perf kernel interface and appended to the
75 * perf_ctx->sample_buffers linked list. When we process the
76 * results of an OA metrics query we need to consider all the periodic
77 * samples between the Begin and End MI_REPORT_PERF_COUNT command
78 * markers.
79 *
80 * 'Periodic' is a simplification as there are other automatic reports
81 * written by the hardware also buffered here.
82 *
83 * Considering three queries, A, B and C:
84 *
85 * Time ---->
86 * ________________A_________________
87 * | |
88 * | ________B_________ _____C___________
89 * | | | | | |
90 *
91 * And an illustration of sample buffers read over this time frame:
92 * [HEAD ][ ][ ][ ][ ][ ][ ][ ][TAIL ]
93 *
94 * These nodes may hold samples for query A:
95 * [ ][ ][ A ][ A ][ A ][ A ][ A ][ ][ ]
96 *
97 * These nodes may hold samples for query B:
98 * [ ][ ][ B ][ B ][ B ][ ][ ][ ][ ]
99 *
100 * These nodes may hold samples for query C:
101 * [ ][ ][ ][ ][ ][ C ][ C ][ C ][ ]
102 *
103 * The illustration assumes we have an even distribution of periodic
104 * samples so all nodes have the same size plotted against time:
105 *
106 * Note, to simplify code, the list is never empty.
107 *
108 * With overlapping queries we can see that periodic OA reports may
109 * relate to multiple queries and care needs to be take to keep
110 * track of sample buffers until there are no queries that might
111 * depend on their contents.
112 *
113 * We use a node ref counting system where a reference ensures that a
114 * node and all following nodes can't be freed/recycled until the
115 * reference drops to zero.
116 *
117 * E.g. with a ref of one here:
118 * [ 0 ][ 0 ][ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ]
119 *
120 * These nodes could be freed or recycled ("reaped"):
121 * [ 0 ][ 0 ]
122 *
123 * These must be preserved until the leading ref drops to zero:
124 * [ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ]
125 *
126 * When a query starts we take a reference on the current tail of
127 * the list, knowing that no already-buffered samples can possibly
128 * relate to the newly-started query. A pointer to this node is
129 * also saved in the query object's ->oa.samples_head.
130 *
131 * E.g. starting query A while there are two nodes in .sample_buffers:
132 * ________________A________
133 * |
134 *
135 * [ 0 ][ 1 ]
136 * ^_______ Add a reference and store pointer to node in
137 * A->oa.samples_head
138 *
139 * Moving forward to when the B query starts with no new buffer nodes:
140 * (for reference, i915 perf reads() are only done when queries finish)
141 * ________________A_______
142 * | ________B___
143 * | |
144 *
145 * [ 0 ][ 2 ]
146 * ^_______ Add a reference and store pointer to
147 * node in B->oa.samples_head
148 *
149 * Once a query is finished, after an OA query has become 'Ready',
150 * once the End OA report has landed and after we we have processed
151 * all the intermediate periodic samples then we drop the
152 * ->oa.samples_head reference we took at the start.
153 *
154 * So when the B query has finished we have:
155 * ________________A________
156 * | ______B___________
157 * | | |
158 * [ 0 ][ 1 ][ 0 ][ 0 ][ 0 ]
159 * ^_______ Drop B->oa.samples_head reference
160 *
161 * We still can't free these due to the A->oa.samples_head ref:
162 * [ 1 ][ 0 ][ 0 ][ 0 ]
163 *
164 * When the A query finishes: (note there's a new ref for C's samples_head)
165 * ________________A_________________
166 * | |
167 * | _____C_________
168 * | | |
169 * [ 0 ][ 0 ][ 0 ][ 0 ][ 1 ][ 0 ][ 0 ]
170 * ^_______ Drop A->oa.samples_head reference
171 *
172 * And we can now reap these nodes up to the C->oa.samples_head:
173 * [ X ][ X ][ X ][ X ]
174 * keeping -> [ 1 ][ 0 ][ 0 ]
175 *
176 * We reap old sample buffers each time we finish processing an OA
177 * query by iterating the sample_buffers list from the head until we
178 * find a referenced node and stop.
179 *
180 * Reaped buffers move to a perfquery.free_sample_buffers list and
181 * when we come to read() we first look to recycle a buffer from the
182 * free_sample_buffers list before allocating a new buffer.
183 */
184 struct oa_sample_buf {
185 struct exec_node link;
186 int refcount;
187 int len;
188 uint8_t buf[I915_PERF_OA_SAMPLE_SIZE * 10];
189 uint32_t last_timestamp;
190 };
191
192 struct gen_perf_context {
193 struct gen_perf_config *perf;
194
195 void * ctx; /* driver context (eg, brw_context) */
196 void * bufmgr;
197 const struct gen_device_info *devinfo;
198
199 uint32_t hw_ctx;
200 int drm_fd;
201
202 /* The i915 perf stream we open to setup + enable the OA counters */
203 int oa_stream_fd;
204
205 /* An i915 perf stream fd gives exclusive access to the OA unit that will
206 * report counter snapshots for a specific counter set/profile in a
207 * specific layout/format so we can only start OA queries that are
208 * compatible with the currently open fd...
209 */
210 int current_oa_metrics_set_id;
211 int current_oa_format;
212
213 /* List of buffers containing OA reports */
214 struct exec_list sample_buffers;
215
216 /* Cached list of empty sample buffers */
217 struct exec_list free_sample_buffers;
218
219 int n_active_oa_queries;
220 int n_active_pipeline_stats_queries;
221
222 /* The number of queries depending on running OA counters which
223 * extends beyond brw_end_perf_query() since we need to wait until
224 * the last MI_RPC command has parsed by the GPU.
225 *
226 * Accurate accounting is important here as emitting an
227 * MI_REPORT_PERF_COUNT command while the OA unit is disabled will
228 * effectively hang the gpu.
229 */
230 int n_oa_users;
231
232 /* To help catch an spurious problem with the hardware or perf
233 * forwarding samples, we emit each MI_REPORT_PERF_COUNT command
234 * with a unique ID that we can explicitly check for...
235 */
236 int next_query_start_report_id;
237
238 /**
239 * An array of queries whose results haven't yet been assembled
240 * based on the data in buffer objects.
241 *
242 * These may be active, or have already ended. However, the
243 * results have not been requested.
244 */
245 struct gen_perf_query_object **unaccumulated;
246 int unaccumulated_elements;
247 int unaccumulated_array_size;
248
249 /* The total number of query objects so we can relinquish
250 * our exclusive access to perf if the application deletes
251 * all of its objects. (NB: We only disable perf while
252 * there are no active queries)
253 */
254 int n_query_instances;
255 };
256
257 struct gen_perf_context *
258 gen_perf_new_context(void *parent)
259 {
260 struct gen_perf_context *ctx = rzalloc(parent, struct gen_perf_context);
261 if (! ctx)
262 fprintf(stderr, "%s: failed to alloc context\n", __func__);
263 return ctx;
264 }
265
266 struct gen_perf_config *
267 gen_perf_config(struct gen_perf_context *ctx)
268 {
269 return ctx->perf;
270 }
271
272 struct gen_perf_query_object *
273 gen_perf_new_query(struct gen_perf_context *perf_ctx, unsigned query_index)
274 {
275 const struct gen_perf_query_info *query =
276 &perf_ctx->perf->queries[query_index];
277 struct gen_perf_query_object *obj =
278 calloc(1, sizeof(struct gen_perf_query_object));
279
280 if (!obj)
281 return NULL;
282
283 obj->queryinfo = query;
284
285 perf_ctx->n_query_instances++;
286 return obj;
287 }
288
289 int
290 gen_perf_active_queries(struct gen_perf_context *perf_ctx,
291 const struct gen_perf_query_info *query)
292 {
293 assert(perf_ctx->n_active_oa_queries == 0 || perf_ctx->n_active_pipeline_stats_queries == 0);
294
295 switch (query->kind) {
296 case GEN_PERF_QUERY_TYPE_OA:
297 case GEN_PERF_QUERY_TYPE_RAW:
298 return perf_ctx->n_active_oa_queries;
299 break;
300
301 case GEN_PERF_QUERY_TYPE_PIPELINE:
302 return perf_ctx->n_active_pipeline_stats_queries;
303 break;
304
305 default:
306 unreachable("Unknown query type");
307 break;
308 }
309 }
310
311 static bool
312 get_sysfs_dev_dir(struct gen_perf_config *perf, int fd)
313 {
314 struct stat sb;
315 int min, maj;
316 DIR *drmdir;
317 struct dirent *drm_entry;
318 int len;
319
320 perf->sysfs_dev_dir[0] = '\0';
321
322 if (fstat(fd, &sb)) {
323 DBG("Failed to stat DRM fd\n");
324 return false;
325 }
326
327 maj = major(sb.st_rdev);
328 min = minor(sb.st_rdev);
329
330 if (!S_ISCHR(sb.st_mode)) {
331 DBG("DRM fd is not a character device as expected\n");
332 return false;
333 }
334
335 len = snprintf(perf->sysfs_dev_dir,
336 sizeof(perf->sysfs_dev_dir),
337 "/sys/dev/char/%d:%d/device/drm", maj, min);
338 if (len < 0 || len >= sizeof(perf->sysfs_dev_dir)) {
339 DBG("Failed to concatenate sysfs path to drm device\n");
340 return false;
341 }
342
343 drmdir = opendir(perf->sysfs_dev_dir);
344 if (!drmdir) {
345 DBG("Failed to open %s: %m\n", perf->sysfs_dev_dir);
346 return false;
347 }
348
349 while ((drm_entry = readdir(drmdir))) {
350 if ((drm_entry->d_type == DT_DIR ||
351 drm_entry->d_type == DT_LNK) &&
352 strncmp(drm_entry->d_name, "card", 4) == 0)
353 {
354 len = snprintf(perf->sysfs_dev_dir,
355 sizeof(perf->sysfs_dev_dir),
356 "/sys/dev/char/%d:%d/device/drm/%s",
357 maj, min, drm_entry->d_name);
358 closedir(drmdir);
359 if (len < 0 || len >= sizeof(perf->sysfs_dev_dir))
360 return false;
361 else
362 return true;
363 }
364 }
365
366 closedir(drmdir);
367
368 DBG("Failed to find cardX directory under /sys/dev/char/%d:%d/device/drm\n",
369 maj, min);
370
371 return false;
372 }
373
374 static bool
375 read_file_uint64(const char *file, uint64_t *val)
376 {
377 char buf[32];
378 int fd, n;
379
380 fd = open(file, 0);
381 if (fd < 0)
382 return false;
383 while ((n = read(fd, buf, sizeof (buf) - 1)) < 0 &&
384 errno == EINTR);
385 close(fd);
386 if (n < 0)
387 return false;
388
389 buf[n] = '\0';
390 *val = strtoull(buf, NULL, 0);
391
392 return true;
393 }
394
395 static bool
396 read_sysfs_drm_device_file_uint64(struct gen_perf_config *perf,
397 const char *file,
398 uint64_t *value)
399 {
400 char buf[512];
401 int len;
402
403 len = snprintf(buf, sizeof(buf), "%s/%s", perf->sysfs_dev_dir, file);
404 if (len < 0 || len >= sizeof(buf)) {
405 DBG("Failed to concatenate sys filename to read u64 from\n");
406 return false;
407 }
408
409 return read_file_uint64(buf, value);
410 }
411
412 static inline struct gen_perf_query_info *
413 append_query_info(struct gen_perf_config *perf, int max_counters)
414 {
415 struct gen_perf_query_info *query;
416
417 perf->queries = reralloc(perf, perf->queries,
418 struct gen_perf_query_info,
419 ++perf->n_queries);
420 query = &perf->queries[perf->n_queries - 1];
421 memset(query, 0, sizeof(*query));
422
423 if (max_counters > 0) {
424 query->max_counters = max_counters;
425 query->counters =
426 rzalloc_array(perf, struct gen_perf_query_counter, max_counters);
427 }
428
429 return query;
430 }
431
432 static void
433 register_oa_config(struct gen_perf_config *perf,
434 const struct gen_perf_query_info *query,
435 uint64_t config_id)
436 {
437 struct gen_perf_query_info *registred_query = append_query_info(perf, 0);
438
439 *registred_query = *query;
440 registred_query->oa_metrics_set_id = config_id;
441 DBG("metric set registred: id = %" PRIu64", guid = %s\n",
442 registred_query->oa_metrics_set_id, query->guid);
443 }
444
445 static void
446 enumerate_sysfs_metrics(struct gen_perf_config *perf)
447 {
448 DIR *metricsdir = NULL;
449 struct dirent *metric_entry;
450 char buf[256];
451 int len;
452
453 len = snprintf(buf, sizeof(buf), "%s/metrics", perf->sysfs_dev_dir);
454 if (len < 0 || len >= sizeof(buf)) {
455 DBG("Failed to concatenate path to sysfs metrics/ directory\n");
456 return;
457 }
458
459 metricsdir = opendir(buf);
460 if (!metricsdir) {
461 DBG("Failed to open %s: %m\n", buf);
462 return;
463 }
464
465 while ((metric_entry = readdir(metricsdir))) {
466 struct hash_entry *entry;
467
468 if ((metric_entry->d_type != DT_DIR &&
469 metric_entry->d_type != DT_LNK) ||
470 metric_entry->d_name[0] == '.')
471 continue;
472
473 DBG("metric set: %s\n", metric_entry->d_name);
474 entry = _mesa_hash_table_search(perf->oa_metrics_table,
475 metric_entry->d_name);
476 if (entry) {
477 uint64_t id;
478
479 len = snprintf(buf, sizeof(buf), "%s/metrics/%s/id",
480 perf->sysfs_dev_dir, metric_entry->d_name);
481 if (len < 0 || len >= sizeof(buf)) {
482 DBG("Failed to concatenate path to sysfs metric id file\n");
483 continue;
484 }
485
486 if (!read_file_uint64(buf, &id)) {
487 DBG("Failed to read metric set id from %s: %m", buf);
488 continue;
489 }
490
491 register_oa_config(perf, (const struct gen_perf_query_info *)entry->data, id);
492 } else
493 DBG("metric set not known by mesa (skipping)\n");
494 }
495
496 closedir(metricsdir);
497 }
498
499 static bool
500 kernel_has_dynamic_config_support(struct gen_perf_config *perf, int fd)
501 {
502 uint64_t invalid_config_id = UINT64_MAX;
503
504 return gen_ioctl(fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG,
505 &invalid_config_id) < 0 && errno == ENOENT;
506 }
507
508 static bool
509 load_metric_id(struct gen_perf_config *perf, const char *guid,
510 uint64_t *metric_id)
511 {
512 char config_path[280];
513
514 snprintf(config_path, sizeof(config_path), "%s/metrics/%s/id",
515 perf->sysfs_dev_dir, guid);
516
517 /* Don't recreate already loaded configs. */
518 return read_file_uint64(config_path, metric_id);
519 }
520
521 static void
522 init_oa_configs(struct gen_perf_config *perf, int fd)
523 {
524 hash_table_foreach(perf->oa_metrics_table, entry) {
525 const struct gen_perf_query_info *query = entry->data;
526 struct drm_i915_perf_oa_config config;
527 uint64_t config_id;
528 int ret;
529
530 if (load_metric_id(perf, query->guid, &config_id)) {
531 DBG("metric set: %s (already loaded)\n", query->guid);
532 register_oa_config(perf, query, config_id);
533 continue;
534 }
535
536 memset(&config, 0, sizeof(config));
537
538 memcpy(config.uuid, query->guid, sizeof(config.uuid));
539
540 config.n_mux_regs = query->n_mux_regs;
541 config.mux_regs_ptr = (uintptr_t) query->mux_regs;
542
543 config.n_boolean_regs = query->n_b_counter_regs;
544 config.boolean_regs_ptr = (uintptr_t) query->b_counter_regs;
545
546 config.n_flex_regs = query->n_flex_regs;
547 config.flex_regs_ptr = (uintptr_t) query->flex_regs;
548
549 ret = gen_ioctl(fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &config);
550 if (ret < 0) {
551 DBG("Failed to load \"%s\" (%s) metrics set in kernel: %s\n",
552 query->name, query->guid, strerror(errno));
553 continue;
554 }
555
556 register_oa_config(perf, query, ret);
557 DBG("metric set: %s (added)\n", query->guid);
558 }
559 }
560
561 static void
562 compute_topology_builtins(struct gen_perf_config *perf,
563 const struct gen_device_info *devinfo)
564 {
565 perf->sys_vars.slice_mask = devinfo->slice_masks;
566 perf->sys_vars.n_eu_slices = devinfo->num_slices;
567
568 for (int i = 0; i < sizeof(devinfo->subslice_masks[i]); i++) {
569 perf->sys_vars.n_eu_sub_slices +=
570 __builtin_popcount(devinfo->subslice_masks[i]);
571 }
572
573 for (int i = 0; i < sizeof(devinfo->eu_masks); i++)
574 perf->sys_vars.n_eus += __builtin_popcount(devinfo->eu_masks[i]);
575
576 perf->sys_vars.eu_threads_count = devinfo->num_thread_per_eu;
577
578 /* The subslice mask builtin contains bits for all slices. Prior to Gen11
579 * it had groups of 3bits for each slice, on Gen11 it's 8bits for each
580 * slice.
581 *
582 * Ideally equations would be updated to have a slice/subslice query
583 * function/operator.
584 */
585 perf->sys_vars.subslice_mask = 0;
586
587 int bits_per_subslice = devinfo->gen == 11 ? 8 : 3;
588
589 for (int s = 0; s < util_last_bit(devinfo->slice_masks); s++) {
590 for (int ss = 0; ss < (devinfo->subslice_slice_stride * 8); ss++) {
591 if (gen_device_info_subslice_available(devinfo, s, ss))
592 perf->sys_vars.subslice_mask |= 1ULL << (s * bits_per_subslice + ss);
593 }
594 }
595 }
596
597 static bool
598 init_oa_sys_vars(struct gen_perf_config *perf, const struct gen_device_info *devinfo)
599 {
600 uint64_t min_freq_mhz = 0, max_freq_mhz = 0;
601
602 if (!read_sysfs_drm_device_file_uint64(perf, "gt_min_freq_mhz", &min_freq_mhz))
603 return false;
604
605 if (!read_sysfs_drm_device_file_uint64(perf, "gt_max_freq_mhz", &max_freq_mhz))
606 return false;
607
608 memset(&perf->sys_vars, 0, sizeof(perf->sys_vars));
609 perf->sys_vars.gt_min_freq = min_freq_mhz * 1000000;
610 perf->sys_vars.gt_max_freq = max_freq_mhz * 1000000;
611 perf->sys_vars.timestamp_frequency = devinfo->timestamp_frequency;
612 perf->sys_vars.revision = devinfo->revision;
613 compute_topology_builtins(perf, devinfo);
614
615 return true;
616 }
617
618 typedef void (*perf_register_oa_queries_t)(struct gen_perf_config *);
619
620 static perf_register_oa_queries_t
621 get_register_queries_function(const struct gen_device_info *devinfo)
622 {
623 if (devinfo->is_haswell)
624 return gen_oa_register_queries_hsw;
625 if (devinfo->is_cherryview)
626 return gen_oa_register_queries_chv;
627 if (devinfo->is_broadwell)
628 return gen_oa_register_queries_bdw;
629 if (devinfo->is_broxton)
630 return gen_oa_register_queries_bxt;
631 if (devinfo->is_skylake) {
632 if (devinfo->gt == 2)
633 return gen_oa_register_queries_sklgt2;
634 if (devinfo->gt == 3)
635 return gen_oa_register_queries_sklgt3;
636 if (devinfo->gt == 4)
637 return gen_oa_register_queries_sklgt4;
638 }
639 if (devinfo->is_kabylake) {
640 if (devinfo->gt == 2)
641 return gen_oa_register_queries_kblgt2;
642 if (devinfo->gt == 3)
643 return gen_oa_register_queries_kblgt3;
644 }
645 if (devinfo->is_geminilake)
646 return gen_oa_register_queries_glk;
647 if (devinfo->is_coffeelake) {
648 if (devinfo->gt == 2)
649 return gen_oa_register_queries_cflgt2;
650 if (devinfo->gt == 3)
651 return gen_oa_register_queries_cflgt3;
652 }
653 if (devinfo->is_cannonlake)
654 return gen_oa_register_queries_cnl;
655 if (devinfo->gen == 11)
656 return gen_oa_register_queries_icl;
657
658 return NULL;
659 }
660
661 static inline void
662 add_stat_reg(struct gen_perf_query_info *query, uint32_t reg,
663 uint32_t numerator, uint32_t denominator,
664 const char *name, const char *description)
665 {
666 struct gen_perf_query_counter *counter;
667
668 assert(query->n_counters < query->max_counters);
669
670 counter = &query->counters[query->n_counters];
671 counter->name = name;
672 counter->desc = description;
673 counter->type = GEN_PERF_COUNTER_TYPE_RAW;
674 counter->data_type = GEN_PERF_COUNTER_DATA_TYPE_UINT64;
675 counter->offset = sizeof(uint64_t) * query->n_counters;
676 counter->pipeline_stat.reg = reg;
677 counter->pipeline_stat.numerator = numerator;
678 counter->pipeline_stat.denominator = denominator;
679
680 query->n_counters++;
681 }
682
683 static inline void
684 add_basic_stat_reg(struct gen_perf_query_info *query,
685 uint32_t reg, const char *name)
686 {
687 add_stat_reg(query, reg, 1, 1, name, name);
688 }
689
690 static void
691 load_pipeline_statistic_metrics(struct gen_perf_config *perf_cfg,
692 const struct gen_device_info *devinfo)
693 {
694 struct gen_perf_query_info *query =
695 append_query_info(perf_cfg, MAX_STAT_COUNTERS);
696
697 query->kind = GEN_PERF_QUERY_TYPE_PIPELINE;
698 query->name = "Pipeline Statistics Registers";
699
700 add_basic_stat_reg(query, IA_VERTICES_COUNT,
701 "N vertices submitted");
702 add_basic_stat_reg(query, IA_PRIMITIVES_COUNT,
703 "N primitives submitted");
704 add_basic_stat_reg(query, VS_INVOCATION_COUNT,
705 "N vertex shader invocations");
706
707 if (devinfo->gen == 6) {
708 add_stat_reg(query, GEN6_SO_PRIM_STORAGE_NEEDED, 1, 1,
709 "SO_PRIM_STORAGE_NEEDED",
710 "N geometry shader stream-out primitives (total)");
711 add_stat_reg(query, GEN6_SO_NUM_PRIMS_WRITTEN, 1, 1,
712 "SO_NUM_PRIMS_WRITTEN",
713 "N geometry shader stream-out primitives (written)");
714 } else {
715 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(0), 1, 1,
716 "SO_PRIM_STORAGE_NEEDED (Stream 0)",
717 "N stream-out (stream 0) primitives (total)");
718 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(1), 1, 1,
719 "SO_PRIM_STORAGE_NEEDED (Stream 1)",
720 "N stream-out (stream 1) primitives (total)");
721 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(2), 1, 1,
722 "SO_PRIM_STORAGE_NEEDED (Stream 2)",
723 "N stream-out (stream 2) primitives (total)");
724 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(3), 1, 1,
725 "SO_PRIM_STORAGE_NEEDED (Stream 3)",
726 "N stream-out (stream 3) primitives (total)");
727 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(0), 1, 1,
728 "SO_NUM_PRIMS_WRITTEN (Stream 0)",
729 "N stream-out (stream 0) primitives (written)");
730 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(1), 1, 1,
731 "SO_NUM_PRIMS_WRITTEN (Stream 1)",
732 "N stream-out (stream 1) primitives (written)");
733 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(2), 1, 1,
734 "SO_NUM_PRIMS_WRITTEN (Stream 2)",
735 "N stream-out (stream 2) primitives (written)");
736 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(3), 1, 1,
737 "SO_NUM_PRIMS_WRITTEN (Stream 3)",
738 "N stream-out (stream 3) primitives (written)");
739 }
740
741 add_basic_stat_reg(query, HS_INVOCATION_COUNT,
742 "N TCS shader invocations");
743 add_basic_stat_reg(query, DS_INVOCATION_COUNT,
744 "N TES shader invocations");
745
746 add_basic_stat_reg(query, GS_INVOCATION_COUNT,
747 "N geometry shader invocations");
748 add_basic_stat_reg(query, GS_PRIMITIVES_COUNT,
749 "N geometry shader primitives emitted");
750
751 add_basic_stat_reg(query, CL_INVOCATION_COUNT,
752 "N primitives entering clipping");
753 add_basic_stat_reg(query, CL_PRIMITIVES_COUNT,
754 "N primitives leaving clipping");
755
756 if (devinfo->is_haswell || devinfo->gen == 8) {
757 add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4,
758 "N fragment shader invocations",
759 "N fragment shader invocations");
760 } else {
761 add_basic_stat_reg(query, PS_INVOCATION_COUNT,
762 "N fragment shader invocations");
763 }
764
765 add_basic_stat_reg(query, PS_DEPTH_COUNT,
766 "N z-pass fragments");
767
768 if (devinfo->gen >= 7) {
769 add_basic_stat_reg(query, CS_INVOCATION_COUNT,
770 "N compute shader invocations");
771 }
772
773 query->data_size = sizeof(uint64_t) * query->n_counters;
774 }
775
776 static bool
777 load_oa_metrics(struct gen_perf_config *perf, int fd,
778 const struct gen_device_info *devinfo)
779 {
780 perf_register_oa_queries_t oa_register = get_register_queries_function(devinfo);
781 bool i915_perf_oa_available = false;
782 struct stat sb;
783
784 /* The existence of this sysctl parameter implies the kernel supports
785 * the i915 perf interface.
786 */
787 if (stat("/proc/sys/dev/i915/perf_stream_paranoid", &sb) == 0) {
788
789 /* If _paranoid == 1 then on Gen8+ we won't be able to access OA
790 * metrics unless running as root.
791 */
792 if (devinfo->is_haswell)
793 i915_perf_oa_available = true;
794 else {
795 uint64_t paranoid = 1;
796
797 read_file_uint64("/proc/sys/dev/i915/perf_stream_paranoid", &paranoid);
798
799 if (paranoid == 0 || geteuid() == 0)
800 i915_perf_oa_available = true;
801 }
802 }
803
804 if (!i915_perf_oa_available ||
805 !oa_register ||
806 !get_sysfs_dev_dir(perf, fd) ||
807 !init_oa_sys_vars(perf, devinfo))
808 return false;
809
810 perf->oa_metrics_table =
811 _mesa_hash_table_create(perf, _mesa_key_hash_string,
812 _mesa_key_string_equal);
813
814 /* Index all the metric sets mesa knows about before looking to see what
815 * the kernel is advertising.
816 */
817 oa_register(perf);
818
819 if (likely((INTEL_DEBUG & DEBUG_NO_OACONFIG) == 0) &&
820 kernel_has_dynamic_config_support(perf, fd))
821 init_oa_configs(perf, fd);
822 else
823 enumerate_sysfs_metrics(perf);
824
825 return true;
826 }
827
828 /* Accumulate 32bits OA counters */
829 static inline void
830 accumulate_uint32(const uint32_t *report0,
831 const uint32_t *report1,
832 uint64_t *accumulator)
833 {
834 *accumulator += (uint32_t)(*report1 - *report0);
835 }
836
837 /* Accumulate 40bits OA counters */
838 static inline void
839 accumulate_uint40(int a_index,
840 const uint32_t *report0,
841 const uint32_t *report1,
842 uint64_t *accumulator)
843 {
844 const uint8_t *high_bytes0 = (uint8_t *)(report0 + 40);
845 const uint8_t *high_bytes1 = (uint8_t *)(report1 + 40);
846 uint64_t high0 = (uint64_t)(high_bytes0[a_index]) << 32;
847 uint64_t high1 = (uint64_t)(high_bytes1[a_index]) << 32;
848 uint64_t value0 = report0[a_index + 4] | high0;
849 uint64_t value1 = report1[a_index + 4] | high1;
850 uint64_t delta;
851
852 if (value0 > value1)
853 delta = (1ULL << 40) + value1 - value0;
854 else
855 delta = value1 - value0;
856
857 *accumulator += delta;
858 }
859
860 static void
861 gen8_read_report_clock_ratios(const uint32_t *report,
862 uint64_t *slice_freq_hz,
863 uint64_t *unslice_freq_hz)
864 {
865 /* The lower 16bits of the RPT_ID field of the OA reports contains a
866 * snapshot of the bits coming from the RP_FREQ_NORMAL register and is
867 * divided this way :
868 *
869 * RPT_ID[31:25]: RP_FREQ_NORMAL[20:14] (low squashed_slice_clock_frequency)
870 * RPT_ID[10:9]: RP_FREQ_NORMAL[22:21] (high squashed_slice_clock_frequency)
871 * RPT_ID[8:0]: RP_FREQ_NORMAL[31:23] (squashed_unslice_clock_frequency)
872 *
873 * RP_FREQ_NORMAL[31:23]: Software Unslice Ratio Request
874 * Multiple of 33.33MHz 2xclk (16 MHz 1xclk)
875 *
876 * RP_FREQ_NORMAL[22:14]: Software Slice Ratio Request
877 * Multiple of 33.33MHz 2xclk (16 MHz 1xclk)
878 */
879
880 uint32_t unslice_freq = report[0] & 0x1ff;
881 uint32_t slice_freq_low = (report[0] >> 25) & 0x7f;
882 uint32_t slice_freq_high = (report[0] >> 9) & 0x3;
883 uint32_t slice_freq = slice_freq_low | (slice_freq_high << 7);
884
885 *slice_freq_hz = slice_freq * 16666667ULL;
886 *unslice_freq_hz = unslice_freq * 16666667ULL;
887 }
888
889 static void
890 query_result_read_frequencies(struct gen_perf_query_result *result,
891 const struct gen_device_info *devinfo,
892 const uint32_t *start,
893 const uint32_t *end)
894 {
895 /* Slice/Unslice frequency is only available in the OA reports when the
896 * "Disable OA reports due to clock ratio change" field in
897 * OA_DEBUG_REGISTER is set to 1. This is how the kernel programs this
898 * global register (see drivers/gpu/drm/i915/i915_perf.c)
899 *
900 * Documentation says this should be available on Gen9+ but experimentation
901 * shows that Gen8 reports similar values, so we enable it there too.
902 */
903 if (devinfo->gen < 8)
904 return;
905
906 gen8_read_report_clock_ratios(start,
907 &result->slice_frequency[0],
908 &result->unslice_frequency[0]);
909 gen8_read_report_clock_ratios(end,
910 &result->slice_frequency[1],
911 &result->unslice_frequency[1]);
912 }
913
914 static void
915 query_result_accumulate(struct gen_perf_query_result *result,
916 const struct gen_perf_query_info *query,
917 const uint32_t *start,
918 const uint32_t *end)
919 {
920 int i, idx = 0;
921
922 result->hw_id = start[2];
923 result->reports_accumulated++;
924
925 switch (query->oa_format) {
926 case I915_OA_FORMAT_A32u40_A4u32_B8_C8:
927 accumulate_uint32(start + 1, end + 1, result->accumulator + idx++); /* timestamp */
928 accumulate_uint32(start + 3, end + 3, result->accumulator + idx++); /* clock */
929
930 /* 32x 40bit A counters... */
931 for (i = 0; i < 32; i++)
932 accumulate_uint40(i, start, end, result->accumulator + idx++);
933
934 /* 4x 32bit A counters... */
935 for (i = 0; i < 4; i++)
936 accumulate_uint32(start + 36 + i, end + 36 + i, result->accumulator + idx++);
937
938 /* 8x 32bit B counters + 8x 32bit C counters... */
939 for (i = 0; i < 16; i++)
940 accumulate_uint32(start + 48 + i, end + 48 + i, result->accumulator + idx++);
941 break;
942
943 case I915_OA_FORMAT_A45_B8_C8:
944 accumulate_uint32(start + 1, end + 1, result->accumulator); /* timestamp */
945
946 for (i = 0; i < 61; i++)
947 accumulate_uint32(start + 3 + i, end + 3 + i, result->accumulator + 1 + i);
948 break;
949
950 default:
951 unreachable("Can't accumulate OA counters in unknown format");
952 }
953
954 }
955
956 static void
957 query_result_clear(struct gen_perf_query_result *result)
958 {
959 memset(result, 0, sizeof(*result));
960 result->hw_id = 0xffffffff; /* invalid */
961 }
962
963 static void
964 register_mdapi_statistic_query(struct gen_perf_config *perf_cfg,
965 const struct gen_device_info *devinfo)
966 {
967 if (!(devinfo->gen >= 7 && devinfo->gen <= 11))
968 return;
969
970 struct gen_perf_query_info *query =
971 append_query_info(perf_cfg, MAX_STAT_COUNTERS);
972
973 query->kind = GEN_PERF_QUERY_TYPE_PIPELINE;
974 query->name = "Intel_Raw_Pipeline_Statistics_Query";
975
976 /* The order has to match mdapi_pipeline_metrics. */
977 add_basic_stat_reg(query, IA_VERTICES_COUNT,
978 "N vertices submitted");
979 add_basic_stat_reg(query, IA_PRIMITIVES_COUNT,
980 "N primitives submitted");
981 add_basic_stat_reg(query, VS_INVOCATION_COUNT,
982 "N vertex shader invocations");
983 add_basic_stat_reg(query, GS_INVOCATION_COUNT,
984 "N geometry shader invocations");
985 add_basic_stat_reg(query, GS_PRIMITIVES_COUNT,
986 "N geometry shader primitives emitted");
987 add_basic_stat_reg(query, CL_INVOCATION_COUNT,
988 "N primitives entering clipping");
989 add_basic_stat_reg(query, CL_PRIMITIVES_COUNT,
990 "N primitives leaving clipping");
991 if (devinfo->is_haswell || devinfo->gen == 8) {
992 add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4,
993 "N fragment shader invocations",
994 "N fragment shader invocations");
995 } else {
996 add_basic_stat_reg(query, PS_INVOCATION_COUNT,
997 "N fragment shader invocations");
998 }
999 add_basic_stat_reg(query, HS_INVOCATION_COUNT,
1000 "N TCS shader invocations");
1001 add_basic_stat_reg(query, DS_INVOCATION_COUNT,
1002 "N TES shader invocations");
1003 if (devinfo->gen >= 7) {
1004 add_basic_stat_reg(query, CS_INVOCATION_COUNT,
1005 "N compute shader invocations");
1006 }
1007
1008 if (devinfo->gen >= 10) {
1009 /* Reuse existing CS invocation register until we can expose this new
1010 * one.
1011 */
1012 add_basic_stat_reg(query, CS_INVOCATION_COUNT,
1013 "Reserved1");
1014 }
1015
1016 query->data_size = sizeof(uint64_t) * query->n_counters;
1017 }
1018
1019 static void
1020 fill_mdapi_perf_query_counter(struct gen_perf_query_info *query,
1021 const char *name,
1022 uint32_t data_offset,
1023 uint32_t data_size,
1024 enum gen_perf_counter_data_type data_type)
1025 {
1026 struct gen_perf_query_counter *counter = &query->counters[query->n_counters];
1027
1028 assert(query->n_counters <= query->max_counters);
1029
1030 counter->name = name;
1031 counter->desc = "Raw counter value";
1032 counter->type = GEN_PERF_COUNTER_TYPE_RAW;
1033 counter->data_type = data_type;
1034 counter->offset = data_offset;
1035
1036 query->n_counters++;
1037
1038 assert(counter->offset + gen_perf_query_counter_get_size(counter) <= query->data_size);
1039 }
1040
1041 #define MDAPI_QUERY_ADD_COUNTER(query, struct_name, field_name, type_name) \
1042 fill_mdapi_perf_query_counter(query, #field_name, \
1043 (uint8_t *) &struct_name.field_name - \
1044 (uint8_t *) &struct_name, \
1045 sizeof(struct_name.field_name), \
1046 GEN_PERF_COUNTER_DATA_TYPE_##type_name)
1047 #define MDAPI_QUERY_ADD_ARRAY_COUNTER(ctx, query, struct_name, field_name, idx, type_name) \
1048 fill_mdapi_perf_query_counter(query, \
1049 ralloc_asprintf(ctx, "%s%i", #field_name, idx), \
1050 (uint8_t *) &struct_name.field_name[idx] - \
1051 (uint8_t *) &struct_name, \
1052 sizeof(struct_name.field_name[0]), \
1053 GEN_PERF_COUNTER_DATA_TYPE_##type_name)
1054
1055 static void
1056 register_mdapi_oa_query(const struct gen_device_info *devinfo,
1057 struct gen_perf_config *perf)
1058 {
1059 struct gen_perf_query_info *query = NULL;
1060
1061 /* MDAPI requires different structures for pretty much every generation
1062 * (right now we have definitions for gen 7 to 11).
1063 */
1064 if (!(devinfo->gen >= 7 && devinfo->gen <= 11))
1065 return;
1066
1067 switch (devinfo->gen) {
1068 case 7: {
1069 query = append_query_info(perf, 1 + 45 + 16 + 7);
1070 query->oa_format = I915_OA_FORMAT_A45_B8_C8;
1071
1072 struct gen7_mdapi_metrics metric_data;
1073 query->data_size = sizeof(metric_data);
1074
1075 MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
1076 for (int i = 0; i < ARRAY_SIZE(metric_data.ACounters); i++) {
1077 MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
1078 metric_data, ACounters, i, UINT64);
1079 }
1080 for (int i = 0; i < ARRAY_SIZE(metric_data.NOACounters); i++) {
1081 MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
1082 metric_data, NOACounters, i, UINT64);
1083 }
1084 MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
1085 MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
1086 MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
1087 MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
1088 MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
1089 MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
1090 MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
1091 break;
1092 }
1093 case 8: {
1094 query = append_query_info(perf, 2 + 36 + 16 + 16);
1095 query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8;
1096
1097 struct gen8_mdapi_metrics metric_data;
1098 query->data_size = sizeof(metric_data);
1099
1100 MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
1101 MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64);
1102 for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) {
1103 MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
1104 metric_data, OaCntr, i, UINT64);
1105 }
1106 for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) {
1107 MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
1108 metric_data, NoaCntr, i, UINT64);
1109 }
1110 MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64);
1111 MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64);
1112 MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64);
1113 MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32);
1114 MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32);
1115 MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64);
1116 MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64);
1117 MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64);
1118 MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64);
1119 MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
1120 MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
1121 MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
1122 MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
1123 MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
1124 MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
1125 MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
1126 break;
1127 }
1128 case 9:
1129 case 10:
1130 case 11: {
1131 query = append_query_info(perf, 2 + 36 + 16 + 16 + 16 + 2);
1132 query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8;
1133
1134 struct gen9_mdapi_metrics metric_data;
1135 query->data_size = sizeof(metric_data);
1136
1137 MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
1138 MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64);
1139 for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) {
1140 MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
1141 metric_data, OaCntr, i, UINT64);
1142 }
1143 for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) {
1144 MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
1145 metric_data, NoaCntr, i, UINT64);
1146 }
1147 MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64);
1148 MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64);
1149 MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64);
1150 MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32);
1151 MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32);
1152 MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64);
1153 MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64);
1154 MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64);
1155 MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64);
1156 MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
1157 MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
1158 MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
1159 MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
1160 MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
1161 MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
1162 MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
1163 for (int i = 0; i < ARRAY_SIZE(metric_data.UserCntr); i++) {
1164 MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
1165 metric_data, UserCntr, i, UINT64);
1166 }
1167 MDAPI_QUERY_ADD_COUNTER(query, metric_data, UserCntrCfgId, UINT32);
1168 MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved4, UINT32);
1169 break;
1170 }
1171 default:
1172 unreachable("Unsupported gen");
1173 break;
1174 }
1175
1176 query->kind = GEN_PERF_QUERY_TYPE_RAW;
1177 query->name = "Intel_Raw_Hardware_Counters_Set_0_Query";
1178 query->guid = GEN_PERF_QUERY_GUID_MDAPI;
1179
1180 {
1181 /* Accumulation buffer offsets copied from an actual query... */
1182 const struct gen_perf_query_info *copy_query =
1183 &perf->queries[0];
1184
1185 query->gpu_time_offset = copy_query->gpu_time_offset;
1186 query->gpu_clock_offset = copy_query->gpu_clock_offset;
1187 query->a_offset = copy_query->a_offset;
1188 query->b_offset = copy_query->b_offset;
1189 query->c_offset = copy_query->c_offset;
1190 }
1191 }
1192
1193 static uint64_t
1194 get_metric_id(struct gen_perf_config *perf,
1195 const struct gen_perf_query_info *query)
1196 {
1197 /* These queries are know not to ever change, their config ID has been
1198 * loaded upon the first query creation. No need to look them up again.
1199 */
1200 if (query->kind == GEN_PERF_QUERY_TYPE_OA)
1201 return query->oa_metrics_set_id;
1202
1203 assert(query->kind == GEN_PERF_QUERY_TYPE_RAW);
1204
1205 /* Raw queries can be reprogrammed up by an external application/library.
1206 * When a raw query is used for the first time it's id is set to a value !=
1207 * 0. When it stops being used the id returns to 0. No need to reload the
1208 * ID when it's already loaded.
1209 */
1210 if (query->oa_metrics_set_id != 0) {
1211 DBG("Raw query '%s' guid=%s using cached ID: %"PRIu64"\n",
1212 query->name, query->guid, query->oa_metrics_set_id);
1213 return query->oa_metrics_set_id;
1214 }
1215
1216 struct gen_perf_query_info *raw_query = (struct gen_perf_query_info *)query;
1217 if (!load_metric_id(perf, query->guid,
1218 &raw_query->oa_metrics_set_id)) {
1219 DBG("Unable to read query guid=%s ID, falling back to test config\n", query->guid);
1220 raw_query->oa_metrics_set_id = 1ULL;
1221 } else {
1222 DBG("Raw query '%s'guid=%s loaded ID: %"PRIu64"\n",
1223 query->name, query->guid, query->oa_metrics_set_id);
1224 }
1225 return query->oa_metrics_set_id;
1226 }
1227
1228 static struct oa_sample_buf *
1229 get_free_sample_buf(struct gen_perf_context *perf_ctx)
1230 {
1231 struct exec_node *node = exec_list_pop_head(&perf_ctx->free_sample_buffers);
1232 struct oa_sample_buf *buf;
1233
1234 if (node)
1235 buf = exec_node_data(struct oa_sample_buf, node, link);
1236 else {
1237 buf = ralloc_size(perf_ctx->perf, sizeof(*buf));
1238
1239 exec_node_init(&buf->link);
1240 buf->refcount = 0;
1241 buf->len = 0;
1242 }
1243
1244 return buf;
1245 }
1246
1247 static void
1248 reap_old_sample_buffers(struct gen_perf_context *perf_ctx)
1249 {
1250 struct exec_node *tail_node =
1251 exec_list_get_tail(&perf_ctx->sample_buffers);
1252 struct oa_sample_buf *tail_buf =
1253 exec_node_data(struct oa_sample_buf, tail_node, link);
1254
1255 /* Remove all old, unreferenced sample buffers walking forward from
1256 * the head of the list, except always leave at least one node in
1257 * the list so we always have a node to reference when we Begin
1258 * a new query.
1259 */
1260 foreach_list_typed_safe(struct oa_sample_buf, buf, link,
1261 &perf_ctx->sample_buffers)
1262 {
1263 if (buf->refcount == 0 && buf != tail_buf) {
1264 exec_node_remove(&buf->link);
1265 exec_list_push_head(&perf_ctx->free_sample_buffers, &buf->link);
1266 } else
1267 return;
1268 }
1269 }
1270
1271 static void
1272 free_sample_bufs(struct gen_perf_context *perf_ctx)
1273 {
1274 foreach_list_typed_safe(struct oa_sample_buf, buf, link,
1275 &perf_ctx->free_sample_buffers)
1276 ralloc_free(buf);
1277
1278 exec_list_make_empty(&perf_ctx->free_sample_buffers);
1279 }
1280
1281 /******************************************************************************/
1282
1283 /**
1284 * Emit MI_STORE_REGISTER_MEM commands to capture all of the
1285 * pipeline statistics for the performance query object.
1286 */
1287 static void
1288 snapshot_statistics_registers(void *context,
1289 struct gen_perf_config *perf,
1290 struct gen_perf_query_object *obj,
1291 uint32_t offset_in_bytes)
1292 {
1293 const struct gen_perf_query_info *query = obj->queryinfo;
1294 const int n_counters = query->n_counters;
1295
1296 for (int i = 0; i < n_counters; i++) {
1297 const struct gen_perf_query_counter *counter = &query->counters[i];
1298
1299 assert(counter->data_type == GEN_PERF_COUNTER_DATA_TYPE_UINT64);
1300
1301 perf->vtbl.store_register_mem64(context, obj->pipeline_stats.bo,
1302 counter->pipeline_stat.reg,
1303 offset_in_bytes + i * sizeof(uint64_t));
1304 }
1305 }
1306
1307 static void
1308 gen_perf_close(struct gen_perf_context *perfquery,
1309 const struct gen_perf_query_info *query)
1310 {
1311 if (perfquery->oa_stream_fd != -1) {
1312 close(perfquery->oa_stream_fd);
1313 perfquery->oa_stream_fd = -1;
1314 }
1315 if (query->kind == GEN_PERF_QUERY_TYPE_RAW) {
1316 struct gen_perf_query_info *raw_query =
1317 (struct gen_perf_query_info *) query;
1318 raw_query->oa_metrics_set_id = 0;
1319 }
1320 }
1321
1322 static bool
1323 gen_perf_open(struct gen_perf_context *perf_ctx,
1324 int metrics_set_id,
1325 int report_format,
1326 int period_exponent,
1327 int drm_fd,
1328 uint32_t ctx_id)
1329 {
1330 uint64_t properties[] = {
1331 /* Single context sampling */
1332 DRM_I915_PERF_PROP_CTX_HANDLE, ctx_id,
1333
1334 /* Include OA reports in samples */
1335 DRM_I915_PERF_PROP_SAMPLE_OA, true,
1336
1337 /* OA unit configuration */
1338 DRM_I915_PERF_PROP_OA_METRICS_SET, metrics_set_id,
1339 DRM_I915_PERF_PROP_OA_FORMAT, report_format,
1340 DRM_I915_PERF_PROP_OA_EXPONENT, period_exponent,
1341 };
1342 struct drm_i915_perf_open_param param = {
1343 .flags = I915_PERF_FLAG_FD_CLOEXEC |
1344 I915_PERF_FLAG_FD_NONBLOCK |
1345 I915_PERF_FLAG_DISABLED,
1346 .num_properties = ARRAY_SIZE(properties) / 2,
1347 .properties_ptr = (uintptr_t) properties,
1348 };
1349 int fd = gen_ioctl(drm_fd, DRM_IOCTL_I915_PERF_OPEN, &param);
1350 if (fd == -1) {
1351 DBG("Error opening gen perf OA stream: %m\n");
1352 return false;
1353 }
1354
1355 perf_ctx->oa_stream_fd = fd;
1356
1357 perf_ctx->current_oa_metrics_set_id = metrics_set_id;
1358 perf_ctx->current_oa_format = report_format;
1359
1360 return true;
1361 }
1362
1363 static bool
1364 inc_n_users(struct gen_perf_context *perf_ctx)
1365 {
1366 if (perf_ctx->n_oa_users == 0 &&
1367 gen_ioctl(perf_ctx->oa_stream_fd, I915_PERF_IOCTL_ENABLE, 0) < 0)
1368 {
1369 return false;
1370 }
1371 ++perf_ctx->n_oa_users;
1372
1373 return true;
1374 }
1375
1376 static void
1377 dec_n_users(struct gen_perf_context *perf_ctx)
1378 {
1379 /* Disabling the i915 perf stream will effectively disable the OA
1380 * counters. Note it's important to be sure there are no outstanding
1381 * MI_RPC commands at this point since they could stall the CS
1382 * indefinitely once OACONTROL is disabled.
1383 */
1384 --perf_ctx->n_oa_users;
1385 if (perf_ctx->n_oa_users == 0 &&
1386 gen_ioctl(perf_ctx->oa_stream_fd, I915_PERF_IOCTL_DISABLE, 0) < 0)
1387 {
1388 DBG("WARNING: Error disabling gen perf stream: %m\n");
1389 }
1390 }
1391
1392 void
1393 gen_perf_init_metrics(struct gen_perf_config *perf_cfg,
1394 const struct gen_device_info *devinfo,
1395 int drm_fd)
1396 {
1397 load_pipeline_statistic_metrics(perf_cfg, devinfo);
1398 register_mdapi_statistic_query(perf_cfg, devinfo);
1399 if (load_oa_metrics(perf_cfg, drm_fd, devinfo))
1400 register_mdapi_oa_query(devinfo, perf_cfg);
1401 }
1402
1403 void
1404 gen_perf_init_context(struct gen_perf_context *perf_ctx,
1405 struct gen_perf_config *perf_cfg,
1406 void * ctx, /* driver context (eg, brw_context) */
1407 void * bufmgr, /* eg brw_bufmgr */
1408 const struct gen_device_info *devinfo,
1409 uint32_t hw_ctx,
1410 int drm_fd)
1411 {
1412 perf_ctx->perf = perf_cfg;
1413 perf_ctx->ctx = ctx;
1414 perf_ctx->bufmgr = bufmgr;
1415 perf_ctx->drm_fd = drm_fd;
1416 perf_ctx->hw_ctx = hw_ctx;
1417 perf_ctx->devinfo = devinfo;
1418
1419 perf_ctx->unaccumulated =
1420 ralloc_array(ctx, struct gen_perf_query_object *, 2);
1421 perf_ctx->unaccumulated_elements = 0;
1422 perf_ctx->unaccumulated_array_size = 2;
1423
1424 exec_list_make_empty(&perf_ctx->sample_buffers);
1425 exec_list_make_empty(&perf_ctx->free_sample_buffers);
1426
1427 /* It's convenient to guarantee that this linked list of sample
1428 * buffers is never empty so we add an empty head so when we
1429 * Begin an OA query we can always take a reference on a buffer
1430 * in this list.
1431 */
1432 struct oa_sample_buf *buf = get_free_sample_buf(perf_ctx);
1433 exec_list_push_head(&perf_ctx->sample_buffers, &buf->link);
1434
1435 perf_ctx->oa_stream_fd = -1;
1436 perf_ctx->next_query_start_report_id = 1000;
1437 }
1438
1439 /**
1440 * Add a query to the global list of "unaccumulated queries."
1441 *
1442 * Queries are tracked here until all the associated OA reports have
1443 * been accumulated via accumulate_oa_reports() after the end
1444 * MI_REPORT_PERF_COUNT has landed in query->oa.bo.
1445 */
1446 static void
1447 add_to_unaccumulated_query_list(struct gen_perf_context *perf_ctx,
1448 struct gen_perf_query_object *obj)
1449 {
1450 if (perf_ctx->unaccumulated_elements >=
1451 perf_ctx->unaccumulated_array_size)
1452 {
1453 perf_ctx->unaccumulated_array_size *= 1.5;
1454 perf_ctx->unaccumulated =
1455 reralloc(perf_ctx->ctx, perf_ctx->unaccumulated,
1456 struct gen_perf_query_object *,
1457 perf_ctx->unaccumulated_array_size);
1458 }
1459
1460 perf_ctx->unaccumulated[perf_ctx->unaccumulated_elements++] = obj;
1461 }
1462
1463 bool
1464 gen_perf_begin_query(struct gen_perf_context *perf_ctx,
1465 struct gen_perf_query_object *query)
1466 {
1467 struct gen_perf_config *perf_cfg = perf_ctx->perf;
1468 const struct gen_perf_query_info *queryinfo = query->queryinfo;
1469
1470 /* XXX: We have to consider that the command parser unit that parses batch
1471 * buffer commands and is used to capture begin/end counter snapshots isn't
1472 * implicitly synchronized with what's currently running across other GPU
1473 * units (such as the EUs running shaders) that the performance counters are
1474 * associated with.
1475 *
1476 * The intention of performance queries is to measure the work associated
1477 * with commands between the begin/end delimiters and so for that to be the
1478 * case we need to explicitly synchronize the parsing of commands to capture
1479 * Begin/End counter snapshots with what's running across other parts of the
1480 * GPU.
1481 *
1482 * When the command parser reaches a Begin marker it effectively needs to
1483 * drain everything currently running on the GPU until the hardware is idle
1484 * before capturing the first snapshot of counters - otherwise the results
1485 * would also be measuring the effects of earlier commands.
1486 *
1487 * When the command parser reaches an End marker it needs to stall until
1488 * everything currently running on the GPU has finished before capturing the
1489 * end snapshot - otherwise the results won't be a complete representation
1490 * of the work.
1491 *
1492 * Theoretically there could be opportunities to minimize how much of the
1493 * GPU pipeline is drained, or that we stall for, when we know what specific
1494 * units the performance counters being queried relate to but we don't
1495 * currently attempt to be clever here.
1496 *
1497 * Note: with our current simple approach here then for back-to-back queries
1498 * we will redundantly emit duplicate commands to synchronize the command
1499 * streamer with the rest of the GPU pipeline, but we assume that in HW the
1500 * second synchronization is effectively a NOOP.
1501 *
1502 * N.B. The final results are based on deltas of counters between (inside)
1503 * Begin/End markers so even though the total wall clock time of the
1504 * workload is stretched by larger pipeline bubbles the bubbles themselves
1505 * are generally invisible to the query results. Whether that's a good or a
1506 * bad thing depends on the use case. For a lower real-time impact while
1507 * capturing metrics then periodic sampling may be a better choice than
1508 * INTEL_performance_query.
1509 *
1510 *
1511 * This is our Begin synchronization point to drain current work on the
1512 * GPU before we capture our first counter snapshot...
1513 */
1514 perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
1515
1516 switch (queryinfo->kind) {
1517 case GEN_PERF_QUERY_TYPE_OA:
1518 case GEN_PERF_QUERY_TYPE_RAW: {
1519
1520 /* Opening an i915 perf stream implies exclusive access to the OA unit
1521 * which will generate counter reports for a specific counter set with a
1522 * specific layout/format so we can't begin any OA based queries that
1523 * require a different counter set or format unless we get an opportunity
1524 * to close the stream and open a new one...
1525 */
1526 uint64_t metric_id = get_metric_id(perf_ctx->perf, queryinfo);
1527
1528 if (perf_ctx->oa_stream_fd != -1 &&
1529 perf_ctx->current_oa_metrics_set_id != metric_id) {
1530
1531 if (perf_ctx->n_oa_users != 0) {
1532 DBG("WARNING: Begin failed already using perf config=%i/%"PRIu64"\n",
1533 perf_ctx->current_oa_metrics_set_id, metric_id);
1534 return false;
1535 } else
1536 gen_perf_close(perf_ctx, queryinfo);
1537 }
1538
1539 /* If the OA counters aren't already on, enable them. */
1540 if (perf_ctx->oa_stream_fd == -1) {
1541 const struct gen_device_info *devinfo = perf_ctx->devinfo;
1542
1543 /* The period_exponent gives a sampling period as follows:
1544 * sample_period = timestamp_period * 2^(period_exponent + 1)
1545 *
1546 * The timestamps increments every 80ns (HSW), ~52ns (GEN9LP) or
1547 * ~83ns (GEN8/9).
1548 *
1549 * The counter overflow period is derived from the EuActive counter
1550 * which reads a counter that increments by the number of clock
1551 * cycles multiplied by the number of EUs. It can be calculated as:
1552 *
1553 * 2^(number of bits in A counter) / (n_eus * max_gen_freq * 2)
1554 *
1555 * (E.g. 40 EUs @ 1GHz = ~53ms)
1556 *
1557 * We select a sampling period inferior to that overflow period to
1558 * ensure we cannot see more than 1 counter overflow, otherwise we
1559 * could loose information.
1560 */
1561
1562 int a_counter_in_bits = 32;
1563 if (devinfo->gen >= 8)
1564 a_counter_in_bits = 40;
1565
1566 uint64_t overflow_period = pow(2, a_counter_in_bits) / (perf_cfg->sys_vars.n_eus *
1567 /* drop 1GHz freq to have units in nanoseconds */
1568 2);
1569
1570 DBG("A counter overflow period: %"PRIu64"ns, %"PRIu64"ms (n_eus=%"PRIu64")\n",
1571 overflow_period, overflow_period / 1000000ul, perf_cfg->sys_vars.n_eus);
1572
1573 int period_exponent = 0;
1574 uint64_t prev_sample_period, next_sample_period;
1575 for (int e = 0; e < 30; e++) {
1576 prev_sample_period = 1000000000ull * pow(2, e + 1) / devinfo->timestamp_frequency;
1577 next_sample_period = 1000000000ull * pow(2, e + 2) / devinfo->timestamp_frequency;
1578
1579 /* Take the previous sampling period, lower than the overflow
1580 * period.
1581 */
1582 if (prev_sample_period < overflow_period &&
1583 next_sample_period > overflow_period)
1584 period_exponent = e + 1;
1585 }
1586
1587 if (period_exponent == 0) {
1588 DBG("WARNING: enable to find a sampling exponent\n");
1589 return false;
1590 }
1591
1592 DBG("OA sampling exponent: %i ~= %"PRIu64"ms\n", period_exponent,
1593 prev_sample_period / 1000000ul);
1594
1595 if (!gen_perf_open(perf_ctx, metric_id, queryinfo->oa_format,
1596 period_exponent, perf_ctx->drm_fd,
1597 perf_ctx->hw_ctx))
1598 return false;
1599 } else {
1600 assert(perf_ctx->current_oa_metrics_set_id == metric_id &&
1601 perf_ctx->current_oa_format == queryinfo->oa_format);
1602 }
1603
1604 if (!inc_n_users(perf_ctx)) {
1605 DBG("WARNING: Error enabling i915 perf stream: %m\n");
1606 return false;
1607 }
1608
1609 if (query->oa.bo) {
1610 perf_cfg->vtbl.bo_unreference(query->oa.bo);
1611 query->oa.bo = NULL;
1612 }
1613
1614 query->oa.bo = perf_cfg->vtbl.bo_alloc(perf_ctx->bufmgr,
1615 "perf. query OA MI_RPC bo",
1616 MI_RPC_BO_SIZE);
1617 #ifdef DEBUG
1618 /* Pre-filling the BO helps debug whether writes landed. */
1619 void *map = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->oa.bo, MAP_WRITE);
1620 memset(map, 0x80, MI_RPC_BO_SIZE);
1621 perf_cfg->vtbl.bo_unmap(query->oa.bo);
1622 #endif
1623
1624 query->oa.begin_report_id = perf_ctx->next_query_start_report_id;
1625 perf_ctx->next_query_start_report_id += 2;
1626
1627 /* We flush the batchbuffer here to minimize the chances that MI_RPC
1628 * delimiting commands end up in different batchbuffers. If that's the
1629 * case, the measurement will include the time it takes for the kernel
1630 * scheduler to load a new request into the hardware. This is manifested in
1631 * tools like frameretrace by spikes in the "GPU Core Clocks" counter.
1632 */
1633 perf_cfg->vtbl.batchbuffer_flush(perf_ctx->ctx, __FILE__, __LINE__);
1634
1635 /* Take a starting OA counter snapshot. */
1636 perf_cfg->vtbl.emit_mi_report_perf_count(perf_ctx->ctx, query->oa.bo, 0,
1637 query->oa.begin_report_id);
1638 perf_cfg->vtbl.capture_frequency_stat_register(perf_ctx->ctx, query->oa.bo,
1639 MI_FREQ_START_OFFSET_BYTES);
1640
1641 ++perf_ctx->n_active_oa_queries;
1642
1643 /* No already-buffered samples can possibly be associated with this query
1644 * so create a marker within the list of sample buffers enabling us to
1645 * easily ignore earlier samples when processing this query after
1646 * completion.
1647 */
1648 assert(!exec_list_is_empty(&perf_ctx->sample_buffers));
1649 query->oa.samples_head = exec_list_get_tail(&perf_ctx->sample_buffers);
1650
1651 struct oa_sample_buf *buf =
1652 exec_node_data(struct oa_sample_buf, query->oa.samples_head, link);
1653
1654 /* This reference will ensure that future/following sample
1655 * buffers (that may relate to this query) can't be freed until
1656 * this drops to zero.
1657 */
1658 buf->refcount++;
1659
1660 query_result_clear(&query->oa.result);
1661 query->oa.results_accumulated = false;
1662
1663 add_to_unaccumulated_query_list(perf_ctx, query);
1664 break;
1665 }
1666
1667 case GEN_PERF_QUERY_TYPE_PIPELINE:
1668 if (query->pipeline_stats.bo) {
1669 perf_cfg->vtbl.bo_unreference(query->pipeline_stats.bo);
1670 query->pipeline_stats.bo = NULL;
1671 }
1672
1673 query->pipeline_stats.bo =
1674 perf_cfg->vtbl.bo_alloc(perf_ctx->bufmgr,
1675 "perf. query pipeline stats bo",
1676 STATS_BO_SIZE);
1677
1678 /* Take starting snapshots. */
1679 snapshot_statistics_registers(perf_ctx->ctx , perf_cfg, query, 0);
1680
1681 ++perf_ctx->n_active_pipeline_stats_queries;
1682 break;
1683
1684 default:
1685 unreachable("Unknown query type");
1686 break;
1687 }
1688
1689 return true;
1690 }
1691
1692 void
1693 gen_perf_end_query(struct gen_perf_context *perf_ctx,
1694 struct gen_perf_query_object *query)
1695 {
1696 struct gen_perf_config *perf_cfg = perf_ctx->perf;
1697
1698 /* Ensure that the work associated with the queried commands will have
1699 * finished before taking our query end counter readings.
1700 *
1701 * For more details see comment in brw_begin_perf_query for
1702 * corresponding flush.
1703 */
1704 perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
1705
1706 switch (query->queryinfo->kind) {
1707 case GEN_PERF_QUERY_TYPE_OA:
1708 case GEN_PERF_QUERY_TYPE_RAW:
1709
1710 /* NB: It's possible that the query will have already been marked
1711 * as 'accumulated' if an error was seen while reading samples
1712 * from perf. In this case we mustn't try and emit a closing
1713 * MI_RPC command in case the OA unit has already been disabled
1714 */
1715 if (!query->oa.results_accumulated) {
1716 /* Take an ending OA counter snapshot. */
1717 perf_cfg->vtbl.capture_frequency_stat_register(perf_ctx->ctx, query->oa.bo,
1718 MI_FREQ_END_OFFSET_BYTES);
1719 perf_cfg->vtbl.emit_mi_report_perf_count(perf_ctx->ctx, query->oa.bo,
1720 MI_RPC_BO_END_OFFSET_BYTES,
1721 query->oa.begin_report_id + 1);
1722 }
1723
1724 --perf_ctx->n_active_oa_queries;
1725
1726 /* NB: even though the query has now ended, it can't be accumulated
1727 * until the end MI_REPORT_PERF_COUNT snapshot has been written
1728 * to query->oa.bo
1729 */
1730 break;
1731
1732 case GEN_PERF_QUERY_TYPE_PIPELINE:
1733 snapshot_statistics_registers(perf_ctx->ctx, perf_cfg, query,
1734 STATS_BO_END_OFFSET_BYTES);
1735 --perf_ctx->n_active_pipeline_stats_queries;
1736 break;
1737
1738 default:
1739 unreachable("Unknown query type");
1740 break;
1741 }
1742 }
1743
1744 enum OaReadStatus {
1745 OA_READ_STATUS_ERROR,
1746 OA_READ_STATUS_UNFINISHED,
1747 OA_READ_STATUS_FINISHED,
1748 };
1749
1750 static enum OaReadStatus
1751 read_oa_samples_until(struct gen_perf_context *perf_ctx,
1752 uint32_t start_timestamp,
1753 uint32_t end_timestamp)
1754 {
1755 struct exec_node *tail_node =
1756 exec_list_get_tail(&perf_ctx->sample_buffers);
1757 struct oa_sample_buf *tail_buf =
1758 exec_node_data(struct oa_sample_buf, tail_node, link);
1759 uint32_t last_timestamp = tail_buf->last_timestamp;
1760
1761 while (1) {
1762 struct oa_sample_buf *buf = get_free_sample_buf(perf_ctx);
1763 uint32_t offset;
1764 int len;
1765
1766 while ((len = read(perf_ctx->oa_stream_fd, buf->buf,
1767 sizeof(buf->buf))) < 0 && errno == EINTR)
1768 ;
1769
1770 if (len <= 0) {
1771 exec_list_push_tail(&perf_ctx->free_sample_buffers, &buf->link);
1772
1773 if (len < 0) {
1774 if (errno == EAGAIN)
1775 return ((last_timestamp - start_timestamp) >=
1776 (end_timestamp - start_timestamp)) ?
1777 OA_READ_STATUS_FINISHED :
1778 OA_READ_STATUS_UNFINISHED;
1779 else {
1780 DBG("Error reading i915 perf samples: %m\n");
1781 }
1782 } else
1783 DBG("Spurious EOF reading i915 perf samples\n");
1784
1785 return OA_READ_STATUS_ERROR;
1786 }
1787
1788 buf->len = len;
1789 exec_list_push_tail(&perf_ctx->sample_buffers, &buf->link);
1790
1791 /* Go through the reports and update the last timestamp. */
1792 offset = 0;
1793 while (offset < buf->len) {
1794 const struct drm_i915_perf_record_header *header =
1795 (const struct drm_i915_perf_record_header *) &buf->buf[offset];
1796 uint32_t *report = (uint32_t *) (header + 1);
1797
1798 if (header->type == DRM_I915_PERF_RECORD_SAMPLE)
1799 last_timestamp = report[1];
1800
1801 offset += header->size;
1802 }
1803
1804 buf->last_timestamp = last_timestamp;
1805 }
1806
1807 unreachable("not reached");
1808 return OA_READ_STATUS_ERROR;
1809 }
1810
1811 /**
1812 * Try to read all the reports until either the delimiting timestamp
1813 * or an error arises.
1814 */
1815 static bool
1816 read_oa_samples_for_query(struct gen_perf_context *perf_ctx,
1817 struct gen_perf_query_object *query,
1818 void *current_batch)
1819 {
1820 uint32_t *start;
1821 uint32_t *last;
1822 uint32_t *end;
1823 struct gen_perf_config *perf_cfg = perf_ctx->perf;
1824
1825 /* We need the MI_REPORT_PERF_COUNT to land before we can start
1826 * accumulate. */
1827 assert(!perf_cfg->vtbl.batch_references(current_batch, query->oa.bo) &&
1828 !perf_cfg->vtbl.bo_busy(query->oa.bo));
1829
1830 /* Map the BO once here and let accumulate_oa_reports() unmap
1831 * it. */
1832 if (query->oa.map == NULL)
1833 query->oa.map = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->oa.bo, MAP_READ);
1834
1835 start = last = query->oa.map;
1836 end = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
1837
1838 if (start[0] != query->oa.begin_report_id) {
1839 DBG("Spurious start report id=%"PRIu32"\n", start[0]);
1840 return true;
1841 }
1842 if (end[0] != (query->oa.begin_report_id + 1)) {
1843 DBG("Spurious end report id=%"PRIu32"\n", end[0]);
1844 return true;
1845 }
1846
1847 /* Read the reports until the end timestamp. */
1848 switch (read_oa_samples_until(perf_ctx, start[1], end[1])) {
1849 case OA_READ_STATUS_ERROR:
1850 /* Fallthrough and let accumulate_oa_reports() deal with the
1851 * error. */
1852 case OA_READ_STATUS_FINISHED:
1853 return true;
1854 case OA_READ_STATUS_UNFINISHED:
1855 return false;
1856 }
1857
1858 unreachable("invalid read status");
1859 return false;
1860 }
1861
1862 void
1863 gen_perf_wait_query(struct gen_perf_context *perf_ctx,
1864 struct gen_perf_query_object *query,
1865 void *current_batch)
1866 {
1867 struct gen_perf_config *perf_cfg = perf_ctx->perf;
1868 struct brw_bo *bo = NULL;
1869
1870 switch (query->queryinfo->kind) {
1871 case GEN_PERF_QUERY_TYPE_OA:
1872 case GEN_PERF_QUERY_TYPE_RAW:
1873 bo = query->oa.bo;
1874 break;
1875
1876 case GEN_PERF_QUERY_TYPE_PIPELINE:
1877 bo = query->pipeline_stats.bo;
1878 break;
1879
1880 default:
1881 unreachable("Unknown query type");
1882 break;
1883 }
1884
1885 if (bo == NULL)
1886 return;
1887
1888 /* If the current batch references our results bo then we need to
1889 * flush first...
1890 */
1891 if (perf_cfg->vtbl.batch_references(current_batch, bo))
1892 perf_cfg->vtbl.batchbuffer_flush(perf_ctx->ctx, __FILE__, __LINE__);
1893
1894 perf_cfg->vtbl.bo_wait_rendering(bo);
1895
1896 /* Due to a race condition between the OA unit signaling report
1897 * availability and the report actually being written into memory,
1898 * we need to wait for all the reports to come in before we can
1899 * read them.
1900 */
1901 if (query->queryinfo->kind == GEN_PERF_QUERY_TYPE_OA ||
1902 query->queryinfo->kind == GEN_PERF_QUERY_TYPE_RAW) {
1903 while (!read_oa_samples_for_query(perf_ctx, query, current_batch))
1904 ;
1905 }
1906 }
1907
1908 bool
1909 gen_perf_is_query_ready(struct gen_perf_context *perf_ctx,
1910 struct gen_perf_query_object *query,
1911 void *current_batch)
1912 {
1913 struct gen_perf_config *perf_cfg = perf_ctx->perf;
1914
1915 switch (query->queryinfo->kind) {
1916 case GEN_PERF_QUERY_TYPE_OA:
1917 case GEN_PERF_QUERY_TYPE_RAW:
1918 return (query->oa.results_accumulated ||
1919 (query->oa.bo &&
1920 !perf_cfg->vtbl.batch_references(current_batch, query->oa.bo) &&
1921 !perf_cfg->vtbl.bo_busy(query->oa.bo) &&
1922 read_oa_samples_for_query(perf_ctx, query, current_batch)));
1923 case GEN_PERF_QUERY_TYPE_PIPELINE:
1924 return (query->pipeline_stats.bo &&
1925 !perf_cfg->vtbl.batch_references(current_batch, query->pipeline_stats.bo) &&
1926 !perf_cfg->vtbl.bo_busy(query->pipeline_stats.bo));
1927
1928 default:
1929 unreachable("Unknown query type");
1930 break;
1931 }
1932
1933 return false;
1934 }
1935
1936 /**
1937 * Remove a query from the global list of unaccumulated queries once
1938 * after successfully accumulating the OA reports associated with the
1939 * query in accumulate_oa_reports() or when discarding unwanted query
1940 * results.
1941 */
1942 static void
1943 drop_from_unaccumulated_query_list(struct gen_perf_context *perf_ctx,
1944 struct gen_perf_query_object *query)
1945 {
1946 for (int i = 0; i < perf_ctx->unaccumulated_elements; i++) {
1947 if (perf_ctx->unaccumulated[i] == query) {
1948 int last_elt = --perf_ctx->unaccumulated_elements;
1949
1950 if (i == last_elt)
1951 perf_ctx->unaccumulated[i] = NULL;
1952 else {
1953 perf_ctx->unaccumulated[i] =
1954 perf_ctx->unaccumulated[last_elt];
1955 }
1956
1957 break;
1958 }
1959 }
1960
1961 /* Drop our samples_head reference so that associated periodic
1962 * sample data buffers can potentially be reaped if they aren't
1963 * referenced by any other queries...
1964 */
1965
1966 struct oa_sample_buf *buf =
1967 exec_node_data(struct oa_sample_buf, query->oa.samples_head, link);
1968
1969 assert(buf->refcount > 0);
1970 buf->refcount--;
1971
1972 query->oa.samples_head = NULL;
1973
1974 reap_old_sample_buffers(perf_ctx);
1975 }
1976
1977 /* In general if we see anything spurious while accumulating results,
1978 * we don't try and continue accumulating the current query, hoping
1979 * for the best, we scrap anything outstanding, and then hope for the
1980 * best with new queries.
1981 */
1982 static void
1983 discard_all_queries(struct gen_perf_context *perf_ctx)
1984 {
1985 while (perf_ctx->unaccumulated_elements) {
1986 struct gen_perf_query_object *query = perf_ctx->unaccumulated[0];
1987
1988 query->oa.results_accumulated = true;
1989 drop_from_unaccumulated_query_list(perf_ctx, query);
1990
1991 dec_n_users(perf_ctx);
1992 }
1993 }
1994
1995 /**
1996 * Accumulate raw OA counter values based on deltas between pairs of
1997 * OA reports.
1998 *
1999 * Accumulation starts from the first report captured via
2000 * MI_REPORT_PERF_COUNT (MI_RPC) by brw_begin_perf_query() until the
2001 * last MI_RPC report requested by brw_end_perf_query(). Between these
2002 * two reports there may also some number of periodically sampled OA
2003 * reports collected via the i915 perf interface - depending on the
2004 * duration of the query.
2005 *
2006 * These periodic snapshots help to ensure we handle counter overflow
2007 * correctly by being frequent enough to ensure we don't miss multiple
2008 * overflows of a counter between snapshots. For Gen8+ the i915 perf
2009 * snapshots provide the extra context-switch reports that let us
2010 * subtract out the progress of counters associated with other
2011 * contexts running on the system.
2012 */
2013 static void
2014 accumulate_oa_reports(struct gen_perf_context *perf_ctx,
2015 struct gen_perf_query_object *query)
2016 {
2017 const struct gen_device_info *devinfo = perf_ctx->devinfo;
2018 uint32_t *start;
2019 uint32_t *last;
2020 uint32_t *end;
2021 struct exec_node *first_samples_node;
2022 bool in_ctx = true;
2023 int out_duration = 0;
2024
2025 assert(query->oa.map != NULL);
2026
2027 start = last = query->oa.map;
2028 end = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
2029
2030 if (start[0] != query->oa.begin_report_id) {
2031 DBG("Spurious start report id=%"PRIu32"\n", start[0]);
2032 goto error;
2033 }
2034 if (end[0] != (query->oa.begin_report_id + 1)) {
2035 DBG("Spurious end report id=%"PRIu32"\n", end[0]);
2036 goto error;
2037 }
2038
2039 /* See if we have any periodic reports to accumulate too... */
2040
2041 /* N.B. The oa.samples_head was set when the query began and
2042 * pointed to the tail of the perf_ctx->sample_buffers list at
2043 * the time the query started. Since the buffer existed before the
2044 * first MI_REPORT_PERF_COUNT command was emitted we therefore know
2045 * that no data in this particular node's buffer can possibly be
2046 * associated with the query - so skip ahead one...
2047 */
2048 first_samples_node = query->oa.samples_head->next;
2049
2050 foreach_list_typed_from(struct oa_sample_buf, buf, link,
2051 &perf_ctx.sample_buffers,
2052 first_samples_node)
2053 {
2054 int offset = 0;
2055
2056 while (offset < buf->len) {
2057 const struct drm_i915_perf_record_header *header =
2058 (const struct drm_i915_perf_record_header *)(buf->buf + offset);
2059
2060 assert(header->size != 0);
2061 assert(header->size <= buf->len);
2062
2063 offset += header->size;
2064
2065 switch (header->type) {
2066 case DRM_I915_PERF_RECORD_SAMPLE: {
2067 uint32_t *report = (uint32_t *)(header + 1);
2068 bool add = true;
2069
2070 /* Ignore reports that come before the start marker.
2071 * (Note: takes care to allow overflow of 32bit timestamps)
2072 */
2073 if (gen_device_info_timebase_scale(devinfo,
2074 report[1] - start[1]) > 5000000000) {
2075 continue;
2076 }
2077
2078 /* Ignore reports that come after the end marker.
2079 * (Note: takes care to allow overflow of 32bit timestamps)
2080 */
2081 if (gen_device_info_timebase_scale(devinfo,
2082 report[1] - end[1]) <= 5000000000) {
2083 goto end;
2084 }
2085
2086 /* For Gen8+ since the counters continue while other
2087 * contexts are running we need to discount any unrelated
2088 * deltas. The hardware automatically generates a report
2089 * on context switch which gives us a new reference point
2090 * to continuing adding deltas from.
2091 *
2092 * For Haswell we can rely on the HW to stop the progress
2093 * of OA counters while any other context is acctive.
2094 */
2095 if (devinfo->gen >= 8) {
2096 if (in_ctx && report[2] != query->oa.result.hw_id) {
2097 DBG("i915 perf: Switch AWAY (observed by ID change)\n");
2098 in_ctx = false;
2099 out_duration = 0;
2100 } else if (in_ctx == false && report[2] == query->oa.result.hw_id) {
2101 DBG("i915 perf: Switch TO\n");
2102 in_ctx = true;
2103
2104 /* From experimentation in IGT, we found that the OA unit
2105 * might label some report as "idle" (using an invalid
2106 * context ID), right after a report for a given context.
2107 * Deltas generated by those reports actually belong to the
2108 * previous context, even though they're not labelled as
2109 * such.
2110 *
2111 * We didn't *really* Switch AWAY in the case that we e.g.
2112 * saw a single periodic report while idle...
2113 */
2114 if (out_duration >= 1)
2115 add = false;
2116 } else if (in_ctx) {
2117 assert(report[2] == query->oa.result.hw_id);
2118 DBG("i915 perf: Continuation IN\n");
2119 } else {
2120 assert(report[2] != query->oa.result.hw_id);
2121 DBG("i915 perf: Continuation OUT\n");
2122 add = false;
2123 out_duration++;
2124 }
2125 }
2126
2127 if (add) {
2128 query_result_accumulate(&query->oa.result, query->queryinfo,
2129 last, report);
2130 }
2131
2132 last = report;
2133
2134 break;
2135 }
2136
2137 case DRM_I915_PERF_RECORD_OA_BUFFER_LOST:
2138 DBG("i915 perf: OA error: all reports lost\n");
2139 goto error;
2140 case DRM_I915_PERF_RECORD_OA_REPORT_LOST:
2141 DBG("i915 perf: OA report lost\n");
2142 break;
2143 }
2144 }
2145 }
2146
2147 end:
2148
2149 query_result_accumulate(&query->oa.result, query->queryinfo,
2150 last, end);
2151
2152 query->oa.results_accumulated = true;
2153 drop_from_unaccumulated_query_list(perf_ctx, query);
2154 dec_n_users(perf_ctx);
2155
2156 return;
2157
2158 error:
2159
2160 discard_all_queries(perf_ctx);
2161 }
2162
2163 void
2164 gen_perf_delete_query(struct gen_perf_context *perf_ctx,
2165 struct gen_perf_query_object *query)
2166 {
2167 struct gen_perf_config *perf_cfg = perf_ctx->perf;
2168
2169 /* We can assume that the frontend waits for a query to complete
2170 * before ever calling into here, so we don't have to worry about
2171 * deleting an in-flight query object.
2172 */
2173 switch (query->queryinfo->kind) {
2174 case GEN_PERF_QUERY_TYPE_OA:
2175 case GEN_PERF_QUERY_TYPE_RAW:
2176 if (query->oa.bo) {
2177 if (!query->oa.results_accumulated) {
2178 drop_from_unaccumulated_query_list(perf_ctx, query);
2179 dec_n_users(perf_ctx);
2180 }
2181
2182 perf_cfg->vtbl.bo_unreference(query->oa.bo);
2183 query->oa.bo = NULL;
2184 }
2185
2186 query->oa.results_accumulated = false;
2187 break;
2188
2189 case GEN_PERF_QUERY_TYPE_PIPELINE:
2190 if (query->pipeline_stats.bo) {
2191 perf_cfg->vtbl.bo_unreference(query->pipeline_stats.bo);
2192 query->pipeline_stats.bo = NULL;
2193 }
2194 break;
2195
2196 default:
2197 unreachable("Unknown query type");
2198 break;
2199 }
2200
2201 /* As an indication that the INTEL_performance_query extension is no
2202 * longer in use, it's a good time to free our cache of sample
2203 * buffers and close any current i915-perf stream.
2204 */
2205 if (--perf_ctx->n_query_instances == 0) {
2206 free_sample_bufs(perf_ctx);
2207 gen_perf_close(perf_ctx, query->queryinfo);
2208 }
2209
2210 free(query);
2211 }
2212
2213 #define GET_FIELD(word, field) (((word) & field ## _MASK) >> field ## _SHIFT)
2214
2215 static void
2216 read_gt_frequency(struct gen_perf_context *perf_ctx,
2217 struct gen_perf_query_object *obj)
2218 {
2219 const struct gen_device_info *devinfo = perf_ctx->devinfo;
2220 uint32_t start = *((uint32_t *)(obj->oa.map + MI_FREQ_START_OFFSET_BYTES)),
2221 end = *((uint32_t *)(obj->oa.map + MI_FREQ_END_OFFSET_BYTES));
2222
2223 switch (devinfo->gen) {
2224 case 7:
2225 case 8:
2226 obj->oa.gt_frequency[0] = GET_FIELD(start, GEN7_RPSTAT1_CURR_GT_FREQ) * 50ULL;
2227 obj->oa.gt_frequency[1] = GET_FIELD(end, GEN7_RPSTAT1_CURR_GT_FREQ) * 50ULL;
2228 break;
2229 case 9:
2230 case 10:
2231 case 11:
2232 obj->oa.gt_frequency[0] = GET_FIELD(start, GEN9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL;
2233 obj->oa.gt_frequency[1] = GET_FIELD(end, GEN9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL;
2234 break;
2235 default:
2236 unreachable("unexpected gen");
2237 }
2238
2239 /* Put the numbers into Hz. */
2240 obj->oa.gt_frequency[0] *= 1000000ULL;
2241 obj->oa.gt_frequency[1] *= 1000000ULL;
2242 }
2243
2244 static int
2245 get_oa_counter_data(struct gen_perf_context *perf_ctx,
2246 struct gen_perf_query_object *query,
2247 size_t data_size,
2248 uint8_t *data)
2249 {
2250 struct gen_perf_config *perf_cfg = perf_ctx->perf;
2251 const struct gen_perf_query_info *queryinfo = query->queryinfo;
2252 int n_counters = queryinfo->n_counters;
2253 int written = 0;
2254
2255 for (int i = 0; i < n_counters; i++) {
2256 const struct gen_perf_query_counter *counter = &queryinfo->counters[i];
2257 uint64_t *out_uint64;
2258 float *out_float;
2259 size_t counter_size = gen_perf_query_counter_get_size(counter);
2260
2261 if (counter_size) {
2262 switch (counter->data_type) {
2263 case GEN_PERF_COUNTER_DATA_TYPE_UINT64:
2264 out_uint64 = (uint64_t *)(data + counter->offset);
2265 *out_uint64 =
2266 counter->oa_counter_read_uint64(perf_cfg, queryinfo,
2267 query->oa.result.accumulator);
2268 break;
2269 case GEN_PERF_COUNTER_DATA_TYPE_FLOAT:
2270 out_float = (float *)(data + counter->offset);
2271 *out_float =
2272 counter->oa_counter_read_float(perf_cfg, queryinfo,
2273 query->oa.result.accumulator);
2274 break;
2275 default:
2276 /* So far we aren't using uint32, double or bool32... */
2277 unreachable("unexpected counter data type");
2278 }
2279 written = counter->offset + counter_size;
2280 }
2281 }
2282
2283 return written;
2284 }
2285
2286 static int
2287 get_pipeline_stats_data(struct gen_perf_context *perf_ctx,
2288 struct gen_perf_query_object *query,
2289 size_t data_size,
2290 uint8_t *data)
2291
2292 {
2293 struct gen_perf_config *perf_cfg = perf_ctx->perf;
2294 const struct gen_perf_query_info *queryinfo = query->queryinfo;
2295 int n_counters = queryinfo->n_counters;
2296 uint8_t *p = data;
2297
2298 uint64_t *start = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->pipeline_stats.bo, MAP_READ);
2299 uint64_t *end = start + (STATS_BO_END_OFFSET_BYTES / sizeof(uint64_t));
2300
2301 for (int i = 0; i < n_counters; i++) {
2302 const struct gen_perf_query_counter *counter = &queryinfo->counters[i];
2303 uint64_t value = end[i] - start[i];
2304
2305 if (counter->pipeline_stat.numerator !=
2306 counter->pipeline_stat.denominator) {
2307 value *= counter->pipeline_stat.numerator;
2308 value /= counter->pipeline_stat.denominator;
2309 }
2310
2311 *((uint64_t *)p) = value;
2312 p += 8;
2313 }
2314
2315 perf_cfg->vtbl.bo_unmap(query->pipeline_stats.bo);
2316
2317 return p - data;
2318 }
2319
2320 void
2321 gen_perf_get_query_data(struct gen_perf_context *perf_ctx,
2322 struct gen_perf_query_object *query,
2323 int data_size,
2324 unsigned *data,
2325 unsigned *bytes_written)
2326 {
2327 struct gen_perf_config *perf_cfg = perf_ctx->perf;
2328 int written = 0;
2329
2330 switch (query->queryinfo->kind) {
2331 case GEN_PERF_QUERY_TYPE_OA:
2332 case GEN_PERF_QUERY_TYPE_RAW:
2333 if (!query->oa.results_accumulated) {
2334 read_gt_frequency(perf_ctx, query);
2335 uint32_t *begin_report = query->oa.map;
2336 uint32_t *end_report = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
2337 query_result_read_frequencies(&query->oa.result,
2338 perf_ctx->devinfo,
2339 begin_report,
2340 end_report);
2341 accumulate_oa_reports(perf_ctx, query);
2342 assert(query->oa.results_accumulated);
2343
2344 perf_cfg->vtbl.bo_unmap(query->oa.bo);
2345 query->oa.map = NULL;
2346 }
2347 if (query->queryinfo->kind == GEN_PERF_QUERY_TYPE_OA) {
2348 written = get_oa_counter_data(perf_ctx, query, data_size, (uint8_t *)data);
2349 } else {
2350 const struct gen_device_info *devinfo = perf_ctx->devinfo;
2351
2352 written = gen_perf_query_result_write_mdapi((uint8_t *)data, data_size,
2353 devinfo, &query->oa.result,
2354 query->oa.gt_frequency[0],
2355 query->oa.gt_frequency[1]);
2356 }
2357 break;
2358
2359 case GEN_PERF_QUERY_TYPE_PIPELINE:
2360 written = get_pipeline_stats_data(perf_ctx, query, data_size, (uint8_t *)data);
2361 break;
2362
2363 default:
2364 unreachable("Unknown query type");
2365 break;
2366 }
2367
2368 if (bytes_written)
2369 *bytes_written = written;
2370 }
2371
2372 void
2373 gen_perf_dump_query_count(struct gen_perf_context *perf_ctx)
2374 {
2375 DBG("Queries: (Open queries = %d, OA users = %d)\n",
2376 perf_ctx->n_active_oa_queries, perf_ctx->n_oa_users);
2377 }
2378
2379 void
2380 gen_perf_dump_query(struct gen_perf_context *ctx,
2381 struct gen_perf_query_object *obj,
2382 void *current_batch)
2383 {
2384 switch (obj->queryinfo->kind) {
2385 case GEN_PERF_QUERY_TYPE_OA:
2386 case GEN_PERF_QUERY_TYPE_RAW:
2387 DBG("BO: %-4s OA data: %-10s %-15s\n",
2388 obj->oa.bo ? "yes," : "no,",
2389 gen_perf_is_query_ready(ctx, obj, current_batch) ? "ready," : "not ready,",
2390 obj->oa.results_accumulated ? "accumulated" : "not accumulated");
2391 break;
2392 case GEN_PERF_QUERY_TYPE_PIPELINE:
2393 DBG("BO: %-4s\n",
2394 obj->pipeline_stats.bo ? "yes" : "no");
2395 break;
2396 default:
2397 unreachable("Unknown query type");
2398 break;
2399 }
2400 }