intel/perf: make oa_sample_buffers private
[mesa.git] / src / intel / perf / gen_perf.c
1 /*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <dirent.h>
25
26 #include <sys/types.h>
27 #include <sys/stat.h>
28 #include <fcntl.h>
29 #include <unistd.h>
30 #include <errno.h>
31
32 #include <drm-uapi/i915_drm.h>
33
34 #include "common/gen_gem.h"
35 #include "gen_perf.h"
36 #include "perf/gen_perf_mdapi.h"
37 #include "perf/gen_perf_metrics.h"
38
39 #include "dev/gen_debug.h"
40 #include "dev/gen_device_info.h"
41 #include "util/bitscan.h"
42 #include "util/u_math.h"
43
44 #define FILE_DEBUG_FLAG DEBUG_PERFMON
45 #define MI_RPC_BO_SIZE 4096
46 #define MI_FREQ_START_OFFSET_BYTES (3072)
47 #define MI_RPC_BO_END_OFFSET_BYTES (MI_RPC_BO_SIZE / 2)
48 #define MI_FREQ_END_OFFSET_BYTES (3076)
49
50 #define INTEL_MASK(high, low) (((1u<<((high)-(low)+1))-1)<<(low))
51
52 #define GEN7_RPSTAT1 0xA01C
53 #define GEN7_RPSTAT1_CURR_GT_FREQ_SHIFT 7
54 #define GEN7_RPSTAT1_CURR_GT_FREQ_MASK INTEL_MASK(13, 7)
55 #define GEN7_RPSTAT1_PREV_GT_FREQ_SHIFT 0
56 #define GEN7_RPSTAT1_PREV_GT_FREQ_MASK INTEL_MASK(6, 0)
57
58 #define GEN9_RPSTAT0 0xA01C
59 #define GEN9_RPSTAT0_CURR_GT_FREQ_SHIFT 23
60 #define GEN9_RPSTAT0_CURR_GT_FREQ_MASK INTEL_MASK(31, 23)
61 #define GEN9_RPSTAT0_PREV_GT_FREQ_SHIFT 0
62 #define GEN9_RPSTAT0_PREV_GT_FREQ_MASK INTEL_MASK(8, 0)
63
64 #define GEN6_SO_PRIM_STORAGE_NEEDED 0x2280
65 #define GEN7_SO_PRIM_STORAGE_NEEDED(n) (0x5240 + (n) * 8)
66 #define GEN6_SO_NUM_PRIMS_WRITTEN 0x2288
67 #define GEN7_SO_NUM_PRIMS_WRITTEN(n) (0x5200 + (n) * 8)
68
69 #define MAP_READ (1 << 0)
70 #define MAP_WRITE (1 << 1)
71
72 /**
73 * Periodic OA samples are read() into these buffer structures via the
74 * i915 perf kernel interface and appended to the
75 * perf_ctx->sample_buffers linked list. When we process the
76 * results of an OA metrics query we need to consider all the periodic
77 * samples between the Begin and End MI_REPORT_PERF_COUNT command
78 * markers.
79 *
80 * 'Periodic' is a simplification as there are other automatic reports
81 * written by the hardware also buffered here.
82 *
83 * Considering three queries, A, B and C:
84 *
85 * Time ---->
86 * ________________A_________________
87 * | |
88 * | ________B_________ _____C___________
89 * | | | | | |
90 *
91 * And an illustration of sample buffers read over this time frame:
92 * [HEAD ][ ][ ][ ][ ][ ][ ][ ][TAIL ]
93 *
94 * These nodes may hold samples for query A:
95 * [ ][ ][ A ][ A ][ A ][ A ][ A ][ ][ ]
96 *
97 * These nodes may hold samples for query B:
98 * [ ][ ][ B ][ B ][ B ][ ][ ][ ][ ]
99 *
100 * These nodes may hold samples for query C:
101 * [ ][ ][ ][ ][ ][ C ][ C ][ C ][ ]
102 *
103 * The illustration assumes we have an even distribution of periodic
104 * samples so all nodes have the same size plotted against time:
105 *
106 * Note, to simplify code, the list is never empty.
107 *
108 * With overlapping queries we can see that periodic OA reports may
109 * relate to multiple queries and care needs to be take to keep
110 * track of sample buffers until there are no queries that might
111 * depend on their contents.
112 *
113 * We use a node ref counting system where a reference ensures that a
114 * node and all following nodes can't be freed/recycled until the
115 * reference drops to zero.
116 *
117 * E.g. with a ref of one here:
118 * [ 0 ][ 0 ][ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ]
119 *
120 * These nodes could be freed or recycled ("reaped"):
121 * [ 0 ][ 0 ]
122 *
123 * These must be preserved until the leading ref drops to zero:
124 * [ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ]
125 *
126 * When a query starts we take a reference on the current tail of
127 * the list, knowing that no already-buffered samples can possibly
128 * relate to the newly-started query. A pointer to this node is
129 * also saved in the query object's ->oa.samples_head.
130 *
131 * E.g. starting query A while there are two nodes in .sample_buffers:
132 * ________________A________
133 * |
134 *
135 * [ 0 ][ 1 ]
136 * ^_______ Add a reference and store pointer to node in
137 * A->oa.samples_head
138 *
139 * Moving forward to when the B query starts with no new buffer nodes:
140 * (for reference, i915 perf reads() are only done when queries finish)
141 * ________________A_______
142 * | ________B___
143 * | |
144 *
145 * [ 0 ][ 2 ]
146 * ^_______ Add a reference and store pointer to
147 * node in B->oa.samples_head
148 *
149 * Once a query is finished, after an OA query has become 'Ready',
150 * once the End OA report has landed and after we we have processed
151 * all the intermediate periodic samples then we drop the
152 * ->oa.samples_head reference we took at the start.
153 *
154 * So when the B query has finished we have:
155 * ________________A________
156 * | ______B___________
157 * | | |
158 * [ 0 ][ 1 ][ 0 ][ 0 ][ 0 ]
159 * ^_______ Drop B->oa.samples_head reference
160 *
161 * We still can't free these due to the A->oa.samples_head ref:
162 * [ 1 ][ 0 ][ 0 ][ 0 ]
163 *
164 * When the A query finishes: (note there's a new ref for C's samples_head)
165 * ________________A_________________
166 * | |
167 * | _____C_________
168 * | | |
169 * [ 0 ][ 0 ][ 0 ][ 0 ][ 1 ][ 0 ][ 0 ]
170 * ^_______ Drop A->oa.samples_head reference
171 *
172 * And we can now reap these nodes up to the C->oa.samples_head:
173 * [ X ][ X ][ X ][ X ]
174 * keeping -> [ 1 ][ 0 ][ 0 ]
175 *
176 * We reap old sample buffers each time we finish processing an OA
177 * query by iterating the sample_buffers list from the head until we
178 * find a referenced node and stop.
179 *
180 * Reaped buffers move to a perfquery.free_sample_buffers list and
181 * when we come to read() we first look to recycle a buffer from the
182 * free_sample_buffers list before allocating a new buffer.
183 */
184 struct oa_sample_buf {
185 struct exec_node link;
186 int refcount;
187 int len;
188 uint8_t buf[I915_PERF_OA_SAMPLE_SIZE * 10];
189 uint32_t last_timestamp;
190 };
191
192 struct gen_perf_query_object *
193 gen_perf_new_query(struct gen_perf_context *perf_ctx, unsigned query_index)
194 {
195 const struct gen_perf_query_info *query =
196 &perf_ctx->perf->queries[query_index];
197 struct gen_perf_query_object *obj =
198 calloc(1, sizeof(struct gen_perf_query_object));
199
200 if (!obj)
201 return NULL;
202
203 obj->queryinfo = query;
204
205 perf_ctx->n_query_instances++;
206 return obj;
207 }
208
209 static bool
210 get_sysfs_dev_dir(struct gen_perf_config *perf, int fd)
211 {
212 struct stat sb;
213 int min, maj;
214 DIR *drmdir;
215 struct dirent *drm_entry;
216 int len;
217
218 perf->sysfs_dev_dir[0] = '\0';
219
220 if (fstat(fd, &sb)) {
221 DBG("Failed to stat DRM fd\n");
222 return false;
223 }
224
225 maj = major(sb.st_rdev);
226 min = minor(sb.st_rdev);
227
228 if (!S_ISCHR(sb.st_mode)) {
229 DBG("DRM fd is not a character device as expected\n");
230 return false;
231 }
232
233 len = snprintf(perf->sysfs_dev_dir,
234 sizeof(perf->sysfs_dev_dir),
235 "/sys/dev/char/%d:%d/device/drm", maj, min);
236 if (len < 0 || len >= sizeof(perf->sysfs_dev_dir)) {
237 DBG("Failed to concatenate sysfs path to drm device\n");
238 return false;
239 }
240
241 drmdir = opendir(perf->sysfs_dev_dir);
242 if (!drmdir) {
243 DBG("Failed to open %s: %m\n", perf->sysfs_dev_dir);
244 return false;
245 }
246
247 while ((drm_entry = readdir(drmdir))) {
248 if ((drm_entry->d_type == DT_DIR ||
249 drm_entry->d_type == DT_LNK) &&
250 strncmp(drm_entry->d_name, "card", 4) == 0)
251 {
252 len = snprintf(perf->sysfs_dev_dir,
253 sizeof(perf->sysfs_dev_dir),
254 "/sys/dev/char/%d:%d/device/drm/%s",
255 maj, min, drm_entry->d_name);
256 closedir(drmdir);
257 if (len < 0 || len >= sizeof(perf->sysfs_dev_dir))
258 return false;
259 else
260 return true;
261 }
262 }
263
264 closedir(drmdir);
265
266 DBG("Failed to find cardX directory under /sys/dev/char/%d:%d/device/drm\n",
267 maj, min);
268
269 return false;
270 }
271
272 static bool
273 read_file_uint64(const char *file, uint64_t *val)
274 {
275 char buf[32];
276 int fd, n;
277
278 fd = open(file, 0);
279 if (fd < 0)
280 return false;
281 while ((n = read(fd, buf, sizeof (buf) - 1)) < 0 &&
282 errno == EINTR);
283 close(fd);
284 if (n < 0)
285 return false;
286
287 buf[n] = '\0';
288 *val = strtoull(buf, NULL, 0);
289
290 return true;
291 }
292
293 static bool
294 read_sysfs_drm_device_file_uint64(struct gen_perf_config *perf,
295 const char *file,
296 uint64_t *value)
297 {
298 char buf[512];
299 int len;
300
301 len = snprintf(buf, sizeof(buf), "%s/%s", perf->sysfs_dev_dir, file);
302 if (len < 0 || len >= sizeof(buf)) {
303 DBG("Failed to concatenate sys filename to read u64 from\n");
304 return false;
305 }
306
307 return read_file_uint64(buf, value);
308 }
309
310 static inline struct gen_perf_query_info *
311 append_query_info(struct gen_perf_config *perf, int max_counters)
312 {
313 struct gen_perf_query_info *query;
314
315 perf->queries = reralloc(perf, perf->queries,
316 struct gen_perf_query_info,
317 ++perf->n_queries);
318 query = &perf->queries[perf->n_queries - 1];
319 memset(query, 0, sizeof(*query));
320
321 if (max_counters > 0) {
322 query->max_counters = max_counters;
323 query->counters =
324 rzalloc_array(perf, struct gen_perf_query_counter, max_counters);
325 }
326
327 return query;
328 }
329
330 static void
331 register_oa_config(struct gen_perf_config *perf,
332 const struct gen_perf_query_info *query,
333 uint64_t config_id)
334 {
335 struct gen_perf_query_info *registred_query = append_query_info(perf, 0);
336
337 *registred_query = *query;
338 registred_query->oa_metrics_set_id = config_id;
339 DBG("metric set registred: id = %" PRIu64", guid = %s\n",
340 registred_query->oa_metrics_set_id, query->guid);
341 }
342
343 static void
344 enumerate_sysfs_metrics(struct gen_perf_config *perf)
345 {
346 DIR *metricsdir = NULL;
347 struct dirent *metric_entry;
348 char buf[256];
349 int len;
350
351 len = snprintf(buf, sizeof(buf), "%s/metrics", perf->sysfs_dev_dir);
352 if (len < 0 || len >= sizeof(buf)) {
353 DBG("Failed to concatenate path to sysfs metrics/ directory\n");
354 return;
355 }
356
357 metricsdir = opendir(buf);
358 if (!metricsdir) {
359 DBG("Failed to open %s: %m\n", buf);
360 return;
361 }
362
363 while ((metric_entry = readdir(metricsdir))) {
364 struct hash_entry *entry;
365
366 if ((metric_entry->d_type != DT_DIR &&
367 metric_entry->d_type != DT_LNK) ||
368 metric_entry->d_name[0] == '.')
369 continue;
370
371 DBG("metric set: %s\n", metric_entry->d_name);
372 entry = _mesa_hash_table_search(perf->oa_metrics_table,
373 metric_entry->d_name);
374 if (entry) {
375 uint64_t id;
376
377 len = snprintf(buf, sizeof(buf), "%s/metrics/%s/id",
378 perf->sysfs_dev_dir, metric_entry->d_name);
379 if (len < 0 || len >= sizeof(buf)) {
380 DBG("Failed to concatenate path to sysfs metric id file\n");
381 continue;
382 }
383
384 if (!read_file_uint64(buf, &id)) {
385 DBG("Failed to read metric set id from %s: %m", buf);
386 continue;
387 }
388
389 register_oa_config(perf, (const struct gen_perf_query_info *)entry->data, id);
390 } else
391 DBG("metric set not known by mesa (skipping)\n");
392 }
393
394 closedir(metricsdir);
395 }
396
397 static bool
398 kernel_has_dynamic_config_support(struct gen_perf_config *perf, int fd)
399 {
400 uint64_t invalid_config_id = UINT64_MAX;
401
402 return gen_ioctl(fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG,
403 &invalid_config_id) < 0 && errno == ENOENT;
404 }
405
406 bool
407 gen_perf_load_metric_id(struct gen_perf_config *perf, const char *guid,
408 uint64_t *metric_id)
409 {
410 char config_path[280];
411
412 snprintf(config_path, sizeof(config_path), "%s/metrics/%s/id",
413 perf->sysfs_dev_dir, guid);
414
415 /* Don't recreate already loaded configs. */
416 return read_file_uint64(config_path, metric_id);
417 }
418
419 static void
420 init_oa_configs(struct gen_perf_config *perf, int fd)
421 {
422 hash_table_foreach(perf->oa_metrics_table, entry) {
423 const struct gen_perf_query_info *query = entry->data;
424 struct drm_i915_perf_oa_config config;
425 uint64_t config_id;
426 int ret;
427
428 if (gen_perf_load_metric_id(perf, query->guid, &config_id)) {
429 DBG("metric set: %s (already loaded)\n", query->guid);
430 register_oa_config(perf, query, config_id);
431 continue;
432 }
433
434 memset(&config, 0, sizeof(config));
435
436 memcpy(config.uuid, query->guid, sizeof(config.uuid));
437
438 config.n_mux_regs = query->n_mux_regs;
439 config.mux_regs_ptr = (uintptr_t) query->mux_regs;
440
441 config.n_boolean_regs = query->n_b_counter_regs;
442 config.boolean_regs_ptr = (uintptr_t) query->b_counter_regs;
443
444 config.n_flex_regs = query->n_flex_regs;
445 config.flex_regs_ptr = (uintptr_t) query->flex_regs;
446
447 ret = gen_ioctl(fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &config);
448 if (ret < 0) {
449 DBG("Failed to load \"%s\" (%s) metrics set in kernel: %s\n",
450 query->name, query->guid, strerror(errno));
451 continue;
452 }
453
454 register_oa_config(perf, query, ret);
455 DBG("metric set: %s (added)\n", query->guid);
456 }
457 }
458
459 static void
460 compute_topology_builtins(struct gen_perf_config *perf,
461 const struct gen_device_info *devinfo)
462 {
463 perf->sys_vars.slice_mask = devinfo->slice_masks;
464 perf->sys_vars.n_eu_slices = devinfo->num_slices;
465
466 for (int i = 0; i < sizeof(devinfo->subslice_masks[i]); i++) {
467 perf->sys_vars.n_eu_sub_slices +=
468 __builtin_popcount(devinfo->subslice_masks[i]);
469 }
470
471 for (int i = 0; i < sizeof(devinfo->eu_masks); i++)
472 perf->sys_vars.n_eus += __builtin_popcount(devinfo->eu_masks[i]);
473
474 perf->sys_vars.eu_threads_count = devinfo->num_thread_per_eu;
475
476 /* The subslice mask builtin contains bits for all slices. Prior to Gen11
477 * it had groups of 3bits for each slice, on Gen11 it's 8bits for each
478 * slice.
479 *
480 * Ideally equations would be updated to have a slice/subslice query
481 * function/operator.
482 */
483 perf->sys_vars.subslice_mask = 0;
484
485 int bits_per_subslice = devinfo->gen == 11 ? 8 : 3;
486
487 for (int s = 0; s < util_last_bit(devinfo->slice_masks); s++) {
488 for (int ss = 0; ss < (devinfo->subslice_slice_stride * 8); ss++) {
489 if (gen_device_info_subslice_available(devinfo, s, ss))
490 perf->sys_vars.subslice_mask |= 1ULL << (s * bits_per_subslice + ss);
491 }
492 }
493 }
494
495 static bool
496 init_oa_sys_vars(struct gen_perf_config *perf, const struct gen_device_info *devinfo)
497 {
498 uint64_t min_freq_mhz = 0, max_freq_mhz = 0;
499
500 if (!read_sysfs_drm_device_file_uint64(perf, "gt_min_freq_mhz", &min_freq_mhz))
501 return false;
502
503 if (!read_sysfs_drm_device_file_uint64(perf, "gt_max_freq_mhz", &max_freq_mhz))
504 return false;
505
506 memset(&perf->sys_vars, 0, sizeof(perf->sys_vars));
507 perf->sys_vars.gt_min_freq = min_freq_mhz * 1000000;
508 perf->sys_vars.gt_max_freq = max_freq_mhz * 1000000;
509 perf->sys_vars.timestamp_frequency = devinfo->timestamp_frequency;
510 perf->sys_vars.revision = devinfo->revision;
511 compute_topology_builtins(perf, devinfo);
512
513 return true;
514 }
515
516 typedef void (*perf_register_oa_queries_t)(struct gen_perf_config *);
517
518 static perf_register_oa_queries_t
519 get_register_queries_function(const struct gen_device_info *devinfo)
520 {
521 if (devinfo->is_haswell)
522 return gen_oa_register_queries_hsw;
523 if (devinfo->is_cherryview)
524 return gen_oa_register_queries_chv;
525 if (devinfo->is_broadwell)
526 return gen_oa_register_queries_bdw;
527 if (devinfo->is_broxton)
528 return gen_oa_register_queries_bxt;
529 if (devinfo->is_skylake) {
530 if (devinfo->gt == 2)
531 return gen_oa_register_queries_sklgt2;
532 if (devinfo->gt == 3)
533 return gen_oa_register_queries_sklgt3;
534 if (devinfo->gt == 4)
535 return gen_oa_register_queries_sklgt4;
536 }
537 if (devinfo->is_kabylake) {
538 if (devinfo->gt == 2)
539 return gen_oa_register_queries_kblgt2;
540 if (devinfo->gt == 3)
541 return gen_oa_register_queries_kblgt3;
542 }
543 if (devinfo->is_geminilake)
544 return gen_oa_register_queries_glk;
545 if (devinfo->is_coffeelake) {
546 if (devinfo->gt == 2)
547 return gen_oa_register_queries_cflgt2;
548 if (devinfo->gt == 3)
549 return gen_oa_register_queries_cflgt3;
550 }
551 if (devinfo->is_cannonlake)
552 return gen_oa_register_queries_cnl;
553 if (devinfo->gen == 11)
554 return gen_oa_register_queries_icl;
555
556 return NULL;
557 }
558
559 static inline void
560 add_stat_reg(struct gen_perf_query_info *query, uint32_t reg,
561 uint32_t numerator, uint32_t denominator,
562 const char *name, const char *description)
563 {
564 struct gen_perf_query_counter *counter;
565
566 assert(query->n_counters < query->max_counters);
567
568 counter = &query->counters[query->n_counters];
569 counter->name = name;
570 counter->desc = description;
571 counter->type = GEN_PERF_COUNTER_TYPE_RAW;
572 counter->data_type = GEN_PERF_COUNTER_DATA_TYPE_UINT64;
573 counter->offset = sizeof(uint64_t) * query->n_counters;
574 counter->pipeline_stat.reg = reg;
575 counter->pipeline_stat.numerator = numerator;
576 counter->pipeline_stat.denominator = denominator;
577
578 query->n_counters++;
579 }
580
581 static inline void
582 add_basic_stat_reg(struct gen_perf_query_info *query,
583 uint32_t reg, const char *name)
584 {
585 add_stat_reg(query, reg, 1, 1, name, name);
586 }
587
588 static void
589 load_pipeline_statistic_metrics(struct gen_perf_config *perf_cfg,
590 const struct gen_device_info *devinfo)
591 {
592 struct gen_perf_query_info *query =
593 append_query_info(perf_cfg, MAX_STAT_COUNTERS);
594
595 query->kind = GEN_PERF_QUERY_TYPE_PIPELINE;
596 query->name = "Pipeline Statistics Registers";
597
598 add_basic_stat_reg(query, IA_VERTICES_COUNT,
599 "N vertices submitted");
600 add_basic_stat_reg(query, IA_PRIMITIVES_COUNT,
601 "N primitives submitted");
602 add_basic_stat_reg(query, VS_INVOCATION_COUNT,
603 "N vertex shader invocations");
604
605 if (devinfo->gen == 6) {
606 add_stat_reg(query, GEN6_SO_PRIM_STORAGE_NEEDED, 1, 1,
607 "SO_PRIM_STORAGE_NEEDED",
608 "N geometry shader stream-out primitives (total)");
609 add_stat_reg(query, GEN6_SO_NUM_PRIMS_WRITTEN, 1, 1,
610 "SO_NUM_PRIMS_WRITTEN",
611 "N geometry shader stream-out primitives (written)");
612 } else {
613 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(0), 1, 1,
614 "SO_PRIM_STORAGE_NEEDED (Stream 0)",
615 "N stream-out (stream 0) primitives (total)");
616 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(1), 1, 1,
617 "SO_PRIM_STORAGE_NEEDED (Stream 1)",
618 "N stream-out (stream 1) primitives (total)");
619 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(2), 1, 1,
620 "SO_PRIM_STORAGE_NEEDED (Stream 2)",
621 "N stream-out (stream 2) primitives (total)");
622 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(3), 1, 1,
623 "SO_PRIM_STORAGE_NEEDED (Stream 3)",
624 "N stream-out (stream 3) primitives (total)");
625 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(0), 1, 1,
626 "SO_NUM_PRIMS_WRITTEN (Stream 0)",
627 "N stream-out (stream 0) primitives (written)");
628 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(1), 1, 1,
629 "SO_NUM_PRIMS_WRITTEN (Stream 1)",
630 "N stream-out (stream 1) primitives (written)");
631 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(2), 1, 1,
632 "SO_NUM_PRIMS_WRITTEN (Stream 2)",
633 "N stream-out (stream 2) primitives (written)");
634 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(3), 1, 1,
635 "SO_NUM_PRIMS_WRITTEN (Stream 3)",
636 "N stream-out (stream 3) primitives (written)");
637 }
638
639 add_basic_stat_reg(query, HS_INVOCATION_COUNT,
640 "N TCS shader invocations");
641 add_basic_stat_reg(query, DS_INVOCATION_COUNT,
642 "N TES shader invocations");
643
644 add_basic_stat_reg(query, GS_INVOCATION_COUNT,
645 "N geometry shader invocations");
646 add_basic_stat_reg(query, GS_PRIMITIVES_COUNT,
647 "N geometry shader primitives emitted");
648
649 add_basic_stat_reg(query, CL_INVOCATION_COUNT,
650 "N primitives entering clipping");
651 add_basic_stat_reg(query, CL_PRIMITIVES_COUNT,
652 "N primitives leaving clipping");
653
654 if (devinfo->is_haswell || devinfo->gen == 8) {
655 add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4,
656 "N fragment shader invocations",
657 "N fragment shader invocations");
658 } else {
659 add_basic_stat_reg(query, PS_INVOCATION_COUNT,
660 "N fragment shader invocations");
661 }
662
663 add_basic_stat_reg(query, PS_DEPTH_COUNT,
664 "N z-pass fragments");
665
666 if (devinfo->gen >= 7) {
667 add_basic_stat_reg(query, CS_INVOCATION_COUNT,
668 "N compute shader invocations");
669 }
670
671 query->data_size = sizeof(uint64_t) * query->n_counters;
672 }
673
674 static bool
675 load_oa_metrics(struct gen_perf_config *perf, int fd,
676 const struct gen_device_info *devinfo)
677 {
678 perf_register_oa_queries_t oa_register = get_register_queries_function(devinfo);
679 bool i915_perf_oa_available = false;
680 struct stat sb;
681
682 /* The existence of this sysctl parameter implies the kernel supports
683 * the i915 perf interface.
684 */
685 if (stat("/proc/sys/dev/i915/perf_stream_paranoid", &sb) == 0) {
686
687 /* If _paranoid == 1 then on Gen8+ we won't be able to access OA
688 * metrics unless running as root.
689 */
690 if (devinfo->is_haswell)
691 i915_perf_oa_available = true;
692 else {
693 uint64_t paranoid = 1;
694
695 read_file_uint64("/proc/sys/dev/i915/perf_stream_paranoid", &paranoid);
696
697 if (paranoid == 0 || geteuid() == 0)
698 i915_perf_oa_available = true;
699 }
700 }
701
702 if (!i915_perf_oa_available ||
703 !oa_register ||
704 !get_sysfs_dev_dir(perf, fd) ||
705 !init_oa_sys_vars(perf, devinfo))
706 return false;
707
708 perf->oa_metrics_table =
709 _mesa_hash_table_create(perf, _mesa_key_hash_string,
710 _mesa_key_string_equal);
711
712 /* Index all the metric sets mesa knows about before looking to see what
713 * the kernel is advertising.
714 */
715 oa_register(perf);
716
717 if (likely((INTEL_DEBUG & DEBUG_NO_OACONFIG) == 0) &&
718 kernel_has_dynamic_config_support(perf, fd))
719 init_oa_configs(perf, fd);
720 else
721 enumerate_sysfs_metrics(perf);
722
723 return true;
724 }
725
726 /* Accumulate 32bits OA counters */
727 static inline void
728 accumulate_uint32(const uint32_t *report0,
729 const uint32_t *report1,
730 uint64_t *accumulator)
731 {
732 *accumulator += (uint32_t)(*report1 - *report0);
733 }
734
735 /* Accumulate 40bits OA counters */
736 static inline void
737 accumulate_uint40(int a_index,
738 const uint32_t *report0,
739 const uint32_t *report1,
740 uint64_t *accumulator)
741 {
742 const uint8_t *high_bytes0 = (uint8_t *)(report0 + 40);
743 const uint8_t *high_bytes1 = (uint8_t *)(report1 + 40);
744 uint64_t high0 = (uint64_t)(high_bytes0[a_index]) << 32;
745 uint64_t high1 = (uint64_t)(high_bytes1[a_index]) << 32;
746 uint64_t value0 = report0[a_index + 4] | high0;
747 uint64_t value1 = report1[a_index + 4] | high1;
748 uint64_t delta;
749
750 if (value0 > value1)
751 delta = (1ULL << 40) + value1 - value0;
752 else
753 delta = value1 - value0;
754
755 *accumulator += delta;
756 }
757
758 static void
759 gen8_read_report_clock_ratios(const uint32_t *report,
760 uint64_t *slice_freq_hz,
761 uint64_t *unslice_freq_hz)
762 {
763 /* The lower 16bits of the RPT_ID field of the OA reports contains a
764 * snapshot of the bits coming from the RP_FREQ_NORMAL register and is
765 * divided this way :
766 *
767 * RPT_ID[31:25]: RP_FREQ_NORMAL[20:14] (low squashed_slice_clock_frequency)
768 * RPT_ID[10:9]: RP_FREQ_NORMAL[22:21] (high squashed_slice_clock_frequency)
769 * RPT_ID[8:0]: RP_FREQ_NORMAL[31:23] (squashed_unslice_clock_frequency)
770 *
771 * RP_FREQ_NORMAL[31:23]: Software Unslice Ratio Request
772 * Multiple of 33.33MHz 2xclk (16 MHz 1xclk)
773 *
774 * RP_FREQ_NORMAL[22:14]: Software Slice Ratio Request
775 * Multiple of 33.33MHz 2xclk (16 MHz 1xclk)
776 */
777
778 uint32_t unslice_freq = report[0] & 0x1ff;
779 uint32_t slice_freq_low = (report[0] >> 25) & 0x7f;
780 uint32_t slice_freq_high = (report[0] >> 9) & 0x3;
781 uint32_t slice_freq = slice_freq_low | (slice_freq_high << 7);
782
783 *slice_freq_hz = slice_freq * 16666667ULL;
784 *unslice_freq_hz = unslice_freq * 16666667ULL;
785 }
786
787 void
788 gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result,
789 const struct gen_device_info *devinfo,
790 const uint32_t *start,
791 const uint32_t *end)
792 {
793 /* Slice/Unslice frequency is only available in the OA reports when the
794 * "Disable OA reports due to clock ratio change" field in
795 * OA_DEBUG_REGISTER is set to 1. This is how the kernel programs this
796 * global register (see drivers/gpu/drm/i915/i915_perf.c)
797 *
798 * Documentation says this should be available on Gen9+ but experimentation
799 * shows that Gen8 reports similar values, so we enable it there too.
800 */
801 if (devinfo->gen < 8)
802 return;
803
804 gen8_read_report_clock_ratios(start,
805 &result->slice_frequency[0],
806 &result->unslice_frequency[0]);
807 gen8_read_report_clock_ratios(end,
808 &result->slice_frequency[1],
809 &result->unslice_frequency[1]);
810 }
811
812 void
813 gen_perf_query_result_accumulate(struct gen_perf_query_result *result,
814 const struct gen_perf_query_info *query,
815 const uint32_t *start,
816 const uint32_t *end)
817 {
818 int i, idx = 0;
819
820 result->hw_id = start[2];
821 result->reports_accumulated++;
822
823 switch (query->oa_format) {
824 case I915_OA_FORMAT_A32u40_A4u32_B8_C8:
825 accumulate_uint32(start + 1, end + 1, result->accumulator + idx++); /* timestamp */
826 accumulate_uint32(start + 3, end + 3, result->accumulator + idx++); /* clock */
827
828 /* 32x 40bit A counters... */
829 for (i = 0; i < 32; i++)
830 accumulate_uint40(i, start, end, result->accumulator + idx++);
831
832 /* 4x 32bit A counters... */
833 for (i = 0; i < 4; i++)
834 accumulate_uint32(start + 36 + i, end + 36 + i, result->accumulator + idx++);
835
836 /* 8x 32bit B counters + 8x 32bit C counters... */
837 for (i = 0; i < 16; i++)
838 accumulate_uint32(start + 48 + i, end + 48 + i, result->accumulator + idx++);
839 break;
840
841 case I915_OA_FORMAT_A45_B8_C8:
842 accumulate_uint32(start + 1, end + 1, result->accumulator); /* timestamp */
843
844 for (i = 0; i < 61; i++)
845 accumulate_uint32(start + 3 + i, end + 3 + i, result->accumulator + 1 + i);
846 break;
847
848 default:
849 unreachable("Can't accumulate OA counters in unknown format");
850 }
851
852 }
853
854 void
855 gen_perf_query_result_clear(struct gen_perf_query_result *result)
856 {
857 memset(result, 0, sizeof(*result));
858 result->hw_id = 0xffffffff; /* invalid */
859 }
860
861 static void
862 gen_perf_query_register_mdapi_statistic_query(struct gen_perf_config *perf_cfg,
863 const struct gen_device_info *devinfo)
864 {
865 if (!(devinfo->gen >= 7 && devinfo->gen <= 11))
866 return;
867
868 struct gen_perf_query_info *query =
869 append_query_info(perf_cfg, MAX_STAT_COUNTERS);
870
871 query->kind = GEN_PERF_QUERY_TYPE_PIPELINE;
872 query->name = "Intel_Raw_Pipeline_Statistics_Query";
873
874 /* The order has to match mdapi_pipeline_metrics. */
875 add_basic_stat_reg(query, IA_VERTICES_COUNT,
876 "N vertices submitted");
877 add_basic_stat_reg(query, IA_PRIMITIVES_COUNT,
878 "N primitives submitted");
879 add_basic_stat_reg(query, VS_INVOCATION_COUNT,
880 "N vertex shader invocations");
881 add_basic_stat_reg(query, GS_INVOCATION_COUNT,
882 "N geometry shader invocations");
883 add_basic_stat_reg(query, GS_PRIMITIVES_COUNT,
884 "N geometry shader primitives emitted");
885 add_basic_stat_reg(query, CL_INVOCATION_COUNT,
886 "N primitives entering clipping");
887 add_basic_stat_reg(query, CL_PRIMITIVES_COUNT,
888 "N primitives leaving clipping");
889 if (devinfo->is_haswell || devinfo->gen == 8) {
890 add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4,
891 "N fragment shader invocations",
892 "N fragment shader invocations");
893 } else {
894 add_basic_stat_reg(query, PS_INVOCATION_COUNT,
895 "N fragment shader invocations");
896 }
897 add_basic_stat_reg(query, HS_INVOCATION_COUNT,
898 "N TCS shader invocations");
899 add_basic_stat_reg(query, DS_INVOCATION_COUNT,
900 "N TES shader invocations");
901 if (devinfo->gen >= 7) {
902 add_basic_stat_reg(query, CS_INVOCATION_COUNT,
903 "N compute shader invocations");
904 }
905
906 if (devinfo->gen >= 10) {
907 /* Reuse existing CS invocation register until we can expose this new
908 * one.
909 */
910 add_basic_stat_reg(query, CS_INVOCATION_COUNT,
911 "Reserved1");
912 }
913
914 query->data_size = sizeof(uint64_t) * query->n_counters;
915 }
916
917 static void
918 fill_mdapi_perf_query_counter(struct gen_perf_query_info *query,
919 const char *name,
920 uint32_t data_offset,
921 uint32_t data_size,
922 enum gen_perf_counter_data_type data_type)
923 {
924 struct gen_perf_query_counter *counter = &query->counters[query->n_counters];
925
926 assert(query->n_counters <= query->max_counters);
927
928 counter->name = name;
929 counter->desc = "Raw counter value";
930 counter->type = GEN_PERF_COUNTER_TYPE_RAW;
931 counter->data_type = data_type;
932 counter->offset = data_offset;
933
934 query->n_counters++;
935
936 assert(counter->offset + gen_perf_query_counter_get_size(counter) <= query->data_size);
937 }
938
939 #define MDAPI_QUERY_ADD_COUNTER(query, struct_name, field_name, type_name) \
940 fill_mdapi_perf_query_counter(query, #field_name, \
941 (uint8_t *) &struct_name.field_name - \
942 (uint8_t *) &struct_name, \
943 sizeof(struct_name.field_name), \
944 GEN_PERF_COUNTER_DATA_TYPE_##type_name)
945 #define MDAPI_QUERY_ADD_ARRAY_COUNTER(ctx, query, struct_name, field_name, idx, type_name) \
946 fill_mdapi_perf_query_counter(query, \
947 ralloc_asprintf(ctx, "%s%i", #field_name, idx), \
948 (uint8_t *) &struct_name.field_name[idx] - \
949 (uint8_t *) &struct_name, \
950 sizeof(struct_name.field_name[0]), \
951 GEN_PERF_COUNTER_DATA_TYPE_##type_name)
952
953 static void
954 register_mdapi_oa_query(const struct gen_device_info *devinfo,
955 struct gen_perf_config *perf)
956 {
957 struct gen_perf_query_info *query = NULL;
958
959 /* MDAPI requires different structures for pretty much every generation
960 * (right now we have definitions for gen 7 to 11).
961 */
962 if (!(devinfo->gen >= 7 && devinfo->gen <= 11))
963 return;
964
965 switch (devinfo->gen) {
966 case 7: {
967 query = append_query_info(perf, 1 + 45 + 16 + 7);
968 query->oa_format = I915_OA_FORMAT_A45_B8_C8;
969
970 struct gen7_mdapi_metrics metric_data;
971 query->data_size = sizeof(metric_data);
972
973 MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
974 for (int i = 0; i < ARRAY_SIZE(metric_data.ACounters); i++) {
975 MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
976 metric_data, ACounters, i, UINT64);
977 }
978 for (int i = 0; i < ARRAY_SIZE(metric_data.NOACounters); i++) {
979 MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
980 metric_data, NOACounters, i, UINT64);
981 }
982 MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
983 MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
984 MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
985 MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
986 MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
987 MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
988 MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
989 break;
990 }
991 case 8: {
992 query = append_query_info(perf, 2 + 36 + 16 + 16);
993 query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8;
994
995 struct gen8_mdapi_metrics metric_data;
996 query->data_size = sizeof(metric_data);
997
998 MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
999 MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64);
1000 for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) {
1001 MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
1002 metric_data, OaCntr, i, UINT64);
1003 }
1004 for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) {
1005 MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
1006 metric_data, NoaCntr, i, UINT64);
1007 }
1008 MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64);
1009 MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64);
1010 MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64);
1011 MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32);
1012 MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32);
1013 MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64);
1014 MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64);
1015 MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64);
1016 MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64);
1017 MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
1018 MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
1019 MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
1020 MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
1021 MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
1022 MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
1023 MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
1024 break;
1025 }
1026 case 9:
1027 case 10:
1028 case 11: {
1029 query = append_query_info(perf, 2 + 36 + 16 + 16 + 16 + 2);
1030 query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8;
1031
1032 struct gen9_mdapi_metrics metric_data;
1033 query->data_size = sizeof(metric_data);
1034
1035 MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
1036 MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64);
1037 for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) {
1038 MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
1039 metric_data, OaCntr, i, UINT64);
1040 }
1041 for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) {
1042 MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
1043 metric_data, NoaCntr, i, UINT64);
1044 }
1045 MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64);
1046 MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64);
1047 MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64);
1048 MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32);
1049 MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32);
1050 MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64);
1051 MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64);
1052 MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64);
1053 MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64);
1054 MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
1055 MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
1056 MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
1057 MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
1058 MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
1059 MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
1060 MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
1061 for (int i = 0; i < ARRAY_SIZE(metric_data.UserCntr); i++) {
1062 MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
1063 metric_data, UserCntr, i, UINT64);
1064 }
1065 MDAPI_QUERY_ADD_COUNTER(query, metric_data, UserCntrCfgId, UINT32);
1066 MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved4, UINT32);
1067 break;
1068 }
1069 default:
1070 unreachable("Unsupported gen");
1071 break;
1072 }
1073
1074 query->kind = GEN_PERF_QUERY_TYPE_RAW;
1075 query->name = "Intel_Raw_Hardware_Counters_Set_0_Query";
1076 query->guid = GEN_PERF_QUERY_GUID_MDAPI;
1077
1078 {
1079 /* Accumulation buffer offsets copied from an actual query... */
1080 const struct gen_perf_query_info *copy_query =
1081 &perf->queries[0];
1082
1083 query->gpu_time_offset = copy_query->gpu_time_offset;
1084 query->gpu_clock_offset = copy_query->gpu_clock_offset;
1085 query->a_offset = copy_query->a_offset;
1086 query->b_offset = copy_query->b_offset;
1087 query->c_offset = copy_query->c_offset;
1088 }
1089 }
1090
1091 uint64_t
1092 gen_perf_query_get_metric_id(struct gen_perf_config *perf,
1093 const struct gen_perf_query_info *query)
1094 {
1095 /* These queries are know not to ever change, their config ID has been
1096 * loaded upon the first query creation. No need to look them up again.
1097 */
1098 if (query->kind == GEN_PERF_QUERY_TYPE_OA)
1099 return query->oa_metrics_set_id;
1100
1101 assert(query->kind == GEN_PERF_QUERY_TYPE_RAW);
1102
1103 /* Raw queries can be reprogrammed up by an external application/library.
1104 * When a raw query is used for the first time it's id is set to a value !=
1105 * 0. When it stops being used the id returns to 0. No need to reload the
1106 * ID when it's already loaded.
1107 */
1108 if (query->oa_metrics_set_id != 0) {
1109 DBG("Raw query '%s' guid=%s using cached ID: %"PRIu64"\n",
1110 query->name, query->guid, query->oa_metrics_set_id);
1111 return query->oa_metrics_set_id;
1112 }
1113
1114 struct gen_perf_query_info *raw_query = (struct gen_perf_query_info *)query;
1115 if (!gen_perf_load_metric_id(perf, query->guid,
1116 &raw_query->oa_metrics_set_id)) {
1117 DBG("Unable to read query guid=%s ID, falling back to test config\n", query->guid);
1118 raw_query->oa_metrics_set_id = 1ULL;
1119 } else {
1120 DBG("Raw query '%s'guid=%s loaded ID: %"PRIu64"\n",
1121 query->name, query->guid, query->oa_metrics_set_id);
1122 }
1123 return query->oa_metrics_set_id;
1124 }
1125
1126 struct oa_sample_buf *
1127 gen_perf_get_free_sample_buf(struct gen_perf_context *perf_ctx)
1128 {
1129 struct exec_node *node = exec_list_pop_head(&perf_ctx->free_sample_buffers);
1130 struct oa_sample_buf *buf;
1131
1132 if (node)
1133 buf = exec_node_data(struct oa_sample_buf, node, link);
1134 else {
1135 buf = ralloc_size(perf_ctx->perf, sizeof(*buf));
1136
1137 exec_node_init(&buf->link);
1138 buf->refcount = 0;
1139 buf->len = 0;
1140 }
1141
1142 return buf;
1143 }
1144
1145 void
1146 gen_perf_reap_old_sample_buffers(struct gen_perf_context *perf_ctx)
1147 {
1148 struct exec_node *tail_node =
1149 exec_list_get_tail(&perf_ctx->sample_buffers);
1150 struct oa_sample_buf *tail_buf =
1151 exec_node_data(struct oa_sample_buf, tail_node, link);
1152
1153 /* Remove all old, unreferenced sample buffers walking forward from
1154 * the head of the list, except always leave at least one node in
1155 * the list so we always have a node to reference when we Begin
1156 * a new query.
1157 */
1158 foreach_list_typed_safe(struct oa_sample_buf, buf, link,
1159 &perf_ctx->sample_buffers)
1160 {
1161 if (buf->refcount == 0 && buf != tail_buf) {
1162 exec_node_remove(&buf->link);
1163 exec_list_push_head(&perf_ctx->free_sample_buffers, &buf->link);
1164 } else
1165 return;
1166 }
1167 }
1168
1169 void
1170 gen_perf_free_sample_bufs(struct gen_perf_context *perf_ctx)
1171 {
1172 foreach_list_typed_safe(struct oa_sample_buf, buf, link,
1173 &perf_ctx->free_sample_buffers)
1174 ralloc_free(buf);
1175
1176 exec_list_make_empty(&perf_ctx->free_sample_buffers);
1177 }
1178
1179 /******************************************************************************/
1180
1181 /**
1182 * Emit MI_STORE_REGISTER_MEM commands to capture all of the
1183 * pipeline statistics for the performance query object.
1184 */
1185 void
1186 gen_perf_snapshot_statistics_registers(void *context,
1187 struct gen_perf_config *perf,
1188 struct gen_perf_query_object *obj,
1189 uint32_t offset_in_bytes)
1190 {
1191 const struct gen_perf_query_info *query = obj->queryinfo;
1192 const int n_counters = query->n_counters;
1193
1194 for (int i = 0; i < n_counters; i++) {
1195 const struct gen_perf_query_counter *counter = &query->counters[i];
1196
1197 assert(counter->data_type == GEN_PERF_COUNTER_DATA_TYPE_UINT64);
1198
1199 perf->vtbl.store_register_mem64(context, obj->pipeline_stats.bo,
1200 counter->pipeline_stat.reg,
1201 offset_in_bytes + i * sizeof(uint64_t));
1202 }
1203 }
1204
1205 void
1206 gen_perf_close(struct gen_perf_context *perfquery,
1207 const struct gen_perf_query_info *query)
1208 {
1209 if (perfquery->oa_stream_fd != -1) {
1210 close(perfquery->oa_stream_fd);
1211 perfquery->oa_stream_fd = -1;
1212 }
1213 if (query->kind == GEN_PERF_QUERY_TYPE_RAW) {
1214 struct gen_perf_query_info *raw_query =
1215 (struct gen_perf_query_info *) query;
1216 raw_query->oa_metrics_set_id = 0;
1217 }
1218 }
1219
1220 bool
1221 gen_perf_open(struct gen_perf_context *perf_ctx,
1222 int metrics_set_id,
1223 int report_format,
1224 int period_exponent,
1225 int drm_fd,
1226 uint32_t ctx_id)
1227 {
1228 uint64_t properties[] = {
1229 /* Single context sampling */
1230 DRM_I915_PERF_PROP_CTX_HANDLE, ctx_id,
1231
1232 /* Include OA reports in samples */
1233 DRM_I915_PERF_PROP_SAMPLE_OA, true,
1234
1235 /* OA unit configuration */
1236 DRM_I915_PERF_PROP_OA_METRICS_SET, metrics_set_id,
1237 DRM_I915_PERF_PROP_OA_FORMAT, report_format,
1238 DRM_I915_PERF_PROP_OA_EXPONENT, period_exponent,
1239 };
1240 struct drm_i915_perf_open_param param = {
1241 .flags = I915_PERF_FLAG_FD_CLOEXEC |
1242 I915_PERF_FLAG_FD_NONBLOCK |
1243 I915_PERF_FLAG_DISABLED,
1244 .num_properties = ARRAY_SIZE(properties) / 2,
1245 .properties_ptr = (uintptr_t) properties,
1246 };
1247 int fd = gen_ioctl(drm_fd, DRM_IOCTL_I915_PERF_OPEN, &param);
1248 if (fd == -1) {
1249 DBG("Error opening gen perf OA stream: %m\n");
1250 return false;
1251 }
1252
1253 perf_ctx->oa_stream_fd = fd;
1254
1255 perf_ctx->current_oa_metrics_set_id = metrics_set_id;
1256 perf_ctx->current_oa_format = report_format;
1257
1258 return true;
1259 }
1260
1261 bool
1262 gen_perf_inc_n_users(struct gen_perf_context *perf_ctx)
1263 {
1264 if (perf_ctx->n_oa_users == 0 &&
1265 gen_ioctl(perf_ctx->oa_stream_fd, I915_PERF_IOCTL_ENABLE, 0) < 0)
1266 {
1267 return false;
1268 }
1269 ++perf_ctx->n_oa_users;
1270
1271 return true;
1272 }
1273
1274 void
1275 gen_perf_dec_n_users(struct gen_perf_context *perf_ctx)
1276 {
1277 /* Disabling the i915 perf stream will effectively disable the OA
1278 * counters. Note it's important to be sure there are no outstanding
1279 * MI_RPC commands at this point since they could stall the CS
1280 * indefinitely once OACONTROL is disabled.
1281 */
1282 --perf_ctx->n_oa_users;
1283 if (perf_ctx->n_oa_users == 0 &&
1284 gen_ioctl(perf_ctx->oa_stream_fd, I915_PERF_IOCTL_DISABLE, 0) < 0)
1285 {
1286 DBG("WARNING: Error disabling gen perf stream: %m\n");
1287 }
1288 }
1289
1290 void
1291 gen_perf_init_metrics(struct gen_perf_config *perf_cfg,
1292 const struct gen_device_info *devinfo,
1293 int drm_fd)
1294 {
1295 load_pipeline_statistic_metrics(perf_cfg, devinfo);
1296 gen_perf_query_register_mdapi_statistic_query(perf_cfg, devinfo);
1297 if (load_oa_metrics(perf_cfg, drm_fd, devinfo))
1298 register_mdapi_oa_query(devinfo, perf_cfg);
1299 }
1300
1301 void
1302 gen_perf_init_context(struct gen_perf_context *perf_ctx,
1303 struct gen_perf_config *perf_cfg,
1304 void * ctx, /* driver context (eg, brw_context) */
1305 void * bufmgr, /* eg brw_bufmgr */
1306 const struct gen_device_info *devinfo,
1307 uint32_t hw_ctx,
1308 int drm_fd)
1309 {
1310 perf_ctx->perf = perf_cfg;
1311 perf_ctx->ctx = ctx;
1312 perf_ctx->bufmgr = bufmgr;
1313 perf_ctx->drm_fd = drm_fd;
1314 perf_ctx->hw_ctx = hw_ctx;
1315 perf_ctx->devinfo = devinfo;
1316
1317 perf_ctx->unaccumulated =
1318 ralloc_array(ctx, struct gen_perf_query_object *, 2);
1319 perf_ctx->unaccumulated_elements = 0;
1320 perf_ctx->unaccumulated_array_size = 2;
1321
1322 exec_list_make_empty(&perf_ctx->sample_buffers);
1323 exec_list_make_empty(&perf_ctx->free_sample_buffers);
1324
1325 /* It's convenient to guarantee that this linked list of sample
1326 * buffers is never empty so we add an empty head so when we
1327 * Begin an OA query we can always take a reference on a buffer
1328 * in this list.
1329 */
1330 struct oa_sample_buf *buf = gen_perf_get_free_sample_buf(perf_ctx);
1331 exec_list_push_head(&perf_ctx->sample_buffers, &buf->link);
1332
1333 perf_ctx->oa_stream_fd = -1;
1334 perf_ctx->next_query_start_report_id = 1000;
1335 }
1336
1337 /**
1338 * Add a query to the global list of "unaccumulated queries."
1339 *
1340 * Queries are tracked here until all the associated OA reports have
1341 * been accumulated via accumulate_oa_reports() after the end
1342 * MI_REPORT_PERF_COUNT has landed in query->oa.bo.
1343 */
1344 static void
1345 add_to_unaccumulated_query_list(struct gen_perf_context *perf_ctx,
1346 struct gen_perf_query_object *obj)
1347 {
1348 if (perf_ctx->unaccumulated_elements >=
1349 perf_ctx->unaccumulated_array_size)
1350 {
1351 perf_ctx->unaccumulated_array_size *= 1.5;
1352 perf_ctx->unaccumulated =
1353 reralloc(perf_ctx->ctx, perf_ctx->unaccumulated,
1354 struct gen_perf_query_object *,
1355 perf_ctx->unaccumulated_array_size);
1356 }
1357
1358 perf_ctx->unaccumulated[perf_ctx->unaccumulated_elements++] = obj;
1359 }
1360
1361 bool
1362 gen_perf_begin_query(struct gen_perf_context *perf_ctx,
1363 struct gen_perf_query_object *query)
1364 {
1365 struct gen_perf_config *perf_cfg = perf_ctx->perf;
1366 const struct gen_perf_query_info *queryinfo = query->queryinfo;
1367
1368 /* XXX: We have to consider that the command parser unit that parses batch
1369 * buffer commands and is used to capture begin/end counter snapshots isn't
1370 * implicitly synchronized with what's currently running across other GPU
1371 * units (such as the EUs running shaders) that the performance counters are
1372 * associated with.
1373 *
1374 * The intention of performance queries is to measure the work associated
1375 * with commands between the begin/end delimiters and so for that to be the
1376 * case we need to explicitly synchronize the parsing of commands to capture
1377 * Begin/End counter snapshots with what's running across other parts of the
1378 * GPU.
1379 *
1380 * When the command parser reaches a Begin marker it effectively needs to
1381 * drain everything currently running on the GPU until the hardware is idle
1382 * before capturing the first snapshot of counters - otherwise the results
1383 * would also be measuring the effects of earlier commands.
1384 *
1385 * When the command parser reaches an End marker it needs to stall until
1386 * everything currently running on the GPU has finished before capturing the
1387 * end snapshot - otherwise the results won't be a complete representation
1388 * of the work.
1389 *
1390 * Theoretically there could be opportunities to minimize how much of the
1391 * GPU pipeline is drained, or that we stall for, when we know what specific
1392 * units the performance counters being queried relate to but we don't
1393 * currently attempt to be clever here.
1394 *
1395 * Note: with our current simple approach here then for back-to-back queries
1396 * we will redundantly emit duplicate commands to synchronize the command
1397 * streamer with the rest of the GPU pipeline, but we assume that in HW the
1398 * second synchronization is effectively a NOOP.
1399 *
1400 * N.B. The final results are based on deltas of counters between (inside)
1401 * Begin/End markers so even though the total wall clock time of the
1402 * workload is stretched by larger pipeline bubbles the bubbles themselves
1403 * are generally invisible to the query results. Whether that's a good or a
1404 * bad thing depends on the use case. For a lower real-time impact while
1405 * capturing metrics then periodic sampling may be a better choice than
1406 * INTEL_performance_query.
1407 *
1408 *
1409 * This is our Begin synchronization point to drain current work on the
1410 * GPU before we capture our first counter snapshot...
1411 */
1412 perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
1413
1414 switch (queryinfo->kind) {
1415 case GEN_PERF_QUERY_TYPE_OA:
1416 case GEN_PERF_QUERY_TYPE_RAW: {
1417
1418 /* Opening an i915 perf stream implies exclusive access to the OA unit
1419 * which will generate counter reports for a specific counter set with a
1420 * specific layout/format so we can't begin any OA based queries that
1421 * require a different counter set or format unless we get an opportunity
1422 * to close the stream and open a new one...
1423 */
1424 uint64_t metric_id = gen_perf_query_get_metric_id(perf_ctx->perf, queryinfo);
1425
1426 if (perf_ctx->oa_stream_fd != -1 &&
1427 perf_ctx->current_oa_metrics_set_id != metric_id) {
1428
1429 if (perf_ctx->n_oa_users != 0) {
1430 DBG("WARNING: Begin failed already using perf config=%i/%"PRIu64"\n",
1431 perf_ctx->current_oa_metrics_set_id, metric_id);
1432 return false;
1433 } else
1434 gen_perf_close(perf_ctx, queryinfo);
1435 }
1436
1437 /* If the OA counters aren't already on, enable them. */
1438 if (perf_ctx->oa_stream_fd == -1) {
1439 const struct gen_device_info *devinfo = perf_ctx->devinfo;
1440
1441 /* The period_exponent gives a sampling period as follows:
1442 * sample_period = timestamp_period * 2^(period_exponent + 1)
1443 *
1444 * The timestamps increments every 80ns (HSW), ~52ns (GEN9LP) or
1445 * ~83ns (GEN8/9).
1446 *
1447 * The counter overflow period is derived from the EuActive counter
1448 * which reads a counter that increments by the number of clock
1449 * cycles multiplied by the number of EUs. It can be calculated as:
1450 *
1451 * 2^(number of bits in A counter) / (n_eus * max_gen_freq * 2)
1452 *
1453 * (E.g. 40 EUs @ 1GHz = ~53ms)
1454 *
1455 * We select a sampling period inferior to that overflow period to
1456 * ensure we cannot see more than 1 counter overflow, otherwise we
1457 * could loose information.
1458 */
1459
1460 int a_counter_in_bits = 32;
1461 if (devinfo->gen >= 8)
1462 a_counter_in_bits = 40;
1463
1464 uint64_t overflow_period = pow(2, a_counter_in_bits) / (perf_cfg->sys_vars.n_eus *
1465 /* drop 1GHz freq to have units in nanoseconds */
1466 2);
1467
1468 DBG("A counter overflow period: %"PRIu64"ns, %"PRIu64"ms (n_eus=%"PRIu64")\n",
1469 overflow_period, overflow_period / 1000000ul, perf_cfg->sys_vars.n_eus);
1470
1471 int period_exponent = 0;
1472 uint64_t prev_sample_period, next_sample_period;
1473 for (int e = 0; e < 30; e++) {
1474 prev_sample_period = 1000000000ull * pow(2, e + 1) / devinfo->timestamp_frequency;
1475 next_sample_period = 1000000000ull * pow(2, e + 2) / devinfo->timestamp_frequency;
1476
1477 /* Take the previous sampling period, lower than the overflow
1478 * period.
1479 */
1480 if (prev_sample_period < overflow_period &&
1481 next_sample_period > overflow_period)
1482 period_exponent = e + 1;
1483 }
1484
1485 if (period_exponent == 0) {
1486 DBG("WARNING: enable to find a sampling exponent\n");
1487 return false;
1488 }
1489
1490 DBG("OA sampling exponent: %i ~= %"PRIu64"ms\n", period_exponent,
1491 prev_sample_period / 1000000ul);
1492
1493 if (!gen_perf_open(perf_ctx, metric_id, queryinfo->oa_format,
1494 period_exponent, perf_ctx->drm_fd,
1495 perf_ctx->hw_ctx))
1496 return false;
1497 } else {
1498 assert(perf_ctx->current_oa_metrics_set_id == metric_id &&
1499 perf_ctx->current_oa_format == queryinfo->oa_format);
1500 }
1501
1502 if (!gen_perf_inc_n_users(perf_ctx)) {
1503 DBG("WARNING: Error enabling i915 perf stream: %m\n");
1504 return false;
1505 }
1506
1507 if (query->oa.bo) {
1508 perf_cfg->vtbl.bo_unreference(query->oa.bo);
1509 query->oa.bo = NULL;
1510 }
1511
1512 query->oa.bo = perf_cfg->vtbl.bo_alloc(perf_ctx->bufmgr,
1513 "perf. query OA MI_RPC bo",
1514 MI_RPC_BO_SIZE);
1515 #ifdef DEBUG
1516 /* Pre-filling the BO helps debug whether writes landed. */
1517 void *map = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->oa.bo, MAP_WRITE);
1518 memset(map, 0x80, MI_RPC_BO_SIZE);
1519 perf_cfg->vtbl.bo_unmap(query->oa.bo);
1520 #endif
1521
1522 query->oa.begin_report_id = perf_ctx->next_query_start_report_id;
1523 perf_ctx->next_query_start_report_id += 2;
1524
1525 /* We flush the batchbuffer here to minimize the chances that MI_RPC
1526 * delimiting commands end up in different batchbuffers. If that's the
1527 * case, the measurement will include the time it takes for the kernel
1528 * scheduler to load a new request into the hardware. This is manifested in
1529 * tools like frameretrace by spikes in the "GPU Core Clocks" counter.
1530 */
1531 perf_cfg->vtbl.batchbuffer_flush(perf_ctx->ctx, __FILE__, __LINE__);
1532
1533 /* Take a starting OA counter snapshot. */
1534 perf_cfg->vtbl.emit_mi_report_perf_count(perf_ctx->ctx, query->oa.bo, 0,
1535 query->oa.begin_report_id);
1536 perf_cfg->vtbl.capture_frequency_stat_register(perf_ctx->ctx, query->oa.bo,
1537 MI_FREQ_START_OFFSET_BYTES);
1538
1539 ++perf_ctx->n_active_oa_queries;
1540
1541 /* No already-buffered samples can possibly be associated with this query
1542 * so create a marker within the list of sample buffers enabling us to
1543 * easily ignore earlier samples when processing this query after
1544 * completion.
1545 */
1546 assert(!exec_list_is_empty(&perf_ctx->sample_buffers));
1547 query->oa.samples_head = exec_list_get_tail(&perf_ctx->sample_buffers);
1548
1549 struct oa_sample_buf *buf =
1550 exec_node_data(struct oa_sample_buf, query->oa.samples_head, link);
1551
1552 /* This reference will ensure that future/following sample
1553 * buffers (that may relate to this query) can't be freed until
1554 * this drops to zero.
1555 */
1556 buf->refcount++;
1557
1558 gen_perf_query_result_clear(&query->oa.result);
1559 query->oa.results_accumulated = false;
1560
1561 add_to_unaccumulated_query_list(perf_ctx, query);
1562 break;
1563 }
1564
1565 case GEN_PERF_QUERY_TYPE_PIPELINE:
1566 if (query->pipeline_stats.bo) {
1567 perf_cfg->vtbl.bo_unreference(query->pipeline_stats.bo);
1568 query->pipeline_stats.bo = NULL;
1569 }
1570
1571 query->pipeline_stats.bo =
1572 perf_cfg->vtbl.bo_alloc(perf_ctx->bufmgr,
1573 "perf. query pipeline stats bo",
1574 STATS_BO_SIZE);
1575
1576 /* Take starting snapshots. */
1577 gen_perf_snapshot_statistics_registers(perf_ctx->ctx , perf_cfg, query, 0);
1578
1579 ++perf_ctx->n_active_pipeline_stats_queries;
1580 break;
1581
1582 default:
1583 unreachable("Unknown query type");
1584 break;
1585 }
1586
1587 return true;
1588 }
1589
1590 void
1591 gen_perf_end_query(struct gen_perf_context *perf_ctx,
1592 struct gen_perf_query_object *query)
1593 {
1594 struct gen_perf_config *perf_cfg = perf_ctx->perf;
1595
1596 /* Ensure that the work associated with the queried commands will have
1597 * finished before taking our query end counter readings.
1598 *
1599 * For more details see comment in brw_begin_perf_query for
1600 * corresponding flush.
1601 */
1602 perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
1603
1604 switch (query->queryinfo->kind) {
1605 case GEN_PERF_QUERY_TYPE_OA:
1606 case GEN_PERF_QUERY_TYPE_RAW:
1607
1608 /* NB: It's possible that the query will have already been marked
1609 * as 'accumulated' if an error was seen while reading samples
1610 * from perf. In this case we mustn't try and emit a closing
1611 * MI_RPC command in case the OA unit has already been disabled
1612 */
1613 if (!query->oa.results_accumulated) {
1614 /* Take an ending OA counter snapshot. */
1615 perf_cfg->vtbl.capture_frequency_stat_register(perf_ctx->ctx, query->oa.bo,
1616 MI_FREQ_END_OFFSET_BYTES);
1617 perf_cfg->vtbl.emit_mi_report_perf_count(perf_ctx->ctx, query->oa.bo,
1618 MI_RPC_BO_END_OFFSET_BYTES,
1619 query->oa.begin_report_id + 1);
1620 }
1621
1622 --perf_ctx->n_active_oa_queries;
1623
1624 /* NB: even though the query has now ended, it can't be accumulated
1625 * until the end MI_REPORT_PERF_COUNT snapshot has been written
1626 * to query->oa.bo
1627 */
1628 break;
1629
1630 case GEN_PERF_QUERY_TYPE_PIPELINE:
1631 gen_perf_snapshot_statistics_registers(perf_ctx->ctx, perf_cfg, query,
1632 STATS_BO_END_OFFSET_BYTES);
1633 --perf_ctx->n_active_pipeline_stats_queries;
1634 break;
1635
1636 default:
1637 unreachable("Unknown query type");
1638 break;
1639 }
1640 }
1641
1642 enum OaReadStatus {
1643 OA_READ_STATUS_ERROR,
1644 OA_READ_STATUS_UNFINISHED,
1645 OA_READ_STATUS_FINISHED,
1646 };
1647
1648 static enum OaReadStatus
1649 read_oa_samples_until(struct gen_perf_context *perf_ctx,
1650 uint32_t start_timestamp,
1651 uint32_t end_timestamp)
1652 {
1653 struct exec_node *tail_node =
1654 exec_list_get_tail(&perf_ctx->sample_buffers);
1655 struct oa_sample_buf *tail_buf =
1656 exec_node_data(struct oa_sample_buf, tail_node, link);
1657 uint32_t last_timestamp = tail_buf->last_timestamp;
1658
1659 while (1) {
1660 struct oa_sample_buf *buf = gen_perf_get_free_sample_buf(perf_ctx);
1661 uint32_t offset;
1662 int len;
1663
1664 while ((len = read(perf_ctx->oa_stream_fd, buf->buf,
1665 sizeof(buf->buf))) < 0 && errno == EINTR)
1666 ;
1667
1668 if (len <= 0) {
1669 exec_list_push_tail(&perf_ctx->free_sample_buffers, &buf->link);
1670
1671 if (len < 0) {
1672 if (errno == EAGAIN)
1673 return ((last_timestamp - start_timestamp) >=
1674 (end_timestamp - start_timestamp)) ?
1675 OA_READ_STATUS_FINISHED :
1676 OA_READ_STATUS_UNFINISHED;
1677 else {
1678 DBG("Error reading i915 perf samples: %m\n");
1679 }
1680 } else
1681 DBG("Spurious EOF reading i915 perf samples\n");
1682
1683 return OA_READ_STATUS_ERROR;
1684 }
1685
1686 buf->len = len;
1687 exec_list_push_tail(&perf_ctx->sample_buffers, &buf->link);
1688
1689 /* Go through the reports and update the last timestamp. */
1690 offset = 0;
1691 while (offset < buf->len) {
1692 const struct drm_i915_perf_record_header *header =
1693 (const struct drm_i915_perf_record_header *) &buf->buf[offset];
1694 uint32_t *report = (uint32_t *) (header + 1);
1695
1696 if (header->type == DRM_I915_PERF_RECORD_SAMPLE)
1697 last_timestamp = report[1];
1698
1699 offset += header->size;
1700 }
1701
1702 buf->last_timestamp = last_timestamp;
1703 }
1704
1705 unreachable("not reached");
1706 return OA_READ_STATUS_ERROR;
1707 }
1708
1709 /**
1710 * Try to read all the reports until either the delimiting timestamp
1711 * or an error arises.
1712 */
1713 static bool
1714 read_oa_samples_for_query(struct gen_perf_context *perf_ctx,
1715 struct gen_perf_query_object *query,
1716 void *current_batch)
1717 {
1718 uint32_t *start;
1719 uint32_t *last;
1720 uint32_t *end;
1721 struct gen_perf_config *perf_cfg = perf_ctx->perf;
1722
1723 /* We need the MI_REPORT_PERF_COUNT to land before we can start
1724 * accumulate. */
1725 assert(!perf_cfg->vtbl.batch_references(current_batch, query->oa.bo) &&
1726 !perf_cfg->vtbl.bo_busy(query->oa.bo));
1727
1728 /* Map the BO once here and let accumulate_oa_reports() unmap
1729 * it. */
1730 if (query->oa.map == NULL)
1731 query->oa.map = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->oa.bo, MAP_READ);
1732
1733 start = last = query->oa.map;
1734 end = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
1735
1736 if (start[0] != query->oa.begin_report_id) {
1737 DBG("Spurious start report id=%"PRIu32"\n", start[0]);
1738 return true;
1739 }
1740 if (end[0] != (query->oa.begin_report_id + 1)) {
1741 DBG("Spurious end report id=%"PRIu32"\n", end[0]);
1742 return true;
1743 }
1744
1745 /* Read the reports until the end timestamp. */
1746 switch (read_oa_samples_until(perf_ctx, start[1], end[1])) {
1747 case OA_READ_STATUS_ERROR:
1748 /* Fallthrough and let accumulate_oa_reports() deal with the
1749 * error. */
1750 case OA_READ_STATUS_FINISHED:
1751 return true;
1752 case OA_READ_STATUS_UNFINISHED:
1753 return false;
1754 }
1755
1756 unreachable("invalid read status");
1757 return false;
1758 }
1759
1760 void
1761 gen_perf_wait_query(struct gen_perf_context *perf_ctx,
1762 struct gen_perf_query_object *query,
1763 void *current_batch)
1764 {
1765 struct gen_perf_config *perf_cfg = perf_ctx->perf;
1766 struct brw_bo *bo = NULL;
1767
1768 switch (query->queryinfo->kind) {
1769 case GEN_PERF_QUERY_TYPE_OA:
1770 case GEN_PERF_QUERY_TYPE_RAW:
1771 bo = query->oa.bo;
1772 break;
1773
1774 case GEN_PERF_QUERY_TYPE_PIPELINE:
1775 bo = query->pipeline_stats.bo;
1776 break;
1777
1778 default:
1779 unreachable("Unknown query type");
1780 break;
1781 }
1782
1783 if (bo == NULL)
1784 return;
1785
1786 /* If the current batch references our results bo then we need to
1787 * flush first...
1788 */
1789 if (perf_cfg->vtbl.batch_references(current_batch, bo))
1790 perf_cfg->vtbl.batchbuffer_flush(perf_ctx->ctx, __FILE__, __LINE__);
1791
1792 perf_cfg->vtbl.bo_wait_rendering(bo);
1793
1794 /* Due to a race condition between the OA unit signaling report
1795 * availability and the report actually being written into memory,
1796 * we need to wait for all the reports to come in before we can
1797 * read them.
1798 */
1799 if (query->queryinfo->kind == GEN_PERF_QUERY_TYPE_OA ||
1800 query->queryinfo->kind == GEN_PERF_QUERY_TYPE_RAW) {
1801 while (!read_oa_samples_for_query(perf_ctx, query, current_batch))
1802 ;
1803 }
1804 }
1805
1806 bool
1807 gen_perf_is_query_ready(struct gen_perf_context *perf_ctx,
1808 struct gen_perf_query_object *query,
1809 void *current_batch)
1810 {
1811 struct gen_perf_config *perf_cfg = perf_ctx->perf;
1812
1813 switch (query->queryinfo->kind) {
1814 case GEN_PERF_QUERY_TYPE_OA:
1815 case GEN_PERF_QUERY_TYPE_RAW:
1816 return (query->oa.results_accumulated ||
1817 (query->oa.bo &&
1818 !perf_cfg->vtbl.batch_references(current_batch, query->oa.bo) &&
1819 !perf_cfg->vtbl.bo_busy(query->oa.bo) &&
1820 read_oa_samples_for_query(perf_ctx, query, current_batch)));
1821 case GEN_PERF_QUERY_TYPE_PIPELINE:
1822 return (query->pipeline_stats.bo &&
1823 !perf_cfg->vtbl.batch_references(current_batch, query->pipeline_stats.bo) &&
1824 !perf_cfg->vtbl.bo_busy(query->pipeline_stats.bo));
1825
1826 default:
1827 unreachable("Unknown query type");
1828 break;
1829 }
1830
1831 return false;
1832 }
1833
1834 /**
1835 * Remove a query from the global list of unaccumulated queries once
1836 * after successfully accumulating the OA reports associated with the
1837 * query in accumulate_oa_reports() or when discarding unwanted query
1838 * results.
1839 */
1840 static void
1841 drop_from_unaccumulated_query_list(struct gen_perf_context *perf_ctx,
1842 struct gen_perf_query_object *query)
1843 {
1844 for (int i = 0; i < perf_ctx->unaccumulated_elements; i++) {
1845 if (perf_ctx->unaccumulated[i] == query) {
1846 int last_elt = --perf_ctx->unaccumulated_elements;
1847
1848 if (i == last_elt)
1849 perf_ctx->unaccumulated[i] = NULL;
1850 else {
1851 perf_ctx->unaccumulated[i] =
1852 perf_ctx->unaccumulated[last_elt];
1853 }
1854
1855 break;
1856 }
1857 }
1858
1859 /* Drop our samples_head reference so that associated periodic
1860 * sample data buffers can potentially be reaped if they aren't
1861 * referenced by any other queries...
1862 */
1863
1864 struct oa_sample_buf *buf =
1865 exec_node_data(struct oa_sample_buf, query->oa.samples_head, link);
1866
1867 assert(buf->refcount > 0);
1868 buf->refcount--;
1869
1870 query->oa.samples_head = NULL;
1871
1872 gen_perf_reap_old_sample_buffers(perf_ctx);
1873 }
1874
1875 /* In general if we see anything spurious while accumulating results,
1876 * we don't try and continue accumulating the current query, hoping
1877 * for the best, we scrap anything outstanding, and then hope for the
1878 * best with new queries.
1879 */
1880 static void
1881 discard_all_queries(struct gen_perf_context *perf_ctx)
1882 {
1883 while (perf_ctx->unaccumulated_elements) {
1884 struct gen_perf_query_object *query = perf_ctx->unaccumulated[0];
1885
1886 query->oa.results_accumulated = true;
1887 drop_from_unaccumulated_query_list(perf_ctx, query);
1888
1889 gen_perf_dec_n_users(perf_ctx);
1890 }
1891 }
1892
1893 /**
1894 * Accumulate raw OA counter values based on deltas between pairs of
1895 * OA reports.
1896 *
1897 * Accumulation starts from the first report captured via
1898 * MI_REPORT_PERF_COUNT (MI_RPC) by brw_begin_perf_query() until the
1899 * last MI_RPC report requested by brw_end_perf_query(). Between these
1900 * two reports there may also some number of periodically sampled OA
1901 * reports collected via the i915 perf interface - depending on the
1902 * duration of the query.
1903 *
1904 * These periodic snapshots help to ensure we handle counter overflow
1905 * correctly by being frequent enough to ensure we don't miss multiple
1906 * overflows of a counter between snapshots. For Gen8+ the i915 perf
1907 * snapshots provide the extra context-switch reports that let us
1908 * subtract out the progress of counters associated with other
1909 * contexts running on the system.
1910 */
1911 static void
1912 accumulate_oa_reports(struct gen_perf_context *perf_ctx,
1913 struct gen_perf_query_object *query)
1914 {
1915 const struct gen_device_info *devinfo = perf_ctx->devinfo;
1916 uint32_t *start;
1917 uint32_t *last;
1918 uint32_t *end;
1919 struct exec_node *first_samples_node;
1920 bool in_ctx = true;
1921 int out_duration = 0;
1922
1923 assert(query->oa.map != NULL);
1924
1925 start = last = query->oa.map;
1926 end = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
1927
1928 if (start[0] != query->oa.begin_report_id) {
1929 DBG("Spurious start report id=%"PRIu32"\n", start[0]);
1930 goto error;
1931 }
1932 if (end[0] != (query->oa.begin_report_id + 1)) {
1933 DBG("Spurious end report id=%"PRIu32"\n", end[0]);
1934 goto error;
1935 }
1936
1937 /* See if we have any periodic reports to accumulate too... */
1938
1939 /* N.B. The oa.samples_head was set when the query began and
1940 * pointed to the tail of the perf_ctx->sample_buffers list at
1941 * the time the query started. Since the buffer existed before the
1942 * first MI_REPORT_PERF_COUNT command was emitted we therefore know
1943 * that no data in this particular node's buffer can possibly be
1944 * associated with the query - so skip ahead one...
1945 */
1946 first_samples_node = query->oa.samples_head->next;
1947
1948 foreach_list_typed_from(struct oa_sample_buf, buf, link,
1949 &perf_ctx.sample_buffers,
1950 first_samples_node)
1951 {
1952 int offset = 0;
1953
1954 while (offset < buf->len) {
1955 const struct drm_i915_perf_record_header *header =
1956 (const struct drm_i915_perf_record_header *)(buf->buf + offset);
1957
1958 assert(header->size != 0);
1959 assert(header->size <= buf->len);
1960
1961 offset += header->size;
1962
1963 switch (header->type) {
1964 case DRM_I915_PERF_RECORD_SAMPLE: {
1965 uint32_t *report = (uint32_t *)(header + 1);
1966 bool add = true;
1967
1968 /* Ignore reports that come before the start marker.
1969 * (Note: takes care to allow overflow of 32bit timestamps)
1970 */
1971 if (gen_device_info_timebase_scale(devinfo,
1972 report[1] - start[1]) > 5000000000) {
1973 continue;
1974 }
1975
1976 /* Ignore reports that come after the end marker.
1977 * (Note: takes care to allow overflow of 32bit timestamps)
1978 */
1979 if (gen_device_info_timebase_scale(devinfo,
1980 report[1] - end[1]) <= 5000000000) {
1981 goto end;
1982 }
1983
1984 /* For Gen8+ since the counters continue while other
1985 * contexts are running we need to discount any unrelated
1986 * deltas. The hardware automatically generates a report
1987 * on context switch which gives us a new reference point
1988 * to continuing adding deltas from.
1989 *
1990 * For Haswell we can rely on the HW to stop the progress
1991 * of OA counters while any other context is acctive.
1992 */
1993 if (devinfo->gen >= 8) {
1994 if (in_ctx && report[2] != query->oa.result.hw_id) {
1995 DBG("i915 perf: Switch AWAY (observed by ID change)\n");
1996 in_ctx = false;
1997 out_duration = 0;
1998 } else if (in_ctx == false && report[2] == query->oa.result.hw_id) {
1999 DBG("i915 perf: Switch TO\n");
2000 in_ctx = true;
2001
2002 /* From experimentation in IGT, we found that the OA unit
2003 * might label some report as "idle" (using an invalid
2004 * context ID), right after a report for a given context.
2005 * Deltas generated by those reports actually belong to the
2006 * previous context, even though they're not labelled as
2007 * such.
2008 *
2009 * We didn't *really* Switch AWAY in the case that we e.g.
2010 * saw a single periodic report while idle...
2011 */
2012 if (out_duration >= 1)
2013 add = false;
2014 } else if (in_ctx) {
2015 assert(report[2] == query->oa.result.hw_id);
2016 DBG("i915 perf: Continuation IN\n");
2017 } else {
2018 assert(report[2] != query->oa.result.hw_id);
2019 DBG("i915 perf: Continuation OUT\n");
2020 add = false;
2021 out_duration++;
2022 }
2023 }
2024
2025 if (add) {
2026 gen_perf_query_result_accumulate(&query->oa.result, query->queryinfo,
2027 last, report);
2028 }
2029
2030 last = report;
2031
2032 break;
2033 }
2034
2035 case DRM_I915_PERF_RECORD_OA_BUFFER_LOST:
2036 DBG("i915 perf: OA error: all reports lost\n");
2037 goto error;
2038 case DRM_I915_PERF_RECORD_OA_REPORT_LOST:
2039 DBG("i915 perf: OA report lost\n");
2040 break;
2041 }
2042 }
2043 }
2044
2045 end:
2046
2047 gen_perf_query_result_accumulate(&query->oa.result, query->queryinfo,
2048 last, end);
2049
2050 query->oa.results_accumulated = true;
2051 drop_from_unaccumulated_query_list(perf_ctx, query);
2052 gen_perf_dec_n_users(perf_ctx);
2053
2054 return;
2055
2056 error:
2057
2058 discard_all_queries(perf_ctx);
2059 }
2060
2061 void
2062 gen_perf_delete_query(struct gen_perf_context *perf_ctx,
2063 struct gen_perf_query_object *query)
2064 {
2065 struct gen_perf_config *perf_cfg = perf_ctx->perf;
2066
2067 /* We can assume that the frontend waits for a query to complete
2068 * before ever calling into here, so we don't have to worry about
2069 * deleting an in-flight query object.
2070 */
2071 switch (query->queryinfo->kind) {
2072 case GEN_PERF_QUERY_TYPE_OA:
2073 case GEN_PERF_QUERY_TYPE_RAW:
2074 if (query->oa.bo) {
2075 if (!query->oa.results_accumulated) {
2076 drop_from_unaccumulated_query_list(perf_ctx, query);
2077 gen_perf_dec_n_users(perf_ctx);
2078 }
2079
2080 perf_cfg->vtbl.bo_unreference(query->oa.bo);
2081 query->oa.bo = NULL;
2082 }
2083
2084 query->oa.results_accumulated = false;
2085 break;
2086
2087 case GEN_PERF_QUERY_TYPE_PIPELINE:
2088 if (query->pipeline_stats.bo) {
2089 perf_cfg->vtbl.bo_unreference(query->pipeline_stats.bo);
2090 query->pipeline_stats.bo = NULL;
2091 }
2092 break;
2093
2094 default:
2095 unreachable("Unknown query type");
2096 break;
2097 }
2098
2099 /* As an indication that the INTEL_performance_query extension is no
2100 * longer in use, it's a good time to free our cache of sample
2101 * buffers and close any current i915-perf stream.
2102 */
2103 if (--perf_ctx->n_query_instances == 0) {
2104 gen_perf_free_sample_bufs(perf_ctx);
2105 gen_perf_close(perf_ctx, query->queryinfo);
2106 }
2107
2108 free(query);
2109 }
2110
2111 #define GET_FIELD(word, field) (((word) & field ## _MASK) >> field ## _SHIFT)
2112
2113 static void
2114 read_gt_frequency(struct gen_perf_context *perf_ctx,
2115 struct gen_perf_query_object *obj)
2116 {
2117 const struct gen_device_info *devinfo = perf_ctx->devinfo;
2118 uint32_t start = *((uint32_t *)(obj->oa.map + MI_FREQ_START_OFFSET_BYTES)),
2119 end = *((uint32_t *)(obj->oa.map + MI_FREQ_END_OFFSET_BYTES));
2120
2121 switch (devinfo->gen) {
2122 case 7:
2123 case 8:
2124 obj->oa.gt_frequency[0] = GET_FIELD(start, GEN7_RPSTAT1_CURR_GT_FREQ) * 50ULL;
2125 obj->oa.gt_frequency[1] = GET_FIELD(end, GEN7_RPSTAT1_CURR_GT_FREQ) * 50ULL;
2126 break;
2127 case 9:
2128 case 10:
2129 case 11:
2130 obj->oa.gt_frequency[0] = GET_FIELD(start, GEN9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL;
2131 obj->oa.gt_frequency[1] = GET_FIELD(end, GEN9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL;
2132 break;
2133 default:
2134 unreachable("unexpected gen");
2135 }
2136
2137 /* Put the numbers into Hz. */
2138 obj->oa.gt_frequency[0] *= 1000000ULL;
2139 obj->oa.gt_frequency[1] *= 1000000ULL;
2140 }
2141
2142 static int
2143 get_oa_counter_data(struct gen_perf_context *perf_ctx,
2144 struct gen_perf_query_object *query,
2145 size_t data_size,
2146 uint8_t *data)
2147 {
2148 struct gen_perf_config *perf_cfg = perf_ctx->perf;
2149 const struct gen_perf_query_info *queryinfo = query->queryinfo;
2150 int n_counters = queryinfo->n_counters;
2151 int written = 0;
2152
2153 for (int i = 0; i < n_counters; i++) {
2154 const struct gen_perf_query_counter *counter = &queryinfo->counters[i];
2155 uint64_t *out_uint64;
2156 float *out_float;
2157 size_t counter_size = gen_perf_query_counter_get_size(counter);
2158
2159 if (counter_size) {
2160 switch (counter->data_type) {
2161 case GEN_PERF_COUNTER_DATA_TYPE_UINT64:
2162 out_uint64 = (uint64_t *)(data + counter->offset);
2163 *out_uint64 =
2164 counter->oa_counter_read_uint64(perf_cfg, queryinfo,
2165 query->oa.result.accumulator);
2166 break;
2167 case GEN_PERF_COUNTER_DATA_TYPE_FLOAT:
2168 out_float = (float *)(data + counter->offset);
2169 *out_float =
2170 counter->oa_counter_read_float(perf_cfg, queryinfo,
2171 query->oa.result.accumulator);
2172 break;
2173 default:
2174 /* So far we aren't using uint32, double or bool32... */
2175 unreachable("unexpected counter data type");
2176 }
2177 written = counter->offset + counter_size;
2178 }
2179 }
2180
2181 return written;
2182 }
2183
2184 static int
2185 get_pipeline_stats_data(struct gen_perf_context *perf_ctx,
2186 struct gen_perf_query_object *query,
2187 size_t data_size,
2188 uint8_t *data)
2189
2190 {
2191 struct gen_perf_config *perf_cfg = perf_ctx->perf;
2192 const struct gen_perf_query_info *queryinfo = query->queryinfo;
2193 int n_counters = queryinfo->n_counters;
2194 uint8_t *p = data;
2195
2196 uint64_t *start = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->pipeline_stats.bo, MAP_READ);
2197 uint64_t *end = start + (STATS_BO_END_OFFSET_BYTES / sizeof(uint64_t));
2198
2199 for (int i = 0; i < n_counters; i++) {
2200 const struct gen_perf_query_counter *counter = &queryinfo->counters[i];
2201 uint64_t value = end[i] - start[i];
2202
2203 if (counter->pipeline_stat.numerator !=
2204 counter->pipeline_stat.denominator) {
2205 value *= counter->pipeline_stat.numerator;
2206 value /= counter->pipeline_stat.denominator;
2207 }
2208
2209 *((uint64_t *)p) = value;
2210 p += 8;
2211 }
2212
2213 perf_cfg->vtbl.bo_unmap(query->pipeline_stats.bo);
2214
2215 return p - data;
2216 }
2217
2218 void
2219 gen_perf_get_query_data(struct gen_perf_context *perf_ctx,
2220 struct gen_perf_query_object *query,
2221 int data_size,
2222 unsigned *data,
2223 unsigned *bytes_written)
2224 {
2225 struct gen_perf_config *perf_cfg = perf_ctx->perf;
2226 int written = 0;
2227
2228 switch (query->queryinfo->kind) {
2229 case GEN_PERF_QUERY_TYPE_OA:
2230 case GEN_PERF_QUERY_TYPE_RAW:
2231 if (!query->oa.results_accumulated) {
2232 read_gt_frequency(perf_ctx, query);
2233 uint32_t *begin_report = query->oa.map;
2234 uint32_t *end_report = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
2235 gen_perf_query_result_read_frequencies(&query->oa.result,
2236 perf_ctx->devinfo,
2237 begin_report,
2238 end_report);
2239 accumulate_oa_reports(perf_ctx, query);
2240 assert(query->oa.results_accumulated);
2241
2242 perf_cfg->vtbl.bo_unmap(query->oa.bo);
2243 query->oa.map = NULL;
2244 }
2245 if (query->queryinfo->kind == GEN_PERF_QUERY_TYPE_OA) {
2246 written = get_oa_counter_data(perf_ctx, query, data_size, (uint8_t *)data);
2247 } else {
2248 const struct gen_device_info *devinfo = perf_ctx->devinfo;
2249
2250 written = gen_perf_query_result_write_mdapi((uint8_t *)data, data_size,
2251 devinfo, &query->oa.result,
2252 query->oa.gt_frequency[0],
2253 query->oa.gt_frequency[1]);
2254 }
2255 break;
2256
2257 case GEN_PERF_QUERY_TYPE_PIPELINE:
2258 written = get_pipeline_stats_data(perf_ctx, query, data_size, (uint8_t *)data);
2259 break;
2260
2261 default:
2262 unreachable("Unknown query type");
2263 break;
2264 }
2265
2266 if (bytes_written)
2267 *bytes_written = written;
2268 }