cd97039b0557e644b5b290b718b9f1bda2fcb18c
[mesa.git] / src / mesa / drivers / dri / i965 / brw_performance_query.c
1 /*
2 * Copyright © 2013 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 /**
25 * \file brw_performance_query.c
26 *
27 * Implementation of the GL_INTEL_performance_query extension.
28 *
29 * Currently there are two possible counter sources exposed here:
30 *
31 * On Gen6+ hardware we have numerous 64bit Pipeline Statistics Registers
32 * that we can snapshot at the beginning and end of a query.
33 *
34 * On Gen7.5+ we have Observability Architecture counters which are
35 * covered in separate document from the rest of the PRMs. It is available at:
36 * https://01.org/linuxgraphics/documentation/driver-documentation-prms
37 * => 2013 Intel Core Processor Family => Observability Performance Counters
38 * (This one volume covers Sandybridge, Ivybridge, Baytrail, and Haswell,
39 * though notably we currently only support OA counters for Haswell+)
40 */
41
42 #include <limits.h>
43 #include <dirent.h>
44
45 /* put before sys/types.h to silence glibc warnings */
46 #ifdef MAJOR_IN_MKDEV
47 #include <sys/mkdev.h>
48 #endif
49 #ifdef MAJOR_IN_SYSMACROS
50 #include <sys/sysmacros.h>
51 #endif
52 #include <sys/types.h>
53 #include <sys/stat.h>
54 #include <fcntl.h>
55 #include <sys/mman.h>
56 #include <sys/ioctl.h>
57
58 #include <xf86drm.h>
59 #include <i915_drm.h>
60
61 #include "main/hash.h"
62 #include "main/macros.h"
63 #include "main/mtypes.h"
64 #include "main/performance_query.h"
65
66 #include "util/bitset.h"
67 #include "util/ralloc.h"
68 #include "util/hash_table.h"
69 #include "util/list.h"
70
71 #include "brw_context.h"
72 #include "brw_defines.h"
73 #include "brw_performance_query.h"
74 #include "brw_oa_hsw.h"
75 #include "brw_oa_bdw.h"
76 #include "brw_oa_chv.h"
77 #include "brw_oa_sklgt2.h"
78 #include "brw_oa_sklgt3.h"
79 #include "brw_oa_sklgt4.h"
80 #include "brw_oa_bxt.h"
81 #include "brw_oa_kblgt2.h"
82 #include "brw_oa_kblgt3.h"
83 #include "brw_oa_glk.h"
84 #include "brw_oa_cflgt2.h"
85 #include "intel_batchbuffer.h"
86
87 #define FILE_DEBUG_FLAG DEBUG_PERFMON
88
89 /*
90 * The largest OA formats we can use include:
91 * For Haswell:
92 * 1 timestamp, 45 A counters, 8 B counters and 8 C counters.
93 * For Gen8+
94 * 1 timestamp, 1 clock, 36 A counters, 8 B counters and 8 C counters
95 */
96 #define MAX_OA_REPORT_COUNTERS 62
97
98 #define OAREPORT_REASON_MASK 0x3f
99 #define OAREPORT_REASON_SHIFT 19
100 #define OAREPORT_REASON_TIMER (1<<0)
101 #define OAREPORT_REASON_TRIGGER1 (1<<1)
102 #define OAREPORT_REASON_TRIGGER2 (1<<2)
103 #define OAREPORT_REASON_CTX_SWITCH (1<<3)
104 #define OAREPORT_REASON_GO_TRANSITION (1<<4)
105
106 #define I915_PERF_OA_SAMPLE_SIZE (8 + /* drm_i915_perf_record_header */ \
107 256) /* OA counter report */
108
109 /**
110 * Periodic OA samples are read() into these buffer structures via the
111 * i915 perf kernel interface and appended to the
112 * brw->perfquery.sample_buffers linked list. When we process the
113 * results of an OA metrics query we need to consider all the periodic
114 * samples between the Begin and End MI_REPORT_PERF_COUNT command
115 * markers.
116 *
117 * 'Periodic' is a simplification as there are other automatic reports
118 * written by the hardware also buffered here.
119 *
120 * Considering three queries, A, B and C:
121 *
122 * Time ---->
123 * ________________A_________________
124 * | |
125 * | ________B_________ _____C___________
126 * | | | | | |
127 *
128 * And an illustration of sample buffers read over this time frame:
129 * [HEAD ][ ][ ][ ][ ][ ][ ][ ][TAIL ]
130 *
131 * These nodes may hold samples for query A:
132 * [ ][ ][ A ][ A ][ A ][ A ][ A ][ ][ ]
133 *
134 * These nodes may hold samples for query B:
135 * [ ][ ][ B ][ B ][ B ][ ][ ][ ][ ]
136 *
137 * These nodes may hold samples for query C:
138 * [ ][ ][ ][ ][ ][ C ][ C ][ C ][ ]
139 *
140 * The illustration assumes we have an even distribution of periodic
141 * samples so all nodes have the same size plotted against time:
142 *
143 * Note, to simplify code, the list is never empty.
144 *
145 * With overlapping queries we can see that periodic OA reports may
146 * relate to multiple queries and care needs to be take to keep
147 * track of sample buffers until there are no queries that might
148 * depend on their contents.
149 *
150 * We use a node ref counting system where a reference ensures that a
151 * node and all following nodes can't be freed/recycled until the
152 * reference drops to zero.
153 *
154 * E.g. with a ref of one here:
155 * [ 0 ][ 0 ][ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ]
156 *
157 * These nodes could be freed or recycled ("reaped"):
158 * [ 0 ][ 0 ]
159 *
160 * These must be preserved until the leading ref drops to zero:
161 * [ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ]
162 *
163 * When a query starts we take a reference on the current tail of
164 * the list, knowing that no already-buffered samples can possibly
165 * relate to the newly-started query. A pointer to this node is
166 * also saved in the query object's ->oa.samples_head.
167 *
168 * E.g. starting query A while there are two nodes in .sample_buffers:
169 * ________________A________
170 * |
171 *
172 * [ 0 ][ 1 ]
173 * ^_______ Add a reference and store pointer to node in
174 * A->oa.samples_head
175 *
176 * Moving forward to when the B query starts with no new buffer nodes:
177 * (for reference, i915 perf reads() are only done when queries finish)
178 * ________________A_______
179 * | ________B___
180 * | |
181 *
182 * [ 0 ][ 2 ]
183 * ^_______ Add a reference and store pointer to
184 * node in B->oa.samples_head
185 *
186 * Once a query is finished, after an OA query has become 'Ready',
187 * once the End OA report has landed and after we we have processed
188 * all the intermediate periodic samples then we drop the
189 * ->oa.samples_head reference we took at the start.
190 *
191 * So when the B query has finished we have:
192 * ________________A________
193 * | ______B___________
194 * | | |
195 * [ 0 ][ 1 ][ 0 ][ 0 ][ 0 ]
196 * ^_______ Drop B->oa.samples_head reference
197 *
198 * We still can't free these due to the A->oa.samples_head ref:
199 * [ 1 ][ 0 ][ 0 ][ 0 ]
200 *
201 * When the A query finishes: (note there's a new ref for C's samples_head)
202 * ________________A_________________
203 * | |
204 * | _____C_________
205 * | | |
206 * [ 0 ][ 0 ][ 0 ][ 0 ][ 1 ][ 0 ][ 0 ]
207 * ^_______ Drop A->oa.samples_head reference
208 *
209 * And we can now reap these nodes up to the C->oa.samples_head:
210 * [ X ][ X ][ X ][ X ]
211 * keeping -> [ 1 ][ 0 ][ 0 ]
212 *
213 * We reap old sample buffers each time we finish processing an OA
214 * query by iterating the sample_buffers list from the head until we
215 * find a referenced node and stop.
216 *
217 * Reaped buffers move to a perfquery.free_sample_buffers list and
218 * when we come to read() we first look to recycle a buffer from the
219 * free_sample_buffers list before allocating a new buffer.
220 */
221 struct brw_oa_sample_buf {
222 struct exec_node link;
223 int refcount;
224 int len;
225 uint8_t buf[I915_PERF_OA_SAMPLE_SIZE * 10];
226 uint32_t last_timestamp;
227 };
228
229 /**
230 * i965 representation of a performance query object.
231 *
232 * NB: We want to keep this structure relatively lean considering that
233 * applications may expect to allocate enough objects to be able to
234 * query around all draw calls in a frame.
235 */
236 struct brw_perf_query_object
237 {
238 struct gl_perf_query_object base;
239
240 const struct brw_perf_query_info *query;
241
242 /* See query->kind to know which state below is in use... */
243 union {
244 struct {
245
246 /**
247 * BO containing OA counter snapshots at query Begin/End time.
248 */
249 struct brw_bo *bo;
250
251 /**
252 * Address of mapped of @bo
253 */
254 void *map;
255
256 /**
257 * The MI_REPORT_PERF_COUNT command lets us specify a unique
258 * ID that will be reflected in the resulting OA report
259 * that's written by the GPU. This is the ID we're expecting
260 * in the begin report and the the end report should be
261 * @begin_report_id + 1.
262 */
263 int begin_report_id;
264
265 /**
266 * Reference the head of the brw->perfquery.sample_buffers
267 * list at the time that the query started (so we only need
268 * to look at nodes after this point when looking for samples
269 * related to this query)
270 *
271 * (See struct brw_oa_sample_buf description for more details)
272 */
273 struct exec_node *samples_head;
274
275 /**
276 * Storage for the final accumulated OA counters.
277 */
278 uint64_t accumulator[MAX_OA_REPORT_COUNTERS];
279
280 /**
281 * false while in the unaccumulated_elements list, and set to
282 * true when the final, end MI_RPC snapshot has been
283 * accumulated.
284 */
285 bool results_accumulated;
286
287 } oa;
288
289 struct {
290 /**
291 * BO containing starting and ending snapshots for the
292 * statistics counters.
293 */
294 struct brw_bo *bo;
295 } pipeline_stats;
296 };
297 };
298
299 /** Downcasting convenience macro. */
300 static inline struct brw_perf_query_object *
301 brw_perf_query(struct gl_perf_query_object *o)
302 {
303 return (struct brw_perf_query_object *) o;
304 }
305
306 #define STATS_BO_SIZE 4096
307 #define STATS_BO_END_OFFSET_BYTES (STATS_BO_SIZE / 2)
308 #define MAX_STAT_COUNTERS (STATS_BO_END_OFFSET_BYTES / 8)
309
310 #define MI_RPC_BO_SIZE 4096
311 #define MI_RPC_BO_END_OFFSET_BYTES (MI_RPC_BO_SIZE / 2)
312
313 /******************************************************************************/
314
315 static bool
316 brw_is_perf_query_ready(struct gl_context *ctx,
317 struct gl_perf_query_object *o);
318
319 static void
320 dump_perf_query_callback(GLuint id, void *query_void, void *brw_void)
321 {
322 struct gl_context *ctx = brw_void;
323 struct gl_perf_query_object *o = query_void;
324 struct brw_perf_query_object *obj = query_void;
325
326 switch (obj->query->kind) {
327 case OA_COUNTERS:
328 DBG("%4d: %-6s %-8s BO: %-4s OA data: %-10s %-15s\n",
329 id,
330 o->Used ? "Dirty," : "New,",
331 o->Active ? "Active," : (o->Ready ? "Ready," : "Pending,"),
332 obj->oa.bo ? "yes," : "no,",
333 brw_is_perf_query_ready(ctx, o) ? "ready," : "not ready,",
334 obj->oa.results_accumulated ? "accumulated" : "not accumulated");
335 break;
336 case PIPELINE_STATS:
337 DBG("%4d: %-6s %-8s BO: %-4s\n",
338 id,
339 o->Used ? "Dirty," : "New,",
340 o->Active ? "Active," : (o->Ready ? "Ready," : "Pending,"),
341 obj->pipeline_stats.bo ? "yes" : "no");
342 break;
343 }
344 }
345
346 static void
347 dump_perf_queries(struct brw_context *brw)
348 {
349 struct gl_context *ctx = &brw->ctx;
350 DBG("Queries: (Open queries = %d, OA users = %d)\n",
351 brw->perfquery.n_active_oa_queries, brw->perfquery.n_oa_users);
352 _mesa_HashWalk(ctx->PerfQuery.Objects, dump_perf_query_callback, brw);
353 }
354
355 /******************************************************************************/
356
357 static struct brw_oa_sample_buf *
358 get_free_sample_buf(struct brw_context *brw)
359 {
360 struct exec_node *node = exec_list_pop_head(&brw->perfquery.free_sample_buffers);
361 struct brw_oa_sample_buf *buf;
362
363 if (node)
364 buf = exec_node_data(struct brw_oa_sample_buf, node, link);
365 else {
366 buf = ralloc_size(brw, sizeof(*buf));
367
368 exec_node_init(&buf->link);
369 buf->refcount = 0;
370 buf->len = 0;
371 }
372
373 return buf;
374 }
375
376 static void
377 reap_old_sample_buffers(struct brw_context *brw)
378 {
379 struct exec_node *tail_node =
380 exec_list_get_tail(&brw->perfquery.sample_buffers);
381 struct brw_oa_sample_buf *tail_buf =
382 exec_node_data(struct brw_oa_sample_buf, tail_node, link);
383
384 /* Remove all old, unreferenced sample buffers walking forward from
385 * the head of the list, except always leave at least one node in
386 * the list so we always have a node to reference when we Begin
387 * a new query.
388 */
389 foreach_list_typed_safe(struct brw_oa_sample_buf, buf, link,
390 &brw->perfquery.sample_buffers)
391 {
392 if (buf->refcount == 0 && buf != tail_buf) {
393 exec_node_remove(&buf->link);
394 exec_list_push_head(&brw->perfquery.free_sample_buffers, &buf->link);
395 } else
396 return;
397 }
398 }
399
400 static void
401 free_sample_bufs(struct brw_context *brw)
402 {
403 foreach_list_typed_safe(struct brw_oa_sample_buf, buf, link,
404 &brw->perfquery.free_sample_buffers)
405 ralloc_free(buf);
406
407 exec_list_make_empty(&brw->perfquery.free_sample_buffers);
408 }
409
410 /******************************************************************************/
411
412 /**
413 * Driver hook for glGetPerfQueryInfoINTEL().
414 */
415 static void
416 brw_get_perf_query_info(struct gl_context *ctx,
417 unsigned query_index,
418 const char **name,
419 GLuint *data_size,
420 GLuint *n_counters,
421 GLuint *n_active)
422 {
423 struct brw_context *brw = brw_context(ctx);
424 const struct brw_perf_query_info *query =
425 &brw->perfquery.queries[query_index];
426
427 *name = query->name;
428 *data_size = query->data_size;
429 *n_counters = query->n_counters;
430
431 switch (query->kind) {
432 case OA_COUNTERS:
433 *n_active = brw->perfquery.n_active_oa_queries;
434 break;
435
436 case PIPELINE_STATS:
437 *n_active = brw->perfquery.n_active_pipeline_stats_queries;
438 break;
439 }
440 }
441
442 /**
443 * Driver hook for glGetPerfCounterInfoINTEL().
444 */
445 static void
446 brw_get_perf_counter_info(struct gl_context *ctx,
447 unsigned query_index,
448 unsigned counter_index,
449 const char **name,
450 const char **desc,
451 GLuint *offset,
452 GLuint *data_size,
453 GLuint *type_enum,
454 GLuint *data_type_enum,
455 GLuint64 *raw_max)
456 {
457 struct brw_context *brw = brw_context(ctx);
458 const struct brw_perf_query_info *query =
459 &brw->perfquery.queries[query_index];
460 const struct brw_perf_query_counter *counter =
461 &query->counters[counter_index];
462
463 *name = counter->name;
464 *desc = counter->desc;
465 *offset = counter->offset;
466 *data_size = counter->size;
467 *type_enum = counter->type;
468 *data_type_enum = counter->data_type;
469 *raw_max = counter->raw_max;
470 }
471
472 /******************************************************************************/
473
474 /**
475 * Emit MI_STORE_REGISTER_MEM commands to capture all of the
476 * pipeline statistics for the performance query object.
477 */
478 static void
479 snapshot_statistics_registers(struct brw_context *brw,
480 struct brw_perf_query_object *obj,
481 uint32_t offset_in_bytes)
482 {
483 const struct brw_perf_query_info *query = obj->query;
484 const int n_counters = query->n_counters;
485
486 for (int i = 0; i < n_counters; i++) {
487 const struct brw_perf_query_counter *counter = &query->counters[i];
488
489 assert(counter->data_type == GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL);
490
491 brw_store_register_mem64(brw, obj->pipeline_stats.bo,
492 counter->pipeline_stat.reg,
493 offset_in_bytes + i * sizeof(uint64_t));
494 }
495 }
496
497 /**
498 * Add a query to the global list of "unaccumulated queries."
499 *
500 * Queries are tracked here until all the associated OA reports have
501 * been accumulated via accumulate_oa_reports() after the end
502 * MI_REPORT_PERF_COUNT has landed in query->oa.bo.
503 */
504 static void
505 add_to_unaccumulated_query_list(struct brw_context *brw,
506 struct brw_perf_query_object *obj)
507 {
508 if (brw->perfquery.unaccumulated_elements >=
509 brw->perfquery.unaccumulated_array_size)
510 {
511 brw->perfquery.unaccumulated_array_size *= 1.5;
512 brw->perfquery.unaccumulated =
513 reralloc(brw, brw->perfquery.unaccumulated,
514 struct brw_perf_query_object *,
515 brw->perfquery.unaccumulated_array_size);
516 }
517
518 brw->perfquery.unaccumulated[brw->perfquery.unaccumulated_elements++] = obj;
519 }
520
521 /**
522 * Remove a query from the global list of unaccumulated queries once
523 * after successfully accumulating the OA reports associated with the
524 * query in accumulate_oa_reports() or when discarding unwanted query
525 * results.
526 */
527 static void
528 drop_from_unaccumulated_query_list(struct brw_context *brw,
529 struct brw_perf_query_object *obj)
530 {
531 for (int i = 0; i < brw->perfquery.unaccumulated_elements; i++) {
532 if (brw->perfquery.unaccumulated[i] == obj) {
533 int last_elt = --brw->perfquery.unaccumulated_elements;
534
535 if (i == last_elt)
536 brw->perfquery.unaccumulated[i] = NULL;
537 else {
538 brw->perfquery.unaccumulated[i] =
539 brw->perfquery.unaccumulated[last_elt];
540 }
541
542 break;
543 }
544 }
545
546 /* Drop our samples_head reference so that associated periodic
547 * sample data buffers can potentially be reaped if they aren't
548 * referenced by any other queries...
549 */
550
551 struct brw_oa_sample_buf *buf =
552 exec_node_data(struct brw_oa_sample_buf, obj->oa.samples_head, link);
553
554 assert(buf->refcount > 0);
555 buf->refcount--;
556
557 obj->oa.samples_head = NULL;
558
559 reap_old_sample_buffers(brw);
560 }
561
562 static uint64_t
563 timebase_scale(struct brw_context *brw, uint32_t u32_time_delta)
564 {
565 const struct gen_device_info *devinfo = &brw->screen->devinfo;
566 uint64_t tmp = ((uint64_t)u32_time_delta) * 1000000000ull;
567
568 return tmp ? tmp / devinfo->timestamp_frequency : 0;
569 }
570
571 static void
572 accumulate_uint32(const uint32_t *report0,
573 const uint32_t *report1,
574 uint64_t *accumulator)
575 {
576 *accumulator += (uint32_t)(*report1 - *report0);
577 }
578
579 static void
580 accumulate_uint40(int a_index,
581 const uint32_t *report0,
582 const uint32_t *report1,
583 uint64_t *accumulator)
584 {
585 const uint8_t *high_bytes0 = (uint8_t *)(report0 + 40);
586 const uint8_t *high_bytes1 = (uint8_t *)(report1 + 40);
587 uint64_t high0 = (uint64_t)(high_bytes0[a_index]) << 32;
588 uint64_t high1 = (uint64_t)(high_bytes1[a_index]) << 32;
589 uint64_t value0 = report0[a_index + 4] | high0;
590 uint64_t value1 = report1[a_index + 4] | high1;
591 uint64_t delta;
592
593 if (value0 > value1)
594 delta = (1ULL << 40) + value1 - value0;
595 else
596 delta = value1 - value0;
597
598 *accumulator += delta;
599 }
600
601 /**
602 * Given pointers to starting and ending OA snapshots, add the deltas for each
603 * counter to the results.
604 */
605 static void
606 add_deltas(struct brw_context *brw,
607 struct brw_perf_query_object *obj,
608 const uint32_t *start,
609 const uint32_t *end)
610 {
611 const struct brw_perf_query_info *query = obj->query;
612 uint64_t *accumulator = obj->oa.accumulator;
613 int idx = 0;
614 int i;
615
616 switch (query->oa_format) {
617 case I915_OA_FORMAT_A32u40_A4u32_B8_C8:
618 accumulate_uint32(start + 1, end + 1, accumulator + idx++); /* timestamp */
619 accumulate_uint32(start + 3, end + 3, accumulator + idx++); /* clock */
620
621 /* 32x 40bit A counters... */
622 for (i = 0; i < 32; i++)
623 accumulate_uint40(i, start, end, accumulator + idx++);
624
625 /* 4x 32bit A counters... */
626 for (i = 0; i < 4; i++)
627 accumulate_uint32(start + 36 + i, end + 36 + i, accumulator + idx++);
628
629 /* 8x 32bit B counters + 8x 32bit C counters... */
630 for (i = 0; i < 16; i++)
631 accumulate_uint32(start + 48 + i, end + 48 + i, accumulator + idx++);
632
633 break;
634 case I915_OA_FORMAT_A45_B8_C8:
635 accumulate_uint32(start + 1, end + 1, accumulator); /* timestamp */
636
637 for (i = 0; i < 61; i++)
638 accumulate_uint32(start + 3 + i, end + 3 + i, accumulator + 1 + i);
639
640 break;
641 default:
642 unreachable("Can't accumulate OA counters in unknown format");
643 }
644 }
645
646 static bool
647 inc_n_oa_users(struct brw_context *brw)
648 {
649 if (brw->perfquery.n_oa_users == 0 &&
650 drmIoctl(brw->perfquery.oa_stream_fd,
651 I915_PERF_IOCTL_ENABLE, 0) < 0)
652 {
653 return false;
654 }
655 ++brw->perfquery.n_oa_users;
656
657 return true;
658 }
659
660 static void
661 dec_n_oa_users(struct brw_context *brw)
662 {
663 /* Disabling the i915 perf stream will effectively disable the OA
664 * counters. Note it's important to be sure there are no outstanding
665 * MI_RPC commands at this point since they could stall the CS
666 * indefinitely once OACONTROL is disabled.
667 */
668 --brw->perfquery.n_oa_users;
669 if (brw->perfquery.n_oa_users == 0 &&
670 drmIoctl(brw->perfquery.oa_stream_fd, I915_PERF_IOCTL_DISABLE, 0) < 0)
671 {
672 DBG("WARNING: Error disabling i915 perf stream: %m\n");
673 }
674 }
675
676 /* In general if we see anything spurious while accumulating results,
677 * we don't try and continue accumulating the current query, hoping
678 * for the best, we scrap anything outstanding, and then hope for the
679 * best with new queries.
680 */
681 static void
682 discard_all_queries(struct brw_context *brw)
683 {
684 while (brw->perfquery.unaccumulated_elements) {
685 struct brw_perf_query_object *obj = brw->perfquery.unaccumulated[0];
686
687 obj->oa.results_accumulated = true;
688 drop_from_unaccumulated_query_list(brw, brw->perfquery.unaccumulated[0]);
689
690 dec_n_oa_users(brw);
691 }
692 }
693
694 enum OaReadStatus {
695 OA_READ_STATUS_ERROR,
696 OA_READ_STATUS_UNFINISHED,
697 OA_READ_STATUS_FINISHED,
698 };
699
700 static enum OaReadStatus
701 read_oa_samples_until(struct brw_context *brw,
702 uint32_t start_timestamp,
703 uint32_t end_timestamp)
704 {
705 struct exec_node *tail_node =
706 exec_list_get_tail(&brw->perfquery.sample_buffers);
707 struct brw_oa_sample_buf *tail_buf =
708 exec_node_data(struct brw_oa_sample_buf, tail_node, link);
709 uint32_t last_timestamp = tail_buf->last_timestamp;
710
711 while (1) {
712 struct brw_oa_sample_buf *buf = get_free_sample_buf(brw);
713 uint32_t offset;
714 int len;
715
716 while ((len = read(brw->perfquery.oa_stream_fd, buf->buf,
717 sizeof(buf->buf))) < 0 && errno == EINTR)
718 ;
719
720 if (len <= 0) {
721 exec_list_push_tail(&brw->perfquery.free_sample_buffers, &buf->link);
722
723 if (len < 0) {
724 if (errno == EAGAIN)
725 return ((last_timestamp - start_timestamp) >=
726 (end_timestamp - start_timestamp)) ?
727 OA_READ_STATUS_FINISHED :
728 OA_READ_STATUS_UNFINISHED;
729 else {
730 DBG("Error reading i915 perf samples: %m\n");
731 }
732 } else
733 DBG("Spurious EOF reading i915 perf samples\n");
734
735 return OA_READ_STATUS_ERROR;
736 }
737
738 buf->len = len;
739 exec_list_push_tail(&brw->perfquery.sample_buffers, &buf->link);
740
741 /* Go through the reports and update the last timestamp. */
742 offset = 0;
743 while (offset < buf->len) {
744 const struct drm_i915_perf_record_header *header =
745 (const struct drm_i915_perf_record_header *) &buf->buf[offset];
746 uint32_t *report = (uint32_t *) (header + 1);
747
748 if (header->type == DRM_I915_PERF_RECORD_SAMPLE)
749 last_timestamp = report[1];
750
751 offset += header->size;
752 }
753
754 buf->last_timestamp = last_timestamp;
755 }
756
757 unreachable("not reached");
758 return OA_READ_STATUS_ERROR;
759 }
760
761 /**
762 * Try to read all the reports until either the delimiting timestamp
763 * or an error arises.
764 */
765 static bool
766 read_oa_samples_for_query(struct brw_context *brw,
767 struct brw_perf_query_object *obj)
768 {
769 uint32_t *start;
770 uint32_t *last;
771 uint32_t *end;
772
773 /* We need the MI_REPORT_PERF_COUNT to land before we can start
774 * accumulate. */
775 assert(!brw_batch_references(&brw->batch, obj->oa.bo) &&
776 !brw_bo_busy(obj->oa.bo));
777
778 /* Map the BO once here and let accumulate_oa_reports() unmap
779 * it. */
780 if (obj->oa.map == NULL)
781 obj->oa.map = brw_bo_map(brw, obj->oa.bo, MAP_READ);
782
783 start = last = obj->oa.map;
784 end = obj->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
785
786 if (start[0] != obj->oa.begin_report_id) {
787 DBG("Spurious start report id=%"PRIu32"\n", start[0]);
788 return true;
789 }
790 if (end[0] != (obj->oa.begin_report_id + 1)) {
791 DBG("Spurious end report id=%"PRIu32"\n", end[0]);
792 return true;
793 }
794
795 /* Read the reports until the end timestamp. */
796 switch (read_oa_samples_until(brw, start[1], end[1])) {
797 case OA_READ_STATUS_ERROR:
798 /* Fallthrough and let accumulate_oa_reports() deal with the
799 * error. */
800 case OA_READ_STATUS_FINISHED:
801 return true;
802 case OA_READ_STATUS_UNFINISHED:
803 return false;
804 }
805
806 unreachable("invalid read status");
807 return false;
808 }
809
810 /**
811 * Accumulate raw OA counter values based on deltas between pairs of
812 * OA reports.
813 *
814 * Accumulation starts from the first report captured via
815 * MI_REPORT_PERF_COUNT (MI_RPC) by brw_begin_perf_query() until the
816 * last MI_RPC report requested by brw_end_perf_query(). Between these
817 * two reports there may also some number of periodically sampled OA
818 * reports collected via the i915 perf interface - depending on the
819 * duration of the query.
820 *
821 * These periodic snapshots help to ensure we handle counter overflow
822 * correctly by being frequent enough to ensure we don't miss multiple
823 * overflows of a counter between snapshots. For Gen8+ the i915 perf
824 * snapshots provide the extra context-switch reports that let us
825 * subtract out the progress of counters associated with other
826 * contexts running on the system.
827 */
828 static void
829 accumulate_oa_reports(struct brw_context *brw,
830 struct brw_perf_query_object *obj)
831 {
832 const struct gen_device_info *devinfo = &brw->screen->devinfo;
833 struct gl_perf_query_object *o = &obj->base;
834 uint32_t *start;
835 uint32_t *last;
836 uint32_t *end;
837 struct exec_node *first_samples_node;
838 bool in_ctx = true;
839 uint32_t ctx_id;
840 int out_duration = 0;
841
842 assert(o->Ready);
843 assert(obj->oa.map != NULL);
844
845 start = last = obj->oa.map;
846 end = obj->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
847
848 if (start[0] != obj->oa.begin_report_id) {
849 DBG("Spurious start report id=%"PRIu32"\n", start[0]);
850 goto error;
851 }
852 if (end[0] != (obj->oa.begin_report_id + 1)) {
853 DBG("Spurious end report id=%"PRIu32"\n", end[0]);
854 goto error;
855 }
856
857 ctx_id = start[2];
858
859 /* See if we have any periodic reports to accumulate too... */
860
861 /* N.B. The oa.samples_head was set when the query began and
862 * pointed to the tail of the brw->perfquery.sample_buffers list at
863 * the time the query started. Since the buffer existed before the
864 * first MI_REPORT_PERF_COUNT command was emitted we therefore know
865 * that no data in this particular node's buffer can possibly be
866 * associated with the query - so skip ahead one...
867 */
868 first_samples_node = obj->oa.samples_head->next;
869
870 foreach_list_typed_from(struct brw_oa_sample_buf, buf, link,
871 &brw->perfquery.sample_buffers,
872 first_samples_node)
873 {
874 int offset = 0;
875
876 while (offset < buf->len) {
877 const struct drm_i915_perf_record_header *header =
878 (const struct drm_i915_perf_record_header *)(buf->buf + offset);
879
880 assert(header->size != 0);
881 assert(header->size <= buf->len);
882
883 offset += header->size;
884
885 switch (header->type) {
886 case DRM_I915_PERF_RECORD_SAMPLE: {
887 uint32_t *report = (uint32_t *)(header + 1);
888 bool add = true;
889
890 /* Ignore reports that come before the start marker.
891 * (Note: takes care to allow overflow of 32bit timestamps)
892 */
893 if (timebase_scale(brw, report[1] - start[1]) > 5000000000)
894 continue;
895
896 /* Ignore reports that come after the end marker.
897 * (Note: takes care to allow overflow of 32bit timestamps)
898 */
899 if (timebase_scale(brw, report[1] - end[1]) <= 5000000000)
900 goto end;
901
902 /* For Gen8+ since the counters continue while other
903 * contexts are running we need to discount any unrelated
904 * deltas. The hardware automatically generates a report
905 * on context switch which gives us a new reference point
906 * to continuing adding deltas from.
907 *
908 * For Haswell we can rely on the HW to stop the progress
909 * of OA counters while any other context is acctive.
910 */
911 if (devinfo->gen >= 8) {
912 if (in_ctx && report[2] != ctx_id) {
913 DBG("i915 perf: Switch AWAY (observed by ID change)\n");
914 in_ctx = false;
915 out_duration = 0;
916 } else if (in_ctx == false && report[2] == ctx_id) {
917 DBG("i915 perf: Switch TO\n");
918 in_ctx = true;
919
920 /* From experimentation in IGT, we found that the OA unit
921 * might label some report as "idle" (using an invalid
922 * context ID), right after a report for a given context.
923 * Deltas generated by those reports actually belong to the
924 * previous context, even though they're not labelled as
925 * such.
926 *
927 * We didn't *really* Switch AWAY in the case that we e.g.
928 * saw a single periodic report while idle...
929 */
930 if (out_duration >= 1)
931 add = false;
932 } else if (in_ctx) {
933 assert(report[2] == ctx_id);
934 DBG("i915 perf: Continuation IN\n");
935 } else {
936 assert(report[2] != ctx_id);
937 DBG("i915 perf: Continuation OUT\n");
938 add = false;
939 out_duration++;
940 }
941 }
942
943 if (add)
944 add_deltas(brw, obj, last, report);
945
946 last = report;
947
948 break;
949 }
950
951 case DRM_I915_PERF_RECORD_OA_BUFFER_LOST:
952 DBG("i915 perf: OA error: all reports lost\n");
953 goto error;
954 case DRM_I915_PERF_RECORD_OA_REPORT_LOST:
955 DBG("i915 perf: OA report lost\n");
956 break;
957 }
958 }
959 }
960
961 end:
962
963 add_deltas(brw, obj, last, end);
964
965 DBG("Marking %d accumulated - results gathered\n", o->Id);
966
967 brw_bo_unmap(obj->oa.bo);
968 obj->oa.map = NULL;
969 obj->oa.results_accumulated = true;
970 drop_from_unaccumulated_query_list(brw, obj);
971 dec_n_oa_users(brw);
972
973 return;
974
975 error:
976
977 brw_bo_unmap(obj->oa.bo);
978 obj->oa.map = NULL;
979 discard_all_queries(brw);
980 }
981
982 /******************************************************************************/
983
984 static bool
985 open_i915_perf_oa_stream(struct brw_context *brw,
986 int metrics_set_id,
987 int report_format,
988 int period_exponent,
989 int drm_fd,
990 uint32_t ctx_id)
991 {
992 uint64_t properties[] = {
993 /* Single context sampling */
994 DRM_I915_PERF_PROP_CTX_HANDLE, ctx_id,
995
996 /* Include OA reports in samples */
997 DRM_I915_PERF_PROP_SAMPLE_OA, true,
998
999 /* OA unit configuration */
1000 DRM_I915_PERF_PROP_OA_METRICS_SET, metrics_set_id,
1001 DRM_I915_PERF_PROP_OA_FORMAT, report_format,
1002 DRM_I915_PERF_PROP_OA_EXPONENT, period_exponent,
1003 };
1004 struct drm_i915_perf_open_param param = {
1005 .flags = I915_PERF_FLAG_FD_CLOEXEC |
1006 I915_PERF_FLAG_FD_NONBLOCK |
1007 I915_PERF_FLAG_DISABLED,
1008 .num_properties = ARRAY_SIZE(properties) / 2,
1009 .properties_ptr = (uintptr_t) properties,
1010 };
1011 int fd = drmIoctl(drm_fd, DRM_IOCTL_I915_PERF_OPEN, &param);
1012 if (fd == -1) {
1013 DBG("Error opening i915 perf OA stream: %m\n");
1014 return false;
1015 }
1016
1017 brw->perfquery.oa_stream_fd = fd;
1018
1019 brw->perfquery.current_oa_metrics_set_id = metrics_set_id;
1020 brw->perfquery.current_oa_format = report_format;
1021
1022 return true;
1023 }
1024
1025 static void
1026 close_perf(struct brw_context *brw)
1027 {
1028 if (brw->perfquery.oa_stream_fd != -1) {
1029 close(brw->perfquery.oa_stream_fd);
1030 brw->perfquery.oa_stream_fd = -1;
1031 }
1032 }
1033
1034 /**
1035 * Driver hook for glBeginPerfQueryINTEL().
1036 */
1037 static bool
1038 brw_begin_perf_query(struct gl_context *ctx,
1039 struct gl_perf_query_object *o)
1040 {
1041 struct brw_context *brw = brw_context(ctx);
1042 struct brw_perf_query_object *obj = brw_perf_query(o);
1043 const struct brw_perf_query_info *query = obj->query;
1044
1045 /* We can assume the frontend hides mistaken attempts to Begin a
1046 * query object multiple times before its End. Similarly if an
1047 * application reuses a query object before results have arrived
1048 * the frontend will wait for prior results so we don't need
1049 * to support abandoning in-flight results.
1050 */
1051 assert(!o->Active);
1052 assert(!o->Used || o->Ready); /* no in-flight query to worry about */
1053
1054 DBG("Begin(%d)\n", o->Id);
1055
1056 /* XXX: We have to consider that the command parser unit that parses batch
1057 * buffer commands and is used to capture begin/end counter snapshots isn't
1058 * implicitly synchronized with what's currently running across other GPU
1059 * units (such as the EUs running shaders) that the performance counters are
1060 * associated with.
1061 *
1062 * The intention of performance queries is to measure the work associated
1063 * with commands between the begin/end delimiters and so for that to be the
1064 * case we need to explicitly synchronize the parsing of commands to capture
1065 * Begin/End counter snapshots with what's running across other parts of the
1066 * GPU.
1067 *
1068 * When the command parser reaches a Begin marker it effectively needs to
1069 * drain everything currently running on the GPU until the hardware is idle
1070 * before capturing the first snapshot of counters - otherwise the results
1071 * would also be measuring the effects of earlier commands.
1072 *
1073 * When the command parser reaches an End marker it needs to stall until
1074 * everything currently running on the GPU has finished before capturing the
1075 * end snapshot - otherwise the results won't be a complete representation
1076 * of the work.
1077 *
1078 * Theoretically there could be opportunities to minimize how much of the
1079 * GPU pipeline is drained, or that we stall for, when we know what specific
1080 * units the performance counters being queried relate to but we don't
1081 * currently attempt to be clever here.
1082 *
1083 * Note: with our current simple approach here then for back-to-back queries
1084 * we will redundantly emit duplicate commands to synchronize the command
1085 * streamer with the rest of the GPU pipeline, but we assume that in HW the
1086 * second synchronization is effectively a NOOP.
1087 *
1088 * N.B. The final results are based on deltas of counters between (inside)
1089 * Begin/End markers so even though the total wall clock time of the
1090 * workload is stretched by larger pipeline bubbles the bubbles themselves
1091 * are generally invisible to the query results. Whether that's a good or a
1092 * bad thing depends on the use case. For a lower real-time impact while
1093 * capturing metrics then periodic sampling may be a better choice than
1094 * INTEL_performance_query.
1095 *
1096 *
1097 * This is our Begin synchronization point to drain current work on the
1098 * GPU before we capture our first counter snapshot...
1099 */
1100 brw_emit_mi_flush(brw);
1101
1102 switch (query->kind) {
1103 case OA_COUNTERS:
1104
1105 /* Opening an i915 perf stream implies exclusive access to the OA unit
1106 * which will generate counter reports for a specific counter set with a
1107 * specific layout/format so we can't begin any OA based queries that
1108 * require a different counter set or format unless we get an opportunity
1109 * to close the stream and open a new one...
1110 */
1111 if (brw->perfquery.oa_stream_fd != -1 &&
1112 brw->perfquery.current_oa_metrics_set_id !=
1113 query->oa_metrics_set_id) {
1114
1115 if (brw->perfquery.n_oa_users != 0)
1116 return false;
1117 else
1118 close_perf(brw);
1119 }
1120
1121 /* If the OA counters aren't already on, enable them. */
1122 if (brw->perfquery.oa_stream_fd == -1) {
1123 __DRIscreen *screen = brw->screen->driScrnPriv;
1124 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1125
1126 /* The period_exponent gives a sampling period as follows:
1127 * sample_period = timestamp_period * 2^(period_exponent + 1)
1128 *
1129 * The timestamps increments every 80ns (HSW), ~52ns (GEN9LP) or
1130 * ~83ns (GEN8/9).
1131 *
1132 * The counter overflow period is derived from the EuActive counter
1133 * which reads a counter that increments by the number of clock
1134 * cycles multiplied by the number of EUs. It can be calculated as:
1135 *
1136 * 2^(number of bits in A counter) / (n_eus * max_gen_freq * 2)
1137 *
1138 * (E.g. 40 EUs @ 1GHz = ~53ms)
1139 *
1140 * We select a sampling period inferior to that overflow period to
1141 * ensure we cannot see more than 1 counter overflow, otherwise we
1142 * could loose information.
1143 */
1144
1145 int a_counter_in_bits = 32;
1146 if (devinfo->gen >= 8)
1147 a_counter_in_bits = 40;
1148
1149 uint64_t overflow_period = pow(2, a_counter_in_bits) /
1150 (brw->perfquery.sys_vars.n_eus *
1151 /* drop 1GHz freq to have units in nanoseconds */
1152 2);
1153
1154 DBG("A counter overflow period: %"PRIu64"ns, %"PRIu64"ms (n_eus=%"PRIu64")\n",
1155 overflow_period, overflow_period / 1000000ul, brw->perfquery.sys_vars.n_eus);
1156
1157 int period_exponent = 0;
1158 uint64_t prev_sample_period, next_sample_period;
1159 for (int e = 0; e < 30; e++) {
1160 prev_sample_period = 1000000000ull * pow(2, e + 1) / devinfo->timestamp_frequency;
1161 next_sample_period = 1000000000ull * pow(2, e + 2) / devinfo->timestamp_frequency;
1162
1163 /* Take the previous sampling period, lower than the overflow
1164 * period.
1165 */
1166 if (prev_sample_period < overflow_period &&
1167 next_sample_period > overflow_period)
1168 period_exponent = e + 1;
1169 }
1170
1171 if (period_exponent == 0) {
1172 DBG("WARNING: enable to find a sampling exponent\n");
1173 return false;
1174 }
1175
1176 DBG("OA sampling exponent: %i ~= %"PRIu64"ms\n", period_exponent,
1177 prev_sample_period / 1000000ul);
1178
1179 if (!open_i915_perf_oa_stream(brw,
1180 query->oa_metrics_set_id,
1181 query->oa_format,
1182 period_exponent,
1183 screen->fd, /* drm fd */
1184 brw->hw_ctx))
1185 return false;
1186 } else {
1187 assert(brw->perfquery.current_oa_metrics_set_id ==
1188 query->oa_metrics_set_id &&
1189 brw->perfquery.current_oa_format ==
1190 query->oa_format);
1191 }
1192
1193 if (!inc_n_oa_users(brw)) {
1194 DBG("WARNING: Error enabling i915 perf stream: %m\n");
1195 return false;
1196 }
1197
1198 if (obj->oa.bo) {
1199 brw_bo_unreference(obj->oa.bo);
1200 obj->oa.bo = NULL;
1201 }
1202
1203 obj->oa.bo =
1204 brw_bo_alloc(brw->bufmgr, "perf. query OA MI_RPC bo",
1205 MI_RPC_BO_SIZE, 64);
1206 #ifdef DEBUG
1207 /* Pre-filling the BO helps debug whether writes landed. */
1208 void *map = brw_bo_map(brw, obj->oa.bo, MAP_WRITE);
1209 memset(map, 0x80, MI_RPC_BO_SIZE);
1210 brw_bo_unmap(obj->oa.bo);
1211 #endif
1212
1213 obj->oa.begin_report_id = brw->perfquery.next_query_start_report_id;
1214 brw->perfquery.next_query_start_report_id += 2;
1215
1216 /* We flush the batchbuffer here to minimize the chances that MI_RPC
1217 * delimiting commands end up in different batchbuffers. If that's the
1218 * case, the measurement will include the time it takes for the kernel
1219 * scheduler to load a new request into the hardware. This is manifested in
1220 * tools like frameretrace by spikes in the "GPU Core Clocks" counter.
1221 */
1222 intel_batchbuffer_flush(brw);
1223
1224 /* Take a starting OA counter snapshot. */
1225 brw->vtbl.emit_mi_report_perf_count(brw, obj->oa.bo, 0,
1226 obj->oa.begin_report_id);
1227 ++brw->perfquery.n_active_oa_queries;
1228
1229 /* No already-buffered samples can possibly be associated with this query
1230 * so create a marker within the list of sample buffers enabling us to
1231 * easily ignore earlier samples when processing this query after
1232 * completion.
1233 */
1234 assert(!exec_list_is_empty(&brw->perfquery.sample_buffers));
1235 obj->oa.samples_head = exec_list_get_tail(&brw->perfquery.sample_buffers);
1236
1237 struct brw_oa_sample_buf *buf =
1238 exec_node_data(struct brw_oa_sample_buf, obj->oa.samples_head, link);
1239
1240 /* This reference will ensure that future/following sample
1241 * buffers (that may relate to this query) can't be freed until
1242 * this drops to zero.
1243 */
1244 buf->refcount++;
1245
1246 memset(obj->oa.accumulator, 0, sizeof(obj->oa.accumulator));
1247 obj->oa.results_accumulated = false;
1248
1249 add_to_unaccumulated_query_list(brw, obj);
1250 break;
1251
1252 case PIPELINE_STATS:
1253 if (obj->pipeline_stats.bo) {
1254 brw_bo_unreference(obj->pipeline_stats.bo);
1255 obj->pipeline_stats.bo = NULL;
1256 }
1257
1258 obj->pipeline_stats.bo =
1259 brw_bo_alloc(brw->bufmgr, "perf. query pipeline stats bo",
1260 STATS_BO_SIZE, 64);
1261
1262 /* Take starting snapshots. */
1263 snapshot_statistics_registers(brw, obj, 0);
1264
1265 ++brw->perfquery.n_active_pipeline_stats_queries;
1266 break;
1267 }
1268
1269 if (INTEL_DEBUG & DEBUG_PERFMON)
1270 dump_perf_queries(brw);
1271
1272 return true;
1273 }
1274
1275 /**
1276 * Driver hook for glEndPerfQueryINTEL().
1277 */
1278 static void
1279 brw_end_perf_query(struct gl_context *ctx,
1280 struct gl_perf_query_object *o)
1281 {
1282 struct brw_context *brw = brw_context(ctx);
1283 struct brw_perf_query_object *obj = brw_perf_query(o);
1284
1285 DBG("End(%d)\n", o->Id);
1286
1287 /* Ensure that the work associated with the queried commands will have
1288 * finished before taking our query end counter readings.
1289 *
1290 * For more details see comment in brw_begin_perf_query for
1291 * corresponding flush.
1292 */
1293 brw_emit_mi_flush(brw);
1294
1295 switch (obj->query->kind) {
1296 case OA_COUNTERS:
1297
1298 /* NB: It's possible that the query will have already been marked
1299 * as 'accumulated' if an error was seen while reading samples
1300 * from perf. In this case we mustn't try and emit a closing
1301 * MI_RPC command in case the OA unit has already been disabled
1302 */
1303 if (!obj->oa.results_accumulated) {
1304 /* Take an ending OA counter snapshot. */
1305 brw->vtbl.emit_mi_report_perf_count(brw, obj->oa.bo,
1306 MI_RPC_BO_END_OFFSET_BYTES,
1307 obj->oa.begin_report_id + 1);
1308 }
1309
1310 --brw->perfquery.n_active_oa_queries;
1311
1312 /* NB: even though the query has now ended, it can't be accumulated
1313 * until the end MI_REPORT_PERF_COUNT snapshot has been written
1314 * to query->oa.bo
1315 */
1316 break;
1317
1318 case PIPELINE_STATS:
1319 snapshot_statistics_registers(brw, obj,
1320 STATS_BO_END_OFFSET_BYTES);
1321 --brw->perfquery.n_active_pipeline_stats_queries;
1322 break;
1323 }
1324 }
1325
1326 static void
1327 brw_wait_perf_query(struct gl_context *ctx, struct gl_perf_query_object *o)
1328 {
1329 struct brw_context *brw = brw_context(ctx);
1330 struct brw_perf_query_object *obj = brw_perf_query(o);
1331 struct brw_bo *bo = NULL;
1332
1333 assert(!o->Ready);
1334
1335 switch (obj->query->kind) {
1336 case OA_COUNTERS:
1337 bo = obj->oa.bo;
1338 break;
1339
1340 case PIPELINE_STATS:
1341 bo = obj->pipeline_stats.bo;
1342 break;
1343 }
1344
1345 if (bo == NULL)
1346 return;
1347
1348 /* If the current batch references our results bo then we need to
1349 * flush first...
1350 */
1351 if (brw_batch_references(&brw->batch, bo))
1352 intel_batchbuffer_flush(brw);
1353
1354 brw_bo_wait_rendering(bo);
1355
1356 /* Due to a race condition between the OA unit signaling report
1357 * availability and the report actually being written into memory,
1358 * we need to wait for all the reports to come in before we can
1359 * read them.
1360 */
1361 if (obj->query->kind == OA_COUNTERS) {
1362 while (!read_oa_samples_for_query(brw, obj))
1363 ;
1364 }
1365 }
1366
1367 static bool
1368 brw_is_perf_query_ready(struct gl_context *ctx,
1369 struct gl_perf_query_object *o)
1370 {
1371 struct brw_context *brw = brw_context(ctx);
1372 struct brw_perf_query_object *obj = brw_perf_query(o);
1373
1374 if (o->Ready)
1375 return true;
1376
1377 switch (obj->query->kind) {
1378 case OA_COUNTERS:
1379 return (obj->oa.results_accumulated ||
1380 (obj->oa.bo &&
1381 !brw_batch_references(&brw->batch, obj->oa.bo) &&
1382 !brw_bo_busy(obj->oa.bo) &&
1383 read_oa_samples_for_query(brw, obj)));
1384 case PIPELINE_STATS:
1385 return (obj->pipeline_stats.bo &&
1386 !brw_batch_references(&brw->batch, obj->pipeline_stats.bo) &&
1387 !brw_bo_busy(obj->pipeline_stats.bo));
1388 }
1389
1390 unreachable("missing ready check for unknown query kind");
1391 return false;
1392 }
1393
1394 static int
1395 get_oa_counter_data(struct brw_context *brw,
1396 struct brw_perf_query_object *obj,
1397 size_t data_size,
1398 uint8_t *data)
1399 {
1400 const struct brw_perf_query_info *query = obj->query;
1401 int n_counters = query->n_counters;
1402 int written = 0;
1403
1404 if (!obj->oa.results_accumulated) {
1405 accumulate_oa_reports(brw, obj);
1406 assert(obj->oa.results_accumulated);
1407 }
1408
1409 for (int i = 0; i < n_counters; i++) {
1410 const struct brw_perf_query_counter *counter = &query->counters[i];
1411 uint64_t *out_uint64;
1412 float *out_float;
1413
1414 if (counter->size) {
1415 switch (counter->data_type) {
1416 case GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL:
1417 out_uint64 = (uint64_t *)(data + counter->offset);
1418 *out_uint64 = counter->oa_counter_read_uint64(brw, query,
1419 obj->oa.accumulator);
1420 break;
1421 case GL_PERFQUERY_COUNTER_DATA_FLOAT_INTEL:
1422 out_float = (float *)(data + counter->offset);
1423 *out_float = counter->oa_counter_read_float(brw, query,
1424 obj->oa.accumulator);
1425 break;
1426 default:
1427 /* So far we aren't using uint32, double or bool32... */
1428 unreachable("unexpected counter data type");
1429 }
1430 written = counter->offset + counter->size;
1431 }
1432 }
1433
1434 return written;
1435 }
1436
1437 static int
1438 get_pipeline_stats_data(struct brw_context *brw,
1439 struct brw_perf_query_object *obj,
1440 size_t data_size,
1441 uint8_t *data)
1442
1443 {
1444 const struct brw_perf_query_info *query = obj->query;
1445 int n_counters = obj->query->n_counters;
1446 uint8_t *p = data;
1447
1448 uint64_t *start = brw_bo_map(brw, obj->pipeline_stats.bo, MAP_READ);
1449 uint64_t *end = start + (STATS_BO_END_OFFSET_BYTES / sizeof(uint64_t));
1450
1451 for (int i = 0; i < n_counters; i++) {
1452 const struct brw_perf_query_counter *counter = &query->counters[i];
1453 uint64_t value = end[i] - start[i];
1454
1455 if (counter->pipeline_stat.numerator !=
1456 counter->pipeline_stat.denominator) {
1457 value *= counter->pipeline_stat.numerator;
1458 value /= counter->pipeline_stat.denominator;
1459 }
1460
1461 *((uint64_t *)p) = value;
1462 p += 8;
1463 }
1464
1465 brw_bo_unmap(obj->pipeline_stats.bo);
1466
1467 return p - data;
1468 }
1469
1470 /**
1471 * Driver hook for glGetPerfQueryDataINTEL().
1472 */
1473 static void
1474 brw_get_perf_query_data(struct gl_context *ctx,
1475 struct gl_perf_query_object *o,
1476 GLsizei data_size,
1477 GLuint *data,
1478 GLuint *bytes_written)
1479 {
1480 struct brw_context *brw = brw_context(ctx);
1481 struct brw_perf_query_object *obj = brw_perf_query(o);
1482 int written = 0;
1483
1484 assert(brw_is_perf_query_ready(ctx, o));
1485
1486 DBG("GetData(%d)\n", o->Id);
1487
1488 if (INTEL_DEBUG & DEBUG_PERFMON)
1489 dump_perf_queries(brw);
1490
1491 /* We expect that the frontend only calls this hook when it knows
1492 * that results are available.
1493 */
1494 assert(o->Ready);
1495
1496 switch (obj->query->kind) {
1497 case OA_COUNTERS:
1498 written = get_oa_counter_data(brw, obj, data_size, (uint8_t *)data);
1499 break;
1500
1501 case PIPELINE_STATS:
1502 written = get_pipeline_stats_data(brw, obj, data_size, (uint8_t *)data);
1503 break;
1504 }
1505
1506 if (bytes_written)
1507 *bytes_written = written;
1508 }
1509
1510 static struct gl_perf_query_object *
1511 brw_new_perf_query_object(struct gl_context *ctx, unsigned query_index)
1512 {
1513 struct brw_context *brw = brw_context(ctx);
1514 const struct brw_perf_query_info *query =
1515 &brw->perfquery.queries[query_index];
1516 struct brw_perf_query_object *obj =
1517 calloc(1, sizeof(struct brw_perf_query_object));
1518
1519 if (!obj)
1520 return NULL;
1521
1522 obj->query = query;
1523
1524 brw->perfquery.n_query_instances++;
1525
1526 return &obj->base;
1527 }
1528
1529 /**
1530 * Driver hook for glDeletePerfQueryINTEL().
1531 */
1532 static void
1533 brw_delete_perf_query(struct gl_context *ctx,
1534 struct gl_perf_query_object *o)
1535 {
1536 struct brw_context *brw = brw_context(ctx);
1537 struct brw_perf_query_object *obj = brw_perf_query(o);
1538
1539 /* We can assume that the frontend waits for a query to complete
1540 * before ever calling into here, so we don't have to worry about
1541 * deleting an in-flight query object.
1542 */
1543 assert(!o->Active);
1544 assert(!o->Used || o->Ready);
1545
1546 DBG("Delete(%d)\n", o->Id);
1547
1548 switch (obj->query->kind) {
1549 case OA_COUNTERS:
1550 if (obj->oa.bo) {
1551 if (!obj->oa.results_accumulated) {
1552 drop_from_unaccumulated_query_list(brw, obj);
1553 dec_n_oa_users(brw);
1554 }
1555
1556 brw_bo_unreference(obj->oa.bo);
1557 obj->oa.bo = NULL;
1558 }
1559
1560 obj->oa.results_accumulated = false;
1561 break;
1562
1563 case PIPELINE_STATS:
1564 if (obj->pipeline_stats.bo) {
1565 brw_bo_unreference(obj->pipeline_stats.bo);
1566 obj->pipeline_stats.bo = NULL;
1567 }
1568 break;
1569 }
1570
1571 free(obj);
1572
1573 /* As an indication that the INTEL_performance_query extension is no
1574 * longer in use, it's a good time to free our cache of sample
1575 * buffers and close any current i915-perf stream.
1576 */
1577 if (--brw->perfquery.n_query_instances == 0) {
1578 free_sample_bufs(brw);
1579 close_perf(brw);
1580 }
1581 }
1582
1583 /******************************************************************************/
1584
1585 static struct brw_perf_query_info *
1586 append_query_info(struct brw_context *brw)
1587 {
1588 brw->perfquery.queries =
1589 reralloc(brw, brw->perfquery.queries,
1590 struct brw_perf_query_info, ++brw->perfquery.n_queries);
1591
1592 return &brw->perfquery.queries[brw->perfquery.n_queries - 1];
1593 }
1594
1595 static void
1596 add_stat_reg(struct brw_perf_query_info *query,
1597 uint32_t reg,
1598 uint32_t numerator,
1599 uint32_t denominator,
1600 const char *name,
1601 const char *description)
1602 {
1603 struct brw_perf_query_counter *counter;
1604
1605 assert(query->n_counters < MAX_STAT_COUNTERS);
1606
1607 counter = &query->counters[query->n_counters];
1608 counter->name = name;
1609 counter->desc = description;
1610 counter->type = GL_PERFQUERY_COUNTER_RAW_INTEL;
1611 counter->data_type = GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL;
1612 counter->size = sizeof(uint64_t);
1613 counter->offset = sizeof(uint64_t) * query->n_counters;
1614 counter->pipeline_stat.reg = reg;
1615 counter->pipeline_stat.numerator = numerator;
1616 counter->pipeline_stat.denominator = denominator;
1617
1618 query->n_counters++;
1619 }
1620
1621 static void
1622 add_basic_stat_reg(struct brw_perf_query_info *query,
1623 uint32_t reg, const char *name)
1624 {
1625 add_stat_reg(query, reg, 1, 1, name, name);
1626 }
1627
1628 static void
1629 init_pipeline_statistic_query_registers(struct brw_context *brw)
1630 {
1631 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1632 struct brw_perf_query_info *query = append_query_info(brw);
1633
1634 query->kind = PIPELINE_STATS;
1635 query->name = "Pipeline Statistics Registers";
1636 query->n_counters = 0;
1637 query->counters =
1638 rzalloc_array(brw, struct brw_perf_query_counter, MAX_STAT_COUNTERS);
1639
1640 add_basic_stat_reg(query, IA_VERTICES_COUNT,
1641 "N vertices submitted");
1642 add_basic_stat_reg(query, IA_PRIMITIVES_COUNT,
1643 "N primitives submitted");
1644 add_basic_stat_reg(query, VS_INVOCATION_COUNT,
1645 "N vertex shader invocations");
1646
1647 if (devinfo->gen == 6) {
1648 add_stat_reg(query, GEN6_SO_PRIM_STORAGE_NEEDED, 1, 1,
1649 "SO_PRIM_STORAGE_NEEDED",
1650 "N geometry shader stream-out primitives (total)");
1651 add_stat_reg(query, GEN6_SO_NUM_PRIMS_WRITTEN, 1, 1,
1652 "SO_NUM_PRIMS_WRITTEN",
1653 "N geometry shader stream-out primitives (written)");
1654 } else {
1655 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(0), 1, 1,
1656 "SO_PRIM_STORAGE_NEEDED (Stream 0)",
1657 "N stream-out (stream 0) primitives (total)");
1658 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(1), 1, 1,
1659 "SO_PRIM_STORAGE_NEEDED (Stream 1)",
1660 "N stream-out (stream 1) primitives (total)");
1661 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(2), 1, 1,
1662 "SO_PRIM_STORAGE_NEEDED (Stream 2)",
1663 "N stream-out (stream 2) primitives (total)");
1664 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(3), 1, 1,
1665 "SO_PRIM_STORAGE_NEEDED (Stream 3)",
1666 "N stream-out (stream 3) primitives (total)");
1667 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(0), 1, 1,
1668 "SO_NUM_PRIMS_WRITTEN (Stream 0)",
1669 "N stream-out (stream 0) primitives (written)");
1670 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(1), 1, 1,
1671 "SO_NUM_PRIMS_WRITTEN (Stream 1)",
1672 "N stream-out (stream 1) primitives (written)");
1673 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(2), 1, 1,
1674 "SO_NUM_PRIMS_WRITTEN (Stream 2)",
1675 "N stream-out (stream 2) primitives (written)");
1676 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(3), 1, 1,
1677 "SO_NUM_PRIMS_WRITTEN (Stream 3)",
1678 "N stream-out (stream 3) primitives (written)");
1679 }
1680
1681 add_basic_stat_reg(query, HS_INVOCATION_COUNT,
1682 "N TCS shader invocations");
1683 add_basic_stat_reg(query, DS_INVOCATION_COUNT,
1684 "N TES shader invocations");
1685
1686 add_basic_stat_reg(query, GS_INVOCATION_COUNT,
1687 "N geometry shader invocations");
1688 add_basic_stat_reg(query, GS_PRIMITIVES_COUNT,
1689 "N geometry shader primitives emitted");
1690
1691 add_basic_stat_reg(query, CL_INVOCATION_COUNT,
1692 "N primitives entering clipping");
1693 add_basic_stat_reg(query, CL_PRIMITIVES_COUNT,
1694 "N primitives leaving clipping");
1695
1696 if (devinfo->is_haswell || devinfo->gen == 8)
1697 add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4,
1698 "N fragment shader invocations",
1699 "N fragment shader invocations");
1700 else
1701 add_basic_stat_reg(query, PS_INVOCATION_COUNT,
1702 "N fragment shader invocations");
1703
1704 add_basic_stat_reg(query, PS_DEPTH_COUNT, "N z-pass fragments");
1705
1706 if (devinfo->gen >= 7)
1707 add_basic_stat_reg(query, CS_INVOCATION_COUNT,
1708 "N compute shader invocations");
1709
1710 query->data_size = sizeof(uint64_t) * query->n_counters;
1711 }
1712
1713 static bool
1714 read_file_uint64(const char *file, uint64_t *val)
1715 {
1716 char buf[32];
1717 int fd, n;
1718
1719 fd = open(file, 0);
1720 if (fd < 0)
1721 return false;
1722 n = read(fd, buf, sizeof (buf) - 1);
1723 close(fd);
1724 if (n < 0)
1725 return false;
1726
1727 buf[n] = '\0';
1728 *val = strtoull(buf, NULL, 0);
1729
1730 return true;
1731 }
1732
1733 static void
1734 register_oa_config(struct brw_context *brw,
1735 const struct brw_perf_query_info *query,
1736 uint64_t config_id)
1737 {
1738 struct brw_perf_query_info *registred_query = append_query_info(brw);
1739 *registred_query = *query;
1740 registred_query->oa_metrics_set_id = config_id;
1741 DBG("metric set registred: id = %" PRIu64", guid = %s\n",
1742 registred_query->oa_metrics_set_id, query->guid);
1743 }
1744
1745 static void
1746 enumerate_sysfs_metrics(struct brw_context *brw, const char *sysfs_dev_dir)
1747 {
1748 char buf[256];
1749 DIR *metricsdir = NULL;
1750 struct dirent *metric_entry;
1751 int len;
1752
1753 len = snprintf(buf, sizeof(buf), "%s/metrics", sysfs_dev_dir);
1754 if (len < 0 || len >= sizeof(buf)) {
1755 DBG("Failed to concatenate path to sysfs metrics/ directory\n");
1756 return;
1757 }
1758
1759 metricsdir = opendir(buf);
1760 if (!metricsdir) {
1761 DBG("Failed to open %s: %m\n", buf);
1762 return;
1763 }
1764
1765 while ((metric_entry = readdir(metricsdir))) {
1766 struct hash_entry *entry;
1767
1768 if ((metric_entry->d_type != DT_DIR &&
1769 metric_entry->d_type != DT_LNK) ||
1770 metric_entry->d_name[0] == '.')
1771 continue;
1772
1773 DBG("metric set: %s\n", metric_entry->d_name);
1774 entry = _mesa_hash_table_search(brw->perfquery.oa_metrics_table,
1775 metric_entry->d_name);
1776 if (entry) {
1777 uint64_t id;
1778
1779 len = snprintf(buf, sizeof(buf), "%s/metrics/%s/id",
1780 sysfs_dev_dir, metric_entry->d_name);
1781 if (len < 0 || len >= sizeof(buf)) {
1782 DBG("Failed to concatenate path to sysfs metric id file\n");
1783 continue;
1784 }
1785
1786 if (!read_file_uint64(buf, &id)) {
1787 DBG("Failed to read metric set id from %s: %m", buf);
1788 continue;
1789 }
1790
1791 register_oa_config(brw, (const struct brw_perf_query_info *)entry->data, id);
1792 } else
1793 DBG("metric set not known by mesa (skipping)\n");
1794 }
1795
1796 closedir(metricsdir);
1797 }
1798
1799 static bool
1800 read_sysfs_drm_device_file_uint64(struct brw_context *brw,
1801 const char *sysfs_dev_dir,
1802 const char *file,
1803 uint64_t *value)
1804 {
1805 char buf[512];
1806 int len;
1807
1808 len = snprintf(buf, sizeof(buf), "%s/%s", sysfs_dev_dir, file);
1809 if (len < 0 || len >= sizeof(buf)) {
1810 DBG("Failed to concatenate sys filename to read u64 from\n");
1811 return false;
1812 }
1813
1814 return read_file_uint64(buf, value);
1815 }
1816
1817 static bool
1818 kernel_has_dynamic_config_support(struct brw_context *brw,
1819 const char *sysfs_dev_dir)
1820 {
1821 __DRIscreen *screen = brw->screen->driScrnPriv;
1822 struct hash_entry *entry;
1823
1824 hash_table_foreach(brw->perfquery.oa_metrics_table, entry) {
1825 struct brw_perf_query_info *query = entry->data;
1826 char config_path[256];
1827 uint64_t config_id;
1828
1829 snprintf(config_path, sizeof(config_path),
1830 "%s/metrics/%s/id", sysfs_dev_dir, query->guid);
1831
1832 /* Look for the test config, which we know we can't replace. */
1833 if (read_file_uint64(config_path, &config_id) && config_id == 1) {
1834 uint32_t mux_regs[] = { 0x9888 /* NOA_WRITE */, 0x0 };
1835 struct drm_i915_perf_oa_config config;
1836
1837 memset(&config, 0, sizeof(config));
1838
1839 memcpy(config.uuid, query->guid, sizeof(config.uuid));
1840
1841 config.n_mux_regs = 1;
1842 config.mux_regs_ptr = (uintptr_t) mux_regs;
1843
1844 if (ioctl(screen->fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG, &config_id) < 0 &&
1845 errno == ENOENT)
1846 return true;
1847
1848 break;
1849 }
1850 }
1851
1852 return false;
1853 }
1854
1855 static void
1856 init_oa_configs(struct brw_context *brw, const char *sysfs_dev_dir)
1857 {
1858 __DRIscreen *screen = brw->screen->driScrnPriv;
1859 struct hash_entry *entry;
1860
1861 hash_table_foreach(brw->perfquery.oa_metrics_table, entry) {
1862 const struct brw_perf_query_info *query = entry->data;
1863 struct drm_i915_perf_oa_config config;
1864 char config_path[256];
1865 uint64_t config_id;
1866 int ret;
1867
1868 snprintf(config_path, sizeof(config_path),
1869 "%s/metrics/%s/id", sysfs_dev_dir, query->guid);
1870
1871 /* Don't recreate already loaded configs. */
1872 if (read_file_uint64(config_path, &config_id)) {
1873 register_oa_config(brw, query, config_id);
1874 continue;
1875 }
1876
1877 memset(&config, 0, sizeof(config));
1878
1879 memcpy(config.uuid, query->guid, sizeof(config.uuid));
1880
1881 config.n_mux_regs = query->n_mux_regs;
1882 config.mux_regs_ptr = (uintptr_t) query->mux_regs;
1883
1884 config.n_boolean_regs = query->n_b_counter_regs;
1885 config.boolean_regs_ptr = (uintptr_t) query->b_counter_regs;
1886
1887 config.n_flex_regs = query->n_flex_regs;
1888 config.flex_regs_ptr = (uintptr_t) query->flex_regs;
1889
1890 ret = ioctl(screen->fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &config);
1891 if (ret < 0) {
1892 DBG("Failed to load \"%s\" (%s) metrics set in kernel: %s\n",
1893 query->name, query->guid, strerror(errno));
1894 continue;
1895 }
1896
1897 register_oa_config(brw, query, config_id);
1898 }
1899 }
1900
1901 static bool
1902 init_oa_sys_vars(struct brw_context *brw, const char *sysfs_dev_dir)
1903 {
1904 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1905 uint64_t min_freq_mhz = 0, max_freq_mhz = 0;
1906 __DRIscreen *screen = brw->screen->driScrnPriv;
1907
1908 if (!read_sysfs_drm_device_file_uint64(brw, sysfs_dev_dir,
1909 "gt_min_freq_mhz",
1910 &min_freq_mhz))
1911 return false;
1912
1913 if (!read_sysfs_drm_device_file_uint64(brw, sysfs_dev_dir,
1914 "gt_max_freq_mhz",
1915 &max_freq_mhz))
1916 return false;
1917
1918 brw->perfquery.sys_vars.gt_min_freq = min_freq_mhz * 1000000;
1919 brw->perfquery.sys_vars.gt_max_freq = max_freq_mhz * 1000000;
1920 brw->perfquery.sys_vars.timestamp_frequency = devinfo->timestamp_frequency;
1921
1922 brw->perfquery.sys_vars.revision = intel_device_get_revision(screen->fd);
1923 brw->perfquery.sys_vars.n_eu_slices = devinfo->num_slices;
1924 /* Assuming uniform distribution of subslices per slices. */
1925 brw->perfquery.sys_vars.n_eu_sub_slices = devinfo->num_subslices[0];
1926
1927 if (devinfo->is_haswell) {
1928 brw->perfquery.sys_vars.slice_mask = 0;
1929 brw->perfquery.sys_vars.subslice_mask = 0;
1930
1931 for (int s = 0; s < devinfo->num_slices; s++)
1932 brw->perfquery.sys_vars.slice_mask |= 1U << s;
1933 for (int ss = 0; ss < devinfo->num_subslices[0]; ss++)
1934 brw->perfquery.sys_vars.subslice_mask |= 1U << ss;
1935
1936 if (devinfo->gt == 1) {
1937 brw->perfquery.sys_vars.n_eus = 10;
1938 } else if (devinfo->gt == 2) {
1939 brw->perfquery.sys_vars.n_eus = 20;
1940 } else if (devinfo->gt == 3) {
1941 brw->perfquery.sys_vars.n_eus = 40;
1942 } else
1943 unreachable("not reached");
1944 } else {
1945 drm_i915_getparam_t gp;
1946 int ret;
1947 int slice_mask = 0;
1948 int ss_mask = 0;
1949 /* maximum number of slices */
1950 int s_max = devinfo->num_slices;
1951 /* maximum number of subslices per slice (assuming uniform subslices per
1952 * slices)
1953 */
1954 int ss_max = devinfo->num_subslices[0];
1955 uint64_t subslice_mask = 0;
1956 int s;
1957
1958 gp.param = I915_PARAM_SLICE_MASK;
1959 gp.value = &slice_mask;
1960 ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
1961 if (ret)
1962 return false;
1963
1964 gp.param = I915_PARAM_SUBSLICE_MASK;
1965 gp.value = &ss_mask;
1966 ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
1967 if (ret)
1968 return false;
1969
1970 brw->perfquery.sys_vars.n_eus = brw->screen->eu_total;
1971 brw->perfquery.sys_vars.n_eu_slices = __builtin_popcount(slice_mask);
1972 brw->perfquery.sys_vars.slice_mask = slice_mask;
1973
1974 /* Note: the _SUBSLICE_MASK param only reports a global subslice mask
1975 * which applies to all slices.
1976 *
1977 * Note: some of the metrics we have (as described in XML) are
1978 * conditional on a $SubsliceMask variable which is expected to also
1979 * reflect the slice mask by packing together subslice masks for each
1980 * slice in one value..
1981 */
1982 for (s = 0; s < s_max; s++) {
1983 if (slice_mask & (1<<s)) {
1984 subslice_mask |= ss_mask << (ss_max * s);
1985 }
1986 }
1987
1988 brw->perfquery.sys_vars.subslice_mask = subslice_mask;
1989 brw->perfquery.sys_vars.n_eu_sub_slices =
1990 __builtin_popcount(subslice_mask);
1991 }
1992
1993 brw->perfquery.sys_vars.eu_threads_count =
1994 brw->perfquery.sys_vars.n_eus * devinfo->num_thread_per_eu;
1995
1996 return true;
1997 }
1998
1999 static bool
2000 get_sysfs_dev_dir(struct brw_context *brw,
2001 char *path_buf,
2002 int path_buf_len)
2003 {
2004 __DRIscreen *screen = brw->screen->driScrnPriv;
2005 struct stat sb;
2006 int min, maj;
2007 DIR *drmdir;
2008 struct dirent *drm_entry;
2009 int len;
2010
2011 assert(path_buf);
2012 assert(path_buf_len);
2013 path_buf[0] = '\0';
2014
2015 if (fstat(screen->fd, &sb)) {
2016 DBG("Failed to stat DRM fd\n");
2017 return false;
2018 }
2019
2020 maj = major(sb.st_rdev);
2021 min = minor(sb.st_rdev);
2022
2023 if (!S_ISCHR(sb.st_mode)) {
2024 DBG("DRM fd is not a character device as expected\n");
2025 return false;
2026 }
2027
2028 len = snprintf(path_buf, path_buf_len,
2029 "/sys/dev/char/%d:%d/device/drm", maj, min);
2030 if (len < 0 || len >= path_buf_len) {
2031 DBG("Failed to concatenate sysfs path to drm device\n");
2032 return false;
2033 }
2034
2035 drmdir = opendir(path_buf);
2036 if (!drmdir) {
2037 DBG("Failed to open %s: %m\n", path_buf);
2038 return false;
2039 }
2040
2041 while ((drm_entry = readdir(drmdir))) {
2042 if ((drm_entry->d_type == DT_DIR ||
2043 drm_entry->d_type == DT_LNK) &&
2044 strncmp(drm_entry->d_name, "card", 4) == 0)
2045 {
2046 len = snprintf(path_buf, path_buf_len,
2047 "/sys/dev/char/%d:%d/device/drm/%s",
2048 maj, min, drm_entry->d_name);
2049 closedir(drmdir);
2050 if (len < 0 || len >= path_buf_len)
2051 return false;
2052 else
2053 return true;
2054 }
2055 }
2056
2057 closedir(drmdir);
2058
2059 DBG("Failed to find cardX directory under /sys/dev/char/%d:%d/device/drm\n",
2060 maj, min);
2061
2062 return false;
2063 }
2064
2065 typedef void (*perf_register_oa_queries_t)(struct brw_context *);
2066
2067 static perf_register_oa_queries_t
2068 get_register_queries_function(const struct gen_device_info *devinfo)
2069 {
2070 if (devinfo->is_haswell)
2071 return brw_oa_register_queries_hsw;
2072 if (devinfo->is_cherryview)
2073 return brw_oa_register_queries_chv;
2074 if (devinfo->is_broadwell)
2075 return brw_oa_register_queries_bdw;
2076 if (devinfo->is_broxton)
2077 return brw_oa_register_queries_bxt;
2078 if (devinfo->is_skylake) {
2079 if (devinfo->gt == 2)
2080 return brw_oa_register_queries_sklgt2;
2081 if (devinfo->gt == 3)
2082 return brw_oa_register_queries_sklgt3;
2083 if (devinfo->gt == 4)
2084 return brw_oa_register_queries_sklgt4;
2085 }
2086 if (devinfo->is_kabylake) {
2087 if (devinfo->gt == 2)
2088 return brw_oa_register_queries_kblgt2;
2089 if (devinfo->gt == 3)
2090 return brw_oa_register_queries_kblgt3;
2091 }
2092 if (devinfo->is_geminilake)
2093 return brw_oa_register_queries_glk;
2094 if (devinfo->is_coffeelake) {
2095 if (devinfo->gt == 2)
2096 return brw_oa_register_queries_cflgt2;
2097 }
2098
2099 return NULL;
2100 }
2101
2102 static unsigned
2103 brw_init_perf_query_info(struct gl_context *ctx)
2104 {
2105 struct brw_context *brw = brw_context(ctx);
2106 const struct gen_device_info *devinfo = &brw->screen->devinfo;
2107 bool i915_perf_oa_available = false;
2108 struct stat sb;
2109 char sysfs_dev_dir[128];
2110 perf_register_oa_queries_t oa_register;
2111
2112 if (brw->perfquery.n_queries)
2113 return brw->perfquery.n_queries;
2114
2115 init_pipeline_statistic_query_registers(brw);
2116
2117 oa_register = get_register_queries_function(devinfo);
2118
2119 /* The existence of this sysctl parameter implies the kernel supports
2120 * the i915 perf interface.
2121 */
2122 if (stat("/proc/sys/dev/i915/perf_stream_paranoid", &sb) == 0) {
2123
2124 /* If _paranoid == 1 then on Gen8+ we won't be able to access OA
2125 * metrics unless running as root.
2126 */
2127 if (devinfo->is_haswell)
2128 i915_perf_oa_available = true;
2129 else {
2130 uint64_t paranoid = 1;
2131
2132 read_file_uint64("/proc/sys/dev/i915/perf_stream_paranoid", &paranoid);
2133
2134 if (paranoid == 0 || geteuid() == 0)
2135 i915_perf_oa_available = true;
2136 }
2137 }
2138
2139 if (i915_perf_oa_available &&
2140 oa_register &&
2141 get_sysfs_dev_dir(brw, sysfs_dev_dir, sizeof(sysfs_dev_dir)) &&
2142 init_oa_sys_vars(brw, sysfs_dev_dir))
2143 {
2144 brw->perfquery.oa_metrics_table =
2145 _mesa_hash_table_create(NULL, _mesa_key_hash_string,
2146 _mesa_key_string_equal);
2147
2148 /* Index all the metric sets mesa knows about before looking to see what
2149 * the kernel is advertising.
2150 */
2151 oa_register(brw);
2152
2153 if (likely((INTEL_DEBUG & DEBUG_NO_OACONFIG) == 0) &&
2154 kernel_has_dynamic_config_support(brw, sysfs_dev_dir))
2155 init_oa_configs(brw, sysfs_dev_dir);
2156 else
2157 enumerate_sysfs_metrics(brw, sysfs_dev_dir);
2158 }
2159
2160 brw->perfquery.unaccumulated =
2161 ralloc_array(brw, struct brw_perf_query_object *, 2);
2162 brw->perfquery.unaccumulated_elements = 0;
2163 brw->perfquery.unaccumulated_array_size = 2;
2164
2165 exec_list_make_empty(&brw->perfquery.sample_buffers);
2166 exec_list_make_empty(&brw->perfquery.free_sample_buffers);
2167
2168 /* It's convenient to guarantee that this linked list of sample
2169 * buffers is never empty so we add an empty head so when we
2170 * Begin an OA query we can always take a reference on a buffer
2171 * in this list.
2172 */
2173 struct brw_oa_sample_buf *buf = get_free_sample_buf(brw);
2174 exec_list_push_head(&brw->perfquery.sample_buffers, &buf->link);
2175
2176 brw->perfquery.oa_stream_fd = -1;
2177
2178 brw->perfquery.next_query_start_report_id = 1000;
2179
2180 return brw->perfquery.n_queries;
2181 }
2182
2183 void
2184 brw_init_performance_queries(struct brw_context *brw)
2185 {
2186 struct gl_context *ctx = &brw->ctx;
2187
2188 ctx->Driver.InitPerfQueryInfo = brw_init_perf_query_info;
2189 ctx->Driver.GetPerfQueryInfo = brw_get_perf_query_info;
2190 ctx->Driver.GetPerfCounterInfo = brw_get_perf_counter_info;
2191 ctx->Driver.NewPerfQueryObject = brw_new_perf_query_object;
2192 ctx->Driver.DeletePerfQuery = brw_delete_perf_query;
2193 ctx->Driver.BeginPerfQuery = brw_begin_perf_query;
2194 ctx->Driver.EndPerfQuery = brw_end_perf_query;
2195 ctx->Driver.WaitPerfQuery = brw_wait_perf_query;
2196 ctx->Driver.IsPerfQueryReady = brw_is_perf_query_ready;
2197 ctx->Driver.GetPerfQueryData = brw_get_perf_query_data;
2198 }