2 * Copyright © 2013 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
25 * \file brw_performance_query.c
27 * Implementation of the GL_INTEL_performance_query extension.
29 * Currently there are two possible counter sources exposed here:
31 * On Gen6+ hardware we have numerous 64bit Pipeline Statistics Registers
32 * that we can snapshot at the beginning and end of a query.
34 * On Gen7.5+ we have Observability Architecture counters which are
35 * covered in separate document from the rest of the PRMs. It is available at:
36 * https://01.org/linuxgraphics/documentation/driver-documentation-prms
37 * => 2013 Intel Core Processor Family => Observability Performance Counters
38 * (This one volume covers Sandybridge, Ivybridge, Baytrail, and Haswell,
39 * though notably we currently only support OA counters for Haswell+)
44 /* put before sys/types.h to silence glibc warnings */
46 #include <sys/mkdev.h>
48 #ifdef MAJOR_IN_SYSMACROS
49 #include <sys/sysmacros.h>
51 #include <sys/types.h>
55 #include <sys/ioctl.h>
58 #include "drm-uapi/i915_drm.h"
60 #include "main/hash.h"
61 #include "main/macros.h"
62 #include "main/mtypes.h"
63 #include "main/performance_query.h"
65 #include "util/bitset.h"
66 #include "util/ralloc.h"
67 #include "util/hash_table.h"
68 #include "util/list.h"
69 #include "util/u_math.h"
71 #include "brw_context.h"
72 #include "brw_defines.h"
73 #include "brw_performance_query.h"
74 #include "intel_batchbuffer.h"
76 #include "perf/gen_perf.h"
77 #include "perf/gen_perf_mdapi.h"
79 #define FILE_DEBUG_FLAG DEBUG_PERFMON
81 #define OAREPORT_REASON_MASK 0x3f
82 #define OAREPORT_REASON_SHIFT 19
83 #define OAREPORT_REASON_TIMER (1<<0)
84 #define OAREPORT_REASON_TRIGGER1 (1<<1)
85 #define OAREPORT_REASON_TRIGGER2 (1<<2)
86 #define OAREPORT_REASON_CTX_SWITCH (1<<3)
87 #define OAREPORT_REASON_GO_TRANSITION (1<<4)
89 /** Downcasting convenience macro. */
90 static inline struct brw_perf_query_object
*
91 brw_perf_query(struct gl_perf_query_object
*o
)
93 return (struct brw_perf_query_object
*) o
;
96 #define MI_RPC_BO_SIZE 4096
97 #define MI_RPC_BO_END_OFFSET_BYTES (MI_RPC_BO_SIZE / 2)
98 #define MI_FREQ_START_OFFSET_BYTES (3072)
99 #define MI_FREQ_END_OFFSET_BYTES (3076)
101 /******************************************************************************/
104 brw_is_perf_query_ready(struct gl_context
*ctx
,
105 struct gl_perf_query_object
*o
);
108 dump_perf_query_callback(GLuint id
, void *query_void
, void *brw_void
)
110 struct gl_context
*ctx
= brw_void
;
111 struct gl_perf_query_object
*o
= query_void
;
112 struct brw_perf_query_object
*obj
= query_void
;
114 switch (obj
->query
->kind
) {
115 case GEN_PERF_QUERY_TYPE_OA
:
116 case GEN_PERF_QUERY_TYPE_RAW
:
117 DBG("%4d: %-6s %-8s BO: %-4s OA data: %-10s %-15s\n",
119 o
->Used
? "Dirty," : "New,",
120 o
->Active
? "Active," : (o
->Ready
? "Ready," : "Pending,"),
121 obj
->oa
.bo
? "yes," : "no,",
122 brw_is_perf_query_ready(ctx
, o
) ? "ready," : "not ready,",
123 obj
->oa
.results_accumulated
? "accumulated" : "not accumulated");
125 case GEN_PERF_QUERY_TYPE_PIPELINE
:
126 DBG("%4d: %-6s %-8s BO: %-4s\n",
128 o
->Used
? "Dirty," : "New,",
129 o
->Active
? "Active," : (o
->Ready
? "Ready," : "Pending,"),
130 obj
->pipeline_stats
.bo
? "yes" : "no");
133 unreachable("Unknown query type");
139 dump_perf_queries(struct brw_context
*brw
)
141 struct gl_context
*ctx
= &brw
->ctx
;
142 DBG("Queries: (Open queries = %d, OA users = %d)\n",
143 brw
->perf_ctx
.n_active_oa_queries
, brw
->perf_ctx
.n_oa_users
);
144 _mesa_HashWalk(ctx
->PerfQuery
.Objects
, dump_perf_query_callback
, brw
);
147 /******************************************************************************/
149 static struct oa_sample_buf
*
150 get_free_sample_buf(struct brw_context
*brw
)
152 struct exec_node
*node
= exec_list_pop_head(&brw
->perf_ctx
.free_sample_buffers
);
153 struct oa_sample_buf
*buf
;
156 buf
= exec_node_data(struct oa_sample_buf
, node
, link
);
158 buf
= ralloc_size(brw
, sizeof(*buf
));
160 exec_node_init(&buf
->link
);
169 reap_old_sample_buffers(struct brw_context
*brw
)
171 struct exec_node
*tail_node
=
172 exec_list_get_tail(&brw
->perf_ctx
.sample_buffers
);
173 struct oa_sample_buf
*tail_buf
=
174 exec_node_data(struct oa_sample_buf
, tail_node
, link
);
176 /* Remove all old, unreferenced sample buffers walking forward from
177 * the head of the list, except always leave at least one node in
178 * the list so we always have a node to reference when we Begin
181 foreach_list_typed_safe(struct oa_sample_buf
, buf
, link
,
182 &brw
->perf_ctx
.sample_buffers
)
184 if (buf
->refcount
== 0 && buf
!= tail_buf
) {
185 exec_node_remove(&buf
->link
);
186 exec_list_push_head(&brw
->perf_ctx
.free_sample_buffers
, &buf
->link
);
193 free_sample_bufs(struct brw_context
*brw
)
195 foreach_list_typed_safe(struct oa_sample_buf
, buf
, link
,
196 &brw
->perf_ctx
.free_sample_buffers
)
199 exec_list_make_empty(&brw
->perf_ctx
.free_sample_buffers
);
202 /******************************************************************************/
205 * Driver hook for glGetPerfQueryInfoINTEL().
208 brw_get_perf_query_info(struct gl_context
*ctx
,
209 unsigned query_index
,
215 struct brw_context
*brw
= brw_context(ctx
);
216 const struct gen_perf_query_info
*query
=
217 &brw
->perf_ctx
.perf
->queries
[query_index
];
220 *data_size
= query
->data_size
;
221 *n_counters
= query
->n_counters
;
223 switch (query
->kind
) {
224 case GEN_PERF_QUERY_TYPE_OA
:
225 case GEN_PERF_QUERY_TYPE_RAW
:
226 *n_active
= brw
->perf_ctx
.n_active_oa_queries
;
229 case GEN_PERF_QUERY_TYPE_PIPELINE
:
230 *n_active
= brw
->perf_ctx
.n_active_pipeline_stats_queries
;
234 unreachable("Unknown query type");
240 gen_counter_type_enum_to_gl_type(enum gen_perf_counter_type type
)
243 case GEN_PERF_COUNTER_TYPE_EVENT
: return GL_PERFQUERY_COUNTER_EVENT_INTEL
;
244 case GEN_PERF_COUNTER_TYPE_DURATION_NORM
: return GL_PERFQUERY_COUNTER_DURATION_NORM_INTEL
;
245 case GEN_PERF_COUNTER_TYPE_DURATION_RAW
: return GL_PERFQUERY_COUNTER_DURATION_RAW_INTEL
;
246 case GEN_PERF_COUNTER_TYPE_THROUGHPUT
: return GL_PERFQUERY_COUNTER_THROUGHPUT_INTEL
;
247 case GEN_PERF_COUNTER_TYPE_RAW
: return GL_PERFQUERY_COUNTER_RAW_INTEL
;
248 case GEN_PERF_COUNTER_TYPE_TIMESTAMP
: return GL_PERFQUERY_COUNTER_TIMESTAMP_INTEL
;
250 unreachable("Unknown counter type");
255 gen_counter_data_type_to_gl_type(enum gen_perf_counter_data_type type
)
258 case GEN_PERF_COUNTER_DATA_TYPE_BOOL32
: return GL_PERFQUERY_COUNTER_DATA_BOOL32_INTEL
;
259 case GEN_PERF_COUNTER_DATA_TYPE_UINT32
: return GL_PERFQUERY_COUNTER_DATA_UINT32_INTEL
;
260 case GEN_PERF_COUNTER_DATA_TYPE_UINT64
: return GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL
;
261 case GEN_PERF_COUNTER_DATA_TYPE_FLOAT
: return GL_PERFQUERY_COUNTER_DATA_FLOAT_INTEL
;
262 case GEN_PERF_COUNTER_DATA_TYPE_DOUBLE
: return GL_PERFQUERY_COUNTER_DATA_DOUBLE_INTEL
;
264 unreachable("Unknown counter data type");
269 * Driver hook for glGetPerfCounterInfoINTEL().
272 brw_get_perf_counter_info(struct gl_context
*ctx
,
273 unsigned query_index
,
274 unsigned counter_index
,
280 GLuint
*data_type_enum
,
283 struct brw_context
*brw
= brw_context(ctx
);
284 const struct gen_perf_query_info
*query
=
285 &brw
->perf_ctx
.perf
->queries
[query_index
];
286 const struct gen_perf_query_counter
*counter
=
287 &query
->counters
[counter_index
];
289 *name
= counter
->name
;
290 *desc
= counter
->desc
;
291 *offset
= counter
->offset
;
292 *data_size
= gen_perf_query_counter_get_size(counter
);
293 *type_enum
= gen_counter_type_enum_to_gl_type(counter
->type
);
294 *data_type_enum
= gen_counter_data_type_to_gl_type(counter
->data_type
);
295 *raw_max
= counter
->raw_max
;
298 /******************************************************************************/
301 * Emit MI_STORE_REGISTER_MEM commands to capture all of the
302 * pipeline statistics for the performance query object.
305 snapshot_statistics_registers(struct brw_context
*brw
,
306 struct brw_perf_query_object
*obj
,
307 uint32_t offset_in_bytes
)
309 const struct gen_perf_query_info
*query
= obj
->query
;
310 const int n_counters
= query
->n_counters
;
312 for (int i
= 0; i
< n_counters
; i
++) {
313 const struct gen_perf_query_counter
*counter
= &query
->counters
[i
];
315 assert(counter
->data_type
== GEN_PERF_COUNTER_DATA_TYPE_UINT64
);
317 brw_store_register_mem64(brw
, obj
->pipeline_stats
.bo
,
318 counter
->pipeline_stat
.reg
,
319 offset_in_bytes
+ i
* sizeof(uint64_t));
324 * Add a query to the global list of "unaccumulated queries."
326 * Queries are tracked here until all the associated OA reports have
327 * been accumulated via accumulate_oa_reports() after the end
328 * MI_REPORT_PERF_COUNT has landed in query->oa.bo.
331 add_to_unaccumulated_query_list(struct brw_context
*brw
,
332 struct brw_perf_query_object
*obj
)
334 if (brw
->perf_ctx
.unaccumulated_elements
>=
335 brw
->perf_ctx
.unaccumulated_array_size
)
337 brw
->perf_ctx
.unaccumulated_array_size
*= 1.5;
338 brw
->perf_ctx
.unaccumulated
=
339 reralloc(brw
, brw
->perf_ctx
.unaccumulated
,
340 struct brw_perf_query_object
*,
341 brw
->perf_ctx
.unaccumulated_array_size
);
344 brw
->perf_ctx
.unaccumulated
[brw
->perf_ctx
.unaccumulated_elements
++] = obj
;
348 * Remove a query from the global list of unaccumulated queries once
349 * after successfully accumulating the OA reports associated with the
350 * query in accumulate_oa_reports() or when discarding unwanted query
354 drop_from_unaccumulated_query_list(struct brw_context
*brw
,
355 struct brw_perf_query_object
*obj
)
357 for (int i
= 0; i
< brw
->perf_ctx
.unaccumulated_elements
; i
++) {
358 if (brw
->perf_ctx
.unaccumulated
[i
] == obj
) {
359 int last_elt
= --brw
->perf_ctx
.unaccumulated_elements
;
362 brw
->perf_ctx
.unaccumulated
[i
] = NULL
;
364 brw
->perf_ctx
.unaccumulated
[i
] =
365 brw
->perf_ctx
.unaccumulated
[last_elt
];
372 /* Drop our samples_head reference so that associated periodic
373 * sample data buffers can potentially be reaped if they aren't
374 * referenced by any other queries...
377 struct oa_sample_buf
*buf
=
378 exec_node_data(struct oa_sample_buf
, obj
->oa
.samples_head
, link
);
380 assert(buf
->refcount
> 0);
383 obj
->oa
.samples_head
= NULL
;
385 reap_old_sample_buffers(brw
);
389 inc_n_oa_users(struct brw_context
*brw
)
391 if (brw
->perf_ctx
.n_oa_users
== 0 &&
392 drmIoctl(brw
->perf_ctx
.oa_stream_fd
,
393 I915_PERF_IOCTL_ENABLE
, 0) < 0)
397 ++brw
->perf_ctx
.n_oa_users
;
403 dec_n_oa_users(struct brw_context
*brw
)
405 /* Disabling the i915 perf stream will effectively disable the OA
406 * counters. Note it's important to be sure there are no outstanding
407 * MI_RPC commands at this point since they could stall the CS
408 * indefinitely once OACONTROL is disabled.
410 --brw
->perf_ctx
.n_oa_users
;
411 if (brw
->perf_ctx
.n_oa_users
== 0 &&
412 drmIoctl(brw
->perf_ctx
.oa_stream_fd
, I915_PERF_IOCTL_DISABLE
, 0) < 0)
414 DBG("WARNING: Error disabling i915 perf stream: %m\n");
418 /* In general if we see anything spurious while accumulating results,
419 * we don't try and continue accumulating the current query, hoping
420 * for the best, we scrap anything outstanding, and then hope for the
421 * best with new queries.
424 discard_all_queries(struct brw_context
*brw
)
426 while (brw
->perf_ctx
.unaccumulated_elements
) {
427 struct brw_perf_query_object
*obj
= brw
->perf_ctx
.unaccumulated
[0];
429 obj
->oa
.results_accumulated
= true;
430 drop_from_unaccumulated_query_list(brw
, brw
->perf_ctx
.unaccumulated
[0]);
437 OA_READ_STATUS_ERROR
,
438 OA_READ_STATUS_UNFINISHED
,
439 OA_READ_STATUS_FINISHED
,
442 static enum OaReadStatus
443 read_oa_samples_until(struct brw_context
*brw
,
444 uint32_t start_timestamp
,
445 uint32_t end_timestamp
)
447 struct exec_node
*tail_node
=
448 exec_list_get_tail(&brw
->perf_ctx
.sample_buffers
);
449 struct oa_sample_buf
*tail_buf
=
450 exec_node_data(struct oa_sample_buf
, tail_node
, link
);
451 uint32_t last_timestamp
= tail_buf
->last_timestamp
;
454 struct oa_sample_buf
*buf
= get_free_sample_buf(brw
);
458 while ((len
= read(brw
->perf_ctx
.oa_stream_fd
, buf
->buf
,
459 sizeof(buf
->buf
))) < 0 && errno
== EINTR
)
463 exec_list_push_tail(&brw
->perf_ctx
.free_sample_buffers
, &buf
->link
);
467 return ((last_timestamp
- start_timestamp
) >=
468 (end_timestamp
- start_timestamp
)) ?
469 OA_READ_STATUS_FINISHED
:
470 OA_READ_STATUS_UNFINISHED
;
472 DBG("Error reading i915 perf samples: %m\n");
475 DBG("Spurious EOF reading i915 perf samples\n");
477 return OA_READ_STATUS_ERROR
;
481 exec_list_push_tail(&brw
->perf_ctx
.sample_buffers
, &buf
->link
);
483 /* Go through the reports and update the last timestamp. */
485 while (offset
< buf
->len
) {
486 const struct drm_i915_perf_record_header
*header
=
487 (const struct drm_i915_perf_record_header
*) &buf
->buf
[offset
];
488 uint32_t *report
= (uint32_t *) (header
+ 1);
490 if (header
->type
== DRM_I915_PERF_RECORD_SAMPLE
)
491 last_timestamp
= report
[1];
493 offset
+= header
->size
;
496 buf
->last_timestamp
= last_timestamp
;
499 unreachable("not reached");
500 return OA_READ_STATUS_ERROR
;
504 * Try to read all the reports until either the delimiting timestamp
505 * or an error arises.
508 read_oa_samples_for_query(struct brw_context
*brw
,
509 struct brw_perf_query_object
*obj
)
515 /* We need the MI_REPORT_PERF_COUNT to land before we can start
517 assert(!brw_batch_references(&brw
->batch
, obj
->oa
.bo
) &&
518 !brw_bo_busy(obj
->oa
.bo
));
520 /* Map the BO once here and let accumulate_oa_reports() unmap
522 if (obj
->oa
.map
== NULL
)
523 obj
->oa
.map
= brw_bo_map(brw
, obj
->oa
.bo
, MAP_READ
);
525 start
= last
= obj
->oa
.map
;
526 end
= obj
->oa
.map
+ MI_RPC_BO_END_OFFSET_BYTES
;
528 if (start
[0] != obj
->oa
.begin_report_id
) {
529 DBG("Spurious start report id=%"PRIu32
"\n", start
[0]);
532 if (end
[0] != (obj
->oa
.begin_report_id
+ 1)) {
533 DBG("Spurious end report id=%"PRIu32
"\n", end
[0]);
537 /* Read the reports until the end timestamp. */
538 switch (read_oa_samples_until(brw
, start
[1], end
[1])) {
539 case OA_READ_STATUS_ERROR
:
540 /* Fallthrough and let accumulate_oa_reports() deal with the
542 case OA_READ_STATUS_FINISHED
:
544 case OA_READ_STATUS_UNFINISHED
:
548 unreachable("invalid read status");
553 * Accumulate raw OA counter values based on deltas between pairs of
556 * Accumulation starts from the first report captured via
557 * MI_REPORT_PERF_COUNT (MI_RPC) by brw_begin_perf_query() until the
558 * last MI_RPC report requested by brw_end_perf_query(). Between these
559 * two reports there may also some number of periodically sampled OA
560 * reports collected via the i915 perf interface - depending on the
561 * duration of the query.
563 * These periodic snapshots help to ensure we handle counter overflow
564 * correctly by being frequent enough to ensure we don't miss multiple
565 * overflows of a counter between snapshots. For Gen8+ the i915 perf
566 * snapshots provide the extra context-switch reports that let us
567 * subtract out the progress of counters associated with other
568 * contexts running on the system.
571 accumulate_oa_reports(struct brw_context
*brw
,
572 struct brw_perf_query_object
*obj
)
574 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
575 struct gl_perf_query_object
*o
= &obj
->base
;
579 struct exec_node
*first_samples_node
;
581 int out_duration
= 0;
584 assert(obj
->oa
.map
!= NULL
);
586 start
= last
= obj
->oa
.map
;
587 end
= obj
->oa
.map
+ MI_RPC_BO_END_OFFSET_BYTES
;
589 if (start
[0] != obj
->oa
.begin_report_id
) {
590 DBG("Spurious start report id=%"PRIu32
"\n", start
[0]);
593 if (end
[0] != (obj
->oa
.begin_report_id
+ 1)) {
594 DBG("Spurious end report id=%"PRIu32
"\n", end
[0]);
598 /* See if we have any periodic reports to accumulate too... */
600 /* N.B. The oa.samples_head was set when the query began and
601 * pointed to the tail of the brw->perf_ctx.sample_buffers list at
602 * the time the query started. Since the buffer existed before the
603 * first MI_REPORT_PERF_COUNT command was emitted we therefore know
604 * that no data in this particular node's buffer can possibly be
605 * associated with the query - so skip ahead one...
607 first_samples_node
= obj
->oa
.samples_head
->next
;
609 foreach_list_typed_from(struct oa_sample_buf
, buf
, link
,
610 &brw
->perf_ctx
.sample_buffers
,
615 while (offset
< buf
->len
) {
616 const struct drm_i915_perf_record_header
*header
=
617 (const struct drm_i915_perf_record_header
*)(buf
->buf
+ offset
);
619 assert(header
->size
!= 0);
620 assert(header
->size
<= buf
->len
);
622 offset
+= header
->size
;
624 switch (header
->type
) {
625 case DRM_I915_PERF_RECORD_SAMPLE
: {
626 uint32_t *report
= (uint32_t *)(header
+ 1);
629 /* Ignore reports that come before the start marker.
630 * (Note: takes care to allow overflow of 32bit timestamps)
632 if (gen_device_info_timebase_scale(devinfo
,
633 report
[1] - start
[1]) > 5000000000) {
637 /* Ignore reports that come after the end marker.
638 * (Note: takes care to allow overflow of 32bit timestamps)
640 if (gen_device_info_timebase_scale(devinfo
,
641 report
[1] - end
[1]) <= 5000000000) {
645 /* For Gen8+ since the counters continue while other
646 * contexts are running we need to discount any unrelated
647 * deltas. The hardware automatically generates a report
648 * on context switch which gives us a new reference point
649 * to continuing adding deltas from.
651 * For Haswell we can rely on the HW to stop the progress
652 * of OA counters while any other context is acctive.
654 if (devinfo
->gen
>= 8) {
655 if (in_ctx
&& report
[2] != obj
->oa
.result
.hw_id
) {
656 DBG("i915 perf: Switch AWAY (observed by ID change)\n");
659 } else if (in_ctx
== false && report
[2] == obj
->oa
.result
.hw_id
) {
660 DBG("i915 perf: Switch TO\n");
663 /* From experimentation in IGT, we found that the OA unit
664 * might label some report as "idle" (using an invalid
665 * context ID), right after a report for a given context.
666 * Deltas generated by those reports actually belong to the
667 * previous context, even though they're not labelled as
670 * We didn't *really* Switch AWAY in the case that we e.g.
671 * saw a single periodic report while idle...
673 if (out_duration
>= 1)
676 assert(report
[2] == obj
->oa
.result
.hw_id
);
677 DBG("i915 perf: Continuation IN\n");
679 assert(report
[2] != obj
->oa
.result
.hw_id
);
680 DBG("i915 perf: Continuation OUT\n");
687 gen_perf_query_result_accumulate(&obj
->oa
.result
, obj
->query
,
696 case DRM_I915_PERF_RECORD_OA_BUFFER_LOST
:
697 DBG("i915 perf: OA error: all reports lost\n");
699 case DRM_I915_PERF_RECORD_OA_REPORT_LOST
:
700 DBG("i915 perf: OA report lost\n");
708 gen_perf_query_result_accumulate(&obj
->oa
.result
, obj
->query
,
711 DBG("Marking %d accumulated - results gathered\n", o
->Id
);
713 obj
->oa
.results_accumulated
= true;
714 drop_from_unaccumulated_query_list(brw
, obj
);
721 discard_all_queries(brw
);
724 /******************************************************************************/
727 open_i915_perf_oa_stream(struct brw_context
*brw
,
734 uint64_t properties
[] = {
735 /* Single context sampling */
736 DRM_I915_PERF_PROP_CTX_HANDLE
, ctx_id
,
738 /* Include OA reports in samples */
739 DRM_I915_PERF_PROP_SAMPLE_OA
, true,
741 /* OA unit configuration */
742 DRM_I915_PERF_PROP_OA_METRICS_SET
, metrics_set_id
,
743 DRM_I915_PERF_PROP_OA_FORMAT
, report_format
,
744 DRM_I915_PERF_PROP_OA_EXPONENT
, period_exponent
,
746 struct drm_i915_perf_open_param param
= {
747 .flags
= I915_PERF_FLAG_FD_CLOEXEC
|
748 I915_PERF_FLAG_FD_NONBLOCK
|
749 I915_PERF_FLAG_DISABLED
,
750 .num_properties
= ARRAY_SIZE(properties
) / 2,
751 .properties_ptr
= (uintptr_t) properties
,
753 int fd
= drmIoctl(drm_fd
, DRM_IOCTL_I915_PERF_OPEN
, ¶m
);
755 DBG("Error opening i915 perf OA stream: %m\n");
759 brw
->perf_ctx
.oa_stream_fd
= fd
;
761 brw
->perf_ctx
.current_oa_metrics_set_id
= metrics_set_id
;
762 brw
->perf_ctx
.current_oa_format
= report_format
;
768 close_perf(struct brw_context
*brw
,
769 const struct gen_perf_query_info
*query
)
771 if (brw
->perf_ctx
.oa_stream_fd
!= -1) {
772 close(brw
->perf_ctx
.oa_stream_fd
);
773 brw
->perf_ctx
.oa_stream_fd
= -1;
775 if (query
->kind
== GEN_PERF_QUERY_TYPE_RAW
) {
776 struct gen_perf_query_info
*raw_query
=
777 (struct gen_perf_query_info
*) query
;
778 raw_query
->oa_metrics_set_id
= 0;
783 capture_frequency_stat_register(struct brw_context
*brw
,
787 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
789 if (devinfo
->gen
>= 7 && devinfo
->gen
<= 8 &&
790 !devinfo
->is_baytrail
&& !devinfo
->is_cherryview
) {
791 brw_store_register_mem32(brw
, bo
, GEN7_RPSTAT1
, bo_offset
);
792 } else if (devinfo
->gen
>= 9) {
793 brw_store_register_mem32(brw
, bo
, GEN9_RPSTAT0
, bo_offset
);
798 * Driver hook for glBeginPerfQueryINTEL().
801 brw_begin_perf_query(struct gl_context
*ctx
,
802 struct gl_perf_query_object
*o
)
804 struct brw_context
*brw
= brw_context(ctx
);
805 struct brw_perf_query_object
*obj
= brw_perf_query(o
);
806 const struct gen_perf_query_info
*query
= obj
->query
;
807 struct gen_perf_config
*perf_cfg
= brw
->perf_ctx
.perf
;
809 /* We can assume the frontend hides mistaken attempts to Begin a
810 * query object multiple times before its End. Similarly if an
811 * application reuses a query object before results have arrived
812 * the frontend will wait for prior results so we don't need
813 * to support abandoning in-flight results.
816 assert(!o
->Used
|| o
->Ready
); /* no in-flight query to worry about */
818 DBG("Begin(%d)\n", o
->Id
);
820 /* XXX: We have to consider that the command parser unit that parses batch
821 * buffer commands and is used to capture begin/end counter snapshots isn't
822 * implicitly synchronized with what's currently running across other GPU
823 * units (such as the EUs running shaders) that the performance counters are
826 * The intention of performance queries is to measure the work associated
827 * with commands between the begin/end delimiters and so for that to be the
828 * case we need to explicitly synchronize the parsing of commands to capture
829 * Begin/End counter snapshots with what's running across other parts of the
832 * When the command parser reaches a Begin marker it effectively needs to
833 * drain everything currently running on the GPU until the hardware is idle
834 * before capturing the first snapshot of counters - otherwise the results
835 * would also be measuring the effects of earlier commands.
837 * When the command parser reaches an End marker it needs to stall until
838 * everything currently running on the GPU has finished before capturing the
839 * end snapshot - otherwise the results won't be a complete representation
842 * Theoretically there could be opportunities to minimize how much of the
843 * GPU pipeline is drained, or that we stall for, when we know what specific
844 * units the performance counters being queried relate to but we don't
845 * currently attempt to be clever here.
847 * Note: with our current simple approach here then for back-to-back queries
848 * we will redundantly emit duplicate commands to synchronize the command
849 * streamer with the rest of the GPU pipeline, but we assume that in HW the
850 * second synchronization is effectively a NOOP.
852 * N.B. The final results are based on deltas of counters between (inside)
853 * Begin/End markers so even though the total wall clock time of the
854 * workload is stretched by larger pipeline bubbles the bubbles themselves
855 * are generally invisible to the query results. Whether that's a good or a
856 * bad thing depends on the use case. For a lower real-time impact while
857 * capturing metrics then periodic sampling may be a better choice than
858 * INTEL_performance_query.
861 * This is our Begin synchronization point to drain current work on the
862 * GPU before we capture our first counter snapshot...
864 brw_emit_mi_flush(brw
);
866 switch (query
->kind
) {
867 case GEN_PERF_QUERY_TYPE_OA
:
868 case GEN_PERF_QUERY_TYPE_RAW
: {
870 /* Opening an i915 perf stream implies exclusive access to the OA unit
871 * which will generate counter reports for a specific counter set with a
872 * specific layout/format so we can't begin any OA based queries that
873 * require a different counter set or format unless we get an opportunity
874 * to close the stream and open a new one...
876 uint64_t metric_id
= gen_perf_query_get_metric_id(brw
->perf_ctx
.perf
, query
);
878 if (brw
->perf_ctx
.oa_stream_fd
!= -1 &&
879 brw
->perf_ctx
.current_oa_metrics_set_id
!= metric_id
) {
881 if (brw
->perf_ctx
.n_oa_users
!= 0) {
882 DBG("WARNING: Begin(%d) failed already using perf config=%i/%"PRIu64
"\n",
883 o
->Id
, brw
->perf_ctx
.current_oa_metrics_set_id
, metric_id
);
886 close_perf(brw
, query
);
889 /* If the OA counters aren't already on, enable them. */
890 if (brw
->perf_ctx
.oa_stream_fd
== -1) {
891 __DRIscreen
*screen
= brw
->screen
->driScrnPriv
;
892 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
894 /* The period_exponent gives a sampling period as follows:
895 * sample_period = timestamp_period * 2^(period_exponent + 1)
897 * The timestamps increments every 80ns (HSW), ~52ns (GEN9LP) or
900 * The counter overflow period is derived from the EuActive counter
901 * which reads a counter that increments by the number of clock
902 * cycles multiplied by the number of EUs. It can be calculated as:
904 * 2^(number of bits in A counter) / (n_eus * max_gen_freq * 2)
906 * (E.g. 40 EUs @ 1GHz = ~53ms)
908 * We select a sampling period inferior to that overflow period to
909 * ensure we cannot see more than 1 counter overflow, otherwise we
910 * could loose information.
913 int a_counter_in_bits
= 32;
914 if (devinfo
->gen
>= 8)
915 a_counter_in_bits
= 40;
917 uint64_t overflow_period
= pow(2, a_counter_in_bits
) /
918 (brw
->perf_ctx
.perf
->sys_vars
.n_eus
*
919 /* drop 1GHz freq to have units in nanoseconds */
922 DBG("A counter overflow period: %"PRIu64
"ns, %"PRIu64
"ms (n_eus=%"PRIu64
")\n",
923 overflow_period
, overflow_period
/ 1000000ul, brw
->perf_ctx
.perf
->sys_vars
.n_eus
);
925 int period_exponent
= 0;
926 uint64_t prev_sample_period
, next_sample_period
;
927 for (int e
= 0; e
< 30; e
++) {
928 prev_sample_period
= 1000000000ull * pow(2, e
+ 1) / devinfo
->timestamp_frequency
;
929 next_sample_period
= 1000000000ull * pow(2, e
+ 2) / devinfo
->timestamp_frequency
;
931 /* Take the previous sampling period, lower than the overflow
934 if (prev_sample_period
< overflow_period
&&
935 next_sample_period
> overflow_period
)
936 period_exponent
= e
+ 1;
939 if (period_exponent
== 0) {
940 DBG("WARNING: enable to find a sampling exponent\n");
944 DBG("OA sampling exponent: %i ~= %"PRIu64
"ms\n", period_exponent
,
945 prev_sample_period
/ 1000000ul);
947 if (!open_i915_perf_oa_stream(brw
,
951 screen
->fd
, /* drm fd */
955 assert(brw
->perf_ctx
.current_oa_metrics_set_id
== metric_id
&&
956 brw
->perf_ctx
.current_oa_format
== query
->oa_format
);
959 if (!inc_n_oa_users(brw
)) {
960 DBG("WARNING: Error enabling i915 perf stream: %m\n");
965 brw
->perf_ctx
.perf
->vtbl
.bo_unreference(obj
->oa
.bo
);
970 brw
->perf_ctx
.perf
->vtbl
.bo_alloc(brw
->bufmgr
,
971 "perf. query OA MI_RPC bo",
974 /* Pre-filling the BO helps debug whether writes landed. */
975 void *map
= brw_bo_map(brw
, obj
->oa
.bo
, MAP_WRITE
);
976 memset(map
, 0x80, MI_RPC_BO_SIZE
);
977 brw_bo_unmap(obj
->oa
.bo
);
980 obj
->oa
.begin_report_id
= brw
->perf_ctx
.next_query_start_report_id
;
981 brw
->perf_ctx
.next_query_start_report_id
+= 2;
983 /* We flush the batchbuffer here to minimize the chances that MI_RPC
984 * delimiting commands end up in different batchbuffers. If that's the
985 * case, the measurement will include the time it takes for the kernel
986 * scheduler to load a new request into the hardware. This is manifested in
987 * tools like frameretrace by spikes in the "GPU Core Clocks" counter.
989 perf_cfg
->vtbl
.batchbuffer_flush(brw
, __FILE__
, __LINE__
);
991 /* Take a starting OA counter snapshot. */
992 brw
->perf_ctx
.perf
->vtbl
.emit_mi_report_perf_count(brw
, obj
->oa
.bo
, 0,
993 obj
->oa
.begin_report_id
);
994 perf_cfg
->vtbl
.capture_frequency_stat_register(brw
, obj
->oa
.bo
,
995 MI_FREQ_START_OFFSET_BYTES
);
997 ++brw
->perf_ctx
.n_active_oa_queries
;
999 /* No already-buffered samples can possibly be associated with this query
1000 * so create a marker within the list of sample buffers enabling us to
1001 * easily ignore earlier samples when processing this query after
1004 assert(!exec_list_is_empty(&brw
->perf_ctx
.sample_buffers
));
1005 obj
->oa
.samples_head
= exec_list_get_tail(&brw
->perf_ctx
.sample_buffers
);
1007 struct oa_sample_buf
*buf
=
1008 exec_node_data(struct oa_sample_buf
, obj
->oa
.samples_head
, link
);
1010 /* This reference will ensure that future/following sample
1011 * buffers (that may relate to this query) can't be freed until
1012 * this drops to zero.
1016 gen_perf_query_result_clear(&obj
->oa
.result
);
1017 obj
->oa
.results_accumulated
= false;
1019 add_to_unaccumulated_query_list(brw
, obj
);
1023 case GEN_PERF_QUERY_TYPE_PIPELINE
:
1024 if (obj
->pipeline_stats
.bo
) {
1025 brw
->perf_ctx
.perf
->vtbl
.bo_unreference(obj
->pipeline_stats
.bo
);
1026 obj
->pipeline_stats
.bo
= NULL
;
1029 obj
->pipeline_stats
.bo
=
1030 brw
->perf_ctx
.perf
->vtbl
.bo_alloc(brw
->bufmgr
,
1031 "perf. query pipeline stats bo",
1034 /* Take starting snapshots. */
1035 snapshot_statistics_registers(brw
, obj
, 0);
1037 ++brw
->perf_ctx
.n_active_pipeline_stats_queries
;
1041 unreachable("Unknown query type");
1045 if (INTEL_DEBUG
& DEBUG_PERFMON
)
1046 dump_perf_queries(brw
);
1052 * Driver hook for glEndPerfQueryINTEL().
1055 brw_end_perf_query(struct gl_context
*ctx
,
1056 struct gl_perf_query_object
*o
)
1058 struct brw_context
*brw
= brw_context(ctx
);
1059 struct brw_perf_query_object
*obj
= brw_perf_query(o
);
1060 struct gen_perf_config
*perf_cfg
= brw
->perf_ctx
.perf
;
1062 DBG("End(%d)\n", o
->Id
);
1064 /* Ensure that the work associated with the queried commands will have
1065 * finished before taking our query end counter readings.
1067 * For more details see comment in brw_begin_perf_query for
1068 * corresponding flush.
1070 brw_emit_mi_flush(brw
);
1072 switch (obj
->query
->kind
) {
1073 case GEN_PERF_QUERY_TYPE_OA
:
1074 case GEN_PERF_QUERY_TYPE_RAW
:
1076 /* NB: It's possible that the query will have already been marked
1077 * as 'accumulated' if an error was seen while reading samples
1078 * from perf. In this case we mustn't try and emit a closing
1079 * MI_RPC command in case the OA unit has already been disabled
1081 if (!obj
->oa
.results_accumulated
) {
1082 /* Take an ending OA counter snapshot. */
1083 perf_cfg
->vtbl
.capture_frequency_stat_register(brw
, obj
->oa
.bo
,
1084 MI_FREQ_END_OFFSET_BYTES
);
1085 brw
->vtbl
.emit_mi_report_perf_count(brw
, obj
->oa
.bo
,
1086 MI_RPC_BO_END_OFFSET_BYTES
,
1087 obj
->oa
.begin_report_id
+ 1);
1090 --brw
->perf_ctx
.n_active_oa_queries
;
1092 /* NB: even though the query has now ended, it can't be accumulated
1093 * until the end MI_REPORT_PERF_COUNT snapshot has been written
1098 case GEN_PERF_QUERY_TYPE_PIPELINE
:
1099 snapshot_statistics_registers(brw
, obj
,
1100 STATS_BO_END_OFFSET_BYTES
);
1101 --brw
->perf_ctx
.n_active_pipeline_stats_queries
;
1105 unreachable("Unknown query type");
1111 brw_wait_perf_query(struct gl_context
*ctx
, struct gl_perf_query_object
*o
)
1113 struct brw_context
*brw
= brw_context(ctx
);
1114 struct brw_perf_query_object
*obj
= brw_perf_query(o
);
1115 struct brw_bo
*bo
= NULL
;
1116 struct gen_perf_config
*perf_cfg
= brw
->perf_ctx
.perf
;
1120 switch (obj
->query
->kind
) {
1121 case GEN_PERF_QUERY_TYPE_OA
:
1122 case GEN_PERF_QUERY_TYPE_RAW
:
1126 case GEN_PERF_QUERY_TYPE_PIPELINE
:
1127 bo
= obj
->pipeline_stats
.bo
;
1131 unreachable("Unknown query type");
1138 /* If the current batch references our results bo then we need to
1141 if (brw_batch_references(&brw
->batch
, bo
))
1142 perf_cfg
->vtbl
.batchbuffer_flush(brw
, __FILE__
, __LINE__
);
1144 brw_bo_wait_rendering(bo
);
1146 /* Due to a race condition between the OA unit signaling report
1147 * availability and the report actually being written into memory,
1148 * we need to wait for all the reports to come in before we can
1151 if (obj
->query
->kind
== GEN_PERF_QUERY_TYPE_OA
||
1152 obj
->query
->kind
== GEN_PERF_QUERY_TYPE_RAW
) {
1153 while (!read_oa_samples_for_query(brw
, obj
))
1159 brw_is_perf_query_ready(struct gl_context
*ctx
,
1160 struct gl_perf_query_object
*o
)
1162 struct brw_context
*brw
= brw_context(ctx
);
1163 struct brw_perf_query_object
*obj
= brw_perf_query(o
);
1168 switch (obj
->query
->kind
) {
1169 case GEN_PERF_QUERY_TYPE_OA
:
1170 case GEN_PERF_QUERY_TYPE_RAW
:
1171 return (obj
->oa
.results_accumulated
||
1173 !brw_batch_references(&brw
->batch
, obj
->oa
.bo
) &&
1174 !brw_bo_busy(obj
->oa
.bo
) &&
1175 read_oa_samples_for_query(brw
, obj
)));
1176 case GEN_PERF_QUERY_TYPE_PIPELINE
:
1177 return (obj
->pipeline_stats
.bo
&&
1178 !brw_batch_references(&brw
->batch
, obj
->pipeline_stats
.bo
) &&
1179 !brw_bo_busy(obj
->pipeline_stats
.bo
));
1182 unreachable("Unknown query type");
1190 read_slice_unslice_frequencies(struct brw_context
*brw
,
1191 struct brw_perf_query_object
*obj
)
1193 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
1194 uint32_t *begin_report
= obj
->oa
.map
, *end_report
= obj
->oa
.map
+ MI_RPC_BO_END_OFFSET_BYTES
;
1196 gen_perf_query_result_read_frequencies(&obj
->oa
.result
,
1197 devinfo
, begin_report
, end_report
);
1201 read_gt_frequency(struct brw_context
*brw
,
1202 struct brw_perf_query_object
*obj
)
1204 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
1205 uint32_t start
= *((uint32_t *)(obj
->oa
.map
+ MI_FREQ_START_OFFSET_BYTES
)),
1206 end
= *((uint32_t *)(obj
->oa
.map
+ MI_FREQ_END_OFFSET_BYTES
));
1208 switch (devinfo
->gen
) {
1211 obj
->oa
.gt_frequency
[0] = GET_FIELD(start
, GEN7_RPSTAT1_CURR_GT_FREQ
) * 50ULL;
1212 obj
->oa
.gt_frequency
[1] = GET_FIELD(end
, GEN7_RPSTAT1_CURR_GT_FREQ
) * 50ULL;
1217 obj
->oa
.gt_frequency
[0] = GET_FIELD(start
, GEN9_RPSTAT0_CURR_GT_FREQ
) * 50ULL / 3ULL;
1218 obj
->oa
.gt_frequency
[1] = GET_FIELD(end
, GEN9_RPSTAT0_CURR_GT_FREQ
) * 50ULL / 3ULL;
1221 unreachable("unexpected gen");
1224 /* Put the numbers into Hz. */
1225 obj
->oa
.gt_frequency
[0] *= 1000000ULL;
1226 obj
->oa
.gt_frequency
[1] *= 1000000ULL;
1230 get_oa_counter_data(struct brw_context
*brw
,
1231 struct brw_perf_query_object
*obj
,
1235 struct gen_perf_config
*perf
= brw
->perf_ctx
.perf
;
1236 const struct gen_perf_query_info
*query
= obj
->query
;
1237 int n_counters
= query
->n_counters
;
1240 for (int i
= 0; i
< n_counters
; i
++) {
1241 const struct gen_perf_query_counter
*counter
= &query
->counters
[i
];
1242 uint64_t *out_uint64
;
1244 size_t counter_size
= gen_perf_query_counter_get_size(counter
);
1247 switch (counter
->data_type
) {
1248 case GEN_PERF_COUNTER_DATA_TYPE_UINT64
:
1249 out_uint64
= (uint64_t *)(data
+ counter
->offset
);
1251 counter
->oa_counter_read_uint64(perf
, query
,
1252 obj
->oa
.result
.accumulator
);
1254 case GEN_PERF_COUNTER_DATA_TYPE_FLOAT
:
1255 out_float
= (float *)(data
+ counter
->offset
);
1257 counter
->oa_counter_read_float(perf
, query
,
1258 obj
->oa
.result
.accumulator
);
1261 /* So far we aren't using uint32, double or bool32... */
1262 unreachable("unexpected counter data type");
1264 written
= counter
->offset
+ counter_size
;
1272 get_pipeline_stats_data(struct brw_context
*brw
,
1273 struct brw_perf_query_object
*obj
,
1278 const struct gen_perf_query_info
*query
= obj
->query
;
1279 int n_counters
= obj
->query
->n_counters
;
1282 uint64_t *start
= brw_bo_map(brw
, obj
->pipeline_stats
.bo
, MAP_READ
);
1283 uint64_t *end
= start
+ (STATS_BO_END_OFFSET_BYTES
/ sizeof(uint64_t));
1285 for (int i
= 0; i
< n_counters
; i
++) {
1286 const struct gen_perf_query_counter
*counter
= &query
->counters
[i
];
1287 uint64_t value
= end
[i
] - start
[i
];
1289 if (counter
->pipeline_stat
.numerator
!=
1290 counter
->pipeline_stat
.denominator
) {
1291 value
*= counter
->pipeline_stat
.numerator
;
1292 value
/= counter
->pipeline_stat
.denominator
;
1295 *((uint64_t *)p
) = value
;
1299 brw_bo_unmap(obj
->pipeline_stats
.bo
);
1305 * Driver hook for glGetPerfQueryDataINTEL().
1308 brw_get_perf_query_data(struct gl_context
*ctx
,
1309 struct gl_perf_query_object
*o
,
1312 GLuint
*bytes_written
)
1314 struct brw_context
*brw
= brw_context(ctx
);
1315 struct brw_perf_query_object
*obj
= brw_perf_query(o
);
1318 assert(brw_is_perf_query_ready(ctx
, o
));
1320 DBG("GetData(%d)\n", o
->Id
);
1322 if (INTEL_DEBUG
& DEBUG_PERFMON
)
1323 dump_perf_queries(brw
);
1325 /* We expect that the frontend only calls this hook when it knows
1326 * that results are available.
1330 switch (obj
->query
->kind
) {
1331 case GEN_PERF_QUERY_TYPE_OA
:
1332 case GEN_PERF_QUERY_TYPE_RAW
:
1333 if (!obj
->oa
.results_accumulated
) {
1334 read_gt_frequency(brw
, obj
);
1335 read_slice_unslice_frequencies(brw
, obj
);
1336 accumulate_oa_reports(brw
, obj
);
1337 assert(obj
->oa
.results_accumulated
);
1339 brw_bo_unmap(obj
->oa
.bo
);
1342 if (obj
->query
->kind
== GEN_PERF_QUERY_TYPE_OA
) {
1343 written
= get_oa_counter_data(brw
, obj
, data_size
, (uint8_t *)data
);
1345 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
1347 written
= gen_perf_query_result_write_mdapi((uint8_t *)data
, data_size
,
1348 devinfo
, &obj
->oa
.result
,
1349 obj
->oa
.gt_frequency
[0],
1350 obj
->oa
.gt_frequency
[1]);
1354 case GEN_PERF_QUERY_TYPE_PIPELINE
:
1355 written
= get_pipeline_stats_data(brw
, obj
, data_size
, (uint8_t *)data
);
1359 unreachable("Unknown query type");
1364 *bytes_written
= written
;
1367 static struct gl_perf_query_object
*
1368 brw_new_perf_query_object(struct gl_context
*ctx
, unsigned query_index
)
1370 struct brw_context
*brw
= brw_context(ctx
);
1371 const struct gen_perf_query_info
*query
=
1372 &brw
->perf_ctx
.perf
->queries
[query_index
];
1373 struct brw_perf_query_object
*obj
=
1374 calloc(1, sizeof(struct brw_perf_query_object
));
1381 brw
->perf_ctx
.n_query_instances
++;
1387 * Driver hook for glDeletePerfQueryINTEL().
1390 brw_delete_perf_query(struct gl_context
*ctx
,
1391 struct gl_perf_query_object
*o
)
1393 struct brw_context
*brw
= brw_context(ctx
);
1394 struct brw_perf_query_object
*obj
= brw_perf_query(o
);
1395 struct gen_perf_config
*perf_cfg
= brw
->perf_ctx
.perf
;
1397 /* We can assume that the frontend waits for a query to complete
1398 * before ever calling into here, so we don't have to worry about
1399 * deleting an in-flight query object.
1402 assert(!o
->Used
|| o
->Ready
);
1404 DBG("Delete(%d)\n", o
->Id
);
1406 switch (obj
->query
->kind
) {
1407 case GEN_PERF_QUERY_TYPE_OA
:
1408 case GEN_PERF_QUERY_TYPE_RAW
:
1410 if (!obj
->oa
.results_accumulated
) {
1411 drop_from_unaccumulated_query_list(brw
, obj
);
1412 dec_n_oa_users(brw
);
1415 perf_cfg
->vtbl
.bo_unreference(obj
->oa
.bo
);
1419 obj
->oa
.results_accumulated
= false;
1422 case GEN_PERF_QUERY_TYPE_PIPELINE
:
1423 if (obj
->pipeline_stats
.bo
) {
1424 perf_cfg
->vtbl
.bo_unreference(obj
->pipeline_stats
.bo
);
1425 obj
->pipeline_stats
.bo
= NULL
;
1430 unreachable("Unknown query type");
1434 /* As an indication that the INTEL_performance_query extension is no
1435 * longer in use, it's a good time to free our cache of sample
1436 * buffers and close any current i915-perf stream.
1438 if (--brw
->perf_ctx
.n_query_instances
== 0) {
1439 free_sample_bufs(brw
);
1440 close_perf(brw
, obj
->query
);
1446 /******************************************************************************/
1449 init_pipeline_statistic_query_registers(struct brw_context
*brw
)
1451 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
1452 struct gen_perf_config
*perf
= brw
->perf_ctx
.perf
;
1453 struct gen_perf_query_info
*query
=
1454 gen_perf_query_append_query_info(perf
, MAX_STAT_COUNTERS
);
1456 query
->kind
= GEN_PERF_QUERY_TYPE_PIPELINE
;
1457 query
->name
= "Pipeline Statistics Registers";
1459 gen_perf_query_info_add_basic_stat_reg(query
, IA_VERTICES_COUNT
,
1460 "N vertices submitted");
1461 gen_perf_query_info_add_basic_stat_reg(query
, IA_PRIMITIVES_COUNT
,
1462 "N primitives submitted");
1463 gen_perf_query_info_add_basic_stat_reg(query
, VS_INVOCATION_COUNT
,
1464 "N vertex shader invocations");
1466 if (devinfo
->gen
== 6) {
1467 gen_perf_query_info_add_stat_reg(query
, GEN6_SO_PRIM_STORAGE_NEEDED
, 1, 1,
1468 "SO_PRIM_STORAGE_NEEDED",
1469 "N geometry shader stream-out primitives (total)");
1470 gen_perf_query_info_add_stat_reg(query
, GEN6_SO_NUM_PRIMS_WRITTEN
, 1, 1,
1471 "SO_NUM_PRIMS_WRITTEN",
1472 "N geometry shader stream-out primitives (written)");
1474 gen_perf_query_info_add_stat_reg(query
, GEN7_SO_PRIM_STORAGE_NEEDED(0), 1, 1,
1475 "SO_PRIM_STORAGE_NEEDED (Stream 0)",
1476 "N stream-out (stream 0) primitives (total)");
1477 gen_perf_query_info_add_stat_reg(query
, GEN7_SO_PRIM_STORAGE_NEEDED(1), 1, 1,
1478 "SO_PRIM_STORAGE_NEEDED (Stream 1)",
1479 "N stream-out (stream 1) primitives (total)");
1480 gen_perf_query_info_add_stat_reg(query
, GEN7_SO_PRIM_STORAGE_NEEDED(2), 1, 1,
1481 "SO_PRIM_STORAGE_NEEDED (Stream 2)",
1482 "N stream-out (stream 2) primitives (total)");
1483 gen_perf_query_info_add_stat_reg(query
, GEN7_SO_PRIM_STORAGE_NEEDED(3), 1, 1,
1484 "SO_PRIM_STORAGE_NEEDED (Stream 3)",
1485 "N stream-out (stream 3) primitives (total)");
1486 gen_perf_query_info_add_stat_reg(query
, GEN7_SO_NUM_PRIMS_WRITTEN(0), 1, 1,
1487 "SO_NUM_PRIMS_WRITTEN (Stream 0)",
1488 "N stream-out (stream 0) primitives (written)");
1489 gen_perf_query_info_add_stat_reg(query
, GEN7_SO_NUM_PRIMS_WRITTEN(1), 1, 1,
1490 "SO_NUM_PRIMS_WRITTEN (Stream 1)",
1491 "N stream-out (stream 1) primitives (written)");
1492 gen_perf_query_info_add_stat_reg(query
, GEN7_SO_NUM_PRIMS_WRITTEN(2), 1, 1,
1493 "SO_NUM_PRIMS_WRITTEN (Stream 2)",
1494 "N stream-out (stream 2) primitives (written)");
1495 gen_perf_query_info_add_stat_reg(query
, GEN7_SO_NUM_PRIMS_WRITTEN(3), 1, 1,
1496 "SO_NUM_PRIMS_WRITTEN (Stream 3)",
1497 "N stream-out (stream 3) primitives (written)");
1500 gen_perf_query_info_add_basic_stat_reg(query
, HS_INVOCATION_COUNT
,
1501 "N TCS shader invocations");
1502 gen_perf_query_info_add_basic_stat_reg(query
, DS_INVOCATION_COUNT
,
1503 "N TES shader invocations");
1505 gen_perf_query_info_add_basic_stat_reg(query
, GS_INVOCATION_COUNT
,
1506 "N geometry shader invocations");
1507 gen_perf_query_info_add_basic_stat_reg(query
, GS_PRIMITIVES_COUNT
,
1508 "N geometry shader primitives emitted");
1510 gen_perf_query_info_add_basic_stat_reg(query
, CL_INVOCATION_COUNT
,
1511 "N primitives entering clipping");
1512 gen_perf_query_info_add_basic_stat_reg(query
, CL_PRIMITIVES_COUNT
,
1513 "N primitives leaving clipping");
1515 if (devinfo
->is_haswell
|| devinfo
->gen
== 8) {
1516 gen_perf_query_info_add_stat_reg(query
, PS_INVOCATION_COUNT
, 1, 4,
1517 "N fragment shader invocations",
1518 "N fragment shader invocations");
1520 gen_perf_query_info_add_basic_stat_reg(query
, PS_INVOCATION_COUNT
,
1521 "N fragment shader invocations");
1524 gen_perf_query_info_add_basic_stat_reg(query
, PS_DEPTH_COUNT
,
1525 "N z-pass fragments");
1527 if (devinfo
->gen
>= 7) {
1528 gen_perf_query_info_add_basic_stat_reg(query
, CS_INVOCATION_COUNT
,
1529 "N compute shader invocations");
1532 query
->data_size
= sizeof(uint64_t) * query
->n_counters
;
1535 /* gen_device_info will have incorrect default topology values for unsupported kernels.
1536 * verify kernel support to ensure OA metrics are accurate.
1539 oa_metrics_kernel_support(int fd
, const struct gen_device_info
*devinfo
)
1541 if (devinfo
->gen
>= 10) {
1542 /* topology uAPI required for CNL+ (kernel 4.17+) make a call to the api
1545 struct drm_i915_query_item item
= {
1546 .query_id
= DRM_I915_QUERY_TOPOLOGY_INFO
,
1548 struct drm_i915_query query
= {
1550 .items_ptr
= (uintptr_t) &item
,
1553 /* kernel 4.17+ supports the query */
1554 return drmIoctl(fd
, DRM_IOCTL_I915_QUERY
, &query
) == 0;
1557 if (devinfo
->gen
>= 8) {
1558 /* 4.13+ api required for gen8 - gen9 */
1560 struct drm_i915_getparam gp
= {
1561 .param
= I915_PARAM_SLICE_MASK
,
1564 /* kernel 4.13+ supports this parameter */
1565 return drmIoctl(fd
, DRM_IOCTL_I915_GETPARAM
, &gp
) == 0;
1568 if (devinfo
->gen
== 7)
1569 /* default topology values are correct for HSW */
1572 /* oa not supported before gen 7*/
1577 brw_oa_bo_alloc(void *bufmgr
, const char *name
, uint64_t size
)
1579 return brw_bo_alloc(bufmgr
, name
, size
, BRW_MEMZONE_OTHER
);
1583 brw_oa_emit_mi_report_perf_count(void *c
,
1585 uint32_t offset_in_bytes
,
1588 struct brw_context
*ctx
= c
;
1589 ctx
->vtbl
.emit_mi_report_perf_count(ctx
,
1595 typedef void (*bo_unreference_t
)(void *);
1596 typedef void (* emit_mi_report_t
)(void *, void *, uint32_t, uint32_t);
1599 brw_oa_batchbuffer_flush(void *c
, const char *file
, int line
)
1601 struct brw_context
*ctx
= c
;
1602 _intel_batchbuffer_flush_fence(ctx
, -1, NULL
, file
, line
);
1605 typedef void (*capture_frequency_stat_register_t
)(void *, void *, uint32_t );
1608 brw_init_perf_query_info(struct gl_context
*ctx
)
1610 struct brw_context
*brw
= brw_context(ctx
);
1611 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
1612 __DRIscreen
*screen
= brw
->screen
->driScrnPriv
;
1614 struct gen_perf_config
*perf_cfg
= brw
->perf_ctx
.perf
;
1616 return perf_cfg
->n_queries
;
1618 perf_cfg
= gen_perf_new(brw
);
1619 brw
->perf_ctx
.perf
= perf_cfg
;
1620 perf_cfg
->vtbl
.bo_alloc
= brw_oa_bo_alloc
;
1621 perf_cfg
->vtbl
.bo_unreference
= (bo_unreference_t
)brw_bo_unreference
;
1622 perf_cfg
->vtbl
.emit_mi_report_perf_count
=
1623 (emit_mi_report_t
)brw_oa_emit_mi_report_perf_count
;
1624 perf_cfg
->vtbl
.batchbuffer_flush
= brw_oa_batchbuffer_flush
;
1625 perf_cfg
->vtbl
.capture_frequency_stat_register
=
1626 (capture_frequency_stat_register_t
) capture_frequency_stat_register
;
1628 init_pipeline_statistic_query_registers(brw
);
1629 gen_perf_query_register_mdapi_statistic_query(&brw
->screen
->devinfo
,
1630 brw
->perf_ctx
.perf
);
1632 if ((oa_metrics_kernel_support(screen
->fd
, devinfo
)) &&
1633 (gen_perf_load_oa_metrics(perf_cfg
, screen
->fd
, devinfo
)))
1634 gen_perf_query_register_mdapi_oa_query(&brw
->screen
->devinfo
,
1635 brw
->perf_ctx
.perf
);
1637 brw
->perf_ctx
.unaccumulated
=
1638 ralloc_array(brw
, struct brw_perf_query_object
*, 2);
1639 brw
->perf_ctx
.unaccumulated_elements
= 0;
1640 brw
->perf_ctx
.unaccumulated_array_size
= 2;
1642 exec_list_make_empty(&brw
->perf_ctx
.sample_buffers
);
1643 exec_list_make_empty(&brw
->perf_ctx
.free_sample_buffers
);
1645 /* It's convenient to guarantee that this linked list of sample
1646 * buffers is never empty so we add an empty head so when we
1647 * Begin an OA query we can always take a reference on a buffer
1650 struct oa_sample_buf
*buf
= get_free_sample_buf(brw
);
1651 exec_list_push_head(&brw
->perf_ctx
.sample_buffers
, &buf
->link
);
1653 brw
->perf_ctx
.oa_stream_fd
= -1;
1655 brw
->perf_ctx
.next_query_start_report_id
= 1000;
1657 return perf_cfg
->n_queries
;
1661 brw_init_performance_queries(struct brw_context
*brw
)
1663 struct gl_context
*ctx
= &brw
->ctx
;
1665 ctx
->Driver
.InitPerfQueryInfo
= brw_init_perf_query_info
;
1666 ctx
->Driver
.GetPerfQueryInfo
= brw_get_perf_query_info
;
1667 ctx
->Driver
.GetPerfCounterInfo
= brw_get_perf_counter_info
;
1668 ctx
->Driver
.NewPerfQueryObject
= brw_new_perf_query_object
;
1669 ctx
->Driver
.DeletePerfQuery
= brw_delete_perf_query
;
1670 ctx
->Driver
.BeginPerfQuery
= brw_begin_perf_query
;
1671 ctx
->Driver
.EndPerfQuery
= brw_end_perf_query
;
1672 ctx
->Driver
.WaitPerfQuery
= brw_wait_perf_query
;
1673 ctx
->Driver
.IsPerfQueryReady
= brw_is_perf_query_ready
;
1674 ctx
->Driver
.GetPerfQueryData
= brw_get_perf_query_data
;