- /* XXX: We have to consider that the command parser unit that parses batch
- * buffer commands and is used to capture begin/end counter snapshots isn't
- * implicitly synchronized with what's currently running across other GPU
- * units (such as the EUs running shaders) that the performance counters are
- * associated with.
- *
- * The intention of performance queries is to measure the work associated
- * with commands between the begin/end delimiters and so for that to be the
- * case we need to explicitly synchronize the parsing of commands to capture
- * Begin/End counter snapshots with what's running across other parts of the
- * GPU.
- *
- * When the command parser reaches a Begin marker it effectively needs to
- * drain everything currently running on the GPU until the hardware is idle
- * before capturing the first snapshot of counters - otherwise the results
- * would also be measuring the effects of earlier commands.
- *
- * When the command parser reaches an End marker it needs to stall until
- * everything currently running on the GPU has finished before capturing the
- * end snapshot - otherwise the results won't be a complete representation
- * of the work.
- *
- * Theoretically there could be opportunities to minimize how much of the
- * GPU pipeline is drained, or that we stall for, when we know what specific
- * units the performance counters being queried relate to but we don't
- * currently attempt to be clever here.
- *
- * Note: with our current simple approach here then for back-to-back queries
- * we will redundantly emit duplicate commands to synchronize the command
- * streamer with the rest of the GPU pipeline, but we assume that in HW the
- * second synchronization is effectively a NOOP.
- *
- * N.B. The final results are based on deltas of counters between (inside)
- * Begin/End markers so even though the total wall clock time of the
- * workload is stretched by larger pipeline bubbles the bubbles themselves
- * are generally invisible to the query results. Whether that's a good or a
- * bad thing depends on the use case. For a lower real-time impact while
- * capturing metrics then periodic sampling may be a better choice than
- * INTEL_performance_query.
- *
- *
- * This is our Begin synchronization point to drain current work on the
- * GPU before we capture our first counter snapshot...
- */
- brw_emit_mi_flush(brw);
-
- switch (query->kind) {
- case OA_COUNTERS:
-
- /* Opening an i915 perf stream implies exclusive access to the OA unit
- * which will generate counter reports for a specific counter set with a
- * specific layout/format so we can't begin any OA based queries that
- * require a different counter set or format unless we get an opportunity
- * to close the stream and open a new one...
- */
- if (brw->perfquery.oa_stream_fd != -1 &&
- brw->perfquery.current_oa_metrics_set_id !=
- query->oa_metrics_set_id) {
-
- if (brw->perfquery.n_oa_users != 0)
- return false;
- else
- close_perf(brw);
- }
-
- /* If the OA counters aren't already on, enable them. */
- if (brw->perfquery.oa_stream_fd == -1) {
- __DRIscreen *screen = brw->screen->driScrnPriv;
- uint32_t ctx_id;
- int period_exponent;
-
- if (drm_intel_gem_context_get_id(brw->hw_ctx, &ctx_id) != 0)
- return false;
-
- /* The timestamp for HSW+ increments every 80ns
- *
- * The period_exponent gives a sampling period as follows:
- * sample_period = 80ns * 2^(period_exponent + 1)
- *
- * The overflow period for Haswell can be calculated as:
- *
- * 2^32 / (n_eus * max_gen_freq * 2)
- * (E.g. 40 EUs @ 1GHz = ~53ms)
- *
- * We currently sample every 42 milliseconds...
- */
- period_exponent = 18;
-
- if (!open_i915_perf_oa_stream(brw,
- query->oa_metrics_set_id,
- query->oa_format,
- period_exponent,
- screen->fd, /* drm fd */
- ctx_id))
- return false;
- } else {
- assert(brw->perfquery.current_oa_metrics_set_id ==
- query->oa_metrics_set_id &&
- brw->perfquery.current_oa_format ==
- query->oa_format);
- }
-
- if (!inc_n_oa_users(brw)) {
- DBG("WARNING: Error enabling i915 perf stream: %m\n");
- return false;
- }
-
- if (obj->oa.bo) {
- drm_intel_bo_unreference(obj->oa.bo);
- obj->oa.bo = NULL;
- }
-
- obj->oa.bo =
- drm_intel_bo_alloc(brw->bufmgr, "perf. query OA MI_RPC bo",
- MI_RPC_BO_SIZE, 64);
-#ifdef DEBUG
- /* Pre-filling the BO helps debug whether writes landed. */
- drm_intel_bo_map(obj->oa.bo, true);
- memset((char *) obj->oa.bo->virtual, 0x80, MI_RPC_BO_SIZE);
- drm_intel_bo_unmap(obj->oa.bo);
-#endif
-
- obj->oa.begin_report_id = brw->perfquery.next_query_start_report_id;
- brw->perfquery.next_query_start_report_id += 2;
-
- /* Take a starting OA counter snapshot. */
- emit_mi_report_perf_count(brw, obj->oa.bo, 0,
- obj->oa.begin_report_id);
- ++brw->perfquery.n_active_oa_queries;
-
- /* No already-buffered samples can possibly be associated with this query
- * so create a marker within the list of sample buffers enabling us to
- * easily ignore earlier samples when processing this query after
- * completion.
- */
- assert(!exec_list_is_empty(&brw->perfquery.sample_buffers));
- obj->oa.samples_head = exec_list_get_tail(&brw->perfquery.sample_buffers);
-
- struct brw_oa_sample_buf *buf =
- exec_node_data(struct brw_oa_sample_buf, obj->oa.samples_head, link);
-
- /* This reference will ensure that future/following sample
- * buffers (that may relate to this query) can't be freed until
- * this drops to zero.
- */
- buf->refcount++;
-
- memset(obj->oa.accumulator, 0, sizeof(obj->oa.accumulator));
- obj->oa.results_accumulated = false;
-
- add_to_unaccumulated_query_list(brw, obj);
- break;
-
- case PIPELINE_STATS:
- if (obj->pipeline_stats.bo) {
- drm_intel_bo_unreference(obj->pipeline_stats.bo);
- obj->pipeline_stats.bo = NULL;
- }
-
- obj->pipeline_stats.bo =
- drm_intel_bo_alloc(brw->bufmgr, "perf. query pipeline stats bo",
- STATS_BO_SIZE, 64);
-
- /* Take starting snapshots. */
- snapshot_statistics_registers(brw, obj, 0);
-
- ++brw->perfquery.n_active_pipeline_stats_queries;
- break;
- }