#include <unistd.h>
#include <errno.h>
+#ifndef HAVE_DIRENT_D_TYPE
+#include <limits.h> // PATH_MAX
+#endif
+
#include <drm-uapi/i915_drm.h>
#include "common/gen_gem.h"
#include "gen_perf.h"
+#include "gen_perf_regs.h"
#include "perf/gen_perf_mdapi.h"
#include "perf/gen_perf_metrics.h"
#include "dev/gen_debug.h"
#include "dev/gen_device_info.h"
#include "util/bitscan.h"
+#include "util/mesa-sha1.h"
#include "util/u_math.h"
#define FILE_DEBUG_FLAG DEBUG_PERFMON
#define MAP_READ (1 << 0)
#define MAP_WRITE (1 << 1)
+#define OA_REPORT_INVALID_CTX_ID (0xffffffff)
+
/**
* Periodic OA samples are read() into these buffer structures via the
* i915 perf kernel interface and appended to the
uint32_t last_timestamp;
};
+/**
+ * gen representation of a performance query object.
+ *
+ * NB: We want to keep this structure relatively lean considering that
+ * applications may expect to allocate enough objects to be able to
+ * query around all draw calls in a frame.
+ */
+struct gen_perf_query_object
+{
+ const struct gen_perf_query_info *queryinfo;
+
+ /* See query->kind to know which state below is in use... */
+ union {
+ struct {
+
+ /**
+ * BO containing OA counter snapshots at query Begin/End time.
+ */
+ void *bo;
+
+ /**
+ * Address of mapped of @bo
+ */
+ void *map;
+
+ /**
+ * The MI_REPORT_PERF_COUNT command lets us specify a unique
+ * ID that will be reflected in the resulting OA report
+ * that's written by the GPU. This is the ID we're expecting
+ * in the begin report and the the end report should be
+ * @begin_report_id + 1.
+ */
+ int begin_report_id;
+
+ /**
+ * Reference the head of the brw->perfquery.sample_buffers
+ * list at the time that the query started (so we only need
+ * to look at nodes after this point when looking for samples
+ * related to this query)
+ *
+ * (See struct brw_oa_sample_buf description for more details)
+ */
+ struct exec_node *samples_head;
+
+ /**
+ * false while in the unaccumulated_elements list, and set to
+ * true when the final, end MI_RPC snapshot has been
+ * accumulated.
+ */
+ bool results_accumulated;
+
+ /**
+ * Frequency of the GT at begin and end of the query.
+ */
+ uint64_t gt_frequency[2];
+
+ /**
+ * Accumulated OA results between begin and end of the query.
+ */
+ struct gen_perf_query_result result;
+ } oa;
+
+ struct {
+ /**
+ * BO containing starting and ending snapshots for the
+ * statistics counters.
+ */
+ void *bo;
+ } pipeline_stats;
+ };
+};
+
+struct gen_perf_context {
+ struct gen_perf_config *perf;
+
+ void * ctx; /* driver context (eg, brw_context) */
+ void * bufmgr;
+ const struct gen_device_info *devinfo;
+
+ uint32_t hw_ctx;
+ int drm_fd;
+
+ /* The i915 perf stream we open to setup + enable the OA counters */
+ int oa_stream_fd;
+
+ /* An i915 perf stream fd gives exclusive access to the OA unit that will
+ * report counter snapshots for a specific counter set/profile in a
+ * specific layout/format so we can only start OA queries that are
+ * compatible with the currently open fd...
+ */
+ int current_oa_metrics_set_id;
+ int current_oa_format;
+
+ /* List of buffers containing OA reports */
+ struct exec_list sample_buffers;
+
+ /* Cached list of empty sample buffers */
+ struct exec_list free_sample_buffers;
+
+ int n_active_oa_queries;
+ int n_active_pipeline_stats_queries;
+
+ /* The number of queries depending on running OA counters which
+ * extends beyond brw_end_perf_query() since we need to wait until
+ * the last MI_RPC command has parsed by the GPU.
+ *
+ * Accurate accounting is important here as emitting an
+ * MI_REPORT_PERF_COUNT command while the OA unit is disabled will
+ * effectively hang the gpu.
+ */
+ int n_oa_users;
+
+ /* To help catch an spurious problem with the hardware or perf
+ * forwarding samples, we emit each MI_REPORT_PERF_COUNT command
+ * with a unique ID that we can explicitly check for...
+ */
+ int next_query_start_report_id;
+
+ /**
+ * An array of queries whose results haven't yet been assembled
+ * based on the data in buffer objects.
+ *
+ * These may be active, or have already ended. However, the
+ * results have not been requested.
+ */
+ struct gen_perf_query_object **unaccumulated;
+ int unaccumulated_elements;
+ int unaccumulated_array_size;
+
+ /* The total number of query objects so we can relinquish
+ * our exclusive access to perf if the application deletes
+ * all of its objects. (NB: We only disable perf while
+ * there are no active queries)
+ */
+ int n_query_instances;
+};
+
+const struct gen_perf_query_info*
+gen_perf_query_info(const struct gen_perf_query_object *query)
+{
+ return query->queryinfo;
+}
+
+struct gen_perf_context *
+gen_perf_new_context(void *parent)
+{
+ struct gen_perf_context *ctx = rzalloc(parent, struct gen_perf_context);
+ if (! ctx)
+ fprintf(stderr, "%s: failed to alloc context\n", __func__);
+ return ctx;
+}
+
+struct gen_perf_config *
+gen_perf_config(struct gen_perf_context *ctx)
+{
+ return ctx->perf;
+}
+
struct gen_perf_query_object *
gen_perf_new_query(struct gen_perf_context *perf_ctx, unsigned query_index)
{
return obj;
}
+int
+gen_perf_active_queries(struct gen_perf_context *perf_ctx,
+ const struct gen_perf_query_info *query)
+{
+ assert(perf_ctx->n_active_oa_queries == 0 || perf_ctx->n_active_pipeline_stats_queries == 0);
+
+ switch (query->kind) {
+ case GEN_PERF_QUERY_TYPE_OA:
+ case GEN_PERF_QUERY_TYPE_RAW:
+ return perf_ctx->n_active_oa_queries;
+ break;
+
+ case GEN_PERF_QUERY_TYPE_PIPELINE:
+ return perf_ctx->n_active_pipeline_stats_queries;
+ break;
+
+ default:
+ unreachable("Unknown query type");
+ break;
+ }
+}
+
+static inline uint64_t to_user_pointer(void *ptr)
+{
+ return (uintptr_t) ptr;
+}
+
+static bool
+is_dir_or_link(const struct dirent *entry, const char *parent_dir)
+{
+#ifdef HAVE_DIRENT_D_TYPE
+ return entry->d_type == DT_DIR || entry->d_type == DT_LNK;
+#else
+ struct stat st;
+ char path[PATH_MAX + 1];
+ snprintf(path, sizeof(path), "%s/%s", parent_dir, entry->d_name);
+ lstat(path, &st);
+ return S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode);
+#endif
+}
+
static bool
get_sysfs_dev_dir(struct gen_perf_config *perf, int fd)
{
}
while ((drm_entry = readdir(drmdir))) {
- if ((drm_entry->d_type == DT_DIR ||
- drm_entry->d_type == DT_LNK) &&
+ if (is_dir_or_link(drm_entry, perf->sysfs_dev_dir) &&
strncmp(drm_entry->d_name, "card", 4) == 0)
{
len = snprintf(perf->sysfs_dev_dir,
const struct gen_perf_query_info *query,
uint64_t config_id)
{
- struct gen_perf_query_info *registred_query = append_query_info(perf, 0);
+ struct gen_perf_query_info *registered_query = append_query_info(perf, 0);
- *registred_query = *query;
- registred_query->oa_metrics_set_id = config_id;
- DBG("metric set registred: id = %" PRIu64", guid = %s\n",
- registred_query->oa_metrics_set_id, query->guid);
+ *registered_query = *query;
+ registered_query->oa_metrics_set_id = config_id;
+ DBG("metric set registered: id = %" PRIu64", guid = %s\n",
+ registered_query->oa_metrics_set_id, query->guid);
}
static void
while ((metric_entry = readdir(metricsdir))) {
struct hash_entry *entry;
-
- if ((metric_entry->d_type != DT_DIR &&
- metric_entry->d_type != DT_LNK) ||
+ if (!is_dir_or_link(metric_entry, buf) ||
metric_entry->d_name[0] == '.')
continue;
metric_entry->d_name);
if (entry) {
uint64_t id;
-
- len = snprintf(buf, sizeof(buf), "%s/metrics/%s/id",
- perf->sysfs_dev_dir, metric_entry->d_name);
- if (len < 0 || len >= sizeof(buf)) {
- DBG("Failed to concatenate path to sysfs metric id file\n");
- continue;
- }
-
- if (!read_file_uint64(buf, &id)) {
+ if (!gen_perf_load_metric_id(perf, metric_entry->d_name, &id)) {
DBG("Failed to read metric set id from %s: %m", buf);
continue;
}
&invalid_config_id) < 0 && errno == ENOENT;
}
+static int
+i915_query_items(struct gen_perf_config *perf, int fd,
+ struct drm_i915_query_item *items, uint32_t n_items)
+{
+ struct drm_i915_query q = {
+ .num_items = n_items,
+ .items_ptr = to_user_pointer(items),
+ };
+ return gen_ioctl(fd, DRM_IOCTL_I915_QUERY, &q);
+}
+
+static bool
+i915_query_perf_config_supported(struct gen_perf_config *perf, int fd)
+{
+ struct drm_i915_query_item item = {
+ .query_id = DRM_I915_QUERY_PERF_CONFIG,
+ .flags = DRM_I915_QUERY_PERF_CONFIG_LIST,
+ };
+
+ return i915_query_items(perf, fd, &item, 1) == 0 && item.length > 0;
+}
+
static bool
-load_metric_id(struct gen_perf_config *perf, const char *guid,
- uint64_t *metric_id)
+i915_query_perf_config_data(struct gen_perf_config *perf,
+ int fd, const char *guid,
+ struct drm_i915_perf_oa_config *config)
+{
+ struct {
+ struct drm_i915_query_perf_config query;
+ struct drm_i915_perf_oa_config config;
+ } item_data;
+ struct drm_i915_query_item item = {
+ .query_id = DRM_I915_QUERY_PERF_CONFIG,
+ .flags = DRM_I915_QUERY_PERF_CONFIG_DATA_FOR_UUID,
+ .data_ptr = to_user_pointer(&item_data),
+ .length = sizeof(item_data),
+ };
+
+ memset(&item_data, 0, sizeof(item_data));
+ memcpy(item_data.query.uuid, guid, sizeof(item_data.query.uuid));
+ memcpy(&item_data.config, config, sizeof(item_data.config));
+
+ if (!(i915_query_items(perf, fd, &item, 1) == 0 && item.length > 0))
+ return false;
+
+ memcpy(config, &item_data.config, sizeof(item_data.config));
+
+ return true;
+}
+
+bool
+gen_perf_load_metric_id(struct gen_perf_config *perf_cfg,
+ const char *guid,
+ uint64_t *metric_id)
{
char config_path[280];
snprintf(config_path, sizeof(config_path), "%s/metrics/%s/id",
- perf->sysfs_dev_dir, guid);
+ perf_cfg->sysfs_dev_dir, guid);
/* Don't recreate already loaded configs. */
return read_file_uint64(config_path, metric_id);
}
+static uint64_t
+i915_add_config(struct gen_perf_config *perf, int fd,
+ const struct gen_perf_registers *config,
+ const char *guid)
+{
+ struct drm_i915_perf_oa_config i915_config = { 0, };
+
+ memcpy(i915_config.uuid, guid, sizeof(i915_config.uuid));
+
+ i915_config.n_mux_regs = config->n_mux_regs;
+ i915_config.mux_regs_ptr = to_user_pointer(config->mux_regs);
+
+ i915_config.n_boolean_regs = config->n_b_counter_regs;
+ i915_config.boolean_regs_ptr = to_user_pointer(config->b_counter_regs);
+
+ i915_config.n_flex_regs = config->n_flex_regs;
+ i915_config.flex_regs_ptr = to_user_pointer(config->flex_regs);
+
+ int ret = gen_ioctl(fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &i915_config);
+ return ret > 0 ? ret : 0;
+}
+
static void
init_oa_configs(struct gen_perf_config *perf, int fd)
{
hash_table_foreach(perf->oa_metrics_table, entry) {
const struct gen_perf_query_info *query = entry->data;
- struct drm_i915_perf_oa_config config;
uint64_t config_id;
- int ret;
- if (load_metric_id(perf, query->guid, &config_id)) {
+ if (gen_perf_load_metric_id(perf, query->guid, &config_id)) {
DBG("metric set: %s (already loaded)\n", query->guid);
register_oa_config(perf, query, config_id);
continue;
}
- memset(&config, 0, sizeof(config));
-
- memcpy(config.uuid, query->guid, sizeof(config.uuid));
-
- config.n_mux_regs = query->n_mux_regs;
- config.mux_regs_ptr = (uintptr_t) query->mux_regs;
-
- config.n_boolean_regs = query->n_b_counter_regs;
- config.boolean_regs_ptr = (uintptr_t) query->b_counter_regs;
-
- config.n_flex_regs = query->n_flex_regs;
- config.flex_regs_ptr = (uintptr_t) query->flex_regs;
-
- ret = gen_ioctl(fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &config);
+ int ret = i915_add_config(perf, fd, &query->config, query->guid);
if (ret < 0) {
DBG("Failed to load \"%s\" (%s) metrics set in kernel: %s\n",
query->name, query->guid, strerror(errno));
}
if (devinfo->is_cannonlake)
return gen_oa_register_queries_cnl;
- if (devinfo->gen == 11)
+ if (devinfo->gen == 11) {
+ if (devinfo->is_elkhartlake)
+ return gen_oa_register_queries_lkf;
return gen_oa_register_queries_icl;
+ }
+ if (devinfo->gen == 12)
+ return gen_oa_register_queries_tgl;
return NULL;
}
bool i915_perf_oa_available = false;
struct stat sb;
+ perf->i915_query_supported = i915_query_perf_config_supported(perf, fd);
+
/* The existence of this sysctl parameter implies the kernel supports
* the i915 perf interface.
*/
return false;
perf->oa_metrics_table =
- _mesa_hash_table_create(perf, _mesa_key_hash_string,
+ _mesa_hash_table_create(perf, _mesa_hash_string,
_mesa_key_string_equal);
/* Index all the metric sets mesa knows about before looking to see what
return true;
}
+struct gen_perf_registers *
+gen_perf_load_configuration(struct gen_perf_config *perf_cfg, int fd, const char *guid)
+{
+ if (!perf_cfg->i915_query_supported)
+ return NULL;
+
+ struct drm_i915_perf_oa_config i915_config = { 0, };
+ if (!i915_query_perf_config_data(perf_cfg, fd, guid, &i915_config))
+ return NULL;
+
+ struct gen_perf_registers *config = rzalloc(NULL, struct gen_perf_registers);
+ config->n_flex_regs = i915_config.n_flex_regs;
+ config->flex_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_flex_regs);
+ config->n_mux_regs = i915_config.n_mux_regs;
+ config->mux_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_mux_regs);
+ config->n_b_counter_regs = i915_config.n_boolean_regs;
+ config->b_counter_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_b_counter_regs);
+
+ /*
+ * struct gen_perf_query_register_prog maps exactly to the tuple of
+ * (register offset, register value) returned by the i915.
+ */
+ i915_config.flex_regs_ptr = to_user_pointer(config->flex_regs);
+ i915_config.mux_regs_ptr = to_user_pointer(config->mux_regs);
+ i915_config.boolean_regs_ptr = to_user_pointer(config->b_counter_regs);
+ if (!i915_query_perf_config_data(perf_cfg, fd, guid, &i915_config)) {
+ ralloc_free(config);
+ return NULL;
+ }
+
+ return config;
+}
+
+uint64_t
+gen_perf_store_configuration(struct gen_perf_config *perf_cfg, int fd,
+ const struct gen_perf_registers *config,
+ const char *guid)
+{
+ if (guid)
+ return i915_add_config(perf_cfg, fd, config, guid);
+
+ struct mesa_sha1 sha1_ctx;
+ _mesa_sha1_init(&sha1_ctx);
+
+ if (config->flex_regs) {
+ _mesa_sha1_update(&sha1_ctx, config->flex_regs,
+ sizeof(config->flex_regs[0]) *
+ config->n_flex_regs);
+ }
+ if (config->mux_regs) {
+ _mesa_sha1_update(&sha1_ctx, config->mux_regs,
+ sizeof(config->mux_regs[0]) *
+ config->n_mux_regs);
+ }
+ if (config->b_counter_regs) {
+ _mesa_sha1_update(&sha1_ctx, config->b_counter_regs,
+ sizeof(config->b_counter_regs[0]) *
+ config->n_b_counter_regs);
+ }
+
+ uint8_t hash[20];
+ _mesa_sha1_final(&sha1_ctx, hash);
+
+ char formatted_hash[41];
+ _mesa_sha1_format(formatted_hash, hash);
+
+ char generated_guid[37];
+ snprintf(generated_guid, sizeof(generated_guid),
+ "%.8s-%.4s-%.4s-%.4s-%.12s",
+ &formatted_hash[0], &formatted_hash[8],
+ &formatted_hash[8 + 4], &formatted_hash[8 + 4 + 4],
+ &formatted_hash[8 + 4 + 4 + 4]);
+
+ /* Check if already present. */
+ uint64_t id;
+ if (gen_perf_load_metric_id(perf_cfg, generated_guid, &id))
+ return id;
+
+ return i915_add_config(perf_cfg, fd, config, generated_guid);
+}
+
/* Accumulate 32bits OA counters */
static inline void
accumulate_uint32(const uint32_t *report0,
*unslice_freq_hz = unslice_freq * 16666667ULL;
}
-static void
-query_result_read_frequencies(struct gen_perf_query_result *result,
- const struct gen_device_info *devinfo,
- const uint32_t *start,
- const uint32_t *end)
+void
+gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result,
+ const struct gen_device_info *devinfo,
+ const uint32_t *start,
+ const uint32_t *end)
{
/* Slice/Unslice frequency is only available in the OA reports when the
* "Disable OA reports due to clock ratio change" field in
&result->unslice_frequency[1]);
}
-static void
-query_result_accumulate(struct gen_perf_query_result *result,
- const struct gen_perf_query_info *query,
- const uint32_t *start,
- const uint32_t *end)
+void
+gen_perf_query_result_accumulate(struct gen_perf_query_result *result,
+ const struct gen_perf_query_info *query,
+ const uint32_t *start,
+ const uint32_t *end)
{
int i, idx = 0;
- result->hw_id = start[2];
+ if (result->hw_id == OA_REPORT_INVALID_CTX_ID &&
+ start[2] != OA_REPORT_INVALID_CTX_ID)
+ result->hw_id = start[2];
+ if (result->reports_accumulated == 0)
+ result->begin_timestamp = start[1];
result->reports_accumulated++;
switch (query->oa_format) {
}
-static void
-query_result_clear(struct gen_perf_query_result *result)
+void
+gen_perf_query_result_clear(struct gen_perf_query_result *result)
{
memset(result, 0, sizeof(*result));
- result->hw_id = 0xffffffff; /* invalid */
+ result->hw_id = OA_REPORT_INVALID_CTX_ID; /* invalid */
}
static void
}
struct gen_perf_query_info *raw_query = (struct gen_perf_query_info *)query;
- if (!load_metric_id(perf, query->guid,
- &raw_query->oa_metrics_set_id)) {
+ if (!gen_perf_load_metric_id(perf, query->guid,
+ &raw_query->oa_metrics_set_id)) {
DBG("Unable to read query guid=%s ID, falling back to test config\n", query->guid);
raw_query->oa_metrics_set_id = 1ULL;
} else {
exec_node_init(&buf->link);
buf->refcount = 0;
- buf->len = 0;
}
+ buf->len = 0;
return buf;
}
* pipeline statistics for the performance query object.
*/
static void
-snapshot_statistics_registers(void *context,
- struct gen_perf_config *perf,
+snapshot_statistics_registers(struct gen_perf_context *ctx,
struct gen_perf_query_object *obj,
uint32_t offset_in_bytes)
{
+ struct gen_perf_config *perf = ctx->perf;
const struct gen_perf_query_info *query = obj->queryinfo;
const int n_counters = query->n_counters;
assert(counter->data_type == GEN_PERF_COUNTER_DATA_TYPE_UINT64);
- perf->vtbl.store_register_mem64(context, obj->pipeline_stats.bo,
- counter->pipeline_stat.reg,
- offset_in_bytes + i * sizeof(uint64_t));
+ perf->vtbl.store_register_mem(ctx->ctx, obj->pipeline_stats.bo,
+ counter->pipeline_stat.reg, 8,
+ offset_in_bytes + i * sizeof(uint64_t));
}
}
+static void
+snapshot_freq_register(struct gen_perf_context *ctx,
+ struct gen_perf_query_object *query,
+ uint32_t bo_offset)
+{
+ struct gen_perf_config *perf = ctx->perf;
+ const struct gen_device_info *devinfo = ctx->devinfo;
+
+ if (devinfo->gen == 8 && !devinfo->is_cherryview)
+ perf->vtbl.store_register_mem(ctx->ctx, query->oa.bo, GEN7_RPSTAT1, 4, bo_offset);
+ else if (devinfo->gen >= 9)
+ perf->vtbl.store_register_mem(ctx->ctx, query->oa.bo, GEN9_RPSTAT0, 4, bo_offset);
+}
+
static void
gen_perf_close(struct gen_perf_context *perfquery,
const struct gen_perf_query_info *query)
* end snapshot - otherwise the results won't be a complete representation
* of the work.
*
- * Theoretically there could be opportunities to minimize how much of the
- * GPU pipeline is drained, or that we stall for, when we know what specific
- * units the performance counters being queried relate to but we don't
- * currently attempt to be clever here.
- *
- * Note: with our current simple approach here then for back-to-back queries
- * we will redundantly emit duplicate commands to synchronize the command
- * streamer with the rest of the GPU pipeline, but we assume that in HW the
- * second synchronization is effectively a NOOP.
+ * To achieve this, we stall the pipeline at pixel scoreboard (prevent any
+ * additional work to be processed by the pipeline until all pixels of the
+ * previous draw has be completed).
*
* N.B. The final results are based on deltas of counters between (inside)
* Begin/End markers so even though the total wall clock time of the
* This is our Begin synchronization point to drain current work on the
* GPU before we capture our first counter snapshot...
*/
- perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
+ perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx);
switch (queryinfo->kind) {
case GEN_PERF_QUERY_TYPE_OA:
query->oa.begin_report_id = perf_ctx->next_query_start_report_id;
perf_ctx->next_query_start_report_id += 2;
- /* We flush the batchbuffer here to minimize the chances that MI_RPC
- * delimiting commands end up in different batchbuffers. If that's the
- * case, the measurement will include the time it takes for the kernel
- * scheduler to load a new request into the hardware. This is manifested in
- * tools like frameretrace by spikes in the "GPU Core Clocks" counter.
- */
- perf_cfg->vtbl.batchbuffer_flush(perf_ctx->ctx, __FILE__, __LINE__);
-
/* Take a starting OA counter snapshot. */
perf_cfg->vtbl.emit_mi_report_perf_count(perf_ctx->ctx, query->oa.bo, 0,
query->oa.begin_report_id);
- perf_cfg->vtbl.capture_frequency_stat_register(perf_ctx->ctx, query->oa.bo,
- MI_FREQ_START_OFFSET_BYTES);
+ snapshot_freq_register(perf_ctx, query, MI_FREQ_START_OFFSET_BYTES);
++perf_ctx->n_active_oa_queries;
*/
buf->refcount++;
- query_result_clear(&query->oa.result);
+ gen_perf_query_result_clear(&query->oa.result);
query->oa.results_accumulated = false;
add_to_unaccumulated_query_list(perf_ctx, query);
STATS_BO_SIZE);
/* Take starting snapshots. */
- snapshot_statistics_registers(perf_ctx->ctx , perf_cfg, query, 0);
+ snapshot_statistics_registers(perf_ctx, query, 0);
++perf_ctx->n_active_pipeline_stats_queries;
break;
* For more details see comment in brw_begin_perf_query for
* corresponding flush.
*/
- perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
+ perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx);
switch (query->queryinfo->kind) {
case GEN_PERF_QUERY_TYPE_OA:
*/
if (!query->oa.results_accumulated) {
/* Take an ending OA counter snapshot. */
- perf_cfg->vtbl.capture_frequency_stat_register(perf_ctx->ctx, query->oa.bo,
- MI_FREQ_END_OFFSET_BYTES);
+ snapshot_freq_register(perf_ctx, query, MI_FREQ_END_OFFSET_BYTES);
perf_cfg->vtbl.emit_mi_report_perf_count(perf_ctx->ctx, query->oa.bo,
MI_RPC_BO_END_OFFSET_BYTES,
query->oa.begin_report_id + 1);
break;
case GEN_PERF_QUERY_TYPE_PIPELINE:
- snapshot_statistics_registers(perf_ctx->ctx, perf_cfg, query,
+ snapshot_statistics_registers(perf_ctx, query,
STATS_BO_END_OFFSET_BYTES);
--perf_ctx->n_active_pipeline_stats_queries;
break;
exec_list_get_tail(&perf_ctx->sample_buffers);
struct oa_sample_buf *tail_buf =
exec_node_data(struct oa_sample_buf, tail_node, link);
- uint32_t last_timestamp = tail_buf->last_timestamp;
+ uint32_t last_timestamp =
+ tail_buf->len == 0 ? start_timestamp : tail_buf->last_timestamp;
while (1) {
struct oa_sample_buf *buf = get_free_sample_buf(perf_ctx);
exec_list_push_tail(&perf_ctx->free_sample_buffers, &buf->link);
if (len < 0) {
- if (errno == EAGAIN)
- return ((last_timestamp - start_timestamp) >=
+ if (errno == EAGAIN) {
+ return ((last_timestamp - start_timestamp) < INT32_MAX &&
+ (last_timestamp - start_timestamp) >=
(end_timestamp - start_timestamp)) ?
OA_READ_STATUS_FINISHED :
OA_READ_STATUS_UNFINISHED;
- else {
+ } else {
DBG("Error reading i915 perf samples: %m\n");
}
} else
}
}
+/* Looks for the validity bit of context ID (dword 2) of an OA report. */
+static bool
+oa_report_ctx_id_valid(const struct gen_device_info *devinfo,
+ const uint32_t *report)
+{
+ assert(devinfo->gen >= 8);
+ if (devinfo->gen == 8)
+ return (report[0] & (1 << 25)) != 0;
+ return (report[0] & (1 << 16)) != 0;
+}
+
/**
* Accumulate raw OA counter values based on deltas between pairs of
* OA reports.
uint32_t *last;
uint32_t *end;
struct exec_node *first_samples_node;
- bool in_ctx = true;
+ bool last_report_ctx_match = true;
int out_duration = 0;
assert(query->oa.map != NULL);
goto error;
}
+ /* On Gen12+ OA reports are sourced from per context counters, so we don't
+ * ever have to look at the global OA buffer. Yey \o/
+ */
+ if (perf_ctx->devinfo->gen >= 12) {
+ last = start;
+ goto end;
+ }
+
/* See if we have any periodic reports to accumulate too... */
/* N.B. The oa.samples_head was set when the query began and
first_samples_node = query->oa.samples_head->next;
foreach_list_typed_from(struct oa_sample_buf, buf, link,
- &perf_ctx.sample_buffers,
+ &perf_ctx->sample_buffers,
first_samples_node)
{
int offset = 0;
switch (header->type) {
case DRM_I915_PERF_RECORD_SAMPLE: {
uint32_t *report = (uint32_t *)(header + 1);
+ bool report_ctx_match = true;
bool add = true;
/* Ignore reports that come before the start marker.
* of OA counters while any other context is acctive.
*/
if (devinfo->gen >= 8) {
- if (in_ctx && report[2] != query->oa.result.hw_id) {
- DBG("i915 perf: Switch AWAY (observed by ID change)\n");
- in_ctx = false;
+ /* Consider that the current report matches our context only if
+ * the report says the report ID is valid.
+ */
+ report_ctx_match = oa_report_ctx_id_valid(devinfo, report) &&
+ report[2] == start[2];
+ if (report_ctx_match)
out_duration = 0;
- } else if (in_ctx == false && report[2] == query->oa.result.hw_id) {
- DBG("i915 perf: Switch TO\n");
- in_ctx = true;
-
- /* From experimentation in IGT, we found that the OA unit
- * might label some report as "idle" (using an invalid
- * context ID), right after a report for a given context.
- * Deltas generated by those reports actually belong to the
- * previous context, even though they're not labelled as
- * such.
- *
- * We didn't *really* Switch AWAY in the case that we e.g.
- * saw a single periodic report while idle...
- */
- if (out_duration >= 1)
- add = false;
- } else if (in_ctx) {
- assert(report[2] == query->oa.result.hw_id);
- DBG("i915 perf: Continuation IN\n");
- } else {
- assert(report[2] != query->oa.result.hw_id);
- DBG("i915 perf: Continuation OUT\n");
- add = false;
+ else
out_duration++;
- }
+
+ /* Only add the delta between <last, report> if the last report
+ * was clearly identified as our context, or if we have at most
+ * 1 report without a matching ID.
+ *
+ * The OA unit will sometimes label reports with an invalid
+ * context ID when i915 rewrites the execlist submit register
+ * with the same context as the one currently running. This
+ * happens when i915 wants to notify the HW of ringbuffer tail
+ * register update. We have to consider this report as part of
+ * our context as the 3d pipeline behind the OACS unit is still
+ * processing the operations started at the previous execlist
+ * submission.
+ */
+ add = last_report_ctx_match && out_duration < 2;
}
if (add) {
- query_result_accumulate(&query->oa.result, query->queryinfo,
- last, report);
+ gen_perf_query_result_accumulate(&query->oa.result,
+ query->queryinfo,
+ last, report);
+ } else {
+ /* We're not adding the delta because we've identified it's not
+ * for the context we filter for. We can consider that the
+ * query was split.
+ */
+ query->oa.result.query_disjoint = true;
}
last = report;
+ last_report_ctx_match = report_ctx_match;
break;
}
end:
- query_result_accumulate(&query->oa.result, query->queryinfo,
- last, end);
+ gen_perf_query_result_accumulate(&query->oa.result, query->queryinfo,
+ last, end);
query->oa.results_accumulated = true;
drop_from_unaccumulated_query_list(perf_ctx, query);
read_gt_frequency(perf_ctx, query);
uint32_t *begin_report = query->oa.map;
uint32_t *end_report = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
- query_result_read_frequencies(&query->oa.result,
- perf_ctx->devinfo,
- begin_report,
- end_report);
+ gen_perf_query_result_read_frequencies(&query->oa.result,
+ perf_ctx->devinfo,
+ begin_report,
+ end_report);
accumulate_oa_reports(perf_ctx, query);
assert(query->oa.results_accumulated);
if (bytes_written)
*bytes_written = written;
}
+
+void
+gen_perf_dump_query_count(struct gen_perf_context *perf_ctx)
+{
+ DBG("Queries: (Open queries = %d, OA users = %d)\n",
+ perf_ctx->n_active_oa_queries, perf_ctx->n_oa_users);
+}
+
+void
+gen_perf_dump_query(struct gen_perf_context *ctx,
+ struct gen_perf_query_object *obj,
+ void *current_batch)
+{
+ switch (obj->queryinfo->kind) {
+ case GEN_PERF_QUERY_TYPE_OA:
+ case GEN_PERF_QUERY_TYPE_RAW:
+ DBG("BO: %-4s OA data: %-10s %-15s\n",
+ obj->oa.bo ? "yes," : "no,",
+ gen_perf_is_query_ready(ctx, obj, current_batch) ? "ready," : "not ready,",
+ obj->oa.results_accumulated ? "accumulated" : "not accumulated");
+ break;
+ case GEN_PERF_QUERY_TYPE_PIPELINE:
+ DBG("BO: %-4s\n",
+ obj->pipeline_stats.bo ? "yes" : "no");
+ break;
+ default:
+ unreachable("Unknown query type");
+ break;
+ }
+}