#define GEN_PERF_H
#include <stdio.h>
+#include <stdbool.h>
#include <stdint.h>
#include <string.h>
+#if defined(MAJOR_IN_SYSMACROS)
#include <sys/sysmacros.h>
+#elif defined(MAJOR_IN_MKDEV)
+#include <sys/mkdev.h>
+#endif
#include "util/hash_table.h"
#include "compiler/glsl/list.h"
#include "util/ralloc.h"
+#include "drm-uapi/i915_drm.h"
+
struct gen_device_info;
struct gen_perf_config;
GEN_PERF_COUNTER_DATA_TYPE_DOUBLE,
};
+enum gen_perf_counter_units {
+ /* size */
+ GEN_PERF_COUNTER_UNITS_BYTES,
+
+ /* frequency */
+ GEN_PERF_COUNTER_UNITS_HZ,
+
+ /* time */
+ GEN_PERF_COUNTER_UNITS_NS,
+ GEN_PERF_COUNTER_UNITS_US,
+
+ /**/
+ GEN_PERF_COUNTER_UNITS_PIXELS,
+ GEN_PERF_COUNTER_UNITS_TEXELS,
+ GEN_PERF_COUNTER_UNITS_THREADS,
+ GEN_PERF_COUNTER_UNITS_PERCENT,
+
+ /* events */
+ GEN_PERF_COUNTER_UNITS_MESSAGES,
+ GEN_PERF_COUNTER_UNITS_NUMBER,
+ GEN_PERF_COUNTER_UNITS_CYCLES,
+ GEN_PERF_COUNTER_UNITS_EVENTS,
+ GEN_PERF_COUNTER_UNITS_UTILIZATION,
+
+ /**/
+ GEN_PERF_COUNTER_UNITS_EU_SENDS_TO_L3_CACHE_LINES,
+ GEN_PERF_COUNTER_UNITS_EU_ATOMIC_REQUESTS_TO_L3_CACHE_LINES,
+ GEN_PERF_COUNTER_UNITS_EU_REQUESTS_TO_L3_CACHE_LINES,
+ GEN_PERF_COUNTER_UNITS_EU_BYTES_PER_L3_CACHE_LINE,
+
+ GEN_PERF_COUNTER_UNITS_MAX
+};
+
struct gen_pipeline_stat {
uint32_t reg;
uint32_t numerator;
*/
#define MAX_OA_REPORT_COUNTERS 62
-#define IA_VERTICES_COUNT 0x2310
-#define IA_PRIMITIVES_COUNT 0x2318
-#define VS_INVOCATION_COUNT 0x2320
-#define HS_INVOCATION_COUNT 0x2300
-#define DS_INVOCATION_COUNT 0x2308
-#define GS_INVOCATION_COUNT 0x2328
-#define GS_PRIMITIVES_COUNT 0x2330
-#define CL_INVOCATION_COUNT 0x2338
-#define CL_PRIMITIVES_COUNT 0x2340
-#define PS_INVOCATION_COUNT 0x2348
-#define CS_INVOCATION_COUNT 0x2290
-#define PS_DEPTH_COUNT 0x2350
-
/*
* When currently allocate only one page for pipeline statistics queries. Here
* we derived the maximum number of counters for that amount.
* query.
*/
uint64_t unslice_frequency[2];
+
+ /**
+ * Timestamp of the query.
+ */
+ uint64_t begin_timestamp;
+
+ /**
+ * Whether the query was interrupted by another workload (aka preemption).
+ */
+ bool query_disjoint;
};
struct gen_perf_query_counter {
const char *name;
const char *desc;
+ const char *symbol_name;
+ const char *category;
enum gen_perf_counter_type type;
enum gen_perf_counter_data_type data_type;
+ enum gen_perf_counter_units units;
uint64_t raw_max;
size_t offset;
uint32_t val;
};
+/* Register programming for a given query */
+struct gen_perf_registers {
+ const struct gen_perf_query_register_prog *flex_regs;
+ uint32_t n_flex_regs;
+
+ const struct gen_perf_query_register_prog *mux_regs;
+ uint32_t n_mux_regs;
+
+ const struct gen_perf_query_register_prog *b_counter_regs;
+ uint32_t n_b_counter_regs;
+};
+
struct gen_perf_query_info {
enum gen_perf_query_type {
GEN_PERF_QUERY_TYPE_OA,
GEN_PERF_QUERY_TYPE_PIPELINE,
} kind;
const char *name;
+ const char *symbol_name;
const char *guid;
struct gen_perf_query_counter *counters;
int n_counters;
int b_offset;
int c_offset;
- /* Register programming for a given query */
- struct gen_perf_query_register_prog *flex_regs;
- uint32_t n_flex_regs;
+ struct gen_perf_registers config;
+};
- struct gen_perf_query_register_prog *mux_regs;
- uint32_t n_mux_regs;
+struct gen_perf_query_counter_info {
+ struct gen_perf_query_counter *counter;
- struct gen_perf_query_register_prog *b_counter_regs;
- uint32_t n_b_counter_regs;
+ uint64_t query_mask;
+
+ /**
+ * Each counter can be a part of many groups, each time at different index.
+ * This struct stores one of those locations.
+ */
+ struct {
+ int group_idx; /* query/group number */
+ int counter_idx; /* index inside of query/group */
+ } location;
};
struct gen_perf_config {
+ /* Whether i915 has DRM_I915_QUERY_PERF_CONFIG support. */
+ bool i915_query_supported;
+
+ /* Version of the i915-perf subsystem, refer to i915_drm.h. */
+ int i915_perf_version;
+
+ /* Powergating configuration for the running the query. */
+ struct drm_i915_gem_context_param_sseu sseu;
+
struct gen_perf_query_info *queries;
int n_queries;
+ struct gen_perf_query_counter_info *counter_infos;
+ int n_counters;
+
/* Variables referenced in the XML meta data for OA performance
* counters, e.g in the normalization equations.
*
*/
struct hash_table *oa_metrics_table;
+ /* When MDAPI hasn't configured the metric we need to use by the time the
+ * query begins, this OA metric is used as a fallback.
+ */
+ uint64_t fallback_raw_oa_metric;
+
+ /* Whether we have support for this platform. If true && n_queries == 0,
+ * this means we will not be able to use i915-perf because of it is in
+ * paranoid mode.
+ */
+ bool platform_supported;
+
/* Location of the device's sysfs entry. */
char sysfs_dev_dir[256];
void (*bo_unreference)(void *bo);
void *(*bo_map)(void *ctx, void *bo, unsigned flags);
void (*bo_unmap)(void *bo);
- void (*emit_mi_flush)(void *ctx);
+ bool (*batch_references)(void *batch, void *bo);
+ void (*bo_wait_rendering)(void *bo);
+ int (*bo_busy)(void *bo);
+ void (*emit_stall_at_pixel_scoreboard)(void *ctx);
void (*emit_mi_report_perf_count)(void *ctx,
void *bo,
uint32_t offset_in_bytes,
uint32_t report_id);
void (*batchbuffer_flush)(void *ctx,
const char *file, int line);
- void (*capture_frequency_stat_register)(void *ctx, void *bo,
- uint32_t bo_offset);
- void (*store_register_mem64)(void *ctx, void *bo, uint32_t reg, uint32_t offset);
+ void (*store_register_mem)(void *ctx, void *bo, uint32_t reg, uint32_t reg_size, uint32_t offset);
} vtbl;
};
-/**
- * Periodic OA samples are read() into these buffer structures via the
- * i915 perf kernel interface and appended to the
- * brw->perfquery.sample_buffers linked list. When we process the
- * results of an OA metrics query we need to consider all the periodic
- * samples between the Begin and End MI_REPORT_PERF_COUNT command
- * markers.
- *
- * 'Periodic' is a simplification as there are other automatic reports
- * written by the hardware also buffered here.
- *
- * Considering three queries, A, B and C:
- *
- * Time ---->
- * ________________A_________________
- * | |
- * | ________B_________ _____C___________
- * | | | | | |
- *
- * And an illustration of sample buffers read over this time frame:
- * [HEAD ][ ][ ][ ][ ][ ][ ][ ][TAIL ]
- *
- * These nodes may hold samples for query A:
- * [ ][ ][ A ][ A ][ A ][ A ][ A ][ ][ ]
- *
- * These nodes may hold samples for query B:
- * [ ][ ][ B ][ B ][ B ][ ][ ][ ][ ]
- *
- * These nodes may hold samples for query C:
- * [ ][ ][ ][ ][ ][ C ][ C ][ C ][ ]
- *
- * The illustration assumes we have an even distribution of periodic
- * samples so all nodes have the same size plotted against time:
- *
- * Note, to simplify code, the list is never empty.
- *
- * With overlapping queries we can see that periodic OA reports may
- * relate to multiple queries and care needs to be take to keep
- * track of sample buffers until there are no queries that might
- * depend on their contents.
- *
- * We use a node ref counting system where a reference ensures that a
- * node and all following nodes can't be freed/recycled until the
- * reference drops to zero.
- *
- * E.g. with a ref of one here:
- * [ 0 ][ 0 ][ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ]
- *
- * These nodes could be freed or recycled ("reaped"):
- * [ 0 ][ 0 ]
- *
- * These must be preserved until the leading ref drops to zero:
- * [ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ]
- *
- * When a query starts we take a reference on the current tail of
- * the list, knowing that no already-buffered samples can possibly
- * relate to the newly-started query. A pointer to this node is
- * also saved in the query object's ->oa.samples_head.
- *
- * E.g. starting query A while there are two nodes in .sample_buffers:
- * ________________A________
- * |
- *
- * [ 0 ][ 1 ]
- * ^_______ Add a reference and store pointer to node in
- * A->oa.samples_head
- *
- * Moving forward to when the B query starts with no new buffer nodes:
- * (for reference, i915 perf reads() are only done when queries finish)
- * ________________A_______
- * | ________B___
- * | |
- *
- * [ 0 ][ 2 ]
- * ^_______ Add a reference and store pointer to
- * node in B->oa.samples_head
- *
- * Once a query is finished, after an OA query has become 'Ready',
- * once the End OA report has landed and after we we have processed
- * all the intermediate periodic samples then we drop the
- * ->oa.samples_head reference we took at the start.
- *
- * So when the B query has finished we have:
- * ________________A________
- * | ______B___________
- * | | |
- * [ 0 ][ 1 ][ 0 ][ 0 ][ 0 ]
- * ^_______ Drop B->oa.samples_head reference
- *
- * We still can't free these due to the A->oa.samples_head ref:
- * [ 1 ][ 0 ][ 0 ][ 0 ]
- *
- * When the A query finishes: (note there's a new ref for C's samples_head)
- * ________________A_________________
- * | |
- * | _____C_________
- * | | |
- * [ 0 ][ 0 ][ 0 ][ 0 ][ 1 ][ 0 ][ 0 ]
- * ^_______ Drop A->oa.samples_head reference
- *
- * And we can now reap these nodes up to the C->oa.samples_head:
- * [ X ][ X ][ X ][ X ]
- * keeping -> [ 1 ][ 0 ][ 0 ]
- *
- * We reap old sample buffers each time we finish processing an OA
- * query by iterating the sample_buffers list from the head until we
- * find a referenced node and stop.
- *
- * Reaped buffers move to a perfquery.free_sample_buffers list and
- * when we come to read() we first look to recycle a buffer from the
- * free_sample_buffers list before allocating a new buffer.
- */
-struct oa_sample_buf {
- struct exec_node link;
- int refcount;
- int len;
- uint8_t buf[I915_PERF_OA_SAMPLE_SIZE * 10];
- uint32_t last_timestamp;
-};
-
-/**
- * gen representation of a performance query object.
- *
- * NB: We want to keep this structure relatively lean considering that
- * applications may expect to allocate enough objects to be able to
- * query around all draw calls in a frame.
- */
-struct gen_perf_query_object
-{
- const struct gen_perf_query_info *queryinfo;
-
- /* See query->kind to know which state below is in use... */
- union {
- struct {
-
- /**
- * BO containing OA counter snapshots at query Begin/End time.
- */
- void *bo;
-
- /**
- * Address of mapped of @bo
- */
- void *map;
-
- /**
- * The MI_REPORT_PERF_COUNT command lets us specify a unique
- * ID that will be reflected in the resulting OA report
- * that's written by the GPU. This is the ID we're expecting
- * in the begin report and the the end report should be
- * @begin_report_id + 1.
- */
- int begin_report_id;
-
- /**
- * Reference the head of the brw->perfquery.sample_buffers
- * list at the time that the query started (so we only need
- * to look at nodes after this point when looking for samples
- * related to this query)
- *
- * (See struct brw_oa_sample_buf description for more details)
- */
- struct exec_node *samples_head;
-
- /**
- * false while in the unaccumulated_elements list, and set to
- * true when the final, end MI_RPC snapshot has been
- * accumulated.
- */
- bool results_accumulated;
-
- /**
- * Frequency of the GT at begin and end of the query.
- */
- uint64_t gt_frequency[2];
-
- /**
- * Accumulated OA results between begin and end of the query.
- */
- struct gen_perf_query_result result;
- } oa;
-
- struct {
- /**
- * BO containing starting and ending snapshots for the
- * statistics counters.
- */
- void *bo;
- } pipeline_stats;
- };
+struct gen_perf_counter_pass {
+ struct gen_perf_query_info *query;
+ struct gen_perf_query_counter *counter;
+ uint32_t pass;
};
-struct gen_perf_context {
- struct gen_perf_config *perf;
-
- void * ctx; /* driver context (eg, brw_context) */
- void * bufmgr;
- const struct gen_device_info *devinfo;
-
- uint32_t hw_ctx;
- int drm_fd;
-
- /* The i915 perf stream we open to setup + enable the OA counters */
- int oa_stream_fd;
-
- /* An i915 perf stream fd gives exclusive access to the OA unit that will
- * report counter snapshots for a specific counter set/profile in a
- * specific layout/format so we can only start OA queries that are
- * compatible with the currently open fd...
- */
- int current_oa_metrics_set_id;
- int current_oa_format;
-
- /* List of buffers containing OA reports */
- struct exec_list sample_buffers;
-
- /* Cached list of empty sample buffers */
- struct exec_list free_sample_buffers;
-
- int n_active_oa_queries;
- int n_active_pipeline_stats_queries;
+void gen_perf_init_metrics(struct gen_perf_config *perf_cfg,
+ const struct gen_device_info *devinfo,
+ int drm_fd,
+ bool include_pipeline_statistics);
- /* The number of queries depending on running OA counters which
- * extends beyond brw_end_perf_query() since we need to wait until
- * the last MI_RPC command has parsed by the GPU.
- *
- * Accurate accounting is important here as emitting an
- * MI_REPORT_PERF_COUNT command while the OA unit is disabled will
- * effectively hang the gpu.
- */
- int n_oa_users;
+/** Query i915 for a metric id using guid.
+ */
+bool gen_perf_load_metric_id(struct gen_perf_config *perf_cfg,
+ const char *guid,
+ uint64_t *metric_id);
- /* To help catch an spurious problem with the hardware or perf
- * forwarding samples, we emit each MI_REPORT_PERF_COUNT command
- * with a unique ID that we can explicitly check for...
- */
- int next_query_start_report_id;
+/** Load a configuation's content from i915 using a guid.
+ */
+struct gen_perf_registers *gen_perf_load_configuration(struct gen_perf_config *perf_cfg,
+ int fd, const char *guid);
- /**
- * An array of queries whose results haven't yet been assembled
- * based on the data in buffer objects.
- *
- * These may be active, or have already ended. However, the
- * results have not been requested.
- */
- struct gen_perf_query_object **unaccumulated;
- int unaccumulated_elements;
- int unaccumulated_array_size;
-
- /* The total number of query objects so we can relinquish
- * our exclusive access to perf if the application deletes
- * all of its objects. (NB: We only disable perf while
- * there are no active queries)
- */
- int n_query_instances;
-};
+/** Store a configuration into i915 using guid and return a new metric id.
+ *
+ * If guid is NULL, then a generated one will be provided by hashing the
+ * content of the configuration.
+ */
+uint64_t gen_perf_store_configuration(struct gen_perf_config *perf_cfg, int fd,
+ const struct gen_perf_registers *config,
+ const char *guid);
-void gen_perf_init_context(struct gen_perf_context *perf_ctx,
- struct gen_perf_config *perf_cfg,
- void * ctx, /* driver context (eg, brw_context) */
- void * bufmgr, /* eg brw_bufmgr */
- const struct gen_device_info *devinfo,
- uint32_t hw_ctx,
- int drm_fd);
+/** Read the slice/unslice frequency from 2 OA reports and store then into
+ * result.
+ */
+void gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result,
+ const struct gen_device_info *devinfo,
+ const uint32_t *start,
+ const uint32_t *end);
+/** Accumulate the delta between 2 OA reports into result for a given query.
+ */
+void gen_perf_query_result_accumulate(struct gen_perf_query_result *result,
+ const struct gen_perf_query_info *query,
+ const uint32_t *start,
+ const uint32_t *end);
+void gen_perf_query_result_clear(struct gen_perf_query_result *result);
static inline size_t
gen_perf_query_counter_get_size(const struct gen_perf_query_counter *counter)
}
}
-static inline struct gen_perf_query_info *
-gen_perf_query_append_query_info(struct gen_perf_config *perf, int max_counters)
-{
- struct gen_perf_query_info *query;
-
- perf->queries = reralloc(perf, perf->queries,
- struct gen_perf_query_info,
- ++perf->n_queries);
- query = &perf->queries[perf->n_queries - 1];
- memset(query, 0, sizeof(*query));
-
- if (max_counters > 0) {
- query->max_counters = max_counters;
- query->counters =
- rzalloc_array(perf, struct gen_perf_query_counter, max_counters);
- }
-
- return query;
-}
-
-static inline void
-gen_perf_query_info_add_stat_reg(struct gen_perf_query_info *query,
- uint32_t reg,
- uint32_t numerator,
- uint32_t denominator,
- const char *name,
- const char *description)
-{
- struct gen_perf_query_counter *counter;
-
- assert(query->n_counters < query->max_counters);
-
- counter = &query->counters[query->n_counters];
- counter->name = name;
- counter->desc = description;
- counter->type = GEN_PERF_COUNTER_TYPE_RAW;
- counter->data_type = GEN_PERF_COUNTER_DATA_TYPE_UINT64;
- counter->offset = sizeof(uint64_t) * query->n_counters;
- counter->pipeline_stat.reg = reg;
- counter->pipeline_stat.numerator = numerator;
- counter->pipeline_stat.denominator = denominator;
-
- query->n_counters++;
-}
-
-static inline void
-gen_perf_query_info_add_basic_stat_reg(struct gen_perf_query_info *query,
- uint32_t reg, const char *name)
-{
- gen_perf_query_info_add_stat_reg(query, reg, 1, 1, name, name);
-}
-
static inline struct gen_perf_config *
gen_perf_new(void *ctx)
{
return perf;
}
-bool gen_perf_load_oa_metrics(struct gen_perf_config *perf, int fd,
- const struct gen_device_info *devinfo);
-bool gen_perf_load_metric_id(struct gen_perf_config *perf, const char *guid,
- uint64_t *metric_id);
-
-void gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result,
- const struct gen_device_info *devinfo,
- const uint32_t *start,
- const uint32_t *end);
-void gen_perf_query_result_accumulate(struct gen_perf_query_result *result,
- const struct gen_perf_query_info *query,
- const uint32_t *start,
- const uint32_t *end);
-void gen_perf_query_result_clear(struct gen_perf_query_result *result);
-void gen_perf_query_register_mdapi_statistic_query(const struct gen_device_info *devinfo,
- struct gen_perf_config *perf);
-void gen_perf_query_register_mdapi_oa_query(const struct gen_device_info *devinfo,
- struct gen_perf_config *perf);
-uint64_t gen_perf_query_get_metric_id(struct gen_perf_config *perf,
- const struct gen_perf_query_info *query);
-struct oa_sample_buf * gen_perf_get_free_sample_buf(struct gen_perf_context *perf);
-void gen_perf_reap_old_sample_buffers(struct gen_perf_context *perf_ctx);
-void gen_perf_free_sample_bufs(struct gen_perf_context *perf_ctx);
-
-void gen_perf_snapshot_statistics_registers(void *context,
- struct gen_perf_config *perf,
- struct gen_perf_query_object *obj,
- uint32_t offset_in_bytes);
-
-void gen_perf_close(struct gen_perf_context *perfquery,
- const struct gen_perf_query_info *query);
-bool gen_perf_open(struct gen_perf_context *perfquery,
- int metrics_set_id,
- int report_format,
- int period_exponent,
- int drm_fd,
- uint32_t ctx_id);
-
-bool gen_perf_inc_n_users(struct gen_perf_context *perfquery);
-void gen_perf_dec_n_users(struct gen_perf_context *perfquery);
-
-bool gen_perf_begin_query(struct gen_perf_context *perf_ctx,
- struct gen_perf_query_object *query);
-void gen_perf_end_query(struct gen_perf_context *perf_ctx,
- struct gen_perf_query_object *query);
+uint32_t gen_perf_get_n_passes(struct gen_perf_config *perf,
+ const uint32_t *counter_indices,
+ uint32_t counter_indices_count,
+ struct gen_perf_query_info **pass_queries);
+void gen_perf_get_counters_passes(struct gen_perf_config *perf,
+ const uint32_t *counter_indices,
+ uint32_t counter_indices_count,
+ struct gen_perf_counter_pass *counter_pass);
#endif /* GEN_PERF_H */