X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fperf%2Fgen_perf.c;h=7e3847d073d5350c4ecf438943a91b6e2f4852d2;hb=acea59dbf8056b46444c820115d86c42d0411686;hp=5fb6044a85981d3163aad1320bfce2e5d2233c4e;hpb=9f84efb452f810494e8ba78a68b56444e343e5f6;p=mesa.git diff --git a/src/intel/perf/gen_perf.c b/src/intel/perf/gen_perf.c index 5fb6044a859..7e3847d073d 100644 --- a/src/intel/perf/gen_perf.c +++ b/src/intel/perf/gen_perf.c @@ -33,12 +33,14 @@ #include "common/gen_gem.h" #include "gen_perf.h" +#include "gen_perf_regs.h" #include "perf/gen_perf_mdapi.h" #include "perf/gen_perf_metrics.h" #include "dev/gen_debug.h" #include "dev/gen_device_info.h" #include "util/bitscan.h" +#include "util/mesa-sha1.h" #include "util/u_math.h" #define FILE_DEBUG_FLAG DEBUG_PERFMON @@ -61,9 +63,338 @@ #define GEN9_RPSTAT0_PREV_GT_FREQ_SHIFT 0 #define GEN9_RPSTAT0_PREV_GT_FREQ_MASK INTEL_MASK(8, 0) +#define GEN6_SO_PRIM_STORAGE_NEEDED 0x2280 +#define GEN7_SO_PRIM_STORAGE_NEEDED(n) (0x5240 + (n) * 8) +#define GEN6_SO_NUM_PRIMS_WRITTEN 0x2288 +#define GEN7_SO_NUM_PRIMS_WRITTEN(n) (0x5200 + (n) * 8) + #define MAP_READ (1 << 0) #define MAP_WRITE (1 << 1) +#define OA_REPORT_INVALID_CTX_ID (0xffffffff) + +/** + * Periodic OA samples are read() into these buffer structures via the + * i915 perf kernel interface and appended to the + * perf_ctx->sample_buffers linked list. When we process the + * results of an OA metrics query we need to consider all the periodic + * samples between the Begin and End MI_REPORT_PERF_COUNT command + * markers. + * + * 'Periodic' is a simplification as there are other automatic reports + * written by the hardware also buffered here. + * + * Considering three queries, A, B and C: + * + * Time ----> + * ________________A_________________ + * | | + * | ________B_________ _____C___________ + * | | | | | | + * + * And an illustration of sample buffers read over this time frame: + * [HEAD ][ ][ ][ ][ ][ ][ ][ ][TAIL ] + * + * These nodes may hold samples for query A: + * [ ][ ][ A ][ A ][ A ][ A ][ A ][ ][ ] + * + * These nodes may hold samples for query B: + * [ ][ ][ B ][ B ][ B ][ ][ ][ ][ ] + * + * These nodes may hold samples for query C: + * [ ][ ][ ][ ][ ][ C ][ C ][ C ][ ] + * + * The illustration assumes we have an even distribution of periodic + * samples so all nodes have the same size plotted against time: + * + * Note, to simplify code, the list is never empty. + * + * With overlapping queries we can see that periodic OA reports may + * relate to multiple queries and care needs to be take to keep + * track of sample buffers until there are no queries that might + * depend on their contents. + * + * We use a node ref counting system where a reference ensures that a + * node and all following nodes can't be freed/recycled until the + * reference drops to zero. + * + * E.g. with a ref of one here: + * [ 0 ][ 0 ][ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ] + * + * These nodes could be freed or recycled ("reaped"): + * [ 0 ][ 0 ] + * + * These must be preserved until the leading ref drops to zero: + * [ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ] + * + * When a query starts we take a reference on the current tail of + * the list, knowing that no already-buffered samples can possibly + * relate to the newly-started query. A pointer to this node is + * also saved in the query object's ->oa.samples_head. + * + * E.g. starting query A while there are two nodes in .sample_buffers: + * ________________A________ + * | + * + * [ 0 ][ 1 ] + * ^_______ Add a reference and store pointer to node in + * A->oa.samples_head + * + * Moving forward to when the B query starts with no new buffer nodes: + * (for reference, i915 perf reads() are only done when queries finish) + * ________________A_______ + * | ________B___ + * | | + * + * [ 0 ][ 2 ] + * ^_______ Add a reference and store pointer to + * node in B->oa.samples_head + * + * Once a query is finished, after an OA query has become 'Ready', + * once the End OA report has landed and after we we have processed + * all the intermediate periodic samples then we drop the + * ->oa.samples_head reference we took at the start. + * + * So when the B query has finished we have: + * ________________A________ + * | ______B___________ + * | | | + * [ 0 ][ 1 ][ 0 ][ 0 ][ 0 ] + * ^_______ Drop B->oa.samples_head reference + * + * We still can't free these due to the A->oa.samples_head ref: + * [ 1 ][ 0 ][ 0 ][ 0 ] + * + * When the A query finishes: (note there's a new ref for C's samples_head) + * ________________A_________________ + * | | + * | _____C_________ + * | | | + * [ 0 ][ 0 ][ 0 ][ 0 ][ 1 ][ 0 ][ 0 ] + * ^_______ Drop A->oa.samples_head reference + * + * And we can now reap these nodes up to the C->oa.samples_head: + * [ X ][ X ][ X ][ X ] + * keeping -> [ 1 ][ 0 ][ 0 ] + * + * We reap old sample buffers each time we finish processing an OA + * query by iterating the sample_buffers list from the head until we + * find a referenced node and stop. + * + * Reaped buffers move to a perfquery.free_sample_buffers list and + * when we come to read() we first look to recycle a buffer from the + * free_sample_buffers list before allocating a new buffer. + */ +struct oa_sample_buf { + struct exec_node link; + int refcount; + int len; + uint8_t buf[I915_PERF_OA_SAMPLE_SIZE * 10]; + uint32_t last_timestamp; +}; + +/** + * gen representation of a performance query object. + * + * NB: We want to keep this structure relatively lean considering that + * applications may expect to allocate enough objects to be able to + * query around all draw calls in a frame. + */ +struct gen_perf_query_object +{ + const struct gen_perf_query_info *queryinfo; + + /* See query->kind to know which state below is in use... */ + union { + struct { + + /** + * BO containing OA counter snapshots at query Begin/End time. + */ + void *bo; + + /** + * Address of mapped of @bo + */ + void *map; + + /** + * The MI_REPORT_PERF_COUNT command lets us specify a unique + * ID that will be reflected in the resulting OA report + * that's written by the GPU. This is the ID we're expecting + * in the begin report and the the end report should be + * @begin_report_id + 1. + */ + int begin_report_id; + + /** + * Reference the head of the brw->perfquery.sample_buffers + * list at the time that the query started (so we only need + * to look at nodes after this point when looking for samples + * related to this query) + * + * (See struct brw_oa_sample_buf description for more details) + */ + struct exec_node *samples_head; + + /** + * false while in the unaccumulated_elements list, and set to + * true when the final, end MI_RPC snapshot has been + * accumulated. + */ + bool results_accumulated; + + /** + * Frequency of the GT at begin and end of the query. + */ + uint64_t gt_frequency[2]; + + /** + * Accumulated OA results between begin and end of the query. + */ + struct gen_perf_query_result result; + } oa; + + struct { + /** + * BO containing starting and ending snapshots for the + * statistics counters. + */ + void *bo; + } pipeline_stats; + }; +}; + +struct gen_perf_context { + struct gen_perf_config *perf; + + void * ctx; /* driver context (eg, brw_context) */ + void * bufmgr; + const struct gen_device_info *devinfo; + + uint32_t hw_ctx; + int drm_fd; + + /* The i915 perf stream we open to setup + enable the OA counters */ + int oa_stream_fd; + + /* An i915 perf stream fd gives exclusive access to the OA unit that will + * report counter snapshots for a specific counter set/profile in a + * specific layout/format so we can only start OA queries that are + * compatible with the currently open fd... + */ + int current_oa_metrics_set_id; + int current_oa_format; + + /* List of buffers containing OA reports */ + struct exec_list sample_buffers; + + /* Cached list of empty sample buffers */ + struct exec_list free_sample_buffers; + + int n_active_oa_queries; + int n_active_pipeline_stats_queries; + + /* The number of queries depending on running OA counters which + * extends beyond brw_end_perf_query() since we need to wait until + * the last MI_RPC command has parsed by the GPU. + * + * Accurate accounting is important here as emitting an + * MI_REPORT_PERF_COUNT command while the OA unit is disabled will + * effectively hang the gpu. + */ + int n_oa_users; + + /* To help catch an spurious problem with the hardware or perf + * forwarding samples, we emit each MI_REPORT_PERF_COUNT command + * with a unique ID that we can explicitly check for... + */ + int next_query_start_report_id; + + /** + * An array of queries whose results haven't yet been assembled + * based on the data in buffer objects. + * + * These may be active, or have already ended. However, the + * results have not been requested. + */ + struct gen_perf_query_object **unaccumulated; + int unaccumulated_elements; + int unaccumulated_array_size; + + /* The total number of query objects so we can relinquish + * our exclusive access to perf if the application deletes + * all of its objects. (NB: We only disable perf while + * there are no active queries) + */ + int n_query_instances; +}; + +const struct gen_perf_query_info* +gen_perf_query_info(const struct gen_perf_query_object *query) +{ + return query->queryinfo; +} + +struct gen_perf_context * +gen_perf_new_context(void *parent) +{ + struct gen_perf_context *ctx = rzalloc(parent, struct gen_perf_context); + if (! ctx) + fprintf(stderr, "%s: failed to alloc context\n", __func__); + return ctx; +} + +struct gen_perf_config * +gen_perf_config(struct gen_perf_context *ctx) +{ + return ctx->perf; +} + +struct gen_perf_query_object * +gen_perf_new_query(struct gen_perf_context *perf_ctx, unsigned query_index) +{ + const struct gen_perf_query_info *query = + &perf_ctx->perf->queries[query_index]; + struct gen_perf_query_object *obj = + calloc(1, sizeof(struct gen_perf_query_object)); + + if (!obj) + return NULL; + + obj->queryinfo = query; + + perf_ctx->n_query_instances++; + return obj; +} + +int +gen_perf_active_queries(struct gen_perf_context *perf_ctx, + const struct gen_perf_query_info *query) +{ + assert(perf_ctx->n_active_oa_queries == 0 || perf_ctx->n_active_pipeline_stats_queries == 0); + + switch (query->kind) { + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: + return perf_ctx->n_active_oa_queries; + break; + + case GEN_PERF_QUERY_TYPE_PIPELINE: + return perf_ctx->n_active_pipeline_stats_queries; + break; + + default: + unreachable("Unknown query type"); + break; + } +} + +static inline uint64_t to_user_pointer(void *ptr) +{ + return (uintptr_t) ptr; +} + static bool get_sysfs_dev_dir(struct gen_perf_config *perf, int fd) { @@ -165,18 +496,37 @@ read_sysfs_drm_device_file_uint64(struct gen_perf_config *perf, return read_file_uint64(buf, value); } +static inline struct gen_perf_query_info * +append_query_info(struct gen_perf_config *perf, int max_counters) +{ + struct gen_perf_query_info *query; + + perf->queries = reralloc(perf, perf->queries, + struct gen_perf_query_info, + ++perf->n_queries); + query = &perf->queries[perf->n_queries - 1]; + memset(query, 0, sizeof(*query)); + + if (max_counters > 0) { + query->max_counters = max_counters; + query->counters = + rzalloc_array(perf, struct gen_perf_query_counter, max_counters); + } + + return query; +} + static void register_oa_config(struct gen_perf_config *perf, const struct gen_perf_query_info *query, uint64_t config_id) { - struct gen_perf_query_info *registred_query = - gen_perf_query_append_query_info(perf, 0); + struct gen_perf_query_info *registered_query = append_query_info(perf, 0); - *registred_query = *query; - registred_query->oa_metrics_set_id = config_id; - DBG("metric set registred: id = %" PRIu64", guid = %s\n", - registred_query->oa_metrics_set_id, query->guid); + *registered_query = *query; + registered_query->oa_metrics_set_id = config_id; + DBG("metric set registered: id = %" PRIu64", guid = %s\n", + registered_query->oa_metrics_set_id, query->guid); } static void @@ -212,15 +562,7 @@ enumerate_sysfs_metrics(struct gen_perf_config *perf) metric_entry->d_name); if (entry) { uint64_t id; - - len = snprintf(buf, sizeof(buf), "%s/metrics/%s/id", - perf->sysfs_dev_dir, metric_entry->d_name); - if (len < 0 || len >= sizeof(buf)) { - DBG("Failed to concatenate path to sysfs metric id file\n"); - continue; - } - - if (!read_file_uint64(buf, &id)) { + if (!gen_perf_load_metric_id(perf, metric_entry->d_name, &id)) { DBG("Failed to read metric set id from %s: %m", buf); continue; } @@ -242,27 +584,98 @@ kernel_has_dynamic_config_support(struct gen_perf_config *perf, int fd) &invalid_config_id) < 0 && errno == ENOENT; } +static int +i915_query_items(struct gen_perf_config *perf, int fd, + struct drm_i915_query_item *items, uint32_t n_items) +{ + struct drm_i915_query q = { + .num_items = n_items, + .items_ptr = to_user_pointer(items), + }; + return gen_ioctl(fd, DRM_IOCTL_I915_QUERY, &q); +} + +static bool +i915_query_perf_config_supported(struct gen_perf_config *perf, int fd) +{ + struct drm_i915_query_item item = { + .query_id = DRM_I915_QUERY_PERF_CONFIG, + .flags = DRM_I915_QUERY_PERF_CONFIG_LIST, + }; + + return i915_query_items(perf, fd, &item, 1) == 0 && item.length > 0; +} + +static bool +i915_query_perf_config_data(struct gen_perf_config *perf, + int fd, const char *guid, + struct drm_i915_perf_oa_config *config) +{ + struct { + struct drm_i915_query_perf_config query; + struct drm_i915_perf_oa_config config; + } item_data; + struct drm_i915_query_item item = { + .query_id = DRM_I915_QUERY_PERF_CONFIG, + .flags = DRM_I915_QUERY_PERF_CONFIG_DATA_FOR_UUID, + .data_ptr = to_user_pointer(&item_data), + .length = sizeof(item_data), + }; + + memset(&item_data, 0, sizeof(item_data)); + memcpy(item_data.query.uuid, guid, sizeof(item_data.query.uuid)); + memcpy(&item_data.config, config, sizeof(item_data.config)); + + if (!(i915_query_items(perf, fd, &item, 1) == 0 && item.length > 0)) + return false; + + memcpy(config, &item_data.config, sizeof(item_data.config)); + + return true; +} + bool -gen_perf_load_metric_id(struct gen_perf_config *perf, const char *guid, +gen_perf_load_metric_id(struct gen_perf_config *perf_cfg, + const char *guid, uint64_t *metric_id) { char config_path[280]; snprintf(config_path, sizeof(config_path), "%s/metrics/%s/id", - perf->sysfs_dev_dir, guid); + perf_cfg->sysfs_dev_dir, guid); /* Don't recreate already loaded configs. */ return read_file_uint64(config_path, metric_id); } +static uint64_t +i915_add_config(struct gen_perf_config *perf, int fd, + const struct gen_perf_registers *config, + const char *guid) +{ + struct drm_i915_perf_oa_config i915_config = { 0, }; + + memcpy(i915_config.uuid, guid, sizeof(i915_config.uuid)); + + i915_config.n_mux_regs = config->n_mux_regs; + i915_config.mux_regs_ptr = to_user_pointer(config->mux_regs); + + i915_config.n_boolean_regs = config->n_b_counter_regs; + i915_config.boolean_regs_ptr = to_user_pointer(config->b_counter_regs); + + i915_config.n_flex_regs = config->n_flex_regs; + i915_config.flex_regs_ptr = to_user_pointer(config->flex_regs); + + int ret = gen_ioctl(fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &i915_config); + return ret > 0 ? ret : 0; +} + static void init_oa_configs(struct gen_perf_config *perf, int fd) { hash_table_foreach(perf->oa_metrics_table, entry) { const struct gen_perf_query_info *query = entry->data; - struct drm_i915_perf_oa_config config; uint64_t config_id; - int ret; if (gen_perf_load_metric_id(perf, query->guid, &config_id)) { DBG("metric set: %s (already loaded)\n", query->guid); @@ -270,20 +683,7 @@ init_oa_configs(struct gen_perf_config *perf, int fd) continue; } - memset(&config, 0, sizeof(config)); - - memcpy(config.uuid, query->guid, sizeof(config.uuid)); - - config.n_mux_regs = query->n_mux_regs; - config.mux_regs_ptr = (uintptr_t) query->mux_regs; - - config.n_boolean_regs = query->n_b_counter_regs; - config.boolean_regs_ptr = (uintptr_t) query->b_counter_regs; - - config.n_flex_regs = query->n_flex_regs; - config.flex_regs_ptr = (uintptr_t) query->flex_regs; - - ret = gen_ioctl(fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &config); + int ret = i915_add_config(perf, fd, &query->config, query->guid); if (ret < 0) { DBG("Failed to load \"%s\" (%s) metrics set in kernel: %s\n", query->name, query->guid, strerror(errno)); @@ -389,20 +789,142 @@ get_register_queries_function(const struct gen_device_info *devinfo) } if (devinfo->is_cannonlake) return gen_oa_register_queries_cnl; - if (devinfo->gen == 11) + if (devinfo->gen == 11) { + if (devinfo->is_elkhartlake) + return gen_oa_register_queries_lkf; return gen_oa_register_queries_icl; + } + if (devinfo->gen == 12) + return gen_oa_register_queries_tgl; return NULL; } -bool -gen_perf_load_oa_metrics(struct gen_perf_config *perf, int fd, +static inline void +add_stat_reg(struct gen_perf_query_info *query, uint32_t reg, + uint32_t numerator, uint32_t denominator, + const char *name, const char *description) +{ + struct gen_perf_query_counter *counter; + + assert(query->n_counters < query->max_counters); + + counter = &query->counters[query->n_counters]; + counter->name = name; + counter->desc = description; + counter->type = GEN_PERF_COUNTER_TYPE_RAW; + counter->data_type = GEN_PERF_COUNTER_DATA_TYPE_UINT64; + counter->offset = sizeof(uint64_t) * query->n_counters; + counter->pipeline_stat.reg = reg; + counter->pipeline_stat.numerator = numerator; + counter->pipeline_stat.denominator = denominator; + + query->n_counters++; +} + +static inline void +add_basic_stat_reg(struct gen_perf_query_info *query, + uint32_t reg, const char *name) +{ + add_stat_reg(query, reg, 1, 1, name, name); +} + +static void +load_pipeline_statistic_metrics(struct gen_perf_config *perf_cfg, + const struct gen_device_info *devinfo) +{ + struct gen_perf_query_info *query = + append_query_info(perf_cfg, MAX_STAT_COUNTERS); + + query->kind = GEN_PERF_QUERY_TYPE_PIPELINE; + query->name = "Pipeline Statistics Registers"; + + add_basic_stat_reg(query, IA_VERTICES_COUNT, + "N vertices submitted"); + add_basic_stat_reg(query, IA_PRIMITIVES_COUNT, + "N primitives submitted"); + add_basic_stat_reg(query, VS_INVOCATION_COUNT, + "N vertex shader invocations"); + + if (devinfo->gen == 6) { + add_stat_reg(query, GEN6_SO_PRIM_STORAGE_NEEDED, 1, 1, + "SO_PRIM_STORAGE_NEEDED", + "N geometry shader stream-out primitives (total)"); + add_stat_reg(query, GEN6_SO_NUM_PRIMS_WRITTEN, 1, 1, + "SO_NUM_PRIMS_WRITTEN", + "N geometry shader stream-out primitives (written)"); + } else { + add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(0), 1, 1, + "SO_PRIM_STORAGE_NEEDED (Stream 0)", + "N stream-out (stream 0) primitives (total)"); + add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(1), 1, 1, + "SO_PRIM_STORAGE_NEEDED (Stream 1)", + "N stream-out (stream 1) primitives (total)"); + add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(2), 1, 1, + "SO_PRIM_STORAGE_NEEDED (Stream 2)", + "N stream-out (stream 2) primitives (total)"); + add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(3), 1, 1, + "SO_PRIM_STORAGE_NEEDED (Stream 3)", + "N stream-out (stream 3) primitives (total)"); + add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(0), 1, 1, + "SO_NUM_PRIMS_WRITTEN (Stream 0)", + "N stream-out (stream 0) primitives (written)"); + add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(1), 1, 1, + "SO_NUM_PRIMS_WRITTEN (Stream 1)", + "N stream-out (stream 1) primitives (written)"); + add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(2), 1, 1, + "SO_NUM_PRIMS_WRITTEN (Stream 2)", + "N stream-out (stream 2) primitives (written)"); + add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(3), 1, 1, + "SO_NUM_PRIMS_WRITTEN (Stream 3)", + "N stream-out (stream 3) primitives (written)"); + } + + add_basic_stat_reg(query, HS_INVOCATION_COUNT, + "N TCS shader invocations"); + add_basic_stat_reg(query, DS_INVOCATION_COUNT, + "N TES shader invocations"); + + add_basic_stat_reg(query, GS_INVOCATION_COUNT, + "N geometry shader invocations"); + add_basic_stat_reg(query, GS_PRIMITIVES_COUNT, + "N geometry shader primitives emitted"); + + add_basic_stat_reg(query, CL_INVOCATION_COUNT, + "N primitives entering clipping"); + add_basic_stat_reg(query, CL_PRIMITIVES_COUNT, + "N primitives leaving clipping"); + + if (devinfo->is_haswell || devinfo->gen == 8) { + add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4, + "N fragment shader invocations", + "N fragment shader invocations"); + } else { + add_basic_stat_reg(query, PS_INVOCATION_COUNT, + "N fragment shader invocations"); + } + + add_basic_stat_reg(query, PS_DEPTH_COUNT, + "N z-pass fragments"); + + if (devinfo->gen >= 7) { + add_basic_stat_reg(query, CS_INVOCATION_COUNT, + "N compute shader invocations"); + } + + query->data_size = sizeof(uint64_t) * query->n_counters; +} + +static bool +load_oa_metrics(struct gen_perf_config *perf, int fd, const struct gen_device_info *devinfo) { perf_register_oa_queries_t oa_register = get_register_queries_function(devinfo); bool i915_perf_oa_available = false; struct stat sb; + perf->i915_query_supported = i915_query_perf_config_supported(perf, fd); + /* The existence of this sysctl parameter implies the kernel supports * the i915 perf interface. */ @@ -447,6 +969,87 @@ gen_perf_load_oa_metrics(struct gen_perf_config *perf, int fd, return true; } +struct gen_perf_registers * +gen_perf_load_configuration(struct gen_perf_config *perf_cfg, int fd, const char *guid) +{ + if (!perf_cfg->i915_query_supported) + return NULL; + + struct drm_i915_perf_oa_config i915_config = { 0, }; + if (!i915_query_perf_config_data(perf_cfg, fd, guid, &i915_config)) + return NULL; + + struct gen_perf_registers *config = rzalloc(NULL, struct gen_perf_registers); + config->n_flex_regs = i915_config.n_flex_regs; + config->flex_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_flex_regs); + config->n_mux_regs = i915_config.n_mux_regs; + config->mux_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_mux_regs); + config->n_b_counter_regs = i915_config.n_boolean_regs; + config->b_counter_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_b_counter_regs); + + /* + * struct gen_perf_query_register_prog maps exactly to the tuple of + * (register offset, register value) returned by the i915. + */ + i915_config.flex_regs_ptr = to_user_pointer(config->flex_regs); + i915_config.mux_regs_ptr = to_user_pointer(config->mux_regs); + i915_config.boolean_regs_ptr = to_user_pointer(config->b_counter_regs); + if (!i915_query_perf_config_data(perf_cfg, fd, guid, &i915_config)) { + ralloc_free(config); + return NULL; + } + + return config; +} + +uint64_t +gen_perf_store_configuration(struct gen_perf_config *perf_cfg, int fd, + const struct gen_perf_registers *config, + const char *guid) +{ + if (guid) + return i915_add_config(perf_cfg, fd, config, guid); + + struct mesa_sha1 sha1_ctx; + _mesa_sha1_init(&sha1_ctx); + + if (config->flex_regs) { + _mesa_sha1_update(&sha1_ctx, config->flex_regs, + sizeof(config->flex_regs[0]) * + config->n_flex_regs); + } + if (config->mux_regs) { + _mesa_sha1_update(&sha1_ctx, config->mux_regs, + sizeof(config->mux_regs[0]) * + config->n_mux_regs); + } + if (config->b_counter_regs) { + _mesa_sha1_update(&sha1_ctx, config->b_counter_regs, + sizeof(config->b_counter_regs[0]) * + config->n_b_counter_regs); + } + + uint8_t hash[20]; + _mesa_sha1_final(&sha1_ctx, hash); + + char formatted_hash[41]; + _mesa_sha1_format(formatted_hash, hash); + + char generated_guid[37]; + snprintf(generated_guid, sizeof(generated_guid), + "%.8s-%.4s-%.4s-%.4s-%.12s", + &formatted_hash[0], &formatted_hash[8], + &formatted_hash[8 + 4], &formatted_hash[8 + 4 + 4], + &formatted_hash[8 + 4 + 4 + 4]); + + /* Check if already present. */ + uint64_t id; + if (gen_perf_load_metric_id(perf_cfg, generated_guid, &id)) + return id; + + return i915_add_config(perf_cfg, fd, config, generated_guid); +} + /* Accumulate 32bits OA counters */ static inline void accumulate_uint32(const uint32_t *report0, @@ -541,7 +1144,9 @@ gen_perf_query_result_accumulate(struct gen_perf_query_result *result, { int i, idx = 0; - result->hw_id = start[2]; + if (result->hw_id == OA_REPORT_INVALID_CTX_ID && + start[2] != OA_REPORT_INVALID_CTX_ID) + result->hw_id = start[2]; result->reports_accumulated++; switch (query->oa_format) { @@ -579,7 +1184,63 @@ void gen_perf_query_result_clear(struct gen_perf_query_result *result) { memset(result, 0, sizeof(*result)); - result->hw_id = 0xffffffff; /* invalid */ + result->hw_id = OA_REPORT_INVALID_CTX_ID; /* invalid */ +} + +static void +register_mdapi_statistic_query(struct gen_perf_config *perf_cfg, + const struct gen_device_info *devinfo) +{ + if (!(devinfo->gen >= 7 && devinfo->gen <= 11)) + return; + + struct gen_perf_query_info *query = + append_query_info(perf_cfg, MAX_STAT_COUNTERS); + + query->kind = GEN_PERF_QUERY_TYPE_PIPELINE; + query->name = "Intel_Raw_Pipeline_Statistics_Query"; + + /* The order has to match mdapi_pipeline_metrics. */ + add_basic_stat_reg(query, IA_VERTICES_COUNT, + "N vertices submitted"); + add_basic_stat_reg(query, IA_PRIMITIVES_COUNT, + "N primitives submitted"); + add_basic_stat_reg(query, VS_INVOCATION_COUNT, + "N vertex shader invocations"); + add_basic_stat_reg(query, GS_INVOCATION_COUNT, + "N geometry shader invocations"); + add_basic_stat_reg(query, GS_PRIMITIVES_COUNT, + "N geometry shader primitives emitted"); + add_basic_stat_reg(query, CL_INVOCATION_COUNT, + "N primitives entering clipping"); + add_basic_stat_reg(query, CL_PRIMITIVES_COUNT, + "N primitives leaving clipping"); + if (devinfo->is_haswell || devinfo->gen == 8) { + add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4, + "N fragment shader invocations", + "N fragment shader invocations"); + } else { + add_basic_stat_reg(query, PS_INVOCATION_COUNT, + "N fragment shader invocations"); + } + add_basic_stat_reg(query, HS_INVOCATION_COUNT, + "N TCS shader invocations"); + add_basic_stat_reg(query, DS_INVOCATION_COUNT, + "N TES shader invocations"); + if (devinfo->gen >= 7) { + add_basic_stat_reg(query, CS_INVOCATION_COUNT, + "N compute shader invocations"); + } + + if (devinfo->gen >= 10) { + /* Reuse existing CS invocation register until we can expose this new + * one. + */ + add_basic_stat_reg(query, CS_INVOCATION_COUNT, + "Reserved1"); + } + + query->data_size = sizeof(uint64_t) * query->n_counters; } static void @@ -618,9 +1279,9 @@ fill_mdapi_perf_query_counter(struct gen_perf_query_info *query, sizeof(struct_name.field_name[0]), \ GEN_PERF_COUNTER_DATA_TYPE_##type_name) -void -gen_perf_query_register_mdapi_oa_query(const struct gen_device_info *devinfo, - struct gen_perf_config *perf) +static void +register_mdapi_oa_query(const struct gen_device_info *devinfo, + struct gen_perf_config *perf) { struct gen_perf_query_info *query = NULL; @@ -632,7 +1293,7 @@ gen_perf_query_register_mdapi_oa_query(const struct gen_device_info *devinfo, switch (devinfo->gen) { case 7: { - query = gen_perf_query_append_query_info(perf, 1 + 45 + 16 + 7); + query = append_query_info(perf, 1 + 45 + 16 + 7); query->oa_format = I915_OA_FORMAT_A45_B8_C8; struct gen7_mdapi_metrics metric_data; @@ -657,7 +1318,7 @@ gen_perf_query_register_mdapi_oa_query(const struct gen_device_info *devinfo, break; } case 8: { - query = gen_perf_query_append_query_info(perf, 2 + 36 + 16 + 16); + query = append_query_info(perf, 2 + 36 + 16 + 16); query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8; struct gen8_mdapi_metrics metric_data; @@ -694,7 +1355,7 @@ gen_perf_query_register_mdapi_oa_query(const struct gen_device_info *devinfo, case 9: case 10: case 11: { - query = gen_perf_query_append_query_info(perf, 2 + 36 + 16 + 16 + 16 + 2); + query = append_query_info(perf, 2 + 36 + 16 + 16 + 16 + 2); query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8; struct gen9_mdapi_metrics metric_data; @@ -756,65 +1417,9 @@ gen_perf_query_register_mdapi_oa_query(const struct gen_device_info *devinfo, } } -void -gen_perf_query_register_mdapi_statistic_query(const struct gen_device_info *devinfo, - struct gen_perf_config *perf) -{ - if (!(devinfo->gen >= 7 && devinfo->gen <= 11)) - return; - - struct gen_perf_query_info *query = - gen_perf_query_append_query_info(perf, MAX_STAT_COUNTERS); - - query->kind = GEN_PERF_QUERY_TYPE_PIPELINE; - query->name = "Intel_Raw_Pipeline_Statistics_Query"; - - /* The order has to match mdapi_pipeline_metrics. */ - gen_perf_query_info_add_basic_stat_reg(query, IA_VERTICES_COUNT, - "N vertices submitted"); - gen_perf_query_info_add_basic_stat_reg(query, IA_PRIMITIVES_COUNT, - "N primitives submitted"); - gen_perf_query_info_add_basic_stat_reg(query, VS_INVOCATION_COUNT, - "N vertex shader invocations"); - gen_perf_query_info_add_basic_stat_reg(query, GS_INVOCATION_COUNT, - "N geometry shader invocations"); - gen_perf_query_info_add_basic_stat_reg(query, GS_PRIMITIVES_COUNT, - "N geometry shader primitives emitted"); - gen_perf_query_info_add_basic_stat_reg(query, CL_INVOCATION_COUNT, - "N primitives entering clipping"); - gen_perf_query_info_add_basic_stat_reg(query, CL_PRIMITIVES_COUNT, - "N primitives leaving clipping"); - if (devinfo->is_haswell || devinfo->gen == 8) { - gen_perf_query_info_add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4, - "N fragment shader invocations", - "N fragment shader invocations"); - } else { - gen_perf_query_info_add_basic_stat_reg(query, PS_INVOCATION_COUNT, - "N fragment shader invocations"); - } - gen_perf_query_info_add_basic_stat_reg(query, HS_INVOCATION_COUNT, - "N TCS shader invocations"); - gen_perf_query_info_add_basic_stat_reg(query, DS_INVOCATION_COUNT, - "N TES shader invocations"); - if (devinfo->gen >= 7) { - gen_perf_query_info_add_basic_stat_reg(query, CS_INVOCATION_COUNT, - "N compute shader invocations"); - } - - if (devinfo->gen >= 10) { - /* Reuse existing CS invocation register until we can expose this new - * one. - */ - gen_perf_query_info_add_basic_stat_reg(query, CS_INVOCATION_COUNT, - "Reserved1"); - } - - query->data_size = sizeof(uint64_t) * query->n_counters; -} - -uint64_t -gen_perf_query_get_metric_id(struct gen_perf_config *perf, - const struct gen_perf_query_info *query) +static uint64_t +get_metric_id(struct gen_perf_config *perf, + const struct gen_perf_query_info *query) { /* These queries are know not to ever change, their config ID has been * loaded upon the first query creation. No need to look them up again. @@ -847,8 +1452,8 @@ gen_perf_query_get_metric_id(struct gen_perf_config *perf, return query->oa_metrics_set_id; } -struct oa_sample_buf * -gen_perf_get_free_sample_buf(struct gen_perf_context *perf_ctx) +static struct oa_sample_buf * +get_free_sample_buf(struct gen_perf_context *perf_ctx) { struct exec_node *node = exec_list_pop_head(&perf_ctx->free_sample_buffers); struct oa_sample_buf *buf; @@ -866,8 +1471,8 @@ gen_perf_get_free_sample_buf(struct gen_perf_context *perf_ctx) return buf; } -void -gen_perf_reap_old_sample_buffers(struct gen_perf_context *perf_ctx) +static void +reap_old_sample_buffers(struct gen_perf_context *perf_ctx) { struct exec_node *tail_node = exec_list_get_tail(&perf_ctx->sample_buffers); @@ -890,8 +1495,8 @@ gen_perf_reap_old_sample_buffers(struct gen_perf_context *perf_ctx) } } -void -gen_perf_free_sample_bufs(struct gen_perf_context *perf_ctx) +static void +free_sample_bufs(struct gen_perf_context *perf_ctx) { foreach_list_typed_safe(struct oa_sample_buf, buf, link, &perf_ctx->free_sample_buffers) @@ -906,11 +1511,11 @@ gen_perf_free_sample_bufs(struct gen_perf_context *perf_ctx) * Emit MI_STORE_REGISTER_MEM commands to capture all of the * pipeline statistics for the performance query object. */ -void -gen_perf_snapshot_statistics_registers(void *context, - struct gen_perf_config *perf, - struct gen_perf_query_object *obj, - uint32_t offset_in_bytes) +static void +snapshot_statistics_registers(void *context, + struct gen_perf_config *perf, + struct gen_perf_query_object *obj, + uint32_t offset_in_bytes) { const struct gen_perf_query_info *query = obj->queryinfo; const int n_counters = query->n_counters; @@ -926,7 +1531,7 @@ gen_perf_snapshot_statistics_registers(void *context, } } -void +static void gen_perf_close(struct gen_perf_context *perfquery, const struct gen_perf_query_info *query) { @@ -941,7 +1546,7 @@ gen_perf_close(struct gen_perf_context *perfquery, } } -bool +static bool gen_perf_open(struct gen_perf_context *perf_ctx, int metrics_set_id, int report_format, @@ -982,8 +1587,8 @@ gen_perf_open(struct gen_perf_context *perf_ctx, return true; } -bool -gen_perf_inc_n_users(struct gen_perf_context *perf_ctx) +static bool +inc_n_users(struct gen_perf_context *perf_ctx) { if (perf_ctx->n_oa_users == 0 && gen_ioctl(perf_ctx->oa_stream_fd, I915_PERF_IOCTL_ENABLE, 0) < 0) @@ -995,8 +1600,8 @@ gen_perf_inc_n_users(struct gen_perf_context *perf_ctx) return true; } -void -gen_perf_dec_n_users(struct gen_perf_context *perf_ctx) +static void +dec_n_users(struct gen_perf_context *perf_ctx) { /* Disabling the i915 perf stream will effectively disable the OA * counters. Note it's important to be sure there are no outstanding @@ -1011,6 +1616,17 @@ gen_perf_dec_n_users(struct gen_perf_context *perf_ctx) } } +void +gen_perf_init_metrics(struct gen_perf_config *perf_cfg, + const struct gen_device_info *devinfo, + int drm_fd) +{ + load_pipeline_statistic_metrics(perf_cfg, devinfo); + register_mdapi_statistic_query(perf_cfg, devinfo); + if (load_oa_metrics(perf_cfg, drm_fd, devinfo)) + register_mdapi_oa_query(devinfo, perf_cfg); +} + void gen_perf_init_context(struct gen_perf_context *perf_ctx, struct gen_perf_config *perf_cfg, @@ -1040,7 +1656,7 @@ gen_perf_init_context(struct gen_perf_context *perf_ctx, * Begin an OA query we can always take a reference on a buffer * in this list. */ - struct oa_sample_buf *buf = gen_perf_get_free_sample_buf(perf_ctx); + struct oa_sample_buf *buf = get_free_sample_buf(perf_ctx); exec_list_push_head(&perf_ctx->sample_buffers, &buf->link); perf_ctx->oa_stream_fd = -1; @@ -1134,7 +1750,7 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx, * require a different counter set or format unless we get an opportunity * to close the stream and open a new one... */ - uint64_t metric_id = gen_perf_query_get_metric_id(perf_ctx->perf, queryinfo); + uint64_t metric_id = get_metric_id(perf_ctx->perf, queryinfo); if (perf_ctx->oa_stream_fd != -1 && perf_ctx->current_oa_metrics_set_id != metric_id) { @@ -1212,7 +1828,7 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx, perf_ctx->current_oa_format == queryinfo->oa_format); } - if (!gen_perf_inc_n_users(perf_ctx)) { + if (!inc_n_users(perf_ctx)) { DBG("WARNING: Error enabling i915 perf stream: %m\n"); return false; } @@ -1287,7 +1903,7 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx, STATS_BO_SIZE); /* Take starting snapshots. */ - gen_perf_snapshot_statistics_registers(perf_ctx->ctx , perf_cfg, query, 0); + snapshot_statistics_registers(perf_ctx->ctx , perf_cfg, query, 0); ++perf_ctx->n_active_pipeline_stats_queries; break; @@ -1341,8 +1957,8 @@ gen_perf_end_query(struct gen_perf_context *perf_ctx, break; case GEN_PERF_QUERY_TYPE_PIPELINE: - gen_perf_snapshot_statistics_registers(perf_ctx->ctx, perf_cfg, query, - STATS_BO_END_OFFSET_BYTES); + snapshot_statistics_registers(perf_ctx->ctx, perf_cfg, query, + STATS_BO_END_OFFSET_BYTES); --perf_ctx->n_active_pipeline_stats_queries; break; @@ -1370,7 +1986,7 @@ read_oa_samples_until(struct gen_perf_context *perf_ctx, uint32_t last_timestamp = tail_buf->last_timestamp; while (1) { - struct oa_sample_buf *buf = gen_perf_get_free_sample_buf(perf_ctx); + struct oa_sample_buf *buf = get_free_sample_buf(perf_ctx); uint32_t offset; int len; @@ -1582,7 +2198,7 @@ drop_from_unaccumulated_query_list(struct gen_perf_context *perf_ctx, query->oa.samples_head = NULL; - gen_perf_reap_old_sample_buffers(perf_ctx); + reap_old_sample_buffers(perf_ctx); } /* In general if we see anything spurious while accumulating results, @@ -1599,7 +2215,7 @@ discard_all_queries(struct gen_perf_context *perf_ctx) query->oa.results_accumulated = true; drop_from_unaccumulated_query_list(perf_ctx, query); - gen_perf_dec_n_users(perf_ctx); + dec_n_users(perf_ctx); } } @@ -1647,6 +2263,14 @@ accumulate_oa_reports(struct gen_perf_context *perf_ctx, goto error; } + /* On Gen12+ OA reports are sourced from per context counters, so we don't + * ever have to look at the global OA buffer. Yey \o/ + */ + if (perf_ctx->devinfo->gen >= 12) { + last = start; + goto end; + } + /* See if we have any periodic reports to accumulate too... */ /* N.B. The oa.samples_head was set when the query began and @@ -1736,7 +2360,8 @@ accumulate_oa_reports(struct gen_perf_context *perf_ctx, } if (add) { - gen_perf_query_result_accumulate(&query->oa.result, query->queryinfo, + gen_perf_query_result_accumulate(&query->oa.result, + query->queryinfo, last, report); } @@ -1762,7 +2387,7 @@ end: query->oa.results_accumulated = true; drop_from_unaccumulated_query_list(perf_ctx, query); - gen_perf_dec_n_users(perf_ctx); + dec_n_users(perf_ctx); return; @@ -1787,7 +2412,7 @@ gen_perf_delete_query(struct gen_perf_context *perf_ctx, if (query->oa.bo) { if (!query->oa.results_accumulated) { drop_from_unaccumulated_query_list(perf_ctx, query); - gen_perf_dec_n_users(perf_ctx); + dec_n_users(perf_ctx); } perf_cfg->vtbl.bo_unreference(query->oa.bo); @@ -1814,7 +2439,7 @@ gen_perf_delete_query(struct gen_perf_context *perf_ctx, * buffers and close any current i915-perf stream. */ if (--perf_ctx->n_query_instances == 0) { - gen_perf_free_sample_bufs(perf_ctx); + free_sample_bufs(perf_ctx); gen_perf_close(perf_ctx, query->queryinfo); } @@ -1979,3 +2604,33 @@ gen_perf_get_query_data(struct gen_perf_context *perf_ctx, if (bytes_written) *bytes_written = written; } + +void +gen_perf_dump_query_count(struct gen_perf_context *perf_ctx) +{ + DBG("Queries: (Open queries = %d, OA users = %d)\n", + perf_ctx->n_active_oa_queries, perf_ctx->n_oa_users); +} + +void +gen_perf_dump_query(struct gen_perf_context *ctx, + struct gen_perf_query_object *obj, + void *current_batch) +{ + switch (obj->queryinfo->kind) { + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: + DBG("BO: %-4s OA data: %-10s %-15s\n", + obj->oa.bo ? "yes," : "no,", + gen_perf_is_query_ready(ctx, obj, current_batch) ? "ready," : "not ready,", + obj->oa.results_accumulated ? "accumulated" : "not accumulated"); + break; + case GEN_PERF_QUERY_TYPE_PIPELINE: + DBG("BO: %-4s\n", + obj->pipeline_stats.bo ? "yes" : "no"); + break; + default: + unreachable("Unknown query type"); + break; + } +}