X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=blobdiff_plain;f=src%2Fintel%2Fperf%2Fgen_perf.h;h=790719ccbadad3960acea2d5f1d7e1af2d3f39ec;hp=6e69278ccdfd36d53d0008ab3996b338297587e9;hb=ec1fa1d51ff614c19d08c949482b40c060de48c9;hpb=f57c8a6dc110e7701233f159ce166c63ea75ff5f diff --git a/src/intel/perf/gen_perf.h b/src/intel/perf/gen_perf.h index 6e69278ccdf..790719ccbad 100644 --- a/src/intel/perf/gen_perf.h +++ b/src/intel/perf/gen_perf.h @@ -25,15 +25,22 @@ #define GEN_PERF_H #include +#include #include #include +#if defined(MAJOR_IN_SYSMACROS) #include +#elif defined(MAJOR_IN_MKDEV) +#include +#endif #include "util/hash_table.h" #include "compiler/glsl/list.h" #include "util/ralloc.h" +#include "drm-uapi/i915_drm.h" + struct gen_device_info; struct gen_perf_config; @@ -56,6 +63,39 @@ enum gen_perf_counter_data_type { GEN_PERF_COUNTER_DATA_TYPE_DOUBLE, }; +enum gen_perf_counter_units { + /* size */ + GEN_PERF_COUNTER_UNITS_BYTES, + + /* frequency */ + GEN_PERF_COUNTER_UNITS_HZ, + + /* time */ + GEN_PERF_COUNTER_UNITS_NS, + GEN_PERF_COUNTER_UNITS_US, + + /**/ + GEN_PERF_COUNTER_UNITS_PIXELS, + GEN_PERF_COUNTER_UNITS_TEXELS, + GEN_PERF_COUNTER_UNITS_THREADS, + GEN_PERF_COUNTER_UNITS_PERCENT, + + /* events */ + GEN_PERF_COUNTER_UNITS_MESSAGES, + GEN_PERF_COUNTER_UNITS_NUMBER, + GEN_PERF_COUNTER_UNITS_CYCLES, + GEN_PERF_COUNTER_UNITS_EVENTS, + GEN_PERF_COUNTER_UNITS_UTILIZATION, + + /**/ + GEN_PERF_COUNTER_UNITS_EU_SENDS_TO_L3_CACHE_LINES, + GEN_PERF_COUNTER_UNITS_EU_ATOMIC_REQUESTS_TO_L3_CACHE_LINES, + GEN_PERF_COUNTER_UNITS_EU_REQUESTS_TO_L3_CACHE_LINES, + GEN_PERF_COUNTER_UNITS_EU_BYTES_PER_L3_CACHE_LINE, + + GEN_PERF_COUNTER_UNITS_MAX +}; + struct gen_pipeline_stat { uint32_t reg; uint32_t numerator; @@ -71,19 +111,6 @@ struct gen_pipeline_stat { */ #define MAX_OA_REPORT_COUNTERS 62 -#define IA_VERTICES_COUNT 0x2310 -#define IA_PRIMITIVES_COUNT 0x2318 -#define VS_INVOCATION_COUNT 0x2320 -#define HS_INVOCATION_COUNT 0x2300 -#define DS_INVOCATION_COUNT 0x2308 -#define GS_INVOCATION_COUNT 0x2328 -#define GS_PRIMITIVES_COUNT 0x2330 -#define CL_INVOCATION_COUNT 0x2338 -#define CL_PRIMITIVES_COUNT 0x2340 -#define PS_INVOCATION_COUNT 0x2348 -#define CS_INVOCATION_COUNT 0x2290 -#define PS_DEPTH_COUNT 0x2350 - /* * When currently allocate only one page for pipeline statistics queries. Here * we derived the maximum number of counters for that amount. @@ -122,13 +149,26 @@ struct gen_perf_query_result { * query. */ uint64_t unslice_frequency[2]; + + /** + * Timestamp of the query. + */ + uint64_t begin_timestamp; + + /** + * Whether the query was interrupted by another workload (aka preemption). + */ + bool query_disjoint; }; struct gen_perf_query_counter { const char *name; const char *desc; + const char *symbol_name; + const char *category; enum gen_perf_counter_type type; enum gen_perf_counter_data_type data_type; + enum gen_perf_counter_units units; uint64_t raw_max; size_t offset; @@ -148,6 +188,18 @@ struct gen_perf_query_register_prog { uint32_t val; }; +/* Register programming for a given query */ +struct gen_perf_registers { + const struct gen_perf_query_register_prog *flex_regs; + uint32_t n_flex_regs; + + const struct gen_perf_query_register_prog *mux_regs; + uint32_t n_mux_regs; + + const struct gen_perf_query_register_prog *b_counter_regs; + uint32_t n_b_counter_regs; +}; + struct gen_perf_query_info { enum gen_perf_query_type { GEN_PERF_QUERY_TYPE_OA, @@ -155,6 +207,7 @@ struct gen_perf_query_info { GEN_PERF_QUERY_TYPE_PIPELINE, } kind; const char *name; + const char *symbol_name; const char *guid; struct gen_perf_query_counter *counters; int n_counters; @@ -172,21 +225,40 @@ struct gen_perf_query_info { int b_offset; int c_offset; - /* Register programming for a given query */ - struct gen_perf_query_register_prog *flex_regs; - uint32_t n_flex_regs; + struct gen_perf_registers config; +}; - struct gen_perf_query_register_prog *mux_regs; - uint32_t n_mux_regs; +struct gen_perf_query_counter_info { + struct gen_perf_query_counter *counter; - struct gen_perf_query_register_prog *b_counter_regs; - uint32_t n_b_counter_regs; + uint64_t query_mask; + + /** + * Each counter can be a part of many groups, each time at different index. + * This struct stores one of those locations. + */ + struct { + int group_idx; /* query/group number */ + int counter_idx; /* index inside of query/group */ + } location; }; struct gen_perf_config { + /* Whether i915 has DRM_I915_QUERY_PERF_CONFIG support. */ + bool i915_query_supported; + + /* Version of the i915-perf subsystem, refer to i915_drm.h. */ + int i915_perf_version; + + /* Powergating configuration for the running the query. */ + struct drm_i915_gem_context_param_sseu sseu; + struct gen_perf_query_info *queries; int n_queries; + struct gen_perf_query_counter_info *counter_infos; + int n_counters; + /* Variables referenced in the XML meta data for OA performance * counters, e.g in the normalization equations. * @@ -211,275 +283,85 @@ struct gen_perf_config { */ struct hash_table *oa_metrics_table; + /* When MDAPI hasn't configured the metric we need to use by the time the + * query begins, this OA metric is used as a fallback. + */ + uint64_t fallback_raw_oa_metric; + + /* Whether we have support for this platform. If true && n_queries == 0, + * this means we will not be able to use i915-perf because of it is in + * paranoid mode. + */ + bool platform_supported; + /* Location of the device's sysfs entry. */ char sysfs_dev_dir[256]; struct { void *(*bo_alloc)(void *bufmgr, const char *name, uint64_t size); void (*bo_unreference)(void *bo); - void (*emit_mi_flush)(void *ctx); + void *(*bo_map)(void *ctx, void *bo, unsigned flags); + void (*bo_unmap)(void *bo); + bool (*batch_references)(void *batch, void *bo); + void (*bo_wait_rendering)(void *bo); + int (*bo_busy)(void *bo); + void (*emit_stall_at_pixel_scoreboard)(void *ctx); void (*emit_mi_report_perf_count)(void *ctx, void *bo, uint32_t offset_in_bytes, uint32_t report_id); void (*batchbuffer_flush)(void *ctx, const char *file, int line); - void (*capture_frequency_stat_register)(void *ctx, void *bo, - uint32_t bo_offset); - void (*store_register_mem64)(void *ctx, void *bo, uint32_t reg, uint32_t offset); + void (*store_register_mem)(void *ctx, void *bo, uint32_t reg, uint32_t reg_size, uint32_t offset); } vtbl; }; -/** - * Periodic OA samples are read() into these buffer structures via the - * i915 perf kernel interface and appended to the - * brw->perfquery.sample_buffers linked list. When we process the - * results of an OA metrics query we need to consider all the periodic - * samples between the Begin and End MI_REPORT_PERF_COUNT command - * markers. - * - * 'Periodic' is a simplification as there are other automatic reports - * written by the hardware also buffered here. - * - * Considering three queries, A, B and C: - * - * Time ----> - * ________________A_________________ - * | | - * | ________B_________ _____C___________ - * | | | | | | - * - * And an illustration of sample buffers read over this time frame: - * [HEAD ][ ][ ][ ][ ][ ][ ][ ][TAIL ] - * - * These nodes may hold samples for query A: - * [ ][ ][ A ][ A ][ A ][ A ][ A ][ ][ ] - * - * These nodes may hold samples for query B: - * [ ][ ][ B ][ B ][ B ][ ][ ][ ][ ] - * - * These nodes may hold samples for query C: - * [ ][ ][ ][ ][ ][ C ][ C ][ C ][ ] - * - * The illustration assumes we have an even distribution of periodic - * samples so all nodes have the same size plotted against time: - * - * Note, to simplify code, the list is never empty. - * - * With overlapping queries we can see that periodic OA reports may - * relate to multiple queries and care needs to be take to keep - * track of sample buffers until there are no queries that might - * depend on their contents. - * - * We use a node ref counting system where a reference ensures that a - * node and all following nodes can't be freed/recycled until the - * reference drops to zero. - * - * E.g. with a ref of one here: - * [ 0 ][ 0 ][ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ] - * - * These nodes could be freed or recycled ("reaped"): - * [ 0 ][ 0 ] - * - * These must be preserved until the leading ref drops to zero: - * [ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ] - * - * When a query starts we take a reference on the current tail of - * the list, knowing that no already-buffered samples can possibly - * relate to the newly-started query. A pointer to this node is - * also saved in the query object's ->oa.samples_head. - * - * E.g. starting query A while there are two nodes in .sample_buffers: - * ________________A________ - * | - * - * [ 0 ][ 1 ] - * ^_______ Add a reference and store pointer to node in - * A->oa.samples_head - * - * Moving forward to when the B query starts with no new buffer nodes: - * (for reference, i915 perf reads() are only done when queries finish) - * ________________A_______ - * | ________B___ - * | | - * - * [ 0 ][ 2 ] - * ^_______ Add a reference and store pointer to - * node in B->oa.samples_head - * - * Once a query is finished, after an OA query has become 'Ready', - * once the End OA report has landed and after we we have processed - * all the intermediate periodic samples then we drop the - * ->oa.samples_head reference we took at the start. - * - * So when the B query has finished we have: - * ________________A________ - * | ______B___________ - * | | | - * [ 0 ][ 1 ][ 0 ][ 0 ][ 0 ] - * ^_______ Drop B->oa.samples_head reference - * - * We still can't free these due to the A->oa.samples_head ref: - * [ 1 ][ 0 ][ 0 ][ 0 ] - * - * When the A query finishes: (note there's a new ref for C's samples_head) - * ________________A_________________ - * | | - * | _____C_________ - * | | | - * [ 0 ][ 0 ][ 0 ][ 0 ][ 1 ][ 0 ][ 0 ] - * ^_______ Drop A->oa.samples_head reference - * - * And we can now reap these nodes up to the C->oa.samples_head: - * [ X ][ X ][ X ][ X ] - * keeping -> [ 1 ][ 0 ][ 0 ] - * - * We reap old sample buffers each time we finish processing an OA - * query by iterating the sample_buffers list from the head until we - * find a referenced node and stop. - * - * Reaped buffers move to a perfquery.free_sample_buffers list and - * when we come to read() we first look to recycle a buffer from the - * free_sample_buffers list before allocating a new buffer. - */ -struct oa_sample_buf { - struct exec_node link; - int refcount; - int len; - uint8_t buf[I915_PERF_OA_SAMPLE_SIZE * 10]; - uint32_t last_timestamp; -}; - -/** - * gen representation of a performance query object. - * - * NB: We want to keep this structure relatively lean considering that - * applications may expect to allocate enough objects to be able to - * query around all draw calls in a frame. - */ -struct gen_perf_query_object -{ - const struct gen_perf_query_info *queryinfo; - - /* See query->kind to know which state below is in use... */ - union { - struct { - - /** - * BO containing OA counter snapshots at query Begin/End time. - */ - void *bo; - - /** - * Address of mapped of @bo - */ - void *map; - - /** - * The MI_REPORT_PERF_COUNT command lets us specify a unique - * ID that will be reflected in the resulting OA report - * that's written by the GPU. This is the ID we're expecting - * in the begin report and the the end report should be - * @begin_report_id + 1. - */ - int begin_report_id; - - /** - * Reference the head of the brw->perfquery.sample_buffers - * list at the time that the query started (so we only need - * to look at nodes after this point when looking for samples - * related to this query) - * - * (See struct brw_oa_sample_buf description for more details) - */ - struct exec_node *samples_head; - - /** - * false while in the unaccumulated_elements list, and set to - * true when the final, end MI_RPC snapshot has been - * accumulated. - */ - bool results_accumulated; - - /** - * Frequency of the GT at begin and end of the query. - */ - uint64_t gt_frequency[2]; - - /** - * Accumulated OA results between begin and end of the query. - */ - struct gen_perf_query_result result; - } oa; - - struct { - /** - * BO containing starting and ending snapshots for the - * statistics counters. - */ - void *bo; - } pipeline_stats; - }; +struct gen_perf_counter_pass { + struct gen_perf_query_info *query; + struct gen_perf_query_counter *counter; + uint32_t pass; }; -struct gen_perf_context { - struct gen_perf_config *perf; +void gen_perf_init_metrics(struct gen_perf_config *perf_cfg, + const struct gen_device_info *devinfo, + int drm_fd, + bool include_pipeline_statistics); - /* The i915 perf stream we open to setup + enable the OA counters */ - int oa_stream_fd; - - /* An i915 perf stream fd gives exclusive access to the OA unit that will - * report counter snapshots for a specific counter set/profile in a - * specific layout/format so we can only start OA queries that are - * compatible with the currently open fd... - */ - int current_oa_metrics_set_id; - int current_oa_format; - - /* List of buffers containing OA reports */ - struct exec_list sample_buffers; - - /* Cached list of empty sample buffers */ - struct exec_list free_sample_buffers; - - int n_active_oa_queries; - int n_active_pipeline_stats_queries; +/** Query i915 for a metric id using guid. + */ +bool gen_perf_load_metric_id(struct gen_perf_config *perf_cfg, + const char *guid, + uint64_t *metric_id); - /* The number of queries depending on running OA counters which - * extends beyond brw_end_perf_query() since we need to wait until - * the last MI_RPC command has parsed by the GPU. - * - * Accurate accounting is important here as emitting an - * MI_REPORT_PERF_COUNT command while the OA unit is disabled will - * effectively hang the gpu. - */ - int n_oa_users; +/** Load a configuation's content from i915 using a guid. + */ +struct gen_perf_registers *gen_perf_load_configuration(struct gen_perf_config *perf_cfg, + int fd, const char *guid); - /* To help catch an spurious problem with the hardware or perf - * forwarding samples, we emit each MI_REPORT_PERF_COUNT command - * with a unique ID that we can explicitly check for... - */ - int next_query_start_report_id; +/** Store a configuration into i915 using guid and return a new metric id. + * + * If guid is NULL, then a generated one will be provided by hashing the + * content of the configuration. + */ +uint64_t gen_perf_store_configuration(struct gen_perf_config *perf_cfg, int fd, + const struct gen_perf_registers *config, + const char *guid); - /** - * An array of queries whose results haven't yet been assembled - * based on the data in buffer objects. - * - * These may be active, or have already ended. However, the - * results have not been requested. - */ - struct gen_perf_query_object **unaccumulated; - int unaccumulated_elements; - int unaccumulated_array_size; - - /* The total number of query objects so we can relinquish - * our exclusive access to perf if the application deletes - * all of its objects. (NB: We only disable perf while - * there are no active queries) - */ - int n_query_instances; -}; +/** Read the slice/unslice frequency from 2 OA reports and store then into + * result. + */ +void gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result, + const struct gen_device_info *devinfo, + const uint32_t *start, + const uint32_t *end); +/** Accumulate the delta between 2 OA reports into result for a given query. + */ +void gen_perf_query_result_accumulate(struct gen_perf_query_result *result, + const struct gen_perf_query_info *query, + const uint32_t *start, + const uint32_t *end); +void gen_perf_query_result_clear(struct gen_perf_query_result *result); static inline size_t gen_perf_query_counter_get_size(const struct gen_perf_query_counter *counter) @@ -500,58 +382,6 @@ gen_perf_query_counter_get_size(const struct gen_perf_query_counter *counter) } } -static inline struct gen_perf_query_info * -gen_perf_query_append_query_info(struct gen_perf_config *perf, int max_counters) -{ - struct gen_perf_query_info *query; - - perf->queries = reralloc(perf, perf->queries, - struct gen_perf_query_info, - ++perf->n_queries); - query = &perf->queries[perf->n_queries - 1]; - memset(query, 0, sizeof(*query)); - - if (max_counters > 0) { - query->max_counters = max_counters; - query->counters = - rzalloc_array(perf, struct gen_perf_query_counter, max_counters); - } - - return query; -} - -static inline void -gen_perf_query_info_add_stat_reg(struct gen_perf_query_info *query, - uint32_t reg, - uint32_t numerator, - uint32_t denominator, - const char *name, - const char *description) -{ - struct gen_perf_query_counter *counter; - - assert(query->n_counters < query->max_counters); - - counter = &query->counters[query->n_counters]; - counter->name = name; - counter->desc = description; - counter->type = GEN_PERF_COUNTER_TYPE_RAW; - counter->data_type = GEN_PERF_COUNTER_DATA_TYPE_UINT64; - counter->offset = sizeof(uint64_t) * query->n_counters; - counter->pipeline_stat.reg = reg; - counter->pipeline_stat.numerator = numerator; - counter->pipeline_stat.denominator = denominator; - - query->n_counters++; -} - -static inline void -gen_perf_query_info_add_basic_stat_reg(struct gen_perf_query_info *query, - uint32_t reg, const char *name) -{ - gen_perf_query_info_add_stat_reg(query, reg, 1, 1, name, name); -} - static inline struct gen_perf_config * gen_perf_new(void *ctx) { @@ -559,33 +389,13 @@ gen_perf_new(void *ctx) return perf; } -bool gen_perf_load_oa_metrics(struct gen_perf_config *perf, int fd, - const struct gen_device_info *devinfo); -bool gen_perf_load_metric_id(struct gen_perf_config *perf, const char *guid, - uint64_t *metric_id); - -void gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result, - const struct gen_device_info *devinfo, - const uint32_t *start, - const uint32_t *end); -void gen_perf_query_result_accumulate(struct gen_perf_query_result *result, - const struct gen_perf_query_info *query, - const uint32_t *start, - const uint32_t *end); -void gen_perf_query_result_clear(struct gen_perf_query_result *result); -void gen_perf_query_register_mdapi_statistic_query(const struct gen_device_info *devinfo, - struct gen_perf_config *perf); -void gen_perf_query_register_mdapi_oa_query(const struct gen_device_info *devinfo, - struct gen_perf_config *perf); -uint64_t gen_perf_query_get_metric_id(struct gen_perf_config *perf, - const struct gen_perf_query_info *query); -struct oa_sample_buf * gen_perf_get_free_sample_buf(struct gen_perf_context *perf); -void gen_perf_reap_old_sample_buffers(struct gen_perf_context *perf_ctx); -void gen_perf_free_sample_bufs(struct gen_perf_context *perf_ctx); - -void gen_perf_snapshot_statistics_registers(void *context, - struct gen_perf_config *perf, - struct gen_perf_query_object *obj, - uint32_t offset_in_bytes); +uint32_t gen_perf_get_n_passes(struct gen_perf_config *perf, + const uint32_t *counter_indices, + uint32_t counter_indices_count, + struct gen_perf_query_info **pass_queries); +void gen_perf_get_counters_passes(struct gen_perf_config *perf, + const uint32_t *counter_indices, + uint32_t counter_indices_count, + struct gen_perf_counter_pass *counter_pass); #endif /* GEN_PERF_H */