X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fperf%2Fgen_perf.c;h=4ef28c42d8fc06c338a1836ac98013f6a9db4e40;hb=61c54a88785f394f37605702053e738c790ed025;hp=fe5bbabe3c81e41ea87c06ed1a1586b5c763b11d;hpb=8c9eac12345fb6ca7a6ae108a0451cbbcfff47ed;p=mesa.git diff --git a/src/intel/perf/gen_perf.c b/src/intel/perf/gen_perf.c index fe5bbabe3c8..4ef28c42d8f 100644 --- a/src/intel/perf/gen_perf.c +++ b/src/intel/perf/gen_perf.c @@ -47,9 +47,345 @@ #define MI_RPC_BO_END_OFFSET_BYTES (MI_RPC_BO_SIZE / 2) #define MI_FREQ_END_OFFSET_BYTES (3076) +#define INTEL_MASK(high, low) (((1u<<((high)-(low)+1))-1)<<(low)) + +#define GEN7_RPSTAT1 0xA01C +#define GEN7_RPSTAT1_CURR_GT_FREQ_SHIFT 7 +#define GEN7_RPSTAT1_CURR_GT_FREQ_MASK INTEL_MASK(13, 7) +#define GEN7_RPSTAT1_PREV_GT_FREQ_SHIFT 0 +#define GEN7_RPSTAT1_PREV_GT_FREQ_MASK INTEL_MASK(6, 0) + +#define GEN9_RPSTAT0 0xA01C +#define GEN9_RPSTAT0_CURR_GT_FREQ_SHIFT 23 +#define GEN9_RPSTAT0_CURR_GT_FREQ_MASK INTEL_MASK(31, 23) +#define GEN9_RPSTAT0_PREV_GT_FREQ_SHIFT 0 +#define GEN9_RPSTAT0_PREV_GT_FREQ_MASK INTEL_MASK(8, 0) + +#define GEN6_SO_PRIM_STORAGE_NEEDED 0x2280 +#define GEN7_SO_PRIM_STORAGE_NEEDED(n) (0x5240 + (n) * 8) +#define GEN6_SO_NUM_PRIMS_WRITTEN 0x2288 +#define GEN7_SO_NUM_PRIMS_WRITTEN(n) (0x5200 + (n) * 8) + #define MAP_READ (1 << 0) #define MAP_WRITE (1 << 1) +/** + * Periodic OA samples are read() into these buffer structures via the + * i915 perf kernel interface and appended to the + * perf_ctx->sample_buffers linked list. When we process the + * results of an OA metrics query we need to consider all the periodic + * samples between the Begin and End MI_REPORT_PERF_COUNT command + * markers. + * + * 'Periodic' is a simplification as there are other automatic reports + * written by the hardware also buffered here. + * + * Considering three queries, A, B and C: + * + * Time ----> + * ________________A_________________ + * | | + * | ________B_________ _____C___________ + * | | | | | | + * + * And an illustration of sample buffers read over this time frame: + * [HEAD ][ ][ ][ ][ ][ ][ ][ ][TAIL ] + * + * These nodes may hold samples for query A: + * [ ][ ][ A ][ A ][ A ][ A ][ A ][ ][ ] + * + * These nodes may hold samples for query B: + * [ ][ ][ B ][ B ][ B ][ ][ ][ ][ ] + * + * These nodes may hold samples for query C: + * [ ][ ][ ][ ][ ][ C ][ C ][ C ][ ] + * + * The illustration assumes we have an even distribution of periodic + * samples so all nodes have the same size plotted against time: + * + * Note, to simplify code, the list is never empty. + * + * With overlapping queries we can see that periodic OA reports may + * relate to multiple queries and care needs to be take to keep + * track of sample buffers until there are no queries that might + * depend on their contents. + * + * We use a node ref counting system where a reference ensures that a + * node and all following nodes can't be freed/recycled until the + * reference drops to zero. + * + * E.g. with a ref of one here: + * [ 0 ][ 0 ][ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ] + * + * These nodes could be freed or recycled ("reaped"): + * [ 0 ][ 0 ] + * + * These must be preserved until the leading ref drops to zero: + * [ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ] + * + * When a query starts we take a reference on the current tail of + * the list, knowing that no already-buffered samples can possibly + * relate to the newly-started query. A pointer to this node is + * also saved in the query object's ->oa.samples_head. + * + * E.g. starting query A while there are two nodes in .sample_buffers: + * ________________A________ + * | + * + * [ 0 ][ 1 ] + * ^_______ Add a reference and store pointer to node in + * A->oa.samples_head + * + * Moving forward to when the B query starts with no new buffer nodes: + * (for reference, i915 perf reads() are only done when queries finish) + * ________________A_______ + * | ________B___ + * | | + * + * [ 0 ][ 2 ] + * ^_______ Add a reference and store pointer to + * node in B->oa.samples_head + * + * Once a query is finished, after an OA query has become 'Ready', + * once the End OA report has landed and after we we have processed + * all the intermediate periodic samples then we drop the + * ->oa.samples_head reference we took at the start. + * + * So when the B query has finished we have: + * ________________A________ + * | ______B___________ + * | | | + * [ 0 ][ 1 ][ 0 ][ 0 ][ 0 ] + * ^_______ Drop B->oa.samples_head reference + * + * We still can't free these due to the A->oa.samples_head ref: + * [ 1 ][ 0 ][ 0 ][ 0 ] + * + * When the A query finishes: (note there's a new ref for C's samples_head) + * ________________A_________________ + * | | + * | _____C_________ + * | | | + * [ 0 ][ 0 ][ 0 ][ 0 ][ 1 ][ 0 ][ 0 ] + * ^_______ Drop A->oa.samples_head reference + * + * And we can now reap these nodes up to the C->oa.samples_head: + * [ X ][ X ][ X ][ X ] + * keeping -> [ 1 ][ 0 ][ 0 ] + * + * We reap old sample buffers each time we finish processing an OA + * query by iterating the sample_buffers list from the head until we + * find a referenced node and stop. + * + * Reaped buffers move to a perfquery.free_sample_buffers list and + * when we come to read() we first look to recycle a buffer from the + * free_sample_buffers list before allocating a new buffer. + */ +struct oa_sample_buf { + struct exec_node link; + int refcount; + int len; + uint8_t buf[I915_PERF_OA_SAMPLE_SIZE * 10]; + uint32_t last_timestamp; +}; + +/** + * gen representation of a performance query object. + * + * NB: We want to keep this structure relatively lean considering that + * applications may expect to allocate enough objects to be able to + * query around all draw calls in a frame. + */ +struct gen_perf_query_object +{ + const struct gen_perf_query_info *queryinfo; + + /* See query->kind to know which state below is in use... */ + union { + struct { + + /** + * BO containing OA counter snapshots at query Begin/End time. + */ + void *bo; + + /** + * Address of mapped of @bo + */ + void *map; + + /** + * The MI_REPORT_PERF_COUNT command lets us specify a unique + * ID that will be reflected in the resulting OA report + * that's written by the GPU. This is the ID we're expecting + * in the begin report and the the end report should be + * @begin_report_id + 1. + */ + int begin_report_id; + + /** + * Reference the head of the brw->perfquery.sample_buffers + * list at the time that the query started (so we only need + * to look at nodes after this point when looking for samples + * related to this query) + * + * (See struct brw_oa_sample_buf description for more details) + */ + struct exec_node *samples_head; + + /** + * false while in the unaccumulated_elements list, and set to + * true when the final, end MI_RPC snapshot has been + * accumulated. + */ + bool results_accumulated; + + /** + * Frequency of the GT at begin and end of the query. + */ + uint64_t gt_frequency[2]; + + /** + * Accumulated OA results between begin and end of the query. + */ + struct gen_perf_query_result result; + } oa; + + struct { + /** + * BO containing starting and ending snapshots for the + * statistics counters. + */ + void *bo; + } pipeline_stats; + }; +}; + +struct gen_perf_context { + struct gen_perf_config *perf; + + void * ctx; /* driver context (eg, brw_context) */ + void * bufmgr; + const struct gen_device_info *devinfo; + + uint32_t hw_ctx; + int drm_fd; + + /* The i915 perf stream we open to setup + enable the OA counters */ + int oa_stream_fd; + + /* An i915 perf stream fd gives exclusive access to the OA unit that will + * report counter snapshots for a specific counter set/profile in a + * specific layout/format so we can only start OA queries that are + * compatible with the currently open fd... + */ + int current_oa_metrics_set_id; + int current_oa_format; + + /* List of buffers containing OA reports */ + struct exec_list sample_buffers; + + /* Cached list of empty sample buffers */ + struct exec_list free_sample_buffers; + + int n_active_oa_queries; + int n_active_pipeline_stats_queries; + + /* The number of queries depending on running OA counters which + * extends beyond brw_end_perf_query() since we need to wait until + * the last MI_RPC command has parsed by the GPU. + * + * Accurate accounting is important here as emitting an + * MI_REPORT_PERF_COUNT command while the OA unit is disabled will + * effectively hang the gpu. + */ + int n_oa_users; + + /* To help catch an spurious problem with the hardware or perf + * forwarding samples, we emit each MI_REPORT_PERF_COUNT command + * with a unique ID that we can explicitly check for... + */ + int next_query_start_report_id; + + /** + * An array of queries whose results haven't yet been assembled + * based on the data in buffer objects. + * + * These may be active, or have already ended. However, the + * results have not been requested. + */ + struct gen_perf_query_object **unaccumulated; + int unaccumulated_elements; + int unaccumulated_array_size; + + /* The total number of query objects so we can relinquish + * our exclusive access to perf if the application deletes + * all of its objects. (NB: We only disable perf while + * there are no active queries) + */ + int n_query_instances; +}; + +const struct gen_perf_query_info* +gen_perf_query_info(const struct gen_perf_query_object *query) +{ + return query->queryinfo; +} + +struct gen_perf_context * +gen_perf_new_context(void *parent) +{ + struct gen_perf_context *ctx = rzalloc(parent, struct gen_perf_context); + if (! ctx) + fprintf(stderr, "%s: failed to alloc context\n", __func__); + return ctx; +} + +struct gen_perf_config * +gen_perf_config(struct gen_perf_context *ctx) +{ + return ctx->perf; +} + +struct gen_perf_query_object * +gen_perf_new_query(struct gen_perf_context *perf_ctx, unsigned query_index) +{ + const struct gen_perf_query_info *query = + &perf_ctx->perf->queries[query_index]; + struct gen_perf_query_object *obj = + calloc(1, sizeof(struct gen_perf_query_object)); + + if (!obj) + return NULL; + + obj->queryinfo = query; + + perf_ctx->n_query_instances++; + return obj; +} + +int +gen_perf_active_queries(struct gen_perf_context *perf_ctx, + const struct gen_perf_query_info *query) +{ + assert(perf_ctx->n_active_oa_queries == 0 || perf_ctx->n_active_pipeline_stats_queries == 0); + + switch (query->kind) { + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: + return perf_ctx->n_active_oa_queries; + break; + + case GEN_PERF_QUERY_TYPE_PIPELINE: + return perf_ctx->n_active_pipeline_stats_queries; + break; + + default: + unreachable("Unknown query type"); + break; + } +} + static bool get_sysfs_dev_dir(struct gen_perf_config *perf, int fd) { @@ -151,18 +487,37 @@ read_sysfs_drm_device_file_uint64(struct gen_perf_config *perf, return read_file_uint64(buf, value); } +static inline struct gen_perf_query_info * +append_query_info(struct gen_perf_config *perf, int max_counters) +{ + struct gen_perf_query_info *query; + + perf->queries = reralloc(perf, perf->queries, + struct gen_perf_query_info, + ++perf->n_queries); + query = &perf->queries[perf->n_queries - 1]; + memset(query, 0, sizeof(*query)); + + if (max_counters > 0) { + query->max_counters = max_counters; + query->counters = + rzalloc_array(perf, struct gen_perf_query_counter, max_counters); + } + + return query; +} + static void register_oa_config(struct gen_perf_config *perf, const struct gen_perf_query_info *query, uint64_t config_id) { - struct gen_perf_query_info *registred_query = - gen_perf_query_append_query_info(perf, 0); + struct gen_perf_query_info *registered_query = append_query_info(perf, 0); - *registred_query = *query; - registred_query->oa_metrics_set_id = config_id; - DBG("metric set registred: id = %" PRIu64", guid = %s\n", - registred_query->oa_metrics_set_id, query->guid); + *registered_query = *query; + registered_query->oa_metrics_set_id = config_id; + DBG("metric set registered: id = %" PRIu64", guid = %s\n", + registered_query->oa_metrics_set_id, query->guid); } static void @@ -228,9 +583,9 @@ kernel_has_dynamic_config_support(struct gen_perf_config *perf, int fd) &invalid_config_id) < 0 && errno == ENOENT; } -bool -gen_perf_load_metric_id(struct gen_perf_config *perf, const char *guid, - uint64_t *metric_id) +static bool +load_metric_id(struct gen_perf_config *perf, const char *guid, + uint64_t *metric_id) { char config_path[280]; @@ -250,7 +605,7 @@ init_oa_configs(struct gen_perf_config *perf, int fd) uint64_t config_id; int ret; - if (gen_perf_load_metric_id(perf, query->guid, &config_id)) { + if (load_metric_id(perf, query->guid, &config_id)) { DBG("metric set: %s (already loaded)\n", query->guid); register_oa_config(perf, query, config_id); continue; @@ -381,8 +736,123 @@ get_register_queries_function(const struct gen_device_info *devinfo) return NULL; } -bool -gen_perf_load_oa_metrics(struct gen_perf_config *perf, int fd, +static inline void +add_stat_reg(struct gen_perf_query_info *query, uint32_t reg, + uint32_t numerator, uint32_t denominator, + const char *name, const char *description) +{ + struct gen_perf_query_counter *counter; + + assert(query->n_counters < query->max_counters); + + counter = &query->counters[query->n_counters]; + counter->name = name; + counter->desc = description; + counter->type = GEN_PERF_COUNTER_TYPE_RAW; + counter->data_type = GEN_PERF_COUNTER_DATA_TYPE_UINT64; + counter->offset = sizeof(uint64_t) * query->n_counters; + counter->pipeline_stat.reg = reg; + counter->pipeline_stat.numerator = numerator; + counter->pipeline_stat.denominator = denominator; + + query->n_counters++; +} + +static inline void +add_basic_stat_reg(struct gen_perf_query_info *query, + uint32_t reg, const char *name) +{ + add_stat_reg(query, reg, 1, 1, name, name); +} + +static void +load_pipeline_statistic_metrics(struct gen_perf_config *perf_cfg, + const struct gen_device_info *devinfo) +{ + struct gen_perf_query_info *query = + append_query_info(perf_cfg, MAX_STAT_COUNTERS); + + query->kind = GEN_PERF_QUERY_TYPE_PIPELINE; + query->name = "Pipeline Statistics Registers"; + + add_basic_stat_reg(query, IA_VERTICES_COUNT, + "N vertices submitted"); + add_basic_stat_reg(query, IA_PRIMITIVES_COUNT, + "N primitives submitted"); + add_basic_stat_reg(query, VS_INVOCATION_COUNT, + "N vertex shader invocations"); + + if (devinfo->gen == 6) { + add_stat_reg(query, GEN6_SO_PRIM_STORAGE_NEEDED, 1, 1, + "SO_PRIM_STORAGE_NEEDED", + "N geometry shader stream-out primitives (total)"); + add_stat_reg(query, GEN6_SO_NUM_PRIMS_WRITTEN, 1, 1, + "SO_NUM_PRIMS_WRITTEN", + "N geometry shader stream-out primitives (written)"); + } else { + add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(0), 1, 1, + "SO_PRIM_STORAGE_NEEDED (Stream 0)", + "N stream-out (stream 0) primitives (total)"); + add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(1), 1, 1, + "SO_PRIM_STORAGE_NEEDED (Stream 1)", + "N stream-out (stream 1) primitives (total)"); + add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(2), 1, 1, + "SO_PRIM_STORAGE_NEEDED (Stream 2)", + "N stream-out (stream 2) primitives (total)"); + add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(3), 1, 1, + "SO_PRIM_STORAGE_NEEDED (Stream 3)", + "N stream-out (stream 3) primitives (total)"); + add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(0), 1, 1, + "SO_NUM_PRIMS_WRITTEN (Stream 0)", + "N stream-out (stream 0) primitives (written)"); + add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(1), 1, 1, + "SO_NUM_PRIMS_WRITTEN (Stream 1)", + "N stream-out (stream 1) primitives (written)"); + add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(2), 1, 1, + "SO_NUM_PRIMS_WRITTEN (Stream 2)", + "N stream-out (stream 2) primitives (written)"); + add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(3), 1, 1, + "SO_NUM_PRIMS_WRITTEN (Stream 3)", + "N stream-out (stream 3) primitives (written)"); + } + + add_basic_stat_reg(query, HS_INVOCATION_COUNT, + "N TCS shader invocations"); + add_basic_stat_reg(query, DS_INVOCATION_COUNT, + "N TES shader invocations"); + + add_basic_stat_reg(query, GS_INVOCATION_COUNT, + "N geometry shader invocations"); + add_basic_stat_reg(query, GS_PRIMITIVES_COUNT, + "N geometry shader primitives emitted"); + + add_basic_stat_reg(query, CL_INVOCATION_COUNT, + "N primitives entering clipping"); + add_basic_stat_reg(query, CL_PRIMITIVES_COUNT, + "N primitives leaving clipping"); + + if (devinfo->is_haswell || devinfo->gen == 8) { + add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4, + "N fragment shader invocations", + "N fragment shader invocations"); + } else { + add_basic_stat_reg(query, PS_INVOCATION_COUNT, + "N fragment shader invocations"); + } + + add_basic_stat_reg(query, PS_DEPTH_COUNT, + "N z-pass fragments"); + + if (devinfo->gen >= 7) { + add_basic_stat_reg(query, CS_INVOCATION_COUNT, + "N compute shader invocations"); + } + + query->data_size = sizeof(uint64_t) * query->n_counters; +} + +static bool +load_oa_metrics(struct gen_perf_config *perf, int fd, const struct gen_device_info *devinfo) { perf_register_oa_queries_t oa_register = get_register_queries_function(devinfo); @@ -494,11 +964,11 @@ gen8_read_report_clock_ratios(const uint32_t *report, *unslice_freq_hz = unslice_freq * 16666667ULL; } -void -gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result, - const struct gen_device_info *devinfo, - const uint32_t *start, - const uint32_t *end) +static void +query_result_read_frequencies(struct gen_perf_query_result *result, + const struct gen_device_info *devinfo, + const uint32_t *start, + const uint32_t *end) { /* Slice/Unslice frequency is only available in the OA reports when the * "Disable OA reports due to clock ratio change" field in @@ -519,11 +989,11 @@ gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result, &result->unslice_frequency[1]); } -void -gen_perf_query_result_accumulate(struct gen_perf_query_result *result, - const struct gen_perf_query_info *query, - const uint32_t *start, - const uint32_t *end) +static void +query_result_accumulate(struct gen_perf_query_result *result, + const struct gen_perf_query_info *query, + const uint32_t *start, + const uint32_t *end) { int i, idx = 0; @@ -561,13 +1031,69 @@ gen_perf_query_result_accumulate(struct gen_perf_query_result *result, } -void -gen_perf_query_result_clear(struct gen_perf_query_result *result) +static void +query_result_clear(struct gen_perf_query_result *result) { memset(result, 0, sizeof(*result)); result->hw_id = 0xffffffff; /* invalid */ } +static void +register_mdapi_statistic_query(struct gen_perf_config *perf_cfg, + const struct gen_device_info *devinfo) +{ + if (!(devinfo->gen >= 7 && devinfo->gen <= 11)) + return; + + struct gen_perf_query_info *query = + append_query_info(perf_cfg, MAX_STAT_COUNTERS); + + query->kind = GEN_PERF_QUERY_TYPE_PIPELINE; + query->name = "Intel_Raw_Pipeline_Statistics_Query"; + + /* The order has to match mdapi_pipeline_metrics. */ + add_basic_stat_reg(query, IA_VERTICES_COUNT, + "N vertices submitted"); + add_basic_stat_reg(query, IA_PRIMITIVES_COUNT, + "N primitives submitted"); + add_basic_stat_reg(query, VS_INVOCATION_COUNT, + "N vertex shader invocations"); + add_basic_stat_reg(query, GS_INVOCATION_COUNT, + "N geometry shader invocations"); + add_basic_stat_reg(query, GS_PRIMITIVES_COUNT, + "N geometry shader primitives emitted"); + add_basic_stat_reg(query, CL_INVOCATION_COUNT, + "N primitives entering clipping"); + add_basic_stat_reg(query, CL_PRIMITIVES_COUNT, + "N primitives leaving clipping"); + if (devinfo->is_haswell || devinfo->gen == 8) { + add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4, + "N fragment shader invocations", + "N fragment shader invocations"); + } else { + add_basic_stat_reg(query, PS_INVOCATION_COUNT, + "N fragment shader invocations"); + } + add_basic_stat_reg(query, HS_INVOCATION_COUNT, + "N TCS shader invocations"); + add_basic_stat_reg(query, DS_INVOCATION_COUNT, + "N TES shader invocations"); + if (devinfo->gen >= 7) { + add_basic_stat_reg(query, CS_INVOCATION_COUNT, + "N compute shader invocations"); + } + + if (devinfo->gen >= 10) { + /* Reuse existing CS invocation register until we can expose this new + * one. + */ + add_basic_stat_reg(query, CS_INVOCATION_COUNT, + "Reserved1"); + } + + query->data_size = sizeof(uint64_t) * query->n_counters; +} + static void fill_mdapi_perf_query_counter(struct gen_perf_query_info *query, const char *name, @@ -604,9 +1130,9 @@ fill_mdapi_perf_query_counter(struct gen_perf_query_info *query, sizeof(struct_name.field_name[0]), \ GEN_PERF_COUNTER_DATA_TYPE_##type_name) -void -gen_perf_query_register_mdapi_oa_query(const struct gen_device_info *devinfo, - struct gen_perf_config *perf) +static void +register_mdapi_oa_query(const struct gen_device_info *devinfo, + struct gen_perf_config *perf) { struct gen_perf_query_info *query = NULL; @@ -618,7 +1144,7 @@ gen_perf_query_register_mdapi_oa_query(const struct gen_device_info *devinfo, switch (devinfo->gen) { case 7: { - query = gen_perf_query_append_query_info(perf, 1 + 45 + 16 + 7); + query = append_query_info(perf, 1 + 45 + 16 + 7); query->oa_format = I915_OA_FORMAT_A45_B8_C8; struct gen7_mdapi_metrics metric_data; @@ -643,7 +1169,7 @@ gen_perf_query_register_mdapi_oa_query(const struct gen_device_info *devinfo, break; } case 8: { - query = gen_perf_query_append_query_info(perf, 2 + 36 + 16 + 16); + query = append_query_info(perf, 2 + 36 + 16 + 16); query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8; struct gen8_mdapi_metrics metric_data; @@ -680,7 +1206,7 @@ gen_perf_query_register_mdapi_oa_query(const struct gen_device_info *devinfo, case 9: case 10: case 11: { - query = gen_perf_query_append_query_info(perf, 2 + 36 + 16 + 16 + 16 + 2); + query = append_query_info(perf, 2 + 36 + 16 + 16 + 16 + 2); query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8; struct gen9_mdapi_metrics metric_data; @@ -742,65 +1268,9 @@ gen_perf_query_register_mdapi_oa_query(const struct gen_device_info *devinfo, } } -void -gen_perf_query_register_mdapi_statistic_query(const struct gen_device_info *devinfo, - struct gen_perf_config *perf) -{ - if (!(devinfo->gen >= 7 && devinfo->gen <= 11)) - return; - - struct gen_perf_query_info *query = - gen_perf_query_append_query_info(perf, MAX_STAT_COUNTERS); - - query->kind = GEN_PERF_QUERY_TYPE_PIPELINE; - query->name = "Intel_Raw_Pipeline_Statistics_Query"; - - /* The order has to match mdapi_pipeline_metrics. */ - gen_perf_query_info_add_basic_stat_reg(query, IA_VERTICES_COUNT, - "N vertices submitted"); - gen_perf_query_info_add_basic_stat_reg(query, IA_PRIMITIVES_COUNT, - "N primitives submitted"); - gen_perf_query_info_add_basic_stat_reg(query, VS_INVOCATION_COUNT, - "N vertex shader invocations"); - gen_perf_query_info_add_basic_stat_reg(query, GS_INVOCATION_COUNT, - "N geometry shader invocations"); - gen_perf_query_info_add_basic_stat_reg(query, GS_PRIMITIVES_COUNT, - "N geometry shader primitives emitted"); - gen_perf_query_info_add_basic_stat_reg(query, CL_INVOCATION_COUNT, - "N primitives entering clipping"); - gen_perf_query_info_add_basic_stat_reg(query, CL_PRIMITIVES_COUNT, - "N primitives leaving clipping"); - if (devinfo->is_haswell || devinfo->gen == 8) { - gen_perf_query_info_add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4, - "N fragment shader invocations", - "N fragment shader invocations"); - } else { - gen_perf_query_info_add_basic_stat_reg(query, PS_INVOCATION_COUNT, - "N fragment shader invocations"); - } - gen_perf_query_info_add_basic_stat_reg(query, HS_INVOCATION_COUNT, - "N TCS shader invocations"); - gen_perf_query_info_add_basic_stat_reg(query, DS_INVOCATION_COUNT, - "N TES shader invocations"); - if (devinfo->gen >= 7) { - gen_perf_query_info_add_basic_stat_reg(query, CS_INVOCATION_COUNT, - "N compute shader invocations"); - } - - if (devinfo->gen >= 10) { - /* Reuse existing CS invocation register until we can expose this new - * one. - */ - gen_perf_query_info_add_basic_stat_reg(query, CS_INVOCATION_COUNT, - "Reserved1"); - } - - query->data_size = sizeof(uint64_t) * query->n_counters; -} - -uint64_t -gen_perf_query_get_metric_id(struct gen_perf_config *perf, - const struct gen_perf_query_info *query) +static uint64_t +get_metric_id(struct gen_perf_config *perf, + const struct gen_perf_query_info *query) { /* These queries are know not to ever change, their config ID has been * loaded upon the first query creation. No need to look them up again. @@ -822,8 +1292,8 @@ gen_perf_query_get_metric_id(struct gen_perf_config *perf, } struct gen_perf_query_info *raw_query = (struct gen_perf_query_info *)query; - if (!gen_perf_load_metric_id(perf, query->guid, - &raw_query->oa_metrics_set_id)) { + if (!load_metric_id(perf, query->guid, + &raw_query->oa_metrics_set_id)) { DBG("Unable to read query guid=%s ID, falling back to test config\n", query->guid); raw_query->oa_metrics_set_id = 1ULL; } else { @@ -833,8 +1303,8 @@ gen_perf_query_get_metric_id(struct gen_perf_config *perf, return query->oa_metrics_set_id; } -struct oa_sample_buf * -gen_perf_get_free_sample_buf(struct gen_perf_context *perf_ctx) +static struct oa_sample_buf * +get_free_sample_buf(struct gen_perf_context *perf_ctx) { struct exec_node *node = exec_list_pop_head(&perf_ctx->free_sample_buffers); struct oa_sample_buf *buf; @@ -852,8 +1322,8 @@ gen_perf_get_free_sample_buf(struct gen_perf_context *perf_ctx) return buf; } -void -gen_perf_reap_old_sample_buffers(struct gen_perf_context *perf_ctx) +static void +reap_old_sample_buffers(struct gen_perf_context *perf_ctx) { struct exec_node *tail_node = exec_list_get_tail(&perf_ctx->sample_buffers); @@ -876,8 +1346,8 @@ gen_perf_reap_old_sample_buffers(struct gen_perf_context *perf_ctx) } } -void -gen_perf_free_sample_bufs(struct gen_perf_context *perf_ctx) +static void +free_sample_bufs(struct gen_perf_context *perf_ctx) { foreach_list_typed_safe(struct oa_sample_buf, buf, link, &perf_ctx->free_sample_buffers) @@ -892,11 +1362,11 @@ gen_perf_free_sample_bufs(struct gen_perf_context *perf_ctx) * Emit MI_STORE_REGISTER_MEM commands to capture all of the * pipeline statistics for the performance query object. */ -void -gen_perf_snapshot_statistics_registers(void *context, - struct gen_perf_config *perf, - struct gen_perf_query_object *obj, - uint32_t offset_in_bytes) +static void +snapshot_statistics_registers(void *context, + struct gen_perf_config *perf, + struct gen_perf_query_object *obj, + uint32_t offset_in_bytes) { const struct gen_perf_query_info *query = obj->queryinfo; const int n_counters = query->n_counters; @@ -912,7 +1382,7 @@ gen_perf_snapshot_statistics_registers(void *context, } } -void +static void gen_perf_close(struct gen_perf_context *perfquery, const struct gen_perf_query_info *query) { @@ -927,7 +1397,7 @@ gen_perf_close(struct gen_perf_context *perfquery, } } -bool +static bool gen_perf_open(struct gen_perf_context *perf_ctx, int metrics_set_id, int report_format, @@ -968,8 +1438,8 @@ gen_perf_open(struct gen_perf_context *perf_ctx, return true; } -bool -gen_perf_inc_n_users(struct gen_perf_context *perf_ctx) +static bool +inc_n_users(struct gen_perf_context *perf_ctx) { if (perf_ctx->n_oa_users == 0 && gen_ioctl(perf_ctx->oa_stream_fd, I915_PERF_IOCTL_ENABLE, 0) < 0) @@ -981,8 +1451,8 @@ gen_perf_inc_n_users(struct gen_perf_context *perf_ctx) return true; } -void -gen_perf_dec_n_users(struct gen_perf_context *perf_ctx) +static void +dec_n_users(struct gen_perf_context *perf_ctx) { /* Disabling the i915 perf stream will effectively disable the OA * counters. Note it's important to be sure there are no outstanding @@ -997,6 +1467,17 @@ gen_perf_dec_n_users(struct gen_perf_context *perf_ctx) } } +void +gen_perf_init_metrics(struct gen_perf_config *perf_cfg, + const struct gen_device_info *devinfo, + int drm_fd) +{ + load_pipeline_statistic_metrics(perf_cfg, devinfo); + register_mdapi_statistic_query(perf_cfg, devinfo); + if (load_oa_metrics(perf_cfg, drm_fd, devinfo)) + register_mdapi_oa_query(devinfo, perf_cfg); +} + void gen_perf_init_context(struct gen_perf_context *perf_ctx, struct gen_perf_config *perf_cfg, @@ -1026,7 +1507,7 @@ gen_perf_init_context(struct gen_perf_context *perf_ctx, * Begin an OA query we can always take a reference on a buffer * in this list. */ - struct oa_sample_buf *buf = gen_perf_get_free_sample_buf(perf_ctx); + struct oa_sample_buf *buf = get_free_sample_buf(perf_ctx); exec_list_push_head(&perf_ctx->sample_buffers, &buf->link); perf_ctx->oa_stream_fd = -1; @@ -1120,7 +1601,7 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx, * require a different counter set or format unless we get an opportunity * to close the stream and open a new one... */ - uint64_t metric_id = gen_perf_query_get_metric_id(perf_ctx->perf, queryinfo); + uint64_t metric_id = get_metric_id(perf_ctx->perf, queryinfo); if (perf_ctx->oa_stream_fd != -1 && perf_ctx->current_oa_metrics_set_id != metric_id) { @@ -1198,7 +1679,7 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx, perf_ctx->current_oa_format == queryinfo->oa_format); } - if (!gen_perf_inc_n_users(perf_ctx)) { + if (!inc_n_users(perf_ctx)) { DBG("WARNING: Error enabling i915 perf stream: %m\n"); return false; } @@ -1254,7 +1735,7 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx, */ buf->refcount++; - gen_perf_query_result_clear(&query->oa.result); + query_result_clear(&query->oa.result); query->oa.results_accumulated = false; add_to_unaccumulated_query_list(perf_ctx, query); @@ -1273,7 +1754,7 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx, STATS_BO_SIZE); /* Take starting snapshots. */ - gen_perf_snapshot_statistics_registers(perf_ctx->ctx , perf_cfg, query, 0); + snapshot_statistics_registers(perf_ctx->ctx , perf_cfg, query, 0); ++perf_ctx->n_active_pipeline_stats_queries; break; @@ -1327,8 +1808,8 @@ gen_perf_end_query(struct gen_perf_context *perf_ctx, break; case GEN_PERF_QUERY_TYPE_PIPELINE: - gen_perf_snapshot_statistics_registers(perf_ctx->ctx, perf_cfg, query, - STATS_BO_END_OFFSET_BYTES); + snapshot_statistics_registers(perf_ctx->ctx, perf_cfg, query, + STATS_BO_END_OFFSET_BYTES); --perf_ctx->n_active_pipeline_stats_queries; break; @@ -1356,7 +1837,7 @@ read_oa_samples_until(struct gen_perf_context *perf_ctx, uint32_t last_timestamp = tail_buf->last_timestamp; while (1) { - struct oa_sample_buf *buf = gen_perf_get_free_sample_buf(perf_ctx); + struct oa_sample_buf *buf = get_free_sample_buf(perf_ctx); uint32_t offset; int len; @@ -1529,3 +2010,469 @@ gen_perf_is_query_ready(struct gen_perf_context *perf_ctx, return false; } + +/** + * Remove a query from the global list of unaccumulated queries once + * after successfully accumulating the OA reports associated with the + * query in accumulate_oa_reports() or when discarding unwanted query + * results. + */ +static void +drop_from_unaccumulated_query_list(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query) +{ + for (int i = 0; i < perf_ctx->unaccumulated_elements; i++) { + if (perf_ctx->unaccumulated[i] == query) { + int last_elt = --perf_ctx->unaccumulated_elements; + + if (i == last_elt) + perf_ctx->unaccumulated[i] = NULL; + else { + perf_ctx->unaccumulated[i] = + perf_ctx->unaccumulated[last_elt]; + } + + break; + } + } + + /* Drop our samples_head reference so that associated periodic + * sample data buffers can potentially be reaped if they aren't + * referenced by any other queries... + */ + + struct oa_sample_buf *buf = + exec_node_data(struct oa_sample_buf, query->oa.samples_head, link); + + assert(buf->refcount > 0); + buf->refcount--; + + query->oa.samples_head = NULL; + + reap_old_sample_buffers(perf_ctx); +} + +/* In general if we see anything spurious while accumulating results, + * we don't try and continue accumulating the current query, hoping + * for the best, we scrap anything outstanding, and then hope for the + * best with new queries. + */ +static void +discard_all_queries(struct gen_perf_context *perf_ctx) +{ + while (perf_ctx->unaccumulated_elements) { + struct gen_perf_query_object *query = perf_ctx->unaccumulated[0]; + + query->oa.results_accumulated = true; + drop_from_unaccumulated_query_list(perf_ctx, query); + + dec_n_users(perf_ctx); + } +} + +/** + * Accumulate raw OA counter values based on deltas between pairs of + * OA reports. + * + * Accumulation starts from the first report captured via + * MI_REPORT_PERF_COUNT (MI_RPC) by brw_begin_perf_query() until the + * last MI_RPC report requested by brw_end_perf_query(). Between these + * two reports there may also some number of periodically sampled OA + * reports collected via the i915 perf interface - depending on the + * duration of the query. + * + * These periodic snapshots help to ensure we handle counter overflow + * correctly by being frequent enough to ensure we don't miss multiple + * overflows of a counter between snapshots. For Gen8+ the i915 perf + * snapshots provide the extra context-switch reports that let us + * subtract out the progress of counters associated with other + * contexts running on the system. + */ +static void +accumulate_oa_reports(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query) +{ + const struct gen_device_info *devinfo = perf_ctx->devinfo; + uint32_t *start; + uint32_t *last; + uint32_t *end; + struct exec_node *first_samples_node; + bool in_ctx = true; + int out_duration = 0; + + assert(query->oa.map != NULL); + + start = last = query->oa.map; + end = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES; + + if (start[0] != query->oa.begin_report_id) { + DBG("Spurious start report id=%"PRIu32"\n", start[0]); + goto error; + } + if (end[0] != (query->oa.begin_report_id + 1)) { + DBG("Spurious end report id=%"PRIu32"\n", end[0]); + goto error; + } + + /* See if we have any periodic reports to accumulate too... */ + + /* N.B. The oa.samples_head was set when the query began and + * pointed to the tail of the perf_ctx->sample_buffers list at + * the time the query started. Since the buffer existed before the + * first MI_REPORT_PERF_COUNT command was emitted we therefore know + * that no data in this particular node's buffer can possibly be + * associated with the query - so skip ahead one... + */ + first_samples_node = query->oa.samples_head->next; + + foreach_list_typed_from(struct oa_sample_buf, buf, link, + &perf_ctx.sample_buffers, + first_samples_node) + { + int offset = 0; + + while (offset < buf->len) { + const struct drm_i915_perf_record_header *header = + (const struct drm_i915_perf_record_header *)(buf->buf + offset); + + assert(header->size != 0); + assert(header->size <= buf->len); + + offset += header->size; + + switch (header->type) { + case DRM_I915_PERF_RECORD_SAMPLE: { + uint32_t *report = (uint32_t *)(header + 1); + bool add = true; + + /* Ignore reports that come before the start marker. + * (Note: takes care to allow overflow of 32bit timestamps) + */ + if (gen_device_info_timebase_scale(devinfo, + report[1] - start[1]) > 5000000000) { + continue; + } + + /* Ignore reports that come after the end marker. + * (Note: takes care to allow overflow of 32bit timestamps) + */ + if (gen_device_info_timebase_scale(devinfo, + report[1] - end[1]) <= 5000000000) { + goto end; + } + + /* For Gen8+ since the counters continue while other + * contexts are running we need to discount any unrelated + * deltas. The hardware automatically generates a report + * on context switch which gives us a new reference point + * to continuing adding deltas from. + * + * For Haswell we can rely on the HW to stop the progress + * of OA counters while any other context is acctive. + */ + if (devinfo->gen >= 8) { + if (in_ctx && report[2] != query->oa.result.hw_id) { + DBG("i915 perf: Switch AWAY (observed by ID change)\n"); + in_ctx = false; + out_duration = 0; + } else if (in_ctx == false && report[2] == query->oa.result.hw_id) { + DBG("i915 perf: Switch TO\n"); + in_ctx = true; + + /* From experimentation in IGT, we found that the OA unit + * might label some report as "idle" (using an invalid + * context ID), right after a report for a given context. + * Deltas generated by those reports actually belong to the + * previous context, even though they're not labelled as + * such. + * + * We didn't *really* Switch AWAY in the case that we e.g. + * saw a single periodic report while idle... + */ + if (out_duration >= 1) + add = false; + } else if (in_ctx) { + assert(report[2] == query->oa.result.hw_id); + DBG("i915 perf: Continuation IN\n"); + } else { + assert(report[2] != query->oa.result.hw_id); + DBG("i915 perf: Continuation OUT\n"); + add = false; + out_duration++; + } + } + + if (add) { + query_result_accumulate(&query->oa.result, query->queryinfo, + last, report); + } + + last = report; + + break; + } + + case DRM_I915_PERF_RECORD_OA_BUFFER_LOST: + DBG("i915 perf: OA error: all reports lost\n"); + goto error; + case DRM_I915_PERF_RECORD_OA_REPORT_LOST: + DBG("i915 perf: OA report lost\n"); + break; + } + } + } + +end: + + query_result_accumulate(&query->oa.result, query->queryinfo, + last, end); + + query->oa.results_accumulated = true; + drop_from_unaccumulated_query_list(perf_ctx, query); + dec_n_users(perf_ctx); + + return; + +error: + + discard_all_queries(perf_ctx); +} + +void +gen_perf_delete_query(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query) +{ + struct gen_perf_config *perf_cfg = perf_ctx->perf; + + /* We can assume that the frontend waits for a query to complete + * before ever calling into here, so we don't have to worry about + * deleting an in-flight query object. + */ + switch (query->queryinfo->kind) { + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: + if (query->oa.bo) { + if (!query->oa.results_accumulated) { + drop_from_unaccumulated_query_list(perf_ctx, query); + dec_n_users(perf_ctx); + } + + perf_cfg->vtbl.bo_unreference(query->oa.bo); + query->oa.bo = NULL; + } + + query->oa.results_accumulated = false; + break; + + case GEN_PERF_QUERY_TYPE_PIPELINE: + if (query->pipeline_stats.bo) { + perf_cfg->vtbl.bo_unreference(query->pipeline_stats.bo); + query->pipeline_stats.bo = NULL; + } + break; + + default: + unreachable("Unknown query type"); + break; + } + + /* As an indication that the INTEL_performance_query extension is no + * longer in use, it's a good time to free our cache of sample + * buffers and close any current i915-perf stream. + */ + if (--perf_ctx->n_query_instances == 0) { + free_sample_bufs(perf_ctx); + gen_perf_close(perf_ctx, query->queryinfo); + } + + free(query); +} + +#define GET_FIELD(word, field) (((word) & field ## _MASK) >> field ## _SHIFT) + +static void +read_gt_frequency(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *obj) +{ + const struct gen_device_info *devinfo = perf_ctx->devinfo; + uint32_t start = *((uint32_t *)(obj->oa.map + MI_FREQ_START_OFFSET_BYTES)), + end = *((uint32_t *)(obj->oa.map + MI_FREQ_END_OFFSET_BYTES)); + + switch (devinfo->gen) { + case 7: + case 8: + obj->oa.gt_frequency[0] = GET_FIELD(start, GEN7_RPSTAT1_CURR_GT_FREQ) * 50ULL; + obj->oa.gt_frequency[1] = GET_FIELD(end, GEN7_RPSTAT1_CURR_GT_FREQ) * 50ULL; + break; + case 9: + case 10: + case 11: + obj->oa.gt_frequency[0] = GET_FIELD(start, GEN9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL; + obj->oa.gt_frequency[1] = GET_FIELD(end, GEN9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL; + break; + default: + unreachable("unexpected gen"); + } + + /* Put the numbers into Hz. */ + obj->oa.gt_frequency[0] *= 1000000ULL; + obj->oa.gt_frequency[1] *= 1000000ULL; +} + +static int +get_oa_counter_data(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query, + size_t data_size, + uint8_t *data) +{ + struct gen_perf_config *perf_cfg = perf_ctx->perf; + const struct gen_perf_query_info *queryinfo = query->queryinfo; + int n_counters = queryinfo->n_counters; + int written = 0; + + for (int i = 0; i < n_counters; i++) { + const struct gen_perf_query_counter *counter = &queryinfo->counters[i]; + uint64_t *out_uint64; + float *out_float; + size_t counter_size = gen_perf_query_counter_get_size(counter); + + if (counter_size) { + switch (counter->data_type) { + case GEN_PERF_COUNTER_DATA_TYPE_UINT64: + out_uint64 = (uint64_t *)(data + counter->offset); + *out_uint64 = + counter->oa_counter_read_uint64(perf_cfg, queryinfo, + query->oa.result.accumulator); + break; + case GEN_PERF_COUNTER_DATA_TYPE_FLOAT: + out_float = (float *)(data + counter->offset); + *out_float = + counter->oa_counter_read_float(perf_cfg, queryinfo, + query->oa.result.accumulator); + break; + default: + /* So far we aren't using uint32, double or bool32... */ + unreachable("unexpected counter data type"); + } + written = counter->offset + counter_size; + } + } + + return written; +} + +static int +get_pipeline_stats_data(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query, + size_t data_size, + uint8_t *data) + +{ + struct gen_perf_config *perf_cfg = perf_ctx->perf; + const struct gen_perf_query_info *queryinfo = query->queryinfo; + int n_counters = queryinfo->n_counters; + uint8_t *p = data; + + uint64_t *start = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->pipeline_stats.bo, MAP_READ); + uint64_t *end = start + (STATS_BO_END_OFFSET_BYTES / sizeof(uint64_t)); + + for (int i = 0; i < n_counters; i++) { + const struct gen_perf_query_counter *counter = &queryinfo->counters[i]; + uint64_t value = end[i] - start[i]; + + if (counter->pipeline_stat.numerator != + counter->pipeline_stat.denominator) { + value *= counter->pipeline_stat.numerator; + value /= counter->pipeline_stat.denominator; + } + + *((uint64_t *)p) = value; + p += 8; + } + + perf_cfg->vtbl.bo_unmap(query->pipeline_stats.bo); + + return p - data; +} + +void +gen_perf_get_query_data(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query, + int data_size, + unsigned *data, + unsigned *bytes_written) +{ + struct gen_perf_config *perf_cfg = perf_ctx->perf; + int written = 0; + + switch (query->queryinfo->kind) { + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: + if (!query->oa.results_accumulated) { + read_gt_frequency(perf_ctx, query); + uint32_t *begin_report = query->oa.map; + uint32_t *end_report = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES; + query_result_read_frequencies(&query->oa.result, + perf_ctx->devinfo, + begin_report, + end_report); + accumulate_oa_reports(perf_ctx, query); + assert(query->oa.results_accumulated); + + perf_cfg->vtbl.bo_unmap(query->oa.bo); + query->oa.map = NULL; + } + if (query->queryinfo->kind == GEN_PERF_QUERY_TYPE_OA) { + written = get_oa_counter_data(perf_ctx, query, data_size, (uint8_t *)data); + } else { + const struct gen_device_info *devinfo = perf_ctx->devinfo; + + written = gen_perf_query_result_write_mdapi((uint8_t *)data, data_size, + devinfo, &query->oa.result, + query->oa.gt_frequency[0], + query->oa.gt_frequency[1]); + } + break; + + case GEN_PERF_QUERY_TYPE_PIPELINE: + written = get_pipeline_stats_data(perf_ctx, query, data_size, (uint8_t *)data); + break; + + default: + unreachable("Unknown query type"); + break; + } + + if (bytes_written) + *bytes_written = written; +} + +void +gen_perf_dump_query_count(struct gen_perf_context *perf_ctx) +{ + DBG("Queries: (Open queries = %d, OA users = %d)\n", + perf_ctx->n_active_oa_queries, perf_ctx->n_oa_users); +} + +void +gen_perf_dump_query(struct gen_perf_context *ctx, + struct gen_perf_query_object *obj, + void *current_batch) +{ + switch (obj->queryinfo->kind) { + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: + DBG("BO: %-4s OA data: %-10s %-15s\n", + obj->oa.bo ? "yes," : "no,", + gen_perf_is_query_ready(ctx, obj, current_batch) ? "ready," : "not ready,", + obj->oa.results_accumulated ? "accumulated" : "not accumulated"); + break; + case GEN_PERF_QUERY_TYPE_PIPELINE: + DBG("BO: %-4s\n", + obj->pipeline_stats.bo ? "yes" : "no"); + break; + default: + unreachable("Unknown query type"); + break; + } +}