src/intel/perf/gen_perf.h

   1 /*
   2  * Copyright © 2018 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #ifndef GEN_PERF_H
  25 #define GEN_PERF_H
  26
  27 #include <stdio.h>
  28 #include <stdint.h>
  29 #include <string.h>
  30
  31 #include <sys/sysmacros.h>
  32
  33 #include "util/hash_table.h"
  34 #include "compiler/glsl/list.h"
  35 #include "util/ralloc.h"
  36
  37 struct gen_device_info;
  38
  39 struct gen_perf_config;
  40 struct gen_perf_query_info;
  41
  42 #define GEN7_RPSTAT1                       0xA01C
  43 #define  GEN7_RPSTAT1_CURR_GT_FREQ_SHIFT   7
  44 #define  GEN7_RPSTAT1_CURR_GT_FREQ_MASK    INTEL_MASK(13, 7)
  45 #define  GEN7_RPSTAT1_PREV_GT_FREQ_SHIFT   0
  46 #define  GEN7_RPSTAT1_PREV_GT_FREQ_MASK    INTEL_MASK(6, 0)
  47
  48 #define GEN9_RPSTAT0                       0xA01C
  49 #define  GEN9_RPSTAT0_CURR_GT_FREQ_SHIFT   23
  50 #define  GEN9_RPSTAT0_CURR_GT_FREQ_MASK    INTEL_MASK(31, 23)
  51 #define  GEN9_RPSTAT0_PREV_GT_FREQ_SHIFT   0
  52 #define  GEN9_RPSTAT0_PREV_GT_FREQ_MASK    INTEL_MASK(8, 0)
  53
  54 enum gen_perf_counter_type {
  55    GEN_PERF_COUNTER_TYPE_EVENT,
  56    GEN_PERF_COUNTER_TYPE_DURATION_NORM,
  57    GEN_PERF_COUNTER_TYPE_DURATION_RAW,
  58    GEN_PERF_COUNTER_TYPE_THROUGHPUT,
  59    GEN_PERF_COUNTER_TYPE_RAW,
  60    GEN_PERF_COUNTER_TYPE_TIMESTAMP,
  61 };
  62
  63 enum gen_perf_counter_data_type {
  64    GEN_PERF_COUNTER_DATA_TYPE_BOOL32,
  65    GEN_PERF_COUNTER_DATA_TYPE_UINT32,
  66    GEN_PERF_COUNTER_DATA_TYPE_UINT64,
  67    GEN_PERF_COUNTER_DATA_TYPE_FLOAT,
  68    GEN_PERF_COUNTER_DATA_TYPE_DOUBLE,
  69 };
  70
  71 struct gen_pipeline_stat {
  72    uint32_t reg;
  73    uint32_t numerator;
  74    uint32_t denominator;
  75 };
  76
  77 /*
  78  * The largest OA formats we can use include:
  79  * For Haswell:
  80  *   1 timestamp, 45 A counters, 8 B counters and 8 C counters.
  81  * For Gen8+
  82  *   1 timestamp, 1 clock, 36 A counters, 8 B counters and 8 C counters
  83  */
  84 #define MAX_OA_REPORT_COUNTERS 62
  85
  86 #define IA_VERTICES_COUNT          0x2310
  87 #define IA_PRIMITIVES_COUNT        0x2318
  88 #define VS_INVOCATION_COUNT        0x2320
  89 #define HS_INVOCATION_COUNT        0x2300
  90 #define DS_INVOCATION_COUNT        0x2308
  91 #define GS_INVOCATION_COUNT        0x2328
  92 #define GS_PRIMITIVES_COUNT        0x2330
  93 #define CL_INVOCATION_COUNT        0x2338
  94 #define CL_PRIMITIVES_COUNT        0x2340
  95 #define PS_INVOCATION_COUNT        0x2348
  96 #define CS_INVOCATION_COUNT        0x2290
  97 #define PS_DEPTH_COUNT             0x2350
  98
  99 /*
 100  * When currently allocate only one page for pipeline statistics queries. Here
 101  * we derived the maximum number of counters for that amount.
 102  */
 103 #define STATS_BO_SIZE               4096
 104 #define STATS_BO_END_OFFSET_BYTES   (STATS_BO_SIZE / 2)
 105 #define MAX_STAT_COUNTERS           (STATS_BO_END_OFFSET_BYTES / 8)
 106
 107 #define I915_PERF_OA_SAMPLE_SIZE (8 +   /* drm_i915_perf_record_header */ \
 108                                   256)  /* OA counter report */
 109
 110 struct gen_perf_query_result {
 111    /**
 112     * Storage for the final accumulated OA counters.
 113     */
 114    uint64_t accumulator[MAX_OA_REPORT_COUNTERS];
 115
 116    /**
 117     * Hw ID used by the context on which the query was running.
 118     */
 119    uint32_t hw_id;
 120
 121    /**
 122     * Number of reports accumulated to produce the results.
 123     */
 124    uint32_t reports_accumulated;
 125
 126    /**
 127     * Frequency in the slices of the GT at the begin and end of the
 128     * query.
 129     */
 130    uint64_t slice_frequency[2];
 131
 132    /**
 133     * Frequency in the unslice of the GT at the begin and end of the
 134     * query.
 135     */
 136    uint64_t unslice_frequency[2];
 137 };
 138
 139 struct gen_perf_query_counter {
 140    const char *name;
 141    const char *desc;
 142    enum gen_perf_counter_type type;
 143    enum gen_perf_counter_data_type data_type;
 144    uint64_t raw_max;
 145    size_t offset;
 146
 147    union {
 148       uint64_t (*oa_counter_read_uint64)(struct gen_perf_config *perf,
 149                                          const struct gen_perf_query_info *query,
 150                                          const uint64_t *accumulator);
 151       float (*oa_counter_read_float)(struct gen_perf_config *perf,
 152                                      const struct gen_perf_query_info *query,
 153                                      const uint64_t *accumulator);
 154       struct gen_pipeline_stat pipeline_stat;
 155    };
 156 };
 157
 158 struct gen_perf_query_register_prog {
 159    uint32_t reg;
 160    uint32_t val;
 161 };
 162
 163 struct gen_perf_query_info {
 164    enum gen_perf_query_type {
 165       GEN_PERF_QUERY_TYPE_OA,
 166       GEN_PERF_QUERY_TYPE_RAW,
 167       GEN_PERF_QUERY_TYPE_PIPELINE,
 168    } kind;
 169    const char *name;
 170    const char *guid;
 171    struct gen_perf_query_counter *counters;
 172    int n_counters;
 173    int max_counters;
 174    size_t data_size;
 175
 176    /* OA specific */
 177    uint64_t oa_metrics_set_id;
 178    int oa_format;
 179
 180    /* For indexing into the accumulator[] ... */
 181    int gpu_time_offset;
 182    int gpu_clock_offset;
 183    int a_offset;
 184    int b_offset;
 185    int c_offset;
 186
 187    /* Register programming for a given query */
 188    struct gen_perf_query_register_prog *flex_regs;
 189    uint32_t n_flex_regs;
 190
 191    struct gen_perf_query_register_prog *mux_regs;
 192    uint32_t n_mux_regs;
 193
 194    struct gen_perf_query_register_prog *b_counter_regs;
 195    uint32_t n_b_counter_regs;
 196 };
 197
 198 struct gen_perf_config {
 199    struct gen_perf_query_info *queries;
 200    int n_queries;
 201
 202    /* Variables referenced in the XML meta data for OA performance
 203     * counters, e.g in the normalization equations.
 204     *
 205     * All uint64_t for consistent operand types in generated code
 206     */
 207    struct {
 208       uint64_t timestamp_frequency; /** $GpuTimestampFrequency */
 209       uint64_t n_eus;               /** $EuCoresTotalCount */
 210       uint64_t n_eu_slices;         /** $EuSlicesTotalCount */
 211       uint64_t n_eu_sub_slices;     /** $EuSubslicesTotalCount */
 212       uint64_t eu_threads_count;    /** $EuThreadsCount */
 213       uint64_t slice_mask;          /** $SliceMask */
 214       uint64_t subslice_mask;       /** $SubsliceMask */
 215       uint64_t gt_min_freq;         /** $GpuMinFrequency */
 216       uint64_t gt_max_freq;         /** $GpuMaxFrequency */
 217       uint64_t revision;            /** $SkuRevisionId */
 218    } sys_vars;
 219
 220    /* OA metric sets, indexed by GUID, as know by Mesa at build time, to
 221     * cross-reference with the GUIDs of configs advertised by the kernel at
 222     * runtime
 223     */
 224    struct hash_table *oa_metrics_table;
 225
 226    /* Location of the device's sysfs entry. */
 227    char sysfs_dev_dir[256];
 228
 229    struct {
 230       void *(*bo_alloc)(void *bufmgr, const char *name, uint64_t size);
 231       void (*bo_unreference)(void *bo);
 232       void *(*bo_map)(void *ctx, void *bo, unsigned flags);
 233       void (*bo_unmap)(void *bo);
 234       bool (*batch_references)(void *batch, void *bo);
 235       void (*bo_wait_rendering)(void *bo);
 236       int (*bo_busy)(void *bo);
 237       void (*emit_mi_flush)(void *ctx);
 238       void (*emit_mi_report_perf_count)(void *ctx,
 239                                         void *bo,
 240                                         uint32_t offset_in_bytes,
 241                                         uint32_t report_id);
 242       void (*batchbuffer_flush)(void *ctx,
 243                                 const char *file, int line);
 244       void (*capture_frequency_stat_register)(void *ctx, void *bo,
 245                                               uint32_t bo_offset);
 246       void (*store_register_mem64)(void *ctx, void *bo, uint32_t reg, uint32_t offset);
 247
 248    } vtbl;
 249 };
 250
 251
 252 /**
 253  * gen representation of a performance query object.
 254  *
 255  * NB: We want to keep this structure relatively lean considering that
 256  * applications may expect to allocate enough objects to be able to
 257  * query around all draw calls in a frame.
 258  */
 259 struct gen_perf_query_object
 260 {
 261    const struct gen_perf_query_info *queryinfo;
 262
 263    /* See query->kind to know which state below is in use... */
 264    union {
 265       struct {
 266
 267          /**
 268           * BO containing OA counter snapshots at query Begin/End time.
 269           */
 270          void *bo;
 271
 272          /**
 273           * Address of mapped of @bo
 274           */
 275          void *map;
 276
 277          /**
 278           * The MI_REPORT_PERF_COUNT command lets us specify a unique
 279           * ID that will be reflected in the resulting OA report
 280           * that's written by the GPU. This is the ID we're expecting
 281           * in the begin report and the the end report should be
 282           * @begin_report_id + 1.
 283           */
 284          int begin_report_id;
 285
 286          /**
 287           * Reference the head of the brw->perfquery.sample_buffers
 288           * list at the time that the query started (so we only need
 289           * to look at nodes after this point when looking for samples
 290           * related to this query)
 291           *
 292           * (See struct brw_oa_sample_buf description for more details)
 293           */
 294          struct exec_node *samples_head;
 295
 296          /**
 297           * false while in the unaccumulated_elements list, and set to
 298           * true when the final, end MI_RPC snapshot has been
 299           * accumulated.
 300           */
 301          bool results_accumulated;
 302
 303          /**
 304           * Frequency of the GT at begin and end of the query.
 305           */
 306          uint64_t gt_frequency[2];
 307
 308          /**
 309           * Accumulated OA results between begin and end of the query.
 310           */
 311          struct gen_perf_query_result result;
 312       } oa;
 313
 314       struct {
 315          /**
 316           * BO containing starting and ending snapshots for the
 317           * statistics counters.
 318           */
 319          void *bo;
 320       } pipeline_stats;
 321    };
 322 };
 323
 324 struct gen_perf_context {
 325    struct gen_perf_config *perf;
 326
 327    void * ctx;  /* driver context (eg, brw_context) */
 328    void * bufmgr;
 329    const struct gen_device_info *devinfo;
 330
 331    uint32_t hw_ctx;
 332    int drm_fd;
 333
 334    /* The i915 perf stream we open to setup + enable the OA counters */
 335    int oa_stream_fd;
 336
 337    /* An i915 perf stream fd gives exclusive access to the OA unit that will
 338     * report counter snapshots for a specific counter set/profile in a
 339     * specific layout/format so we can only start OA queries that are
 340     * compatible with the currently open fd...
 341     */
 342    int current_oa_metrics_set_id;
 343    int current_oa_format;
 344
 345    /* List of buffers containing OA reports */
 346    struct exec_list sample_buffers;
 347
 348    /* Cached list of empty sample buffers */
 349    struct exec_list free_sample_buffers;
 350
 351    int n_active_oa_queries;
 352    int n_active_pipeline_stats_queries;
 353
 354    /* The number of queries depending on running OA counters which
 355     * extends beyond brw_end_perf_query() since we need to wait until
 356     * the last MI_RPC command has parsed by the GPU.
 357     *
 358     * Accurate accounting is important here as emitting an
 359     * MI_REPORT_PERF_COUNT command while the OA unit is disabled will
 360     * effectively hang the gpu.
 361     */
 362    int n_oa_users;
 363
 364    /* To help catch an spurious problem with the hardware or perf
 365     * forwarding samples, we emit each MI_REPORT_PERF_COUNT command
 366     * with a unique ID that we can explicitly check for...
 367     */
 368    int next_query_start_report_id;
 369
 370    /**
 371     * An array of queries whose results haven't yet been assembled
 372     * based on the data in buffer objects.
 373     *
 374     * These may be active, or have already ended.  However, the
 375     * results have not been requested.
 376     */
 377    struct gen_perf_query_object **unaccumulated;
 378    int unaccumulated_elements;
 379    int unaccumulated_array_size;
 380
 381    /* The total number of query objects so we can relinquish
 382     * our exclusive access to perf if the application deletes
 383     * all of its objects. (NB: We only disable perf while
 384     * there are no active queries)
 385     */
 386    int n_query_instances;
 387 };
 388
 389 void gen_perf_init_metrics(struct gen_perf_config *perf_cfg,
 390                            const struct gen_device_info *devinfo,
 391                            int drm_fd);
 392 void gen_perf_init_context(struct gen_perf_context *perf_ctx,
 393                            struct gen_perf_config *perf_cfg,
 394                            void * ctx,  /* driver context (eg, brw_context) */
 395                            void * bufmgr,  /* eg brw_bufmgr */
 396                            const struct gen_device_info *devinfo,
 397                            uint32_t hw_ctx,
 398                            int drm_fd);
 399
 400 static inline size_t
 401 gen_perf_query_counter_get_size(const struct gen_perf_query_counter *counter)
 402 {
 403    switch (counter->data_type) {
 404    case GEN_PERF_COUNTER_DATA_TYPE_BOOL32:
 405       return sizeof(uint32_t);
 406    case GEN_PERF_COUNTER_DATA_TYPE_UINT32:
 407       return sizeof(uint32_t);
 408    case GEN_PERF_COUNTER_DATA_TYPE_UINT64:
 409       return sizeof(uint64_t);
 410    case GEN_PERF_COUNTER_DATA_TYPE_FLOAT:
 411       return sizeof(float);
 412    case GEN_PERF_COUNTER_DATA_TYPE_DOUBLE:
 413       return sizeof(double);
 414    default:
 415       unreachable("invalid counter data type");
 416    }
 417 }
 418
 419 static inline struct gen_perf_config *
 420 gen_perf_new(void *ctx)
 421 {
 422    struct gen_perf_config *perf = rzalloc(ctx, struct gen_perf_config);
 423    return perf;
 424 }
 425
 426 struct gen_perf_query_object *
 427 gen_perf_new_query(struct gen_perf_context *, unsigned query_index);
 428
 429
 430 bool gen_perf_begin_query(struct gen_perf_context *perf_ctx,
 431                           struct gen_perf_query_object *query);
 432 void gen_perf_end_query(struct gen_perf_context *perf_ctx,
 433                         struct gen_perf_query_object *query);
 434 void gen_perf_wait_query(struct gen_perf_context *perf_ctx,
 435                          struct gen_perf_query_object *query,
 436                          void *current_batch);
 437 bool gen_perf_is_query_ready(struct gen_perf_context *perf_ctx,
 438                              struct gen_perf_query_object *query,
 439                              void *current_batch);
 440 void gen_perf_delete_query(struct gen_perf_context *perf_ctx,
 441                            struct gen_perf_query_object *query);
 442 void gen_perf_get_query_data(struct gen_perf_context *perf_ctx,
 443                              struct gen_perf_query_object *query,
 444                              int data_size,
 445                              unsigned *data,
 446                              unsigned *bytes_written);
 447
 448 #endif /* GEN_PERF_H */