src/intel/perf/gen_perf.c

   1 /*
   2  * Copyright © 2018 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <dirent.h>
  25
  26 #include <sys/types.h>
  27 #include <sys/stat.h>
  28 #include <fcntl.h>
  29 #include <unistd.h>
  30 #include <errno.h>
  31
  32 #include <drm-uapi/i915_drm.h>
  33
  34 #include "common/gen_gem.h"
  35 #include "gen_perf.h"
  36 #include "perf/gen_perf_mdapi.h"
  37 #include "perf/gen_perf_metrics.h"
  38
  39 #include "dev/gen_debug.h"
  40 #include "dev/gen_device_info.h"
  41 #include "util/bitscan.h"
  42 #include "util/u_math.h"
  43
  44 #define FILE_DEBUG_FLAG DEBUG_PERFMON
  45 #define MI_RPC_BO_SIZE              4096
  46 #define MI_FREQ_START_OFFSET_BYTES  (3072)
  47
  48 #define MAP_READ  (1 << 0)
  49 #define MAP_WRITE (1 << 1)
  50
  51 static bool
  52 get_sysfs_dev_dir(struct gen_perf_config *perf, int fd)
  53 {
  54    struct stat sb;
  55    int min, maj;
  56    DIR *drmdir;
  57    struct dirent *drm_entry;
  58    int len;
  59
  60    perf->sysfs_dev_dir[0] = '\0';
  61
  62    if (fstat(fd, &sb)) {
  63       DBG("Failed to stat DRM fd\n");
  64       return false;
  65    }
  66
  67    maj = major(sb.st_rdev);
  68    min = minor(sb.st_rdev);
  69
  70    if (!S_ISCHR(sb.st_mode)) {
  71       DBG("DRM fd is not a character device as expected\n");
  72       return false;
  73    }
  74
  75    len = snprintf(perf->sysfs_dev_dir,
  76                   sizeof(perf->sysfs_dev_dir),
  77                   "/sys/dev/char/%d:%d/device/drm", maj, min);
  78    if (len < 0 || len >= sizeof(perf->sysfs_dev_dir)) {
  79       DBG("Failed to concatenate sysfs path to drm device\n");
  80       return false;
  81    }
  82
  83    drmdir = opendir(perf->sysfs_dev_dir);
  84    if (!drmdir) {
  85       DBG("Failed to open %s: %m\n", perf->sysfs_dev_dir);
  86       return false;
  87    }
  88
  89    while ((drm_entry = readdir(drmdir))) {
  90       if ((drm_entry->d_type == DT_DIR ||
  91            drm_entry->d_type == DT_LNK) &&
  92           strncmp(drm_entry->d_name, "card", 4) == 0)
  93       {
  94          len = snprintf(perf->sysfs_dev_dir,
  95                         sizeof(perf->sysfs_dev_dir),
  96                         "/sys/dev/char/%d:%d/device/drm/%s",
  97                         maj, min, drm_entry->d_name);
  98          closedir(drmdir);
  99          if (len < 0 || len >= sizeof(perf->sysfs_dev_dir))
 100             return false;
 101          else
 102             return true;
 103       }
 104    }
 105
 106    closedir(drmdir);
 107
 108    DBG("Failed to find cardX directory under /sys/dev/char/%d:%d/device/drm\n",
 109        maj, min);
 110
 111    return false;
 112 }
 113
 114 static bool
 115 read_file_uint64(const char *file, uint64_t *val)
 116 {
 117     char buf[32];
 118     int fd, n;
 119
 120     fd = open(file, 0);
 121     if (fd < 0)
 122        return false;
 123     while ((n = read(fd, buf, sizeof (buf) - 1)) < 0 &&
 124            errno == EINTR);
 125     close(fd);
 126     if (n < 0)
 127        return false;
 128
 129     buf[n] = '\0';
 130     *val = strtoull(buf, NULL, 0);
 131
 132     return true;
 133 }
 134
 135 static bool
 136 read_sysfs_drm_device_file_uint64(struct gen_perf_config *perf,
 137                                   const char *file,
 138                                   uint64_t *value)
 139 {
 140    char buf[512];
 141    int len;
 142
 143    len = snprintf(buf, sizeof(buf), "%s/%s", perf->sysfs_dev_dir, file);
 144    if (len < 0 || len >= sizeof(buf)) {
 145       DBG("Failed to concatenate sys filename to read u64 from\n");
 146       return false;
 147    }
 148
 149    return read_file_uint64(buf, value);
 150 }
 151
 152 static void
 153 register_oa_config(struct gen_perf_config *perf,
 154                    const struct gen_perf_query_info *query,
 155                    uint64_t config_id)
 156 {
 157    struct gen_perf_query_info *registred_query =
 158       gen_perf_query_append_query_info(perf, 0);
 159
 160    *registred_query = *query;
 161    registred_query->oa_metrics_set_id = config_id;
 162    DBG("metric set registred: id = %" PRIu64", guid = %s\n",
 163        registred_query->oa_metrics_set_id, query->guid);
 164 }
 165
 166 static void
 167 enumerate_sysfs_metrics(struct gen_perf_config *perf)
 168 {
 169    DIR *metricsdir = NULL;
 170    struct dirent *metric_entry;
 171    char buf[256];
 172    int len;
 173
 174    len = snprintf(buf, sizeof(buf), "%s/metrics", perf->sysfs_dev_dir);
 175    if (len < 0 || len >= sizeof(buf)) {
 176       DBG("Failed to concatenate path to sysfs metrics/ directory\n");
 177       return;
 178    }
 179
 180    metricsdir = opendir(buf);
 181    if (!metricsdir) {
 182       DBG("Failed to open %s: %m\n", buf);
 183       return;
 184    }
 185
 186    while ((metric_entry = readdir(metricsdir))) {
 187       struct hash_entry *entry;
 188
 189       if ((metric_entry->d_type != DT_DIR &&
 190            metric_entry->d_type != DT_LNK) ||
 191           metric_entry->d_name[0] == '.')
 192          continue;
 193
 194       DBG("metric set: %s\n", metric_entry->d_name);
 195       entry = _mesa_hash_table_search(perf->oa_metrics_table,
 196                                       metric_entry->d_name);
 197       if (entry) {
 198          uint64_t id;
 199
 200          len = snprintf(buf, sizeof(buf), "%s/metrics/%s/id",
 201                         perf->sysfs_dev_dir, metric_entry->d_name);
 202          if (len < 0 || len >= sizeof(buf)) {
 203             DBG("Failed to concatenate path to sysfs metric id file\n");
 204             continue;
 205          }
 206
 207          if (!read_file_uint64(buf, &id)) {
 208             DBG("Failed to read metric set id from %s: %m", buf);
 209             continue;
 210          }
 211
 212          register_oa_config(perf, (const struct gen_perf_query_info *)entry->data, id);
 213       } else
 214          DBG("metric set not known by mesa (skipping)\n");
 215    }
 216
 217    closedir(metricsdir);
 218 }
 219
 220 static bool
 221 kernel_has_dynamic_config_support(struct gen_perf_config *perf, int fd)
 222 {
 223    uint64_t invalid_config_id = UINT64_MAX;
 224
 225    return gen_ioctl(fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG,
 226                     &invalid_config_id) < 0 && errno == ENOENT;
 227 }
 228
 229 bool
 230 gen_perf_load_metric_id(struct gen_perf_config *perf, const char *guid,
 231                         uint64_t *metric_id)
 232 {
 233    char config_path[280];
 234
 235    snprintf(config_path, sizeof(config_path), "%s/metrics/%s/id",
 236             perf->sysfs_dev_dir, guid);
 237
 238    /* Don't recreate already loaded configs. */
 239    return read_file_uint64(config_path, metric_id);
 240 }
 241
 242 static void
 243 init_oa_configs(struct gen_perf_config *perf, int fd)
 244 {
 245    hash_table_foreach(perf->oa_metrics_table, entry) {
 246       const struct gen_perf_query_info *query = entry->data;
 247       struct drm_i915_perf_oa_config config;
 248       uint64_t config_id;
 249       int ret;
 250
 251       if (gen_perf_load_metric_id(perf, query->guid, &config_id)) {
 252          DBG("metric set: %s (already loaded)\n", query->guid);
 253          register_oa_config(perf, query, config_id);
 254          continue;
 255       }
 256
 257       memset(&config, 0, sizeof(config));
 258
 259       memcpy(config.uuid, query->guid, sizeof(config.uuid));
 260
 261       config.n_mux_regs = query->n_mux_regs;
 262       config.mux_regs_ptr = (uintptr_t) query->mux_regs;
 263
 264       config.n_boolean_regs = query->n_b_counter_regs;
 265       config.boolean_regs_ptr = (uintptr_t) query->b_counter_regs;
 266
 267       config.n_flex_regs = query->n_flex_regs;
 268       config.flex_regs_ptr = (uintptr_t) query->flex_regs;
 269
 270       ret = gen_ioctl(fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &config);
 271       if (ret < 0) {
 272          DBG("Failed to load \"%s\" (%s) metrics set in kernel: %s\n",
 273              query->name, query->guid, strerror(errno));
 274          continue;
 275       }
 276
 277       register_oa_config(perf, query, ret);
 278       DBG("metric set: %s (added)\n", query->guid);
 279    }
 280 }
 281
 282 static void
 283 compute_topology_builtins(struct gen_perf_config *perf,
 284                           const struct gen_device_info *devinfo)
 285 {
 286    perf->sys_vars.slice_mask = devinfo->slice_masks;
 287    perf->sys_vars.n_eu_slices = devinfo->num_slices;
 288
 289    for (int i = 0; i < sizeof(devinfo->subslice_masks[i]); i++) {
 290       perf->sys_vars.n_eu_sub_slices +=
 291          __builtin_popcount(devinfo->subslice_masks[i]);
 292    }
 293
 294    for (int i = 0; i < sizeof(devinfo->eu_masks); i++)
 295       perf->sys_vars.n_eus += __builtin_popcount(devinfo->eu_masks[i]);
 296
 297    perf->sys_vars.eu_threads_count = devinfo->num_thread_per_eu;
 298
 299    /* The subslice mask builtin contains bits for all slices. Prior to Gen11
 300     * it had groups of 3bits for each slice, on Gen11 it's 8bits for each
 301     * slice.
 302     *
 303     * Ideally equations would be updated to have a slice/subslice query
 304     * function/operator.
 305     */
 306    perf->sys_vars.subslice_mask = 0;
 307
 308    int bits_per_subslice = devinfo->gen == 11 ? 8 : 3;
 309
 310    for (int s = 0; s < util_last_bit(devinfo->slice_masks); s++) {
 311       for (int ss = 0; ss < (devinfo->subslice_slice_stride * 8); ss++) {
 312          if (gen_device_info_subslice_available(devinfo, s, ss))
 313             perf->sys_vars.subslice_mask |= 1ULL << (s * bits_per_subslice + ss);
 314       }
 315    }
 316 }
 317
 318 static bool
 319 init_oa_sys_vars(struct gen_perf_config *perf, const struct gen_device_info *devinfo)
 320 {
 321    uint64_t min_freq_mhz = 0, max_freq_mhz = 0;
 322
 323    if (!read_sysfs_drm_device_file_uint64(perf, "gt_min_freq_mhz", &min_freq_mhz))
 324       return false;
 325
 326    if (!read_sysfs_drm_device_file_uint64(perf,  "gt_max_freq_mhz", &max_freq_mhz))
 327       return false;
 328
 329    memset(&perf->sys_vars, 0, sizeof(perf->sys_vars));
 330    perf->sys_vars.gt_min_freq = min_freq_mhz * 1000000;
 331    perf->sys_vars.gt_max_freq = max_freq_mhz * 1000000;
 332    perf->sys_vars.timestamp_frequency = devinfo->timestamp_frequency;
 333    perf->sys_vars.revision = devinfo->revision;
 334    compute_topology_builtins(perf, devinfo);
 335
 336    return true;
 337 }
 338
 339 typedef void (*perf_register_oa_queries_t)(struct gen_perf_config *);
 340
 341 static perf_register_oa_queries_t
 342 get_register_queries_function(const struct gen_device_info *devinfo)
 343 {
 344    if (devinfo->is_haswell)
 345       return gen_oa_register_queries_hsw;
 346    if (devinfo->is_cherryview)
 347       return gen_oa_register_queries_chv;
 348    if (devinfo->is_broadwell)
 349       return gen_oa_register_queries_bdw;
 350    if (devinfo->is_broxton)
 351       return gen_oa_register_queries_bxt;
 352    if (devinfo->is_skylake) {
 353       if (devinfo->gt == 2)
 354          return gen_oa_register_queries_sklgt2;
 355       if (devinfo->gt == 3)
 356          return gen_oa_register_queries_sklgt3;
 357       if (devinfo->gt == 4)
 358          return gen_oa_register_queries_sklgt4;
 359    }
 360    if (devinfo->is_kabylake) {
 361       if (devinfo->gt == 2)
 362          return gen_oa_register_queries_kblgt2;
 363       if (devinfo->gt == 3)
 364          return gen_oa_register_queries_kblgt3;
 365    }
 366    if (devinfo->is_geminilake)
 367       return gen_oa_register_queries_glk;
 368    if (devinfo->is_coffeelake) {
 369       if (devinfo->gt == 2)
 370          return gen_oa_register_queries_cflgt2;
 371       if (devinfo->gt == 3)
 372          return gen_oa_register_queries_cflgt3;
 373    }
 374    if (devinfo->is_cannonlake)
 375       return gen_oa_register_queries_cnl;
 376    if (devinfo->gen == 11)
 377       return gen_oa_register_queries_icl;
 378
 379    return NULL;
 380 }
 381
 382 bool
 383 gen_perf_load_oa_metrics(struct gen_perf_config *perf, int fd,
 384                          const struct gen_device_info *devinfo)
 385 {
 386    perf_register_oa_queries_t oa_register = get_register_queries_function(devinfo);
 387    bool i915_perf_oa_available = false;
 388    struct stat sb;
 389
 390    /* The existence of this sysctl parameter implies the kernel supports
 391     * the i915 perf interface.
 392     */
 393    if (stat("/proc/sys/dev/i915/perf_stream_paranoid", &sb) == 0) {
 394
 395       /* If _paranoid == 1 then on Gen8+ we won't be able to access OA
 396        * metrics unless running as root.
 397        */
 398       if (devinfo->is_haswell)
 399          i915_perf_oa_available = true;
 400       else {
 401          uint64_t paranoid = 1;
 402
 403          read_file_uint64("/proc/sys/dev/i915/perf_stream_paranoid", &paranoid);
 404
 405          if (paranoid == 0 || geteuid() == 0)
 406             i915_perf_oa_available = true;
 407       }
 408    }
 409
 410    if (!i915_perf_oa_available ||
 411        !oa_register ||
 412        !get_sysfs_dev_dir(perf, fd) ||
 413        !init_oa_sys_vars(perf, devinfo))
 414       return false;
 415
 416    perf->oa_metrics_table =
 417       _mesa_hash_table_create(perf, _mesa_key_hash_string,
 418                               _mesa_key_string_equal);
 419
 420    /* Index all the metric sets mesa knows about before looking to see what
 421     * the kernel is advertising.
 422     */
 423    oa_register(perf);
 424
 425    if (likely((INTEL_DEBUG & DEBUG_NO_OACONFIG) == 0) &&
 426        kernel_has_dynamic_config_support(perf, fd))
 427       init_oa_configs(perf, fd);
 428    else
 429       enumerate_sysfs_metrics(perf);
 430
 431    return true;
 432 }
 433
 434 /* Accumulate 32bits OA counters */
 435 static inline void
 436 accumulate_uint32(const uint32_t *report0,
 437                   const uint32_t *report1,
 438                   uint64_t *accumulator)
 439 {
 440    *accumulator += (uint32_t)(*report1 - *report0);
 441 }
 442
 443 /* Accumulate 40bits OA counters */
 444 static inline void
 445 accumulate_uint40(int a_index,
 446                   const uint32_t *report0,
 447                   const uint32_t *report1,
 448                   uint64_t *accumulator)
 449 {
 450    const uint8_t *high_bytes0 = (uint8_t *)(report0 + 40);
 451    const uint8_t *high_bytes1 = (uint8_t *)(report1 + 40);
 452    uint64_t high0 = (uint64_t)(high_bytes0[a_index]) << 32;
 453    uint64_t high1 = (uint64_t)(high_bytes1[a_index]) << 32;
 454    uint64_t value0 = report0[a_index + 4] | high0;
 455    uint64_t value1 = report1[a_index + 4] | high1;
 456    uint64_t delta;
 457
 458    if (value0 > value1)
 459       delta = (1ULL << 40) + value1 - value0;
 460    else
 461       delta = value1 - value0;
 462
 463    *accumulator += delta;
 464 }
 465
 466 static void
 467 gen8_read_report_clock_ratios(const uint32_t *report,
 468                               uint64_t *slice_freq_hz,
 469                               uint64_t *unslice_freq_hz)
 470 {
 471    /* The lower 16bits of the RPT_ID field of the OA reports contains a
 472     * snapshot of the bits coming from the RP_FREQ_NORMAL register and is
 473     * divided this way :
 474     *
 475     * RPT_ID[31:25]: RP_FREQ_NORMAL[20:14] (low squashed_slice_clock_frequency)
 476     * RPT_ID[10:9]:  RP_FREQ_NORMAL[22:21] (high squashed_slice_clock_frequency)
 477     * RPT_ID[8:0]:   RP_FREQ_NORMAL[31:23] (squashed_unslice_clock_frequency)
 478     *
 479     * RP_FREQ_NORMAL[31:23]: Software Unslice Ratio Request
 480     *                        Multiple of 33.33MHz 2xclk (16 MHz 1xclk)
 481     *
 482     * RP_FREQ_NORMAL[22:14]: Software Slice Ratio Request
 483     *                        Multiple of 33.33MHz 2xclk (16 MHz 1xclk)
 484     */
 485
 486    uint32_t unslice_freq = report[0] & 0x1ff;
 487    uint32_t slice_freq_low = (report[0] >> 25) & 0x7f;
 488    uint32_t slice_freq_high = (report[0] >> 9) & 0x3;
 489    uint32_t slice_freq = slice_freq_low | (slice_freq_high << 7);
 490
 491    *slice_freq_hz = slice_freq * 16666667ULL;
 492    *unslice_freq_hz = unslice_freq * 16666667ULL;
 493 }
 494
 495 void
 496 gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result,
 497                                        const struct gen_device_info *devinfo,
 498                                        const uint32_t *start,
 499                                        const uint32_t *end)
 500 {
 501    /* Slice/Unslice frequency is only available in the OA reports when the
 502     * "Disable OA reports due to clock ratio change" field in
 503     * OA_DEBUG_REGISTER is set to 1. This is how the kernel programs this
 504     * global register (see drivers/gpu/drm/i915/i915_perf.c)
 505     *
 506     * Documentation says this should be available on Gen9+ but experimentation
 507     * shows that Gen8 reports similar values, so we enable it there too.
 508     */
 509    if (devinfo->gen < 8)
 510       return;
 511
 512    gen8_read_report_clock_ratios(start,
 513                                  &result->slice_frequency[0],
 514                                  &result->unslice_frequency[0]);
 515    gen8_read_report_clock_ratios(end,
 516                                  &result->slice_frequency[1],
 517                                  &result->unslice_frequency[1]);
 518 }
 519
 520 void
 521 gen_perf_query_result_accumulate(struct gen_perf_query_result *result,
 522                                  const struct gen_perf_query_info *query,
 523                                  const uint32_t *start,
 524                                  const uint32_t *end)
 525 {
 526    int i, idx = 0;
 527
 528    result->hw_id = start[2];
 529    result->reports_accumulated++;
 530
 531    switch (query->oa_format) {
 532    case I915_OA_FORMAT_A32u40_A4u32_B8_C8:
 533       accumulate_uint32(start + 1, end + 1, result->accumulator + idx++); /* timestamp */
 534       accumulate_uint32(start + 3, end + 3, result->accumulator + idx++); /* clock */
 535
 536       /* 32x 40bit A counters... */
 537       for (i = 0; i < 32; i++)
 538          accumulate_uint40(i, start, end, result->accumulator + idx++);
 539
 540       /* 4x 32bit A counters... */
 541       for (i = 0; i < 4; i++)
 542          accumulate_uint32(start + 36 + i, end + 36 + i, result->accumulator + idx++);
 543
 544       /* 8x 32bit B counters + 8x 32bit C counters... */
 545       for (i = 0; i < 16; i++)
 546          accumulate_uint32(start + 48 + i, end + 48 + i, result->accumulator + idx++);
 547       break;
 548
 549    case I915_OA_FORMAT_A45_B8_C8:
 550       accumulate_uint32(start + 1, end + 1, result->accumulator); /* timestamp */
 551
 552       for (i = 0; i < 61; i++)
 553          accumulate_uint32(start + 3 + i, end + 3 + i, result->accumulator + 1 + i);
 554       break;
 555
 556    default:
 557       unreachable("Can't accumulate OA counters in unknown format");
 558    }
 559
 560 }
 561
 562 void
 563 gen_perf_query_result_clear(struct gen_perf_query_result *result)
 564 {
 565    memset(result, 0, sizeof(*result));
 566    result->hw_id = 0xffffffff; /* invalid */
 567 }
 568
 569 static void
 570 fill_mdapi_perf_query_counter(struct gen_perf_query_info *query,
 571                               const char *name,
 572                               uint32_t data_offset,
 573                               uint32_t data_size,
 574                               enum gen_perf_counter_data_type data_type)
 575 {
 576    struct gen_perf_query_counter *counter = &query->counters[query->n_counters];
 577
 578    assert(query->n_counters <= query->max_counters);
 579
 580    counter->name = name;
 581    counter->desc = "Raw counter value";
 582    counter->type = GEN_PERF_COUNTER_TYPE_RAW;
 583    counter->data_type = data_type;
 584    counter->offset = data_offset;
 585
 586    query->n_counters++;
 587
 588    assert(counter->offset + gen_perf_query_counter_get_size(counter) <= query->data_size);
 589 }
 590
 591 #define MDAPI_QUERY_ADD_COUNTER(query, struct_name, field_name, type_name) \
 592    fill_mdapi_perf_query_counter(query, #field_name,                    \
 593                                  (uint8_t *) &struct_name.field_name -  \
 594                                  (uint8_t *) &struct_name,              \
 595                                  sizeof(struct_name.field_name),        \
 596                                  GEN_PERF_COUNTER_DATA_TYPE_##type_name)
 597 #define MDAPI_QUERY_ADD_ARRAY_COUNTER(ctx, query, struct_name, field_name, idx, type_name) \
 598    fill_mdapi_perf_query_counter(query,                                 \
 599                                  ralloc_asprintf(ctx, "%s%i", #field_name, idx), \
 600                                  (uint8_t *) &struct_name.field_name[idx] - \
 601                                  (uint8_t *) &struct_name,              \
 602                                  sizeof(struct_name.field_name[0]),     \
 603                                  GEN_PERF_COUNTER_DATA_TYPE_##type_name)
 604
 605 void
 606 gen_perf_query_register_mdapi_oa_query(const struct gen_device_info *devinfo,
 607                                        struct gen_perf_config *perf)
 608 {
 609    struct gen_perf_query_info *query = NULL;
 610
 611    /* MDAPI requires different structures for pretty much every generation
 612     * (right now we have definitions for gen 7 to 11).
 613     */
 614    if (!(devinfo->gen >= 7 && devinfo->gen <= 11))
 615       return;
 616
 617    switch (devinfo->gen) {
 618    case 7: {
 619       query = gen_perf_query_append_query_info(perf, 1 + 45 + 16 + 7);
 620       query->oa_format = I915_OA_FORMAT_A45_B8_C8;
 621
 622       struct gen7_mdapi_metrics metric_data;
 623       query->data_size = sizeof(metric_data);
 624
 625       MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
 626       for (int i = 0; i < ARRAY_SIZE(metric_data.ACounters); i++) {
 627          MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
 628                                        metric_data, ACounters, i, UINT64);
 629       }
 630       for (int i = 0; i < ARRAY_SIZE(metric_data.NOACounters); i++) {
 631          MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
 632                                        metric_data, NOACounters, i, UINT64);
 633       }
 634       MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
 635       MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
 636       MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
 637       MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
 638       MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
 639       MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
 640       MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
 641       break;
 642    }
 643    case 8: {
 644       query = gen_perf_query_append_query_info(perf, 2 + 36 + 16 + 16);
 645       query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8;
 646
 647       struct gen8_mdapi_metrics metric_data;
 648       query->data_size = sizeof(metric_data);
 649
 650       MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
 651       MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64);
 652       for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) {
 653          MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
 654                                        metric_data, OaCntr, i, UINT64);
 655       }
 656       for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) {
 657          MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
 658                                        metric_data, NoaCntr, i, UINT64);
 659       }
 660       MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64);
 661       MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64);
 662       MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64);
 663       MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32);
 664       MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32);
 665       MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64);
 666       MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64);
 667       MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64);
 668       MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64);
 669       MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
 670       MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
 671       MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
 672       MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
 673       MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
 674       MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
 675       MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
 676       break;
 677    }
 678    case 9:
 679    case 10:
 680    case 11: {
 681       query = gen_perf_query_append_query_info(perf, 2 + 36 + 16 + 16 + 16 + 2);
 682       query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8;
 683
 684       struct gen9_mdapi_metrics metric_data;
 685       query->data_size = sizeof(metric_data);
 686
 687       MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
 688       MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64);
 689       for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) {
 690          MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
 691                                        metric_data, OaCntr, i, UINT64);
 692       }
 693       for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) {
 694          MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
 695                                        metric_data, NoaCntr, i, UINT64);
 696       }
 697       MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64);
 698       MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64);
 699       MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64);
 700       MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32);
 701       MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32);
 702       MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64);
 703       MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64);
 704       MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64);
 705       MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64);
 706       MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
 707       MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
 708       MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
 709       MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
 710       MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
 711       MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
 712       MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
 713       for (int i = 0; i < ARRAY_SIZE(metric_data.UserCntr); i++) {
 714          MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
 715                                        metric_data, UserCntr, i, UINT64);
 716       }
 717       MDAPI_QUERY_ADD_COUNTER(query, metric_data, UserCntrCfgId, UINT32);
 718       MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved4, UINT32);
 719       break;
 720    }
 721    default:
 722       unreachable("Unsupported gen");
 723       break;
 724    }
 725
 726    query->kind = GEN_PERF_QUERY_TYPE_RAW;
 727    query->name = "Intel_Raw_Hardware_Counters_Set_0_Query";
 728    query->guid = GEN_PERF_QUERY_GUID_MDAPI;
 729
 730    {
 731       /* Accumulation buffer offsets copied from an actual query... */
 732       const struct gen_perf_query_info *copy_query =
 733          &perf->queries[0];
 734
 735       query->gpu_time_offset = copy_query->gpu_time_offset;
 736       query->gpu_clock_offset = copy_query->gpu_clock_offset;
 737       query->a_offset = copy_query->a_offset;
 738       query->b_offset = copy_query->b_offset;
 739       query->c_offset = copy_query->c_offset;
 740    }
 741 }
 742
 743 void
 744 gen_perf_query_register_mdapi_statistic_query(const struct gen_device_info *devinfo,
 745                                               struct gen_perf_config *perf)
 746 {
 747    if (!(devinfo->gen >= 7 && devinfo->gen <= 11))
 748       return;
 749
 750    struct gen_perf_query_info *query =
 751       gen_perf_query_append_query_info(perf, MAX_STAT_COUNTERS);
 752
 753    query->kind = GEN_PERF_QUERY_TYPE_PIPELINE;
 754    query->name = "Intel_Raw_Pipeline_Statistics_Query";
 755
 756    /* The order has to match mdapi_pipeline_metrics. */
 757    gen_perf_query_info_add_basic_stat_reg(query, IA_VERTICES_COUNT,
 758                                           "N vertices submitted");
 759    gen_perf_query_info_add_basic_stat_reg(query, IA_PRIMITIVES_COUNT,
 760                                           "N primitives submitted");
 761    gen_perf_query_info_add_basic_stat_reg(query, VS_INVOCATION_COUNT,
 762                                           "N vertex shader invocations");
 763    gen_perf_query_info_add_basic_stat_reg(query, GS_INVOCATION_COUNT,
 764                                           "N geometry shader invocations");
 765    gen_perf_query_info_add_basic_stat_reg(query, GS_PRIMITIVES_COUNT,
 766                                           "N geometry shader primitives emitted");
 767    gen_perf_query_info_add_basic_stat_reg(query, CL_INVOCATION_COUNT,
 768                                           "N primitives entering clipping");
 769    gen_perf_query_info_add_basic_stat_reg(query, CL_PRIMITIVES_COUNT,
 770                                           "N primitives leaving clipping");
 771    if (devinfo->is_haswell || devinfo->gen == 8) {
 772       gen_perf_query_info_add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4,
 773                                        "N fragment shader invocations",
 774                                        "N fragment shader invocations");
 775    } else {
 776       gen_perf_query_info_add_basic_stat_reg(query, PS_INVOCATION_COUNT,
 777                                              "N fragment shader invocations");
 778    }
 779    gen_perf_query_info_add_basic_stat_reg(query, HS_INVOCATION_COUNT,
 780                                           "N TCS shader invocations");
 781    gen_perf_query_info_add_basic_stat_reg(query, DS_INVOCATION_COUNT,
 782                                           "N TES shader invocations");
 783    if (devinfo->gen >= 7) {
 784       gen_perf_query_info_add_basic_stat_reg(query, CS_INVOCATION_COUNT,
 785                                              "N compute shader invocations");
 786    }
 787
 788    if (devinfo->gen >= 10) {
 789       /* Reuse existing CS invocation register until we can expose this new
 790        * one.
 791        */
 792       gen_perf_query_info_add_basic_stat_reg(query, CS_INVOCATION_COUNT,
 793                                              "Reserved1");
 794    }
 795
 796    query->data_size = sizeof(uint64_t) * query->n_counters;
 797 }
 798
 799 uint64_t
 800 gen_perf_query_get_metric_id(struct gen_perf_config *perf,
 801                              const struct gen_perf_query_info *query)
 802 {
 803    /* These queries are know not to ever change, their config ID has been
 804     * loaded upon the first query creation. No need to look them up again.
 805     */
 806    if (query->kind == GEN_PERF_QUERY_TYPE_OA)
 807       return query->oa_metrics_set_id;
 808
 809    assert(query->kind == GEN_PERF_QUERY_TYPE_RAW);
 810
 811    /* Raw queries can be reprogrammed up by an external application/library.
 812     * When a raw query is used for the first time it's id is set to a value !=
 813     * 0. When it stops being used the id returns to 0. No need to reload the
 814     * ID when it's already loaded.
 815     */
 816    if (query->oa_metrics_set_id != 0) {
 817       DBG("Raw query '%s' guid=%s using cached ID: %"PRIu64"\n",
 818           query->name, query->guid, query->oa_metrics_set_id);
 819       return query->oa_metrics_set_id;
 820    }
 821
 822    struct gen_perf_query_info *raw_query = (struct gen_perf_query_info *)query;
 823    if (!gen_perf_load_metric_id(perf, query->guid,
 824                                 &raw_query->oa_metrics_set_id)) {
 825       DBG("Unable to read query guid=%s ID, falling back to test config\n", query->guid);
 826       raw_query->oa_metrics_set_id = 1ULL;
 827    } else {
 828       DBG("Raw query '%s'guid=%s loaded ID: %"PRIu64"\n",
 829           query->name, query->guid, query->oa_metrics_set_id);
 830    }
 831    return query->oa_metrics_set_id;
 832 }
 833
 834 struct oa_sample_buf *
 835 gen_perf_get_free_sample_buf(struct gen_perf_context *perf_ctx)
 836 {
 837    struct exec_node *node = exec_list_pop_head(&perf_ctx->free_sample_buffers);
 838    struct oa_sample_buf *buf;
 839
 840    if (node)
 841       buf = exec_node_data(struct oa_sample_buf, node, link);
 842    else {
 843       buf = ralloc_size(perf_ctx->perf, sizeof(*buf));
 844
 845       exec_node_init(&buf->link);
 846       buf->refcount = 0;
 847       buf->len = 0;
 848    }
 849
 850    return buf;
 851 }
 852
 853 void
 854 gen_perf_reap_old_sample_buffers(struct gen_perf_context *perf_ctx)
 855 {
 856    struct exec_node *tail_node =
 857       exec_list_get_tail(&perf_ctx->sample_buffers);
 858    struct oa_sample_buf *tail_buf =
 859       exec_node_data(struct oa_sample_buf, tail_node, link);
 860
 861    /* Remove all old, unreferenced sample buffers walking forward from
 862     * the head of the list, except always leave at least one node in
 863     * the list so we always have a node to reference when we Begin
 864     * a new query.
 865     */
 866    foreach_list_typed_safe(struct oa_sample_buf, buf, link,
 867                            &perf_ctx->sample_buffers)
 868    {
 869       if (buf->refcount == 0 && buf != tail_buf) {
 870          exec_node_remove(&buf->link);
 871          exec_list_push_head(&perf_ctx->free_sample_buffers, &buf->link);
 872       } else
 873          return;
 874    }
 875 }
 876
 877 void
 878 gen_perf_free_sample_bufs(struct gen_perf_context *perf_ctx)
 879 {
 880    foreach_list_typed_safe(struct oa_sample_buf, buf, link,
 881                            &perf_ctx->free_sample_buffers)
 882       ralloc_free(buf);
 883
 884    exec_list_make_empty(&perf_ctx->free_sample_buffers);
 885 }
 886
 887 /******************************************************************************/
 888
 889 /**
 890  * Emit MI_STORE_REGISTER_MEM commands to capture all of the
 891  * pipeline statistics for the performance query object.
 892  */
 893 void
 894 gen_perf_snapshot_statistics_registers(void *context,
 895                                        struct gen_perf_config *perf,
 896                                        struct gen_perf_query_object *obj,
 897                                        uint32_t offset_in_bytes)
 898 {
 899    const struct gen_perf_query_info *query = obj->queryinfo;
 900    const int n_counters = query->n_counters;
 901
 902    for (int i = 0; i < n_counters; i++) {
 903       const struct gen_perf_query_counter *counter = &query->counters[i];
 904
 905       assert(counter->data_type == GEN_PERF_COUNTER_DATA_TYPE_UINT64);
 906
 907       perf->vtbl.store_register_mem64(context, obj->pipeline_stats.bo,
 908                                       counter->pipeline_stat.reg,
 909                                       offset_in_bytes + i * sizeof(uint64_t));
 910    }
 911 }
 912
 913 void
 914 gen_perf_close(struct gen_perf_context *perfquery,
 915                const struct gen_perf_query_info *query)
 916 {
 917    if (perfquery->oa_stream_fd != -1) {
 918       close(perfquery->oa_stream_fd);
 919       perfquery->oa_stream_fd = -1;
 920    }
 921    if (query->kind == GEN_PERF_QUERY_TYPE_RAW) {
 922       struct gen_perf_query_info *raw_query =
 923          (struct gen_perf_query_info *) query;
 924       raw_query->oa_metrics_set_id = 0;
 925    }
 926 }
 927
 928 bool
 929 gen_perf_open(struct gen_perf_context *perf_ctx,
 930               int metrics_set_id,
 931               int report_format,
 932               int period_exponent,
 933               int drm_fd,
 934               uint32_t ctx_id)
 935 {
 936    uint64_t properties[] = {
 937       /* Single context sampling */
 938       DRM_I915_PERF_PROP_CTX_HANDLE, ctx_id,
 939
 940       /* Include OA reports in samples */
 941       DRM_I915_PERF_PROP_SAMPLE_OA, true,
 942
 943       /* OA unit configuration */
 944       DRM_I915_PERF_PROP_OA_METRICS_SET, metrics_set_id,
 945       DRM_I915_PERF_PROP_OA_FORMAT, report_format,
 946       DRM_I915_PERF_PROP_OA_EXPONENT, period_exponent,
 947    };
 948    struct drm_i915_perf_open_param param = {
 949       .flags = I915_PERF_FLAG_FD_CLOEXEC |
 950                I915_PERF_FLAG_FD_NONBLOCK |
 951                I915_PERF_FLAG_DISABLED,
 952       .num_properties = ARRAY_SIZE(properties) / 2,
 953       .properties_ptr = (uintptr_t) properties,
 954    };
 955    int fd = gen_ioctl(drm_fd, DRM_IOCTL_I915_PERF_OPEN, &param);
 956    if (fd == -1) {
 957       DBG("Error opening gen perf OA stream: %m\n");
 958       return false;
 959    }
 960
 961    perf_ctx->oa_stream_fd = fd;
 962
 963    perf_ctx->current_oa_metrics_set_id = metrics_set_id;
 964    perf_ctx->current_oa_format = report_format;
 965
 966    return true;
 967 }
 968
 969 bool
 970 gen_perf_inc_n_users(struct gen_perf_context *perf_ctx)
 971 {
 972    if (perf_ctx->n_oa_users == 0 &&
 973        gen_ioctl(perf_ctx->oa_stream_fd, I915_PERF_IOCTL_ENABLE, 0) < 0)
 974    {
 975       return false;
 976    }
 977    ++perf_ctx->n_oa_users;
 978
 979    return true;
 980 }
 981
 982 void
 983 gen_perf_dec_n_users(struct gen_perf_context *perf_ctx)
 984 {
 985    /* Disabling the i915 perf stream will effectively disable the OA
 986     * counters.  Note it's important to be sure there are no outstanding
 987     * MI_RPC commands at this point since they could stall the CS
 988     * indefinitely once OACONTROL is disabled.
 989     */
 990    --perf_ctx->n_oa_users;
 991    if (perf_ctx->n_oa_users == 0 &&
 992        gen_ioctl(perf_ctx->oa_stream_fd, I915_PERF_IOCTL_DISABLE, 0) < 0)
 993    {
 994       DBG("WARNING: Error disabling gen perf stream: %m\n");
 995    }
 996 }
 997
 998 void
 999 gen_perf_init_context(struct gen_perf_context *perf_ctx,
1000                       struct gen_perf_config *perf_cfg,
1001                       void * ctx,  /* driver context (eg, brw_context) */
1002                       void * bufmgr,  /* eg brw_bufmgr */
1003                       const struct gen_device_info *devinfo,
1004                       uint32_t hw_ctx,
1005                       int drm_fd)
1006 {
1007    perf_ctx->perf = perf_cfg;
1008    perf_ctx->ctx = ctx;
1009    perf_ctx->bufmgr = bufmgr;
1010    perf_ctx->drm_fd = drm_fd;
1011    perf_ctx->hw_ctx = hw_ctx;
1012    perf_ctx->devinfo = devinfo;
1013
1014    perf_ctx->unaccumulated =
1015       ralloc_array(ctx, struct gen_perf_query_object *, 2);
1016    perf_ctx->unaccumulated_elements = 0;
1017    perf_ctx->unaccumulated_array_size = 2;
1018
1019    exec_list_make_empty(&perf_ctx->sample_buffers);
1020    exec_list_make_empty(&perf_ctx->free_sample_buffers);
1021
1022    /* It's convenient to guarantee that this linked list of sample
1023     * buffers is never empty so we add an empty head so when we
1024     * Begin an OA query we can always take a reference on a buffer
1025     * in this list.
1026     */
1027    struct oa_sample_buf *buf = gen_perf_get_free_sample_buf(perf_ctx);
1028    exec_list_push_head(&perf_ctx->sample_buffers, &buf->link);
1029
1030    perf_ctx->oa_stream_fd = -1;
1031    perf_ctx->next_query_start_report_id = 1000;
1032 }
1033
1034 /**
1035  * Add a query to the global list of "unaccumulated queries."
1036  *
1037  * Queries are tracked here until all the associated OA reports have
1038  * been accumulated via accumulate_oa_reports() after the end
1039  * MI_REPORT_PERF_COUNT has landed in query->oa.bo.
1040  */
1041 static void
1042 add_to_unaccumulated_query_list(struct gen_perf_context *perf_ctx,
1043                                 struct gen_perf_query_object *obj)
1044 {
1045    if (perf_ctx->unaccumulated_elements >=
1046        perf_ctx->unaccumulated_array_size)
1047    {
1048       perf_ctx->unaccumulated_array_size *= 1.5;
1049       perf_ctx->unaccumulated =
1050          reralloc(perf_ctx->ctx, perf_ctx->unaccumulated,
1051                   struct gen_perf_query_object *,
1052                   perf_ctx->unaccumulated_array_size);
1053    }
1054
1055    perf_ctx->unaccumulated[perf_ctx->unaccumulated_elements++] = obj;
1056 }
1057
1058 bool
1059 gen_perf_begin_query(struct gen_perf_context *perf_ctx,
1060                      struct gen_perf_query_object *query)
1061 {
1062    struct gen_perf_config *perf_cfg = perf_ctx->perf;
1063    const struct gen_perf_query_info *queryinfo = query->queryinfo;
1064
1065    /* XXX: We have to consider that the command parser unit that parses batch
1066     * buffer commands and is used to capture begin/end counter snapshots isn't
1067     * implicitly synchronized with what's currently running across other GPU
1068     * units (such as the EUs running shaders) that the performance counters are
1069     * associated with.
1070     *
1071     * The intention of performance queries is to measure the work associated
1072     * with commands between the begin/end delimiters and so for that to be the
1073     * case we need to explicitly synchronize the parsing of commands to capture
1074     * Begin/End counter snapshots with what's running across other parts of the
1075     * GPU.
1076     *
1077     * When the command parser reaches a Begin marker it effectively needs to
1078     * drain everything currently running on the GPU until the hardware is idle
1079     * before capturing the first snapshot of counters - otherwise the results
1080     * would also be measuring the effects of earlier commands.
1081     *
1082     * When the command parser reaches an End marker it needs to stall until
1083     * everything currently running on the GPU has finished before capturing the
1084     * end snapshot - otherwise the results won't be a complete representation
1085     * of the work.
1086     *
1087     * Theoretically there could be opportunities to minimize how much of the
1088     * GPU pipeline is drained, or that we stall for, when we know what specific
1089     * units the performance counters being queried relate to but we don't
1090     * currently attempt to be clever here.
1091     *
1092     * Note: with our current simple approach here then for back-to-back queries
1093     * we will redundantly emit duplicate commands to synchronize the command
1094     * streamer with the rest of the GPU pipeline, but we assume that in HW the
1095     * second synchronization is effectively a NOOP.
1096     *
1097     * N.B. The final results are based on deltas of counters between (inside)
1098     * Begin/End markers so even though the total wall clock time of the
1099     * workload is stretched by larger pipeline bubbles the bubbles themselves
1100     * are generally invisible to the query results. Whether that's a good or a
1101     * bad thing depends on the use case. For a lower real-time impact while
1102     * capturing metrics then periodic sampling may be a better choice than
1103     * INTEL_performance_query.
1104     *
1105     *
1106     * This is our Begin synchronization point to drain current work on the
1107     * GPU before we capture our first counter snapshot...
1108     */
1109    perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
1110
1111    switch (queryinfo->kind) {
1112    case GEN_PERF_QUERY_TYPE_OA:
1113    case GEN_PERF_QUERY_TYPE_RAW: {
1114
1115       /* Opening an i915 perf stream implies exclusive access to the OA unit
1116        * which will generate counter reports for a specific counter set with a
1117        * specific layout/format so we can't begin any OA based queries that
1118        * require a different counter set or format unless we get an opportunity
1119        * to close the stream and open a new one...
1120        */
1121       uint64_t metric_id = gen_perf_query_get_metric_id(perf_ctx->perf, queryinfo);
1122
1123       if (perf_ctx->oa_stream_fd != -1 &&
1124           perf_ctx->current_oa_metrics_set_id != metric_id) {
1125
1126          if (perf_ctx->n_oa_users != 0) {
1127             DBG("WARNING: Begin failed already using perf config=%i/%"PRIu64"\n",
1128                 perf_ctx->current_oa_metrics_set_id, metric_id);
1129             return false;
1130          } else
1131             gen_perf_close(perf_ctx, queryinfo);
1132       }
1133
1134       /* If the OA counters aren't already on, enable them. */
1135       if (perf_ctx->oa_stream_fd == -1) {
1136          const struct gen_device_info *devinfo = perf_ctx->devinfo;
1137
1138          /* The period_exponent gives a sampling period as follows:
1139           *   sample_period = timestamp_period * 2^(period_exponent + 1)
1140           *
1141           * The timestamps increments every 80ns (HSW), ~52ns (GEN9LP) or
1142           * ~83ns (GEN8/9).
1143           *
1144           * The counter overflow period is derived from the EuActive counter
1145           * which reads a counter that increments by the number of clock
1146           * cycles multiplied by the number of EUs. It can be calculated as:
1147           *
1148           * 2^(number of bits in A counter) / (n_eus * max_gen_freq * 2)
1149           *
1150           * (E.g. 40 EUs @ 1GHz = ~53ms)
1151           *
1152           * We select a sampling period inferior to that overflow period to
1153           * ensure we cannot see more than 1 counter overflow, otherwise we
1154           * could loose information.
1155           */
1156
1157          int a_counter_in_bits = 32;
1158          if (devinfo->gen >= 8)
1159             a_counter_in_bits = 40;
1160
1161          uint64_t overflow_period = pow(2, a_counter_in_bits) / (perf_cfg->sys_vars.n_eus *
1162              /* drop 1GHz freq to have units in nanoseconds */
1163              2);
1164
1165          DBG("A counter overflow period: %"PRIu64"ns, %"PRIu64"ms (n_eus=%"PRIu64")\n",
1166              overflow_period, overflow_period / 1000000ul, perf_cfg->sys_vars.n_eus);
1167
1168          int period_exponent = 0;
1169          uint64_t prev_sample_period, next_sample_period;
1170          for (int e = 0; e < 30; e++) {
1171             prev_sample_period = 1000000000ull * pow(2, e + 1) / devinfo->timestamp_frequency;
1172             next_sample_period = 1000000000ull * pow(2, e + 2) / devinfo->timestamp_frequency;
1173
1174             /* Take the previous sampling period, lower than the overflow
1175              * period.
1176              */
1177             if (prev_sample_period < overflow_period &&
1178                 next_sample_period > overflow_period)
1179                period_exponent = e + 1;
1180          }
1181
1182          if (period_exponent == 0) {
1183             DBG("WARNING: enable to find a sampling exponent\n");
1184             return false;
1185          }
1186
1187          DBG("OA sampling exponent: %i ~= %"PRIu64"ms\n", period_exponent,
1188              prev_sample_period / 1000000ul);
1189
1190          if (!gen_perf_open(perf_ctx, metric_id, queryinfo->oa_format,
1191                             period_exponent, perf_ctx->drm_fd,
1192                             perf_ctx->hw_ctx))
1193             return false;
1194       } else {
1195          assert(perf_ctx->current_oa_metrics_set_id == metric_id &&
1196                 perf_ctx->current_oa_format == queryinfo->oa_format);
1197       }
1198
1199       if (!gen_perf_inc_n_users(perf_ctx)) {
1200          DBG("WARNING: Error enabling i915 perf stream: %m\n");
1201          return false;
1202       }
1203
1204       if (query->oa.bo) {
1205          perf_cfg->vtbl.bo_unreference(query->oa.bo);
1206          query->oa.bo = NULL;
1207       }
1208
1209       query->oa.bo = perf_cfg->vtbl.bo_alloc(perf_ctx->bufmgr,
1210                                              "perf. query OA MI_RPC bo",
1211                                              MI_RPC_BO_SIZE);
1212 #ifdef DEBUG
1213       /* Pre-filling the BO helps debug whether writes landed. */
1214       void *map = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->oa.bo, MAP_WRITE);
1215       memset(map, 0x80, MI_RPC_BO_SIZE);
1216       perf_cfg->vtbl.bo_unmap(query->oa.bo);
1217 #endif
1218
1219       query->oa.begin_report_id = perf_ctx->next_query_start_report_id;
1220       perf_ctx->next_query_start_report_id += 2;
1221
1222       /* We flush the batchbuffer here to minimize the chances that MI_RPC
1223        * delimiting commands end up in different batchbuffers. If that's the
1224        * case, the measurement will include the time it takes for the kernel
1225        * scheduler to load a new request into the hardware. This is manifested in
1226        * tools like frameretrace by spikes in the "GPU Core Clocks" counter.
1227        */
1228       perf_cfg->vtbl.batchbuffer_flush(perf_ctx->ctx, __FILE__, __LINE__);
1229
1230       /* Take a starting OA counter snapshot. */
1231       perf_cfg->vtbl.emit_mi_report_perf_count(perf_ctx->ctx, query->oa.bo, 0,
1232                                                query->oa.begin_report_id);
1233       perf_cfg->vtbl.capture_frequency_stat_register(perf_ctx->ctx, query->oa.bo,
1234                                                      MI_FREQ_START_OFFSET_BYTES);
1235
1236       ++perf_ctx->n_active_oa_queries;
1237
1238       /* No already-buffered samples can possibly be associated with this query
1239        * so create a marker within the list of sample buffers enabling us to
1240        * easily ignore earlier samples when processing this query after
1241        * completion.
1242        */
1243       assert(!exec_list_is_empty(&perf_ctx->sample_buffers));
1244       query->oa.samples_head = exec_list_get_tail(&perf_ctx->sample_buffers);
1245
1246       struct oa_sample_buf *buf =
1247          exec_node_data(struct oa_sample_buf, query->oa.samples_head, link);
1248
1249       /* This reference will ensure that future/following sample
1250        * buffers (that may relate to this query) can't be freed until
1251        * this drops to zero.
1252        */
1253       buf->refcount++;
1254
1255       gen_perf_query_result_clear(&query->oa.result);
1256       query->oa.results_accumulated = false;
1257
1258       add_to_unaccumulated_query_list(perf_ctx, query);
1259       break;
1260    }
1261
1262    case GEN_PERF_QUERY_TYPE_PIPELINE:
1263       if (query->pipeline_stats.bo) {
1264          perf_cfg->vtbl.bo_unreference(query->pipeline_stats.bo);
1265          query->pipeline_stats.bo = NULL;
1266       }
1267
1268       query->pipeline_stats.bo =
1269          perf_cfg->vtbl.bo_alloc(perf_ctx->bufmgr,
1270                                  "perf. query pipeline stats bo",
1271                                  STATS_BO_SIZE);
1272
1273       /* Take starting snapshots. */
1274       gen_perf_snapshot_statistics_registers(perf_ctx->ctx , perf_cfg, query, 0);
1275
1276       ++perf_ctx->n_active_pipeline_stats_queries;
1277       break;
1278
1279    default:
1280       unreachable("Unknown query type");
1281       break;
1282    }
1283
1284    return true;
1285 }