src/intel/perf/gen_perf.c

   1 /*
   2  * Copyright © 2018 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <dirent.h>
  25
  26 #include <sys/types.h>
  27 #include <sys/stat.h>
  28 #include <fcntl.h>
  29 #include <unistd.h>
  30 #include <errno.h>
  31
  32 #include <drm-uapi/i915_drm.h>
  33
  34 #include "common/gen_gem.h"
  35 #include "gen_perf.h"
  36 #include "perf/gen_perf_mdapi.h"
  37 #include "perf/gen_perf_metrics.h"
  38
  39 #include "dev/gen_debug.h"
  40 #include "dev/gen_device_info.h"
  41 #include "util/bitscan.h"
  42 #include "util/u_math.h"
  43
  44 #define FILE_DEBUG_FLAG DEBUG_PERFMON
  45 #define MI_RPC_BO_SIZE              4096
  46 #define MI_FREQ_START_OFFSET_BYTES  (3072)
  47 #define MI_RPC_BO_END_OFFSET_BYTES  (MI_RPC_BO_SIZE / 2)
  48 #define MI_FREQ_END_OFFSET_BYTES    (3076)
  49
  50 #define INTEL_MASK(high, low) (((1u<<((high)-(low)+1))-1)<<(low))
  51
  52 #define GEN7_RPSTAT1                       0xA01C
  53 #define  GEN7_RPSTAT1_CURR_GT_FREQ_SHIFT   7
  54 #define  GEN7_RPSTAT1_CURR_GT_FREQ_MASK    INTEL_MASK(13, 7)
  55 #define  GEN7_RPSTAT1_PREV_GT_FREQ_SHIFT   0
  56 #define  GEN7_RPSTAT1_PREV_GT_FREQ_MASK    INTEL_MASK(6, 0)
  57
  58 #define GEN9_RPSTAT0                       0xA01C
  59 #define  GEN9_RPSTAT0_CURR_GT_FREQ_SHIFT   23
  60 #define  GEN9_RPSTAT0_CURR_GT_FREQ_MASK    INTEL_MASK(31, 23)
  61 #define  GEN9_RPSTAT0_PREV_GT_FREQ_SHIFT   0
  62 #define  GEN9_RPSTAT0_PREV_GT_FREQ_MASK    INTEL_MASK(8, 0)
  63
  64 #define MAP_READ  (1 << 0)
  65 #define MAP_WRITE (1 << 1)
  66
  67 static bool
  68 get_sysfs_dev_dir(struct gen_perf_config *perf, int fd)
  69 {
  70    struct stat sb;
  71    int min, maj;
  72    DIR *drmdir;
  73    struct dirent *drm_entry;
  74    int len;
  75
  76    perf->sysfs_dev_dir[0] = '\0';
  77
  78    if (fstat(fd, &sb)) {
  79       DBG("Failed to stat DRM fd\n");
  80       return false;
  81    }
  82
  83    maj = major(sb.st_rdev);
  84    min = minor(sb.st_rdev);
  85
  86    if (!S_ISCHR(sb.st_mode)) {
  87       DBG("DRM fd is not a character device as expected\n");
  88       return false;
  89    }
  90
  91    len = snprintf(perf->sysfs_dev_dir,
  92                   sizeof(perf->sysfs_dev_dir),
  93                   "/sys/dev/char/%d:%d/device/drm", maj, min);
  94    if (len < 0 || len >= sizeof(perf->sysfs_dev_dir)) {
  95       DBG("Failed to concatenate sysfs path to drm device\n");
  96       return false;
  97    }
  98
  99    drmdir = opendir(perf->sysfs_dev_dir);
 100    if (!drmdir) {
 101       DBG("Failed to open %s: %m\n", perf->sysfs_dev_dir);
 102       return false;
 103    }
 104
 105    while ((drm_entry = readdir(drmdir))) {
 106       if ((drm_entry->d_type == DT_DIR ||
 107            drm_entry->d_type == DT_LNK) &&
 108           strncmp(drm_entry->d_name, "card", 4) == 0)
 109       {
 110          len = snprintf(perf->sysfs_dev_dir,
 111                         sizeof(perf->sysfs_dev_dir),
 112                         "/sys/dev/char/%d:%d/device/drm/%s",
 113                         maj, min, drm_entry->d_name);
 114          closedir(drmdir);
 115          if (len < 0 || len >= sizeof(perf->sysfs_dev_dir))
 116             return false;
 117          else
 118             return true;
 119       }
 120    }
 121
 122    closedir(drmdir);
 123
 124    DBG("Failed to find cardX directory under /sys/dev/char/%d:%d/device/drm\n",
 125        maj, min);
 126
 127    return false;
 128 }
 129
 130 static bool
 131 read_file_uint64(const char *file, uint64_t *val)
 132 {
 133     char buf[32];
 134     int fd, n;
 135
 136     fd = open(file, 0);
 137     if (fd < 0)
 138        return false;
 139     while ((n = read(fd, buf, sizeof (buf) - 1)) < 0 &&
 140            errno == EINTR);
 141     close(fd);
 142     if (n < 0)
 143        return false;
 144
 145     buf[n] = '\0';
 146     *val = strtoull(buf, NULL, 0);
 147
 148     return true;
 149 }
 150
 151 static bool
 152 read_sysfs_drm_device_file_uint64(struct gen_perf_config *perf,
 153                                   const char *file,
 154                                   uint64_t *value)
 155 {
 156    char buf[512];
 157    int len;
 158
 159    len = snprintf(buf, sizeof(buf), "%s/%s", perf->sysfs_dev_dir, file);
 160    if (len < 0 || len >= sizeof(buf)) {
 161       DBG("Failed to concatenate sys filename to read u64 from\n");
 162       return false;
 163    }
 164
 165    return read_file_uint64(buf, value);
 166 }
 167
 168 static void
 169 register_oa_config(struct gen_perf_config *perf,
 170                    const struct gen_perf_query_info *query,
 171                    uint64_t config_id)
 172 {
 173    struct gen_perf_query_info *registred_query =
 174       gen_perf_query_append_query_info(perf, 0);
 175
 176    *registred_query = *query;
 177    registred_query->oa_metrics_set_id = config_id;
 178    DBG("metric set registred: id = %" PRIu64", guid = %s\n",
 179        registred_query->oa_metrics_set_id, query->guid);
 180 }
 181
 182 static void
 183 enumerate_sysfs_metrics(struct gen_perf_config *perf)
 184 {
 185    DIR *metricsdir = NULL;
 186    struct dirent *metric_entry;
 187    char buf[256];
 188    int len;
 189
 190    len = snprintf(buf, sizeof(buf), "%s/metrics", perf->sysfs_dev_dir);
 191    if (len < 0 || len >= sizeof(buf)) {
 192       DBG("Failed to concatenate path to sysfs metrics/ directory\n");
 193       return;
 194    }
 195
 196    metricsdir = opendir(buf);
 197    if (!metricsdir) {
 198       DBG("Failed to open %s: %m\n", buf);
 199       return;
 200    }
 201
 202    while ((metric_entry = readdir(metricsdir))) {
 203       struct hash_entry *entry;
 204
 205       if ((metric_entry->d_type != DT_DIR &&
 206            metric_entry->d_type != DT_LNK) ||
 207           metric_entry->d_name[0] == '.')
 208          continue;
 209
 210       DBG("metric set: %s\n", metric_entry->d_name);
 211       entry = _mesa_hash_table_search(perf->oa_metrics_table,
 212                                       metric_entry->d_name);
 213       if (entry) {
 214          uint64_t id;
 215
 216          len = snprintf(buf, sizeof(buf), "%s/metrics/%s/id",
 217                         perf->sysfs_dev_dir, metric_entry->d_name);
 218          if (len < 0 || len >= sizeof(buf)) {
 219             DBG("Failed to concatenate path to sysfs metric id file\n");
 220             continue;
 221          }
 222
 223          if (!read_file_uint64(buf, &id)) {
 224             DBG("Failed to read metric set id from %s: %m", buf);
 225             continue;
 226          }
 227
 228          register_oa_config(perf, (const struct gen_perf_query_info *)entry->data, id);
 229       } else
 230          DBG("metric set not known by mesa (skipping)\n");
 231    }
 232
 233    closedir(metricsdir);
 234 }
 235
 236 static bool
 237 kernel_has_dynamic_config_support(struct gen_perf_config *perf, int fd)
 238 {
 239    uint64_t invalid_config_id = UINT64_MAX;
 240
 241    return gen_ioctl(fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG,
 242                     &invalid_config_id) < 0 && errno == ENOENT;
 243 }
 244
 245 bool
 246 gen_perf_load_metric_id(struct gen_perf_config *perf, const char *guid,
 247                         uint64_t *metric_id)
 248 {
 249    char config_path[280];
 250
 251    snprintf(config_path, sizeof(config_path), "%s/metrics/%s/id",
 252             perf->sysfs_dev_dir, guid);
 253
 254    /* Don't recreate already loaded configs. */
 255    return read_file_uint64(config_path, metric_id);
 256 }
 257
 258 static void
 259 init_oa_configs(struct gen_perf_config *perf, int fd)
 260 {
 261    hash_table_foreach(perf->oa_metrics_table, entry) {
 262       const struct gen_perf_query_info *query = entry->data;
 263       struct drm_i915_perf_oa_config config;
 264       uint64_t config_id;
 265       int ret;
 266
 267       if (gen_perf_load_metric_id(perf, query->guid, &config_id)) {
 268          DBG("metric set: %s (already loaded)\n", query->guid);
 269          register_oa_config(perf, query, config_id);
 270          continue;
 271       }
 272
 273       memset(&config, 0, sizeof(config));
 274
 275       memcpy(config.uuid, query->guid, sizeof(config.uuid));
 276
 277       config.n_mux_regs = query->n_mux_regs;
 278       config.mux_regs_ptr = (uintptr_t) query->mux_regs;
 279
 280       config.n_boolean_regs = query->n_b_counter_regs;
 281       config.boolean_regs_ptr = (uintptr_t) query->b_counter_regs;
 282
 283       config.n_flex_regs = query->n_flex_regs;
 284       config.flex_regs_ptr = (uintptr_t) query->flex_regs;
 285
 286       ret = gen_ioctl(fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &config);
 287       if (ret < 0) {
 288          DBG("Failed to load \"%s\" (%s) metrics set in kernel: %s\n",
 289              query->name, query->guid, strerror(errno));
 290          continue;
 291       }
 292
 293       register_oa_config(perf, query, ret);
 294       DBG("metric set: %s (added)\n", query->guid);
 295    }
 296 }
 297
 298 static void
 299 compute_topology_builtins(struct gen_perf_config *perf,
 300                           const struct gen_device_info *devinfo)
 301 {
 302    perf->sys_vars.slice_mask = devinfo->slice_masks;
 303    perf->sys_vars.n_eu_slices = devinfo->num_slices;
 304
 305    for (int i = 0; i < sizeof(devinfo->subslice_masks[i]); i++) {
 306       perf->sys_vars.n_eu_sub_slices +=
 307          __builtin_popcount(devinfo->subslice_masks[i]);
 308    }
 309
 310    for (int i = 0; i < sizeof(devinfo->eu_masks); i++)
 311       perf->sys_vars.n_eus += __builtin_popcount(devinfo->eu_masks[i]);
 312
 313    perf->sys_vars.eu_threads_count = devinfo->num_thread_per_eu;
 314
 315    /* The subslice mask builtin contains bits for all slices. Prior to Gen11
 316     * it had groups of 3bits for each slice, on Gen11 it's 8bits for each
 317     * slice.
 318     *
 319     * Ideally equations would be updated to have a slice/subslice query
 320     * function/operator.
 321     */
 322    perf->sys_vars.subslice_mask = 0;
 323
 324    int bits_per_subslice = devinfo->gen == 11 ? 8 : 3;
 325
 326    for (int s = 0; s < util_last_bit(devinfo->slice_masks); s++) {
 327       for (int ss = 0; ss < (devinfo->subslice_slice_stride * 8); ss++) {
 328          if (gen_device_info_subslice_available(devinfo, s, ss))
 329             perf->sys_vars.subslice_mask |= 1ULL << (s * bits_per_subslice + ss);
 330       }
 331    }
 332 }
 333
 334 static bool
 335 init_oa_sys_vars(struct gen_perf_config *perf, const struct gen_device_info *devinfo)
 336 {
 337    uint64_t min_freq_mhz = 0, max_freq_mhz = 0;
 338
 339    if (!read_sysfs_drm_device_file_uint64(perf, "gt_min_freq_mhz", &min_freq_mhz))
 340       return false;
 341
 342    if (!read_sysfs_drm_device_file_uint64(perf,  "gt_max_freq_mhz", &max_freq_mhz))
 343       return false;
 344
 345    memset(&perf->sys_vars, 0, sizeof(perf->sys_vars));
 346    perf->sys_vars.gt_min_freq = min_freq_mhz * 1000000;
 347    perf->sys_vars.gt_max_freq = max_freq_mhz * 1000000;
 348    perf->sys_vars.timestamp_frequency = devinfo->timestamp_frequency;
 349    perf->sys_vars.revision = devinfo->revision;
 350    compute_topology_builtins(perf, devinfo);
 351
 352    return true;
 353 }
 354
 355 typedef void (*perf_register_oa_queries_t)(struct gen_perf_config *);
 356
 357 static perf_register_oa_queries_t
 358 get_register_queries_function(const struct gen_device_info *devinfo)
 359 {
 360    if (devinfo->is_haswell)
 361       return gen_oa_register_queries_hsw;
 362    if (devinfo->is_cherryview)
 363       return gen_oa_register_queries_chv;
 364    if (devinfo->is_broadwell)
 365       return gen_oa_register_queries_bdw;
 366    if (devinfo->is_broxton)
 367       return gen_oa_register_queries_bxt;
 368    if (devinfo->is_skylake) {
 369       if (devinfo->gt == 2)
 370          return gen_oa_register_queries_sklgt2;
 371       if (devinfo->gt == 3)
 372          return gen_oa_register_queries_sklgt3;
 373       if (devinfo->gt == 4)
 374          return gen_oa_register_queries_sklgt4;
 375    }
 376    if (devinfo->is_kabylake) {
 377       if (devinfo->gt == 2)
 378          return gen_oa_register_queries_kblgt2;
 379       if (devinfo->gt == 3)
 380          return gen_oa_register_queries_kblgt3;
 381    }
 382    if (devinfo->is_geminilake)
 383       return gen_oa_register_queries_glk;
 384    if (devinfo->is_coffeelake) {
 385       if (devinfo->gt == 2)
 386          return gen_oa_register_queries_cflgt2;
 387       if (devinfo->gt == 3)
 388          return gen_oa_register_queries_cflgt3;
 389    }
 390    if (devinfo->is_cannonlake)
 391       return gen_oa_register_queries_cnl;
 392    if (devinfo->gen == 11)
 393       return gen_oa_register_queries_icl;
 394
 395    return NULL;
 396 }
 397
 398 bool
 399 gen_perf_load_oa_metrics(struct gen_perf_config *perf, int fd,
 400                          const struct gen_device_info *devinfo)
 401 {
 402    perf_register_oa_queries_t oa_register = get_register_queries_function(devinfo);
 403    bool i915_perf_oa_available = false;
 404    struct stat sb;
 405
 406    /* The existence of this sysctl parameter implies the kernel supports
 407     * the i915 perf interface.
 408     */
 409    if (stat("/proc/sys/dev/i915/perf_stream_paranoid", &sb) == 0) {
 410
 411       /* If _paranoid == 1 then on Gen8+ we won't be able to access OA
 412        * metrics unless running as root.
 413        */
 414       if (devinfo->is_haswell)
 415          i915_perf_oa_available = true;
 416       else {
 417          uint64_t paranoid = 1;
 418
 419          read_file_uint64("/proc/sys/dev/i915/perf_stream_paranoid", &paranoid);
 420
 421          if (paranoid == 0 || geteuid() == 0)
 422             i915_perf_oa_available = true;
 423       }
 424    }
 425
 426    if (!i915_perf_oa_available ||
 427        !oa_register ||
 428        !get_sysfs_dev_dir(perf, fd) ||
 429        !init_oa_sys_vars(perf, devinfo))
 430       return false;
 431
 432    perf->oa_metrics_table =
 433       _mesa_hash_table_create(perf, _mesa_key_hash_string,
 434                               _mesa_key_string_equal);
 435
 436    /* Index all the metric sets mesa knows about before looking to see what
 437     * the kernel is advertising.
 438     */
 439    oa_register(perf);
 440
 441    if (likely((INTEL_DEBUG & DEBUG_NO_OACONFIG) == 0) &&
 442        kernel_has_dynamic_config_support(perf, fd))
 443       init_oa_configs(perf, fd);
 444    else
 445       enumerate_sysfs_metrics(perf);
 446
 447    return true;
 448 }
 449
 450 /* Accumulate 32bits OA counters */
 451 static inline void
 452 accumulate_uint32(const uint32_t *report0,
 453                   const uint32_t *report1,
 454                   uint64_t *accumulator)
 455 {
 456    *accumulator += (uint32_t)(*report1 - *report0);
 457 }
 458
 459 /* Accumulate 40bits OA counters */
 460 static inline void
 461 accumulate_uint40(int a_index,
 462                   const uint32_t *report0,
 463                   const uint32_t *report1,
 464                   uint64_t *accumulator)
 465 {
 466    const uint8_t *high_bytes0 = (uint8_t *)(report0 + 40);
 467    const uint8_t *high_bytes1 = (uint8_t *)(report1 + 40);
 468    uint64_t high0 = (uint64_t)(high_bytes0[a_index]) << 32;
 469    uint64_t high1 = (uint64_t)(high_bytes1[a_index]) << 32;
 470    uint64_t value0 = report0[a_index + 4] | high0;
 471    uint64_t value1 = report1[a_index + 4] | high1;
 472    uint64_t delta;
 473
 474    if (value0 > value1)
 475       delta = (1ULL << 40) + value1 - value0;
 476    else
 477       delta = value1 - value0;
 478
 479    *accumulator += delta;
 480 }
 481
 482 static void
 483 gen8_read_report_clock_ratios(const uint32_t *report,
 484                               uint64_t *slice_freq_hz,
 485                               uint64_t *unslice_freq_hz)
 486 {
 487    /* The lower 16bits of the RPT_ID field of the OA reports contains a
 488     * snapshot of the bits coming from the RP_FREQ_NORMAL register and is
 489     * divided this way :
 490     *
 491     * RPT_ID[31:25]: RP_FREQ_NORMAL[20:14] (low squashed_slice_clock_frequency)
 492     * RPT_ID[10:9]:  RP_FREQ_NORMAL[22:21] (high squashed_slice_clock_frequency)
 493     * RPT_ID[8:0]:   RP_FREQ_NORMAL[31:23] (squashed_unslice_clock_frequency)
 494     *
 495     * RP_FREQ_NORMAL[31:23]: Software Unslice Ratio Request
 496     *                        Multiple of 33.33MHz 2xclk (16 MHz 1xclk)
 497     *
 498     * RP_FREQ_NORMAL[22:14]: Software Slice Ratio Request
 499     *                        Multiple of 33.33MHz 2xclk (16 MHz 1xclk)
 500     */
 501
 502    uint32_t unslice_freq = report[0] & 0x1ff;
 503    uint32_t slice_freq_low = (report[0] >> 25) & 0x7f;
 504    uint32_t slice_freq_high = (report[0] >> 9) & 0x3;
 505    uint32_t slice_freq = slice_freq_low | (slice_freq_high << 7);
 506
 507    *slice_freq_hz = slice_freq * 16666667ULL;
 508    *unslice_freq_hz = unslice_freq * 16666667ULL;
 509 }
 510
 511 void
 512 gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result,
 513                                        const struct gen_device_info *devinfo,
 514                                        const uint32_t *start,
 515                                        const uint32_t *end)
 516 {
 517    /* Slice/Unslice frequency is only available in the OA reports when the
 518     * "Disable OA reports due to clock ratio change" field in
 519     * OA_DEBUG_REGISTER is set to 1. This is how the kernel programs this
 520     * global register (see drivers/gpu/drm/i915/i915_perf.c)
 521     *
 522     * Documentation says this should be available on Gen9+ but experimentation
 523     * shows that Gen8 reports similar values, so we enable it there too.
 524     */
 525    if (devinfo->gen < 8)
 526       return;
 527
 528    gen8_read_report_clock_ratios(start,
 529                                  &result->slice_frequency[0],
 530                                  &result->unslice_frequency[0]);
 531    gen8_read_report_clock_ratios(end,
 532                                  &result->slice_frequency[1],
 533                                  &result->unslice_frequency[1]);
 534 }
 535
 536 void
 537 gen_perf_query_result_accumulate(struct gen_perf_query_result *result,
 538                                  const struct gen_perf_query_info *query,
 539                                  const uint32_t *start,
 540                                  const uint32_t *end)
 541 {
 542    int i, idx = 0;
 543
 544    result->hw_id = start[2];
 545    result->reports_accumulated++;
 546
 547    switch (query->oa_format) {
 548    case I915_OA_FORMAT_A32u40_A4u32_B8_C8:
 549       accumulate_uint32(start + 1, end + 1, result->accumulator + idx++); /* timestamp */
 550       accumulate_uint32(start + 3, end + 3, result->accumulator + idx++); /* clock */
 551
 552       /* 32x 40bit A counters... */
 553       for (i = 0; i < 32; i++)
 554          accumulate_uint40(i, start, end, result->accumulator + idx++);
 555
 556       /* 4x 32bit A counters... */
 557       for (i = 0; i < 4; i++)
 558          accumulate_uint32(start + 36 + i, end + 36 + i, result->accumulator + idx++);
 559
 560       /* 8x 32bit B counters + 8x 32bit C counters... */
 561       for (i = 0; i < 16; i++)
 562          accumulate_uint32(start + 48 + i, end + 48 + i, result->accumulator + idx++);
 563       break;
 564
 565    case I915_OA_FORMAT_A45_B8_C8:
 566       accumulate_uint32(start + 1, end + 1, result->accumulator); /* timestamp */
 567
 568       for (i = 0; i < 61; i++)
 569          accumulate_uint32(start + 3 + i, end + 3 + i, result->accumulator + 1 + i);
 570       break;
 571
 572    default:
 573       unreachable("Can't accumulate OA counters in unknown format");
 574    }
 575
 576 }
 577
 578 void
 579 gen_perf_query_result_clear(struct gen_perf_query_result *result)
 580 {
 581    memset(result, 0, sizeof(*result));
 582    result->hw_id = 0xffffffff; /* invalid */
 583 }
 584
 585 static void
 586 fill_mdapi_perf_query_counter(struct gen_perf_query_info *query,
 587                               const char *name,
 588                               uint32_t data_offset,
 589                               uint32_t data_size,
 590                               enum gen_perf_counter_data_type data_type)
 591 {
 592    struct gen_perf_query_counter *counter = &query->counters[query->n_counters];
 593
 594    assert(query->n_counters <= query->max_counters);
 595
 596    counter->name = name;
 597    counter->desc = "Raw counter value";
 598    counter->type = GEN_PERF_COUNTER_TYPE_RAW;
 599    counter->data_type = data_type;
 600    counter->offset = data_offset;
 601
 602    query->n_counters++;
 603
 604    assert(counter->offset + gen_perf_query_counter_get_size(counter) <= query->data_size);
 605 }
 606
 607 #define MDAPI_QUERY_ADD_COUNTER(query, struct_name, field_name, type_name) \
 608    fill_mdapi_perf_query_counter(query, #field_name,                    \
 609                                  (uint8_t *) &struct_name.field_name -  \
 610                                  (uint8_t *) &struct_name,              \
 611                                  sizeof(struct_name.field_name),        \
 612                                  GEN_PERF_COUNTER_DATA_TYPE_##type_name)
 613 #define MDAPI_QUERY_ADD_ARRAY_COUNTER(ctx, query, struct_name, field_name, idx, type_name) \
 614    fill_mdapi_perf_query_counter(query,                                 \
 615                                  ralloc_asprintf(ctx, "%s%i", #field_name, idx), \
 616                                  (uint8_t *) &struct_name.field_name[idx] - \
 617                                  (uint8_t *) &struct_name,              \
 618                                  sizeof(struct_name.field_name[0]),     \
 619                                  GEN_PERF_COUNTER_DATA_TYPE_##type_name)
 620
 621 void
 622 gen_perf_query_register_mdapi_oa_query(const struct gen_device_info *devinfo,
 623                                        struct gen_perf_config *perf)
 624 {
 625    struct gen_perf_query_info *query = NULL;
 626
 627    /* MDAPI requires different structures for pretty much every generation
 628     * (right now we have definitions for gen 7 to 11).
 629     */
 630    if (!(devinfo->gen >= 7 && devinfo->gen <= 11))
 631       return;
 632
 633    switch (devinfo->gen) {
 634    case 7: {
 635       query = gen_perf_query_append_query_info(perf, 1 + 45 + 16 + 7);
 636       query->oa_format = I915_OA_FORMAT_A45_B8_C8;
 637
 638       struct gen7_mdapi_metrics metric_data;
 639       query->data_size = sizeof(metric_data);
 640
 641       MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
 642       for (int i = 0; i < ARRAY_SIZE(metric_data.ACounters); i++) {
 643          MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
 644                                        metric_data, ACounters, i, UINT64);
 645       }
 646       for (int i = 0; i < ARRAY_SIZE(metric_data.NOACounters); i++) {
 647          MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
 648                                        metric_data, NOACounters, i, UINT64);
 649       }
 650       MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
 651       MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
 652       MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
 653       MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
 654       MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
 655       MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
 656       MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
 657       break;
 658    }
 659    case 8: {
 660       query = gen_perf_query_append_query_info(perf, 2 + 36 + 16 + 16);
 661       query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8;
 662
 663       struct gen8_mdapi_metrics metric_data;
 664       query->data_size = sizeof(metric_data);
 665
 666       MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
 667       MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64);
 668       for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) {
 669          MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
 670                                        metric_data, OaCntr, i, UINT64);
 671       }
 672       for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) {
 673          MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
 674                                        metric_data, NoaCntr, i, UINT64);
 675       }
 676       MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64);
 677       MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64);
 678       MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64);
 679       MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32);
 680       MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32);
 681       MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64);
 682       MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64);
 683       MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64);
 684       MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64);
 685       MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
 686       MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
 687       MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
 688       MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
 689       MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
 690       MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
 691       MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
 692       break;
 693    }
 694    case 9:
 695    case 10:
 696    case 11: {
 697       query = gen_perf_query_append_query_info(perf, 2 + 36 + 16 + 16 + 16 + 2);
 698       query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8;
 699
 700       struct gen9_mdapi_metrics metric_data;
 701       query->data_size = sizeof(metric_data);
 702
 703       MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
 704       MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64);
 705       for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) {
 706          MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
 707                                        metric_data, OaCntr, i, UINT64);
 708       }
 709       for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) {
 710          MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
 711                                        metric_data, NoaCntr, i, UINT64);
 712       }
 713       MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64);
 714       MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64);
 715       MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64);
 716       MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32);
 717       MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32);
 718       MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64);
 719       MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64);
 720       MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64);
 721       MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64);
 722       MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
 723       MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
 724       MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
 725       MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
 726       MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
 727       MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
 728       MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
 729       for (int i = 0; i < ARRAY_SIZE(metric_data.UserCntr); i++) {
 730          MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
 731                                        metric_data, UserCntr, i, UINT64);
 732       }
 733       MDAPI_QUERY_ADD_COUNTER(query, metric_data, UserCntrCfgId, UINT32);
 734       MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved4, UINT32);
 735       break;
 736    }
 737    default:
 738       unreachable("Unsupported gen");
 739       break;
 740    }
 741
 742    query->kind = GEN_PERF_QUERY_TYPE_RAW;
 743    query->name = "Intel_Raw_Hardware_Counters_Set_0_Query";
 744    query->guid = GEN_PERF_QUERY_GUID_MDAPI;
 745
 746    {
 747       /* Accumulation buffer offsets copied from an actual query... */
 748       const struct gen_perf_query_info *copy_query =
 749          &perf->queries[0];
 750
 751       query->gpu_time_offset = copy_query->gpu_time_offset;
 752       query->gpu_clock_offset = copy_query->gpu_clock_offset;
 753       query->a_offset = copy_query->a_offset;
 754       query->b_offset = copy_query->b_offset;
 755       query->c_offset = copy_query->c_offset;
 756    }
 757 }
 758
 759 void
 760 gen_perf_query_register_mdapi_statistic_query(const struct gen_device_info *devinfo,
 761                                               struct gen_perf_config *perf)
 762 {
 763    if (!(devinfo->gen >= 7 && devinfo->gen <= 11))
 764       return;
 765
 766    struct gen_perf_query_info *query =
 767       gen_perf_query_append_query_info(perf, MAX_STAT_COUNTERS);
 768
 769    query->kind = GEN_PERF_QUERY_TYPE_PIPELINE;
 770    query->name = "Intel_Raw_Pipeline_Statistics_Query";
 771
 772    /* The order has to match mdapi_pipeline_metrics. */
 773    gen_perf_query_info_add_basic_stat_reg(query, IA_VERTICES_COUNT,
 774                                           "N vertices submitted");
 775    gen_perf_query_info_add_basic_stat_reg(query, IA_PRIMITIVES_COUNT,
 776                                           "N primitives submitted");
 777    gen_perf_query_info_add_basic_stat_reg(query, VS_INVOCATION_COUNT,
 778                                           "N vertex shader invocations");
 779    gen_perf_query_info_add_basic_stat_reg(query, GS_INVOCATION_COUNT,
 780                                           "N geometry shader invocations");
 781    gen_perf_query_info_add_basic_stat_reg(query, GS_PRIMITIVES_COUNT,
 782                                           "N geometry shader primitives emitted");
 783    gen_perf_query_info_add_basic_stat_reg(query, CL_INVOCATION_COUNT,
 784                                           "N primitives entering clipping");
 785    gen_perf_query_info_add_basic_stat_reg(query, CL_PRIMITIVES_COUNT,
 786                                           "N primitives leaving clipping");
 787    if (devinfo->is_haswell || devinfo->gen == 8) {
 788       gen_perf_query_info_add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4,
 789                                        "N fragment shader invocations",
 790                                        "N fragment shader invocations");
 791    } else {
 792       gen_perf_query_info_add_basic_stat_reg(query, PS_INVOCATION_COUNT,
 793                                              "N fragment shader invocations");
 794    }
 795    gen_perf_query_info_add_basic_stat_reg(query, HS_INVOCATION_COUNT,
 796                                           "N TCS shader invocations");
 797    gen_perf_query_info_add_basic_stat_reg(query, DS_INVOCATION_COUNT,
 798                                           "N TES shader invocations");
 799    if (devinfo->gen >= 7) {
 800       gen_perf_query_info_add_basic_stat_reg(query, CS_INVOCATION_COUNT,
 801                                              "N compute shader invocations");
 802    }
 803
 804    if (devinfo->gen >= 10) {
 805       /* Reuse existing CS invocation register until we can expose this new
 806        * one.
 807        */
 808       gen_perf_query_info_add_basic_stat_reg(query, CS_INVOCATION_COUNT,
 809                                              "Reserved1");
 810    }
 811
 812    query->data_size = sizeof(uint64_t) * query->n_counters;
 813 }
 814
 815 uint64_t
 816 gen_perf_query_get_metric_id(struct gen_perf_config *perf,
 817                              const struct gen_perf_query_info *query)
 818 {
 819    /* These queries are know not to ever change, their config ID has been
 820     * loaded upon the first query creation. No need to look them up again.
 821     */
 822    if (query->kind == GEN_PERF_QUERY_TYPE_OA)
 823       return query->oa_metrics_set_id;
 824
 825    assert(query->kind == GEN_PERF_QUERY_TYPE_RAW);
 826
 827    /* Raw queries can be reprogrammed up by an external application/library.
 828     * When a raw query is used for the first time it's id is set to a value !=
 829     * 0. When it stops being used the id returns to 0. No need to reload the
 830     * ID when it's already loaded.
 831     */
 832    if (query->oa_metrics_set_id != 0) {
 833       DBG("Raw query '%s' guid=%s using cached ID: %"PRIu64"\n",
 834           query->name, query->guid, query->oa_metrics_set_id);
 835       return query->oa_metrics_set_id;
 836    }
 837
 838    struct gen_perf_query_info *raw_query = (struct gen_perf_query_info *)query;
 839    if (!gen_perf_load_metric_id(perf, query->guid,
 840                                 &raw_query->oa_metrics_set_id)) {
 841       DBG("Unable to read query guid=%s ID, falling back to test config\n", query->guid);
 842       raw_query->oa_metrics_set_id = 1ULL;
 843    } else {
 844       DBG("Raw query '%s'guid=%s loaded ID: %"PRIu64"\n",
 845           query->name, query->guid, query->oa_metrics_set_id);
 846    }
 847    return query->oa_metrics_set_id;
 848 }
 849
 850 struct oa_sample_buf *
 851 gen_perf_get_free_sample_buf(struct gen_perf_context *perf_ctx)
 852 {
 853    struct exec_node *node = exec_list_pop_head(&perf_ctx->free_sample_buffers);
 854    struct oa_sample_buf *buf;
 855
 856    if (node)
 857       buf = exec_node_data(struct oa_sample_buf, node, link);
 858    else {
 859       buf = ralloc_size(perf_ctx->perf, sizeof(*buf));
 860
 861       exec_node_init(&buf->link);
 862       buf->refcount = 0;
 863       buf->len = 0;
 864    }
 865
 866    return buf;
 867 }
 868
 869 void
 870 gen_perf_reap_old_sample_buffers(struct gen_perf_context *perf_ctx)
 871 {
 872    struct exec_node *tail_node =
 873       exec_list_get_tail(&perf_ctx->sample_buffers);
 874    struct oa_sample_buf *tail_buf =
 875       exec_node_data(struct oa_sample_buf, tail_node, link);
 876
 877    /* Remove all old, unreferenced sample buffers walking forward from
 878     * the head of the list, except always leave at least one node in
 879     * the list so we always have a node to reference when we Begin
 880     * a new query.
 881     */
 882    foreach_list_typed_safe(struct oa_sample_buf, buf, link,
 883                            &perf_ctx->sample_buffers)
 884    {
 885       if (buf->refcount == 0 && buf != tail_buf) {
 886          exec_node_remove(&buf->link);
 887          exec_list_push_head(&perf_ctx->free_sample_buffers, &buf->link);
 888       } else
 889          return;
 890    }
 891 }
 892
 893 void
 894 gen_perf_free_sample_bufs(struct gen_perf_context *perf_ctx)
 895 {
 896    foreach_list_typed_safe(struct oa_sample_buf, buf, link,
 897                            &perf_ctx->free_sample_buffers)
 898       ralloc_free(buf);
 899
 900    exec_list_make_empty(&perf_ctx->free_sample_buffers);
 901 }
 902
 903 /******************************************************************************/
 904
 905 /**
 906  * Emit MI_STORE_REGISTER_MEM commands to capture all of the
 907  * pipeline statistics for the performance query object.
 908  */
 909 void
 910 gen_perf_snapshot_statistics_registers(void *context,
 911                                        struct gen_perf_config *perf,
 912                                        struct gen_perf_query_object *obj,
 913                                        uint32_t offset_in_bytes)
 914 {
 915    const struct gen_perf_query_info *query = obj->queryinfo;
 916    const int n_counters = query->n_counters;
 917
 918    for (int i = 0; i < n_counters; i++) {
 919       const struct gen_perf_query_counter *counter = &query->counters[i];
 920
 921       assert(counter->data_type == GEN_PERF_COUNTER_DATA_TYPE_UINT64);
 922
 923       perf->vtbl.store_register_mem64(context, obj->pipeline_stats.bo,
 924                                       counter->pipeline_stat.reg,
 925                                       offset_in_bytes + i * sizeof(uint64_t));
 926    }
 927 }
 928
 929 void
 930 gen_perf_close(struct gen_perf_context *perfquery,
 931                const struct gen_perf_query_info *query)
 932 {
 933    if (perfquery->oa_stream_fd != -1) {
 934       close(perfquery->oa_stream_fd);
 935       perfquery->oa_stream_fd = -1;
 936    }
 937    if (query->kind == GEN_PERF_QUERY_TYPE_RAW) {
 938       struct gen_perf_query_info *raw_query =
 939          (struct gen_perf_query_info *) query;
 940       raw_query->oa_metrics_set_id = 0;
 941    }
 942 }
 943
 944 bool
 945 gen_perf_open(struct gen_perf_context *perf_ctx,
 946               int metrics_set_id,
 947               int report_format,
 948               int period_exponent,
 949               int drm_fd,
 950               uint32_t ctx_id)
 951 {
 952    uint64_t properties[] = {
 953       /* Single context sampling */
 954       DRM_I915_PERF_PROP_CTX_HANDLE, ctx_id,
 955
 956       /* Include OA reports in samples */
 957       DRM_I915_PERF_PROP_SAMPLE_OA, true,
 958
 959       /* OA unit configuration */
 960       DRM_I915_PERF_PROP_OA_METRICS_SET, metrics_set_id,
 961       DRM_I915_PERF_PROP_OA_FORMAT, report_format,
 962       DRM_I915_PERF_PROP_OA_EXPONENT, period_exponent,
 963    };
 964    struct drm_i915_perf_open_param param = {
 965       .flags = I915_PERF_FLAG_FD_CLOEXEC |
 966                I915_PERF_FLAG_FD_NONBLOCK |
 967                I915_PERF_FLAG_DISABLED,
 968       .num_properties = ARRAY_SIZE(properties) / 2,
 969       .properties_ptr = (uintptr_t) properties,
 970    };
 971    int fd = gen_ioctl(drm_fd, DRM_IOCTL_I915_PERF_OPEN, &param);
 972    if (fd == -1) {
 973       DBG("Error opening gen perf OA stream: %m\n");
 974       return false;
 975    }
 976
 977    perf_ctx->oa_stream_fd = fd;
 978
 979    perf_ctx->current_oa_metrics_set_id = metrics_set_id;
 980    perf_ctx->current_oa_format = report_format;
 981
 982    return true;
 983 }
 984
 985 bool
 986 gen_perf_inc_n_users(struct gen_perf_context *perf_ctx)
 987 {
 988    if (perf_ctx->n_oa_users == 0 &&
 989        gen_ioctl(perf_ctx->oa_stream_fd, I915_PERF_IOCTL_ENABLE, 0) < 0)
 990    {
 991       return false;
 992    }
 993    ++perf_ctx->n_oa_users;
 994
 995    return true;
 996 }
 997
 998 void
 999 gen_perf_dec_n_users(struct gen_perf_context *perf_ctx)
1000 {
1001    /* Disabling the i915 perf stream will effectively disable the OA
1002     * counters.  Note it's important to be sure there are no outstanding
1003     * MI_RPC commands at this point since they could stall the CS
1004     * indefinitely once OACONTROL is disabled.
1005     */
1006    --perf_ctx->n_oa_users;
1007    if (perf_ctx->n_oa_users == 0 &&
1008        gen_ioctl(perf_ctx->oa_stream_fd, I915_PERF_IOCTL_DISABLE, 0) < 0)
1009    {
1010       DBG("WARNING: Error disabling gen perf stream: %m\n");
1011    }
1012 }
1013
1014 void
1015 gen_perf_init_context(struct gen_perf_context *perf_ctx,
1016                       struct gen_perf_config *perf_cfg,
1017                       void * ctx,  /* driver context (eg, brw_context) */
1018                       void * bufmgr,  /* eg brw_bufmgr */
1019                       const struct gen_device_info *devinfo,
1020                       uint32_t hw_ctx,
1021                       int drm_fd)
1022 {
1023    perf_ctx->perf = perf_cfg;
1024    perf_ctx->ctx = ctx;
1025    perf_ctx->bufmgr = bufmgr;
1026    perf_ctx->drm_fd = drm_fd;
1027    perf_ctx->hw_ctx = hw_ctx;
1028    perf_ctx->devinfo = devinfo;
1029
1030    perf_ctx->unaccumulated =
1031       ralloc_array(ctx, struct gen_perf_query_object *, 2);
1032    perf_ctx->unaccumulated_elements = 0;
1033    perf_ctx->unaccumulated_array_size = 2;
1034
1035    exec_list_make_empty(&perf_ctx->sample_buffers);
1036    exec_list_make_empty(&perf_ctx->free_sample_buffers);
1037
1038    /* It's convenient to guarantee that this linked list of sample
1039     * buffers is never empty so we add an empty head so when we
1040     * Begin an OA query we can always take a reference on a buffer
1041     * in this list.
1042     */
1043    struct oa_sample_buf *buf = gen_perf_get_free_sample_buf(perf_ctx);
1044    exec_list_push_head(&perf_ctx->sample_buffers, &buf->link);
1045
1046    perf_ctx->oa_stream_fd = -1;
1047    perf_ctx->next_query_start_report_id = 1000;
1048 }
1049
1050 /**
1051  * Add a query to the global list of "unaccumulated queries."
1052  *
1053  * Queries are tracked here until all the associated OA reports have
1054  * been accumulated via accumulate_oa_reports() after the end
1055  * MI_REPORT_PERF_COUNT has landed in query->oa.bo.
1056  */
1057 static void
1058 add_to_unaccumulated_query_list(struct gen_perf_context *perf_ctx,
1059                                 struct gen_perf_query_object *obj)
1060 {
1061    if (perf_ctx->unaccumulated_elements >=
1062        perf_ctx->unaccumulated_array_size)
1063    {
1064       perf_ctx->unaccumulated_array_size *= 1.5;
1065       perf_ctx->unaccumulated =
1066          reralloc(perf_ctx->ctx, perf_ctx->unaccumulated,
1067                   struct gen_perf_query_object *,
1068                   perf_ctx->unaccumulated_array_size);
1069    }
1070
1071    perf_ctx->unaccumulated[perf_ctx->unaccumulated_elements++] = obj;
1072 }
1073
1074 bool
1075 gen_perf_begin_query(struct gen_perf_context *perf_ctx,
1076                      struct gen_perf_query_object *query)
1077 {
1078    struct gen_perf_config *perf_cfg = perf_ctx->perf;
1079    const struct gen_perf_query_info *queryinfo = query->queryinfo;
1080
1081    /* XXX: We have to consider that the command parser unit that parses batch
1082     * buffer commands and is used to capture begin/end counter snapshots isn't
1083     * implicitly synchronized with what's currently running across other GPU
1084     * units (such as the EUs running shaders) that the performance counters are
1085     * associated with.
1086     *
1087     * The intention of performance queries is to measure the work associated
1088     * with commands between the begin/end delimiters and so for that to be the
1089     * case we need to explicitly synchronize the parsing of commands to capture
1090     * Begin/End counter snapshots with what's running across other parts of the
1091     * GPU.
1092     *
1093     * When the command parser reaches a Begin marker it effectively needs to
1094     * drain everything currently running on the GPU until the hardware is idle
1095     * before capturing the first snapshot of counters - otherwise the results
1096     * would also be measuring the effects of earlier commands.
1097     *
1098     * When the command parser reaches an End marker it needs to stall until
1099     * everything currently running on the GPU has finished before capturing the
1100     * end snapshot - otherwise the results won't be a complete representation
1101     * of the work.
1102     *
1103     * Theoretically there could be opportunities to minimize how much of the
1104     * GPU pipeline is drained, or that we stall for, when we know what specific
1105     * units the performance counters being queried relate to but we don't
1106     * currently attempt to be clever here.
1107     *
1108     * Note: with our current simple approach here then for back-to-back queries
1109     * we will redundantly emit duplicate commands to synchronize the command
1110     * streamer with the rest of the GPU pipeline, but we assume that in HW the
1111     * second synchronization is effectively a NOOP.
1112     *
1113     * N.B. The final results are based on deltas of counters between (inside)
1114     * Begin/End markers so even though the total wall clock time of the
1115     * workload is stretched by larger pipeline bubbles the bubbles themselves
1116     * are generally invisible to the query results. Whether that's a good or a
1117     * bad thing depends on the use case. For a lower real-time impact while
1118     * capturing metrics then periodic sampling may be a better choice than
1119     * INTEL_performance_query.
1120     *
1121     *
1122     * This is our Begin synchronization point to drain current work on the
1123     * GPU before we capture our first counter snapshot...
1124     */
1125    perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
1126
1127    switch (queryinfo->kind) {
1128    case GEN_PERF_QUERY_TYPE_OA:
1129    case GEN_PERF_QUERY_TYPE_RAW: {
1130
1131       /* Opening an i915 perf stream implies exclusive access to the OA unit
1132        * which will generate counter reports for a specific counter set with a
1133        * specific layout/format so we can't begin any OA based queries that
1134        * require a different counter set or format unless we get an opportunity
1135        * to close the stream and open a new one...
1136        */
1137       uint64_t metric_id = gen_perf_query_get_metric_id(perf_ctx->perf, queryinfo);
1138
1139       if (perf_ctx->oa_stream_fd != -1 &&
1140           perf_ctx->current_oa_metrics_set_id != metric_id) {
1141
1142          if (perf_ctx->n_oa_users != 0) {
1143             DBG("WARNING: Begin failed already using perf config=%i/%"PRIu64"\n",
1144                 perf_ctx->current_oa_metrics_set_id, metric_id);
1145             return false;
1146          } else
1147             gen_perf_close(perf_ctx, queryinfo);
1148       }
1149
1150       /* If the OA counters aren't already on, enable them. */
1151       if (perf_ctx->oa_stream_fd == -1) {
1152          const struct gen_device_info *devinfo = perf_ctx->devinfo;
1153
1154          /* The period_exponent gives a sampling period as follows:
1155           *   sample_period = timestamp_period * 2^(period_exponent + 1)
1156           *
1157           * The timestamps increments every 80ns (HSW), ~52ns (GEN9LP) or
1158           * ~83ns (GEN8/9).
1159           *
1160           * The counter overflow period is derived from the EuActive counter
1161           * which reads a counter that increments by the number of clock
1162           * cycles multiplied by the number of EUs. It can be calculated as:
1163           *
1164           * 2^(number of bits in A counter) / (n_eus * max_gen_freq * 2)
1165           *
1166           * (E.g. 40 EUs @ 1GHz = ~53ms)
1167           *
1168           * We select a sampling period inferior to that overflow period to
1169           * ensure we cannot see more than 1 counter overflow, otherwise we
1170           * could loose information.
1171           */
1172
1173          int a_counter_in_bits = 32;
1174          if (devinfo->gen >= 8)
1175             a_counter_in_bits = 40;
1176
1177          uint64_t overflow_period = pow(2, a_counter_in_bits) / (perf_cfg->sys_vars.n_eus *
1178              /* drop 1GHz freq to have units in nanoseconds */
1179              2);
1180
1181          DBG("A counter overflow period: %"PRIu64"ns, %"PRIu64"ms (n_eus=%"PRIu64")\n",
1182              overflow_period, overflow_period / 1000000ul, perf_cfg->sys_vars.n_eus);
1183
1184          int period_exponent = 0;
1185          uint64_t prev_sample_period, next_sample_period;
1186          for (int e = 0; e < 30; e++) {
1187             prev_sample_period = 1000000000ull * pow(2, e + 1) / devinfo->timestamp_frequency;
1188             next_sample_period = 1000000000ull * pow(2, e + 2) / devinfo->timestamp_frequency;
1189
1190             /* Take the previous sampling period, lower than the overflow
1191              * period.
1192              */
1193             if (prev_sample_period < overflow_period &&
1194                 next_sample_period > overflow_period)
1195                period_exponent = e + 1;
1196          }
1197
1198          if (period_exponent == 0) {
1199             DBG("WARNING: enable to find a sampling exponent\n");
1200             return false;
1201          }
1202
1203          DBG("OA sampling exponent: %i ~= %"PRIu64"ms\n", period_exponent,
1204              prev_sample_period / 1000000ul);
1205
1206          if (!gen_perf_open(perf_ctx, metric_id, queryinfo->oa_format,
1207                             period_exponent, perf_ctx->drm_fd,
1208                             perf_ctx->hw_ctx))
1209             return false;
1210       } else {
1211          assert(perf_ctx->current_oa_metrics_set_id == metric_id &&
1212                 perf_ctx->current_oa_format == queryinfo->oa_format);
1213       }
1214
1215       if (!gen_perf_inc_n_users(perf_ctx)) {
1216          DBG("WARNING: Error enabling i915 perf stream: %m\n");
1217          return false;
1218       }
1219
1220       if (query->oa.bo) {
1221          perf_cfg->vtbl.bo_unreference(query->oa.bo);
1222          query->oa.bo = NULL;
1223       }
1224
1225       query->oa.bo = perf_cfg->vtbl.bo_alloc(perf_ctx->bufmgr,
1226                                              "perf. query OA MI_RPC bo",
1227                                              MI_RPC_BO_SIZE);
1228 #ifdef DEBUG
1229       /* Pre-filling the BO helps debug whether writes landed. */
1230       void *map = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->oa.bo, MAP_WRITE);
1231       memset(map, 0x80, MI_RPC_BO_SIZE);
1232       perf_cfg->vtbl.bo_unmap(query->oa.bo);
1233 #endif
1234
1235       query->oa.begin_report_id = perf_ctx->next_query_start_report_id;
1236       perf_ctx->next_query_start_report_id += 2;
1237
1238       /* We flush the batchbuffer here to minimize the chances that MI_RPC
1239        * delimiting commands end up in different batchbuffers. If that's the
1240        * case, the measurement will include the time it takes for the kernel
1241        * scheduler to load a new request into the hardware. This is manifested in
1242        * tools like frameretrace by spikes in the "GPU Core Clocks" counter.
1243        */
1244       perf_cfg->vtbl.batchbuffer_flush(perf_ctx->ctx, __FILE__, __LINE__);
1245
1246       /* Take a starting OA counter snapshot. */
1247       perf_cfg->vtbl.emit_mi_report_perf_count(perf_ctx->ctx, query->oa.bo, 0,
1248                                                query->oa.begin_report_id);
1249       perf_cfg->vtbl.capture_frequency_stat_register(perf_ctx->ctx, query->oa.bo,
1250                                                      MI_FREQ_START_OFFSET_BYTES);
1251
1252       ++perf_ctx->n_active_oa_queries;
1253
1254       /* No already-buffered samples can possibly be associated with this query
1255        * so create a marker within the list of sample buffers enabling us to
1256        * easily ignore earlier samples when processing this query after
1257        * completion.
1258        */
1259       assert(!exec_list_is_empty(&perf_ctx->sample_buffers));
1260       query->oa.samples_head = exec_list_get_tail(&perf_ctx->sample_buffers);
1261
1262       struct oa_sample_buf *buf =
1263          exec_node_data(struct oa_sample_buf, query->oa.samples_head, link);
1264
1265       /* This reference will ensure that future/following sample
1266        * buffers (that may relate to this query) can't be freed until
1267        * this drops to zero.
1268        */
1269       buf->refcount++;
1270
1271       gen_perf_query_result_clear(&query->oa.result);
1272       query->oa.results_accumulated = false;
1273
1274       add_to_unaccumulated_query_list(perf_ctx, query);
1275       break;
1276    }
1277
1278    case GEN_PERF_QUERY_TYPE_PIPELINE:
1279       if (query->pipeline_stats.bo) {
1280          perf_cfg->vtbl.bo_unreference(query->pipeline_stats.bo);
1281          query->pipeline_stats.bo = NULL;
1282       }
1283
1284       query->pipeline_stats.bo =
1285          perf_cfg->vtbl.bo_alloc(perf_ctx->bufmgr,
1286                                  "perf. query pipeline stats bo",
1287                                  STATS_BO_SIZE);
1288
1289       /* Take starting snapshots. */
1290       gen_perf_snapshot_statistics_registers(perf_ctx->ctx , perf_cfg, query, 0);
1291
1292       ++perf_ctx->n_active_pipeline_stats_queries;
1293       break;
1294
1295    default:
1296       unreachable("Unknown query type");
1297       break;
1298    }
1299
1300    return true;
1301 }
1302
1303 void
1304 gen_perf_end_query(struct gen_perf_context *perf_ctx,
1305                    struct gen_perf_query_object *query)
1306 {
1307    struct gen_perf_config *perf_cfg = perf_ctx->perf;
1308
1309    /* Ensure that the work associated with the queried commands will have
1310     * finished before taking our query end counter readings.
1311     *
1312     * For more details see comment in brw_begin_perf_query for
1313     * corresponding flush.
1314     */
1315   perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
1316
1317    switch (query->queryinfo->kind) {
1318    case GEN_PERF_QUERY_TYPE_OA:
1319    case GEN_PERF_QUERY_TYPE_RAW:
1320
1321       /* NB: It's possible that the query will have already been marked
1322        * as 'accumulated' if an error was seen while reading samples
1323        * from perf. In this case we mustn't try and emit a closing
1324        * MI_RPC command in case the OA unit has already been disabled
1325        */
1326       if (!query->oa.results_accumulated) {
1327          /* Take an ending OA counter snapshot. */
1328          perf_cfg->vtbl.capture_frequency_stat_register(perf_ctx->ctx, query->oa.bo,
1329                                                      MI_FREQ_END_OFFSET_BYTES);
1330          perf_cfg->vtbl.emit_mi_report_perf_count(perf_ctx->ctx, query->oa.bo,
1331                                              MI_RPC_BO_END_OFFSET_BYTES,
1332                                              query->oa.begin_report_id + 1);
1333       }
1334
1335       --perf_ctx->n_active_oa_queries;
1336
1337       /* NB: even though the query has now ended, it can't be accumulated
1338        * until the end MI_REPORT_PERF_COUNT snapshot has been written
1339        * to query->oa.bo
1340        */
1341       break;
1342
1343    case GEN_PERF_QUERY_TYPE_PIPELINE:
1344       gen_perf_snapshot_statistics_registers(perf_ctx->ctx, perf_cfg, query,
1345                                              STATS_BO_END_OFFSET_BYTES);
1346       --perf_ctx->n_active_pipeline_stats_queries;
1347       break;
1348
1349    default:
1350       unreachable("Unknown query type");
1351       break;
1352    }
1353 }
1354
1355 enum OaReadStatus {
1356    OA_READ_STATUS_ERROR,
1357    OA_READ_STATUS_UNFINISHED,
1358    OA_READ_STATUS_FINISHED,
1359 };
1360
1361 static enum OaReadStatus
1362 read_oa_samples_until(struct gen_perf_context *perf_ctx,
1363                       uint32_t start_timestamp,
1364                       uint32_t end_timestamp)
1365 {
1366    struct exec_node *tail_node =
1367       exec_list_get_tail(&perf_ctx->sample_buffers);
1368    struct oa_sample_buf *tail_buf =
1369       exec_node_data(struct oa_sample_buf, tail_node, link);
1370    uint32_t last_timestamp = tail_buf->last_timestamp;
1371
1372    while (1) {
1373       struct oa_sample_buf *buf = gen_perf_get_free_sample_buf(perf_ctx);
1374       uint32_t offset;
1375       int len;
1376
1377       while ((len = read(perf_ctx->oa_stream_fd, buf->buf,
1378                          sizeof(buf->buf))) < 0 && errno == EINTR)
1379          ;
1380
1381       if (len <= 0) {
1382          exec_list_push_tail(&perf_ctx->free_sample_buffers, &buf->link);
1383
1384          if (len < 0) {
1385             if (errno == EAGAIN)
1386                return ((last_timestamp - start_timestamp) >=
1387                        (end_timestamp - start_timestamp)) ?
1388                       OA_READ_STATUS_FINISHED :
1389                       OA_READ_STATUS_UNFINISHED;
1390             else {
1391                DBG("Error reading i915 perf samples: %m\n");
1392             }
1393          } else
1394             DBG("Spurious EOF reading i915 perf samples\n");
1395
1396          return OA_READ_STATUS_ERROR;
1397       }
1398
1399       buf->len = len;
1400       exec_list_push_tail(&perf_ctx->sample_buffers, &buf->link);
1401
1402       /* Go through the reports and update the last timestamp. */
1403       offset = 0;
1404       while (offset < buf->len) {
1405          const struct drm_i915_perf_record_header *header =
1406             (const struct drm_i915_perf_record_header *) &buf->buf[offset];
1407          uint32_t *report = (uint32_t *) (header + 1);
1408
1409          if (header->type == DRM_I915_PERF_RECORD_SAMPLE)
1410             last_timestamp = report[1];
1411
1412          offset += header->size;
1413       }
1414
1415       buf->last_timestamp = last_timestamp;
1416    }
1417
1418    unreachable("not reached");
1419    return OA_READ_STATUS_ERROR;
1420 }
1421
1422 /**
1423  * Try to read all the reports until either the delimiting timestamp
1424  * or an error arises.
1425  */
1426 static bool
1427 read_oa_samples_for_query(struct gen_perf_context *perf_ctx,
1428                           struct gen_perf_query_object *query,
1429                           void *current_batch)
1430 {
1431    uint32_t *start;
1432    uint32_t *last;
1433    uint32_t *end;
1434    struct gen_perf_config *perf_cfg = perf_ctx->perf;
1435
1436    /* We need the MI_REPORT_PERF_COUNT to land before we can start
1437     * accumulate. */
1438    assert(!perf_cfg->vtbl.batch_references(current_batch, query->oa.bo) &&
1439           !perf_cfg->vtbl.bo_busy(query->oa.bo));
1440
1441    /* Map the BO once here and let accumulate_oa_reports() unmap
1442     * it. */
1443    if (query->oa.map == NULL)
1444       query->oa.map = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->oa.bo, MAP_READ);
1445
1446    start = last = query->oa.map;
1447    end = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
1448
1449    if (start[0] != query->oa.begin_report_id) {
1450       DBG("Spurious start report id=%"PRIu32"\n", start[0]);
1451       return true;
1452    }
1453    if (end[0] != (query->oa.begin_report_id + 1)) {
1454       DBG("Spurious end report id=%"PRIu32"\n", end[0]);
1455       return true;
1456    }
1457
1458    /* Read the reports until the end timestamp. */
1459    switch (read_oa_samples_until(perf_ctx, start[1], end[1])) {
1460    case OA_READ_STATUS_ERROR:
1461       /* Fallthrough and let accumulate_oa_reports() deal with the
1462        * error. */
1463    case OA_READ_STATUS_FINISHED:
1464       return true;
1465    case OA_READ_STATUS_UNFINISHED:
1466       return false;
1467    }
1468
1469    unreachable("invalid read status");
1470    return false;
1471 }
1472
1473 void
1474 gen_perf_wait_query(struct gen_perf_context *perf_ctx,
1475                     struct gen_perf_query_object *query,
1476                     void *current_batch)
1477 {
1478    struct gen_perf_config *perf_cfg = perf_ctx->perf;
1479    struct brw_bo *bo = NULL;
1480
1481    switch (query->queryinfo->kind) {
1482    case GEN_PERF_QUERY_TYPE_OA:
1483    case GEN_PERF_QUERY_TYPE_RAW:
1484       bo = query->oa.bo;
1485       break;
1486
1487    case GEN_PERF_QUERY_TYPE_PIPELINE:
1488       bo = query->pipeline_stats.bo;
1489       break;
1490
1491    default:
1492       unreachable("Unknown query type");
1493       break;
1494    }
1495
1496    if (bo == NULL)
1497       return;
1498
1499    /* If the current batch references our results bo then we need to
1500     * flush first...
1501     */
1502    if (perf_cfg->vtbl.batch_references(current_batch, bo))
1503       perf_cfg->vtbl.batchbuffer_flush(perf_ctx->ctx, __FILE__, __LINE__);
1504
1505    perf_cfg->vtbl.bo_wait_rendering(bo);
1506
1507    /* Due to a race condition between the OA unit signaling report
1508     * availability and the report actually being written into memory,
1509     * we need to wait for all the reports to come in before we can
1510     * read them.
1511     */
1512    if (query->queryinfo->kind == GEN_PERF_QUERY_TYPE_OA ||
1513        query->queryinfo->kind == GEN_PERF_QUERY_TYPE_RAW) {
1514       while (!read_oa_samples_for_query(perf_ctx, query, current_batch))
1515          ;
1516    }
1517 }
1518
1519 bool
1520 gen_perf_is_query_ready(struct gen_perf_context *perf_ctx,
1521                         struct gen_perf_query_object *query,
1522                         void *current_batch)
1523 {
1524    struct gen_perf_config *perf_cfg = perf_ctx->perf;
1525
1526    switch (query->queryinfo->kind) {
1527    case GEN_PERF_QUERY_TYPE_OA:
1528    case GEN_PERF_QUERY_TYPE_RAW:
1529       return (query->oa.results_accumulated ||
1530               (query->oa.bo &&
1531                !perf_cfg->vtbl.batch_references(current_batch, query->oa.bo) &&
1532                !perf_cfg->vtbl.bo_busy(query->oa.bo) &&
1533                read_oa_samples_for_query(perf_ctx, query, current_batch)));
1534    case GEN_PERF_QUERY_TYPE_PIPELINE:
1535       return (query->pipeline_stats.bo &&
1536               !perf_cfg->vtbl.batch_references(current_batch, query->pipeline_stats.bo) &&
1537               !perf_cfg->vtbl.bo_busy(query->pipeline_stats.bo));
1538
1539    default:
1540       unreachable("Unknown query type");
1541       break;
1542    }
1543
1544    return false;
1545 }
1546
1547 /**
1548  * Remove a query from the global list of unaccumulated queries once
1549  * after successfully accumulating the OA reports associated with the
1550  * query in accumulate_oa_reports() or when discarding unwanted query
1551  * results.
1552  */
1553 static void
1554 drop_from_unaccumulated_query_list(struct gen_perf_context *perf_ctx,
1555                                    struct gen_perf_query_object *query)
1556 {
1557    for (int i = 0; i < perf_ctx->unaccumulated_elements; i++) {
1558       if (perf_ctx->unaccumulated[i] == query) {
1559          int last_elt = --perf_ctx->unaccumulated_elements;
1560
1561          if (i == last_elt)
1562             perf_ctx->unaccumulated[i] = NULL;
1563          else {
1564             perf_ctx->unaccumulated[i] =
1565                perf_ctx->unaccumulated[last_elt];
1566          }
1567
1568          break;
1569       }
1570    }
1571
1572    /* Drop our samples_head reference so that associated periodic
1573     * sample data buffers can potentially be reaped if they aren't
1574     * referenced by any other queries...
1575     */
1576
1577    struct oa_sample_buf *buf =
1578       exec_node_data(struct oa_sample_buf, query->oa.samples_head, link);
1579
1580    assert(buf->refcount > 0);
1581    buf->refcount--;
1582
1583    query->oa.samples_head = NULL;
1584
1585    gen_perf_reap_old_sample_buffers(perf_ctx);
1586 }
1587
1588 /* In general if we see anything spurious while accumulating results,
1589  * we don't try and continue accumulating the current query, hoping
1590  * for the best, we scrap anything outstanding, and then hope for the
1591  * best with new queries.
1592  */
1593 static void
1594 discard_all_queries(struct gen_perf_context *perf_ctx)
1595 {
1596    while (perf_ctx->unaccumulated_elements) {
1597       struct gen_perf_query_object *query = perf_ctx->unaccumulated[0];
1598
1599       query->oa.results_accumulated = true;
1600       drop_from_unaccumulated_query_list(perf_ctx, query);
1601
1602       gen_perf_dec_n_users(perf_ctx);
1603    }
1604 }
1605
1606 /**
1607  * Accumulate raw OA counter values based on deltas between pairs of
1608  * OA reports.
1609  *
1610  * Accumulation starts from the first report captured via
1611  * MI_REPORT_PERF_COUNT (MI_RPC) by brw_begin_perf_query() until the
1612  * last MI_RPC report requested by brw_end_perf_query(). Between these
1613  * two reports there may also some number of periodically sampled OA
1614  * reports collected via the i915 perf interface - depending on the
1615  * duration of the query.
1616  *
1617  * These periodic snapshots help to ensure we handle counter overflow
1618  * correctly by being frequent enough to ensure we don't miss multiple
1619  * overflows of a counter between snapshots. For Gen8+ the i915 perf
1620  * snapshots provide the extra context-switch reports that let us
1621  * subtract out the progress of counters associated with other
1622  * contexts running on the system.
1623  */
1624 static void
1625 accumulate_oa_reports(struct gen_perf_context *perf_ctx,
1626                       struct gen_perf_query_object *query)
1627 {
1628    const struct gen_device_info *devinfo = perf_ctx->devinfo;
1629    uint32_t *start;
1630    uint32_t *last;
1631    uint32_t *end;
1632    struct exec_node *first_samples_node;
1633    bool in_ctx = true;
1634    int out_duration = 0;
1635
1636    assert(query->oa.map != NULL);
1637
1638    start = last = query->oa.map;
1639    end = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
1640
1641    if (start[0] != query->oa.begin_report_id) {
1642       DBG("Spurious start report id=%"PRIu32"\n", start[0]);
1643       goto error;
1644    }
1645    if (end[0] != (query->oa.begin_report_id + 1)) {
1646       DBG("Spurious end report id=%"PRIu32"\n", end[0]);
1647       goto error;
1648    }
1649
1650    /* See if we have any periodic reports to accumulate too... */
1651
1652    /* N.B. The oa.samples_head was set when the query began and
1653     * pointed to the tail of the perf_ctx->sample_buffers list at
1654     * the time the query started. Since the buffer existed before the
1655     * first MI_REPORT_PERF_COUNT command was emitted we therefore know
1656     * that no data in this particular node's buffer can possibly be
1657     * associated with the query - so skip ahead one...
1658     */
1659    first_samples_node = query->oa.samples_head->next;
1660
1661    foreach_list_typed_from(struct oa_sample_buf, buf, link,
1662                            &perf_ctx.sample_buffers,
1663                            first_samples_node)
1664    {
1665       int offset = 0;
1666
1667       while (offset < buf->len) {
1668          const struct drm_i915_perf_record_header *header =
1669             (const struct drm_i915_perf_record_header *)(buf->buf + offset);
1670
1671          assert(header->size != 0);
1672          assert(header->size <= buf->len);
1673
1674          offset += header->size;
1675
1676          switch (header->type) {
1677          case DRM_I915_PERF_RECORD_SAMPLE: {
1678             uint32_t *report = (uint32_t *)(header + 1);
1679             bool add = true;
1680
1681             /* Ignore reports that come before the start marker.
1682              * (Note: takes care to allow overflow of 32bit timestamps)
1683              */
1684             if (gen_device_info_timebase_scale(devinfo,
1685                                                report[1] - start[1]) > 5000000000) {
1686                continue;
1687             }
1688
1689             /* Ignore reports that come after the end marker.
1690              * (Note: takes care to allow overflow of 32bit timestamps)
1691              */
1692             if (gen_device_info_timebase_scale(devinfo,
1693                                                report[1] - end[1]) <= 5000000000) {
1694                goto end;
1695             }
1696
1697             /* For Gen8+ since the counters continue while other
1698              * contexts are running we need to discount any unrelated
1699              * deltas. The hardware automatically generates a report
1700              * on context switch which gives us a new reference point
1701              * to continuing adding deltas from.
1702              *
1703              * For Haswell we can rely on the HW to stop the progress
1704              * of OA counters while any other context is acctive.
1705              */
1706             if (devinfo->gen >= 8) {
1707                if (in_ctx && report[2] != query->oa.result.hw_id) {
1708                   DBG("i915 perf: Switch AWAY (observed by ID change)\n");
1709                   in_ctx = false;
1710                   out_duration = 0;
1711                } else if (in_ctx == false && report[2] == query->oa.result.hw_id) {
1712                   DBG("i915 perf: Switch TO\n");
1713                   in_ctx = true;
1714
1715                   /* From experimentation in IGT, we found that the OA unit
1716                    * might label some report as "idle" (using an invalid
1717                    * context ID), right after a report for a given context.
1718                    * Deltas generated by those reports actually belong to the
1719                    * previous context, even though they're not labelled as
1720                    * such.
1721                    *
1722                    * We didn't *really* Switch AWAY in the case that we e.g.
1723                    * saw a single periodic report while idle...
1724                    */
1725                   if (out_duration >= 1)
1726                      add = false;
1727                } else if (in_ctx) {
1728                   assert(report[2] == query->oa.result.hw_id);
1729                   DBG("i915 perf: Continuation IN\n");
1730                } else {
1731                   assert(report[2] != query->oa.result.hw_id);
1732                   DBG("i915 perf: Continuation OUT\n");
1733                   add = false;
1734                   out_duration++;
1735                }
1736             }
1737
1738             if (add) {
1739                gen_perf_query_result_accumulate(&query->oa.result, query->queryinfo,
1740                                                 last, report);
1741             }
1742
1743             last = report;
1744
1745             break;
1746          }
1747
1748          case DRM_I915_PERF_RECORD_OA_BUFFER_LOST:
1749              DBG("i915 perf: OA error: all reports lost\n");
1750              goto error;
1751          case DRM_I915_PERF_RECORD_OA_REPORT_LOST:
1752              DBG("i915 perf: OA report lost\n");
1753              break;
1754          }
1755       }
1756    }
1757
1758 end:
1759
1760    gen_perf_query_result_accumulate(&query->oa.result, query->queryinfo,
1761                                     last, end);
1762
1763    query->oa.results_accumulated = true;
1764    drop_from_unaccumulated_query_list(perf_ctx, query);
1765    gen_perf_dec_n_users(perf_ctx);
1766
1767    return;
1768
1769 error:
1770
1771    discard_all_queries(perf_ctx);
1772 }
1773
1774 void
1775 gen_perf_delete_query(struct gen_perf_context *perf_ctx,
1776                       struct gen_perf_query_object *query)
1777 {
1778    struct gen_perf_config *perf_cfg = perf_ctx->perf;
1779
1780    /* We can assume that the frontend waits for a query to complete
1781     * before ever calling into here, so we don't have to worry about
1782     * deleting an in-flight query object.
1783     */
1784    switch (query->queryinfo->kind) {
1785    case GEN_PERF_QUERY_TYPE_OA:
1786    case GEN_PERF_QUERY_TYPE_RAW:
1787       if (query->oa.bo) {
1788          if (!query->oa.results_accumulated) {
1789             drop_from_unaccumulated_query_list(perf_ctx, query);
1790             gen_perf_dec_n_users(perf_ctx);
1791          }
1792
1793          perf_cfg->vtbl.bo_unreference(query->oa.bo);
1794          query->oa.bo = NULL;
1795       }
1796
1797       query->oa.results_accumulated = false;
1798       break;
1799
1800    case GEN_PERF_QUERY_TYPE_PIPELINE:
1801       if (query->pipeline_stats.bo) {
1802          perf_cfg->vtbl.bo_unreference(query->pipeline_stats.bo);
1803          query->pipeline_stats.bo = NULL;
1804       }
1805       break;
1806
1807    default:
1808       unreachable("Unknown query type");
1809       break;
1810    }
1811
1812    /* As an indication that the INTEL_performance_query extension is no
1813     * longer in use, it's a good time to free our cache of sample
1814     * buffers and close any current i915-perf stream.
1815     */
1816    if (--perf_ctx->n_query_instances == 0) {
1817       gen_perf_free_sample_bufs(perf_ctx);
1818       gen_perf_close(perf_ctx, query->queryinfo);
1819    }
1820
1821    free(query);
1822 }
1823
1824 #define GET_FIELD(word, field) (((word)  & field ## _MASK) >> field ## _SHIFT)
1825
1826 static void
1827 read_gt_frequency(struct gen_perf_context *perf_ctx,
1828                   struct gen_perf_query_object *obj)
1829 {
1830    const struct gen_device_info *devinfo = perf_ctx->devinfo;
1831    uint32_t start = *((uint32_t *)(obj->oa.map + MI_FREQ_START_OFFSET_BYTES)),
1832       end = *((uint32_t *)(obj->oa.map + MI_FREQ_END_OFFSET_BYTES));
1833
1834    switch (devinfo->gen) {
1835    case 7:
1836    case 8:
1837       obj->oa.gt_frequency[0] = GET_FIELD(start, GEN7_RPSTAT1_CURR_GT_FREQ) * 50ULL;
1838       obj->oa.gt_frequency[1] = GET_FIELD(end, GEN7_RPSTAT1_CURR_GT_FREQ) * 50ULL;
1839       break;
1840    case 9:
1841    case 10:
1842    case 11:
1843       obj->oa.gt_frequency[0] = GET_FIELD(start, GEN9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL;
1844       obj->oa.gt_frequency[1] = GET_FIELD(end, GEN9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL;
1845       break;
1846    default:
1847       unreachable("unexpected gen");
1848    }
1849
1850    /* Put the numbers into Hz. */
1851    obj->oa.gt_frequency[0] *= 1000000ULL;
1852    obj->oa.gt_frequency[1] *= 1000000ULL;
1853 }
1854
1855 static int
1856 get_oa_counter_data(struct gen_perf_context *perf_ctx,
1857                     struct gen_perf_query_object *query,
1858                     size_t data_size,
1859                     uint8_t *data)
1860 {
1861    struct gen_perf_config *perf_cfg = perf_ctx->perf;
1862    const struct gen_perf_query_info *queryinfo = query->queryinfo;
1863    int n_counters = queryinfo->n_counters;
1864    int written = 0;
1865
1866    for (int i = 0; i < n_counters; i++) {
1867       const struct gen_perf_query_counter *counter = &queryinfo->counters[i];
1868       uint64_t *out_uint64;
1869       float *out_float;
1870       size_t counter_size = gen_perf_query_counter_get_size(counter);
1871
1872       if (counter_size) {
1873          switch (counter->data_type) {
1874          case GEN_PERF_COUNTER_DATA_TYPE_UINT64:
1875             out_uint64 = (uint64_t *)(data + counter->offset);
1876             *out_uint64 =
1877                counter->oa_counter_read_uint64(perf_cfg, queryinfo,
1878                                                query->oa.result.accumulator);
1879             break;
1880          case GEN_PERF_COUNTER_DATA_TYPE_FLOAT:
1881             out_float = (float *)(data + counter->offset);
1882             *out_float =
1883                counter->oa_counter_read_float(perf_cfg, queryinfo,
1884                                               query->oa.result.accumulator);
1885             break;
1886          default:
1887             /* So far we aren't using uint32, double or bool32... */
1888             unreachable("unexpected counter data type");
1889          }
1890          written = counter->offset + counter_size;
1891       }
1892    }
1893
1894    return written;
1895 }
1896
1897 static int
1898 get_pipeline_stats_data(struct gen_perf_context *perf_ctx,
1899                         struct gen_perf_query_object *query,
1900                         size_t data_size,
1901                         uint8_t *data)
1902
1903 {
1904    struct gen_perf_config *perf_cfg = perf_ctx->perf;
1905    const struct gen_perf_query_info *queryinfo = query->queryinfo;
1906    int n_counters = queryinfo->n_counters;
1907    uint8_t *p = data;
1908
1909    uint64_t *start = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->pipeline_stats.bo, MAP_READ);
1910    uint64_t *end = start + (STATS_BO_END_OFFSET_BYTES / sizeof(uint64_t));
1911
1912    for (int i = 0; i < n_counters; i++) {
1913       const struct gen_perf_query_counter *counter = &queryinfo->counters[i];
1914       uint64_t value = end[i] - start[i];
1915
1916       if (counter->pipeline_stat.numerator !=
1917           counter->pipeline_stat.denominator) {
1918          value *= counter->pipeline_stat.numerator;
1919          value /= counter->pipeline_stat.denominator;
1920       }
1921
1922       *((uint64_t *)p) = value;
1923       p += 8;
1924    }
1925
1926    perf_cfg->vtbl.bo_unmap(query->pipeline_stats.bo);
1927
1928    return p - data;
1929 }
1930
1931 void
1932 gen_perf_get_query_data(struct gen_perf_context *perf_ctx,
1933                         struct gen_perf_query_object *query,
1934                         int data_size,
1935                         unsigned *data,
1936                         unsigned *bytes_written)
1937 {
1938    struct gen_perf_config *perf_cfg = perf_ctx->perf;
1939    int written = 0;
1940
1941    switch (query->queryinfo->kind) {
1942    case GEN_PERF_QUERY_TYPE_OA:
1943    case GEN_PERF_QUERY_TYPE_RAW:
1944       if (!query->oa.results_accumulated) {
1945          read_gt_frequency(perf_ctx, query);
1946          uint32_t *begin_report = query->oa.map;
1947          uint32_t *end_report = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
1948          gen_perf_query_result_read_frequencies(&query->oa.result,
1949                                                 perf_ctx->devinfo,
1950                                                 begin_report,
1951                                                 end_report);
1952          accumulate_oa_reports(perf_ctx, query);
1953          assert(query->oa.results_accumulated);
1954
1955          perf_cfg->vtbl.bo_unmap(query->oa.bo);
1956          query->oa.map = NULL;
1957       }
1958       if (query->queryinfo->kind == GEN_PERF_QUERY_TYPE_OA) {
1959          written = get_oa_counter_data(perf_ctx, query, data_size, (uint8_t *)data);
1960       } else {
1961          const struct gen_device_info *devinfo = perf_ctx->devinfo;
1962
1963          written = gen_perf_query_result_write_mdapi((uint8_t *)data, data_size,
1964                                                      devinfo, &query->oa.result,
1965                                                      query->oa.gt_frequency[0],
1966                                                      query->oa.gt_frequency[1]);
1967       }
1968       break;
1969
1970    case GEN_PERF_QUERY_TYPE_PIPELINE:
1971       written = get_pipeline_stats_data(perf_ctx, query, data_size, (uint8_t *)data);
1972       break;
1973
1974    default:
1975       unreachable("Unknown query type");
1976       break;
1977    }
1978
1979    if (bytes_written)
1980       *bytes_written = written;
1981 }