st/vdpau: Only call is_video_format_supported hook if needed

[mesa.git] / src / intel / perf / gen_perf.c
diff --git a/src/intel/perf/gen_perf.c b/src/intel/perf/gen_perf.c

index 2d1a58bae9e1901ad55ed3a11ab03af957f2fcb3..ceb10f5af120a75384c27bfaec73c7920bc8bf0c 100644 (file)
--- a/src/intel/perf/gen_perf.c
+++ b/src/intel/perf/gen_perf.c
@@ -29,16 +29,22 @@
  #include <unistd.h>
  #include <errno.h>
  
+#ifndef HAVE_DIRENT_D_TYPE
+#include <limits.h> // PATH_MAX
+#endif
+
  #include <drm-uapi/i915_drm.h>
  
  #include "common/gen_gem.h"
  #include "gen_perf.h"
+#include "gen_perf_regs.h"
  #include "perf/gen_perf_mdapi.h"
  #include "perf/gen_perf_metrics.h"
  
  #include "dev/gen_debug.h"
  #include "dev/gen_device_info.h"
  #include "util/bitscan.h"
+#include "util/mesa-sha1.h"
  #include "util/u_math.h"
  
  #define FILE_DEBUG_FLAG DEBUG_PERFMON
@@ -69,6 +75,8 @@
  #define MAP_READ  (1 << 0)
  #define MAP_WRITE (1 << 1)
  
+#define OA_REPORT_INVALID_CTX_ID (0xffffffff)
+
  /**
   * Periodic OA samples are read() into these buffer structures via the
   * i915 perf kernel interface and appended to the
@@ -189,6 +197,164 @@ struct oa_sample_buf {
     uint32_t last_timestamp;
  };
  
+/**
+ * gen representation of a performance query object.
+ *
+ * NB: We want to keep this structure relatively lean considering that
+ * applications may expect to allocate enough objects to be able to
+ * query around all draw calls in a frame.
+ */
+struct gen_perf_query_object
+{
+   const struct gen_perf_query_info *queryinfo;
+
+   /* See query->kind to know which state below is in use... */
+   union {
+      struct {
+
+         /**
+          * BO containing OA counter snapshots at query Begin/End time.
+          */
+         void *bo;
+
+         /**
+          * Address of mapped of @bo
+          */
+         void *map;
+
+         /**
+          * The MI_REPORT_PERF_COUNT command lets us specify a unique
+          * ID that will be reflected in the resulting OA report
+          * that's written by the GPU. This is the ID we're expecting
+          * in the begin report and the the end report should be
+          * @begin_report_id + 1.
+          */
+         int begin_report_id;
+
+         /**
+          * Reference the head of the brw->perfquery.sample_buffers
+          * list at the time that the query started (so we only need
+          * to look at nodes after this point when looking for samples
+          * related to this query)
+          *
+          * (See struct brw_oa_sample_buf description for more details)
+          */
+         struct exec_node *samples_head;
+
+         /**
+          * false while in the unaccumulated_elements list, and set to
+          * true when the final, end MI_RPC snapshot has been
+          * accumulated.
+          */
+         bool results_accumulated;
+
+         /**
+          * Frequency of the GT at begin and end of the query.
+          */
+         uint64_t gt_frequency[2];
+
+         /**
+          * Accumulated OA results between begin and end of the query.
+          */
+         struct gen_perf_query_result result;
+      } oa;
+
+      struct {
+         /**
+          * BO containing starting and ending snapshots for the
+          * statistics counters.
+          */
+         void *bo;
+      } pipeline_stats;
+   };
+};
+
+struct gen_perf_context {
+   struct gen_perf_config *perf;
+
+   void * ctx;  /* driver context (eg, brw_context) */
+   void * bufmgr;
+   const struct gen_device_info *devinfo;
+
+   uint32_t hw_ctx;
+   int drm_fd;
+
+   /* The i915 perf stream we open to setup + enable the OA counters */
+   int oa_stream_fd;
+
+   /* An i915 perf stream fd gives exclusive access to the OA unit that will
+    * report counter snapshots for a specific counter set/profile in a
+    * specific layout/format so we can only start OA queries that are
+    * compatible with the currently open fd...
+    */
+   int current_oa_metrics_set_id;
+   int current_oa_format;
+
+   /* List of buffers containing OA reports */
+   struct exec_list sample_buffers;
+
+   /* Cached list of empty sample buffers */
+   struct exec_list free_sample_buffers;
+
+   int n_active_oa_queries;
+   int n_active_pipeline_stats_queries;
+
+   /* The number of queries depending on running OA counters which
+    * extends beyond brw_end_perf_query() since we need to wait until
+    * the last MI_RPC command has parsed by the GPU.
+    *
+    * Accurate accounting is important here as emitting an
+    * MI_REPORT_PERF_COUNT command while the OA unit is disabled will
+    * effectively hang the gpu.
+    */
+   int n_oa_users;
+
+   /* To help catch an spurious problem with the hardware or perf
+    * forwarding samples, we emit each MI_REPORT_PERF_COUNT command
+    * with a unique ID that we can explicitly check for...
+    */
+   int next_query_start_report_id;
+
+   /**
+    * An array of queries whose results haven't yet been assembled
+    * based on the data in buffer objects.
+    *
+    * These may be active, or have already ended.  However, the
+    * results have not been requested.
+    */
+   struct gen_perf_query_object **unaccumulated;
+   int unaccumulated_elements;
+   int unaccumulated_array_size;
+
+   /* The total number of query objects so we can relinquish
+    * our exclusive access to perf if the application deletes
+    * all of its objects. (NB: We only disable perf while
+    * there are no active queries)
+    */
+   int n_query_instances;
+};
+
+const struct gen_perf_query_info*
+gen_perf_query_info(const struct gen_perf_query_object *query)
+{
+   return query->queryinfo;
+}
+
+struct gen_perf_context *
+gen_perf_new_context(void *parent)
+{
+   struct gen_perf_context *ctx = rzalloc(parent, struct gen_perf_context);
+   if (! ctx)
+      fprintf(stderr, "%s: failed to alloc context\n", __func__);
+   return ctx;
+}
+
+struct gen_perf_config *
+gen_perf_config(struct gen_perf_context *ctx)
+{
+   return ctx->perf;
+}
+
  struct gen_perf_query_object *
  gen_perf_new_query(struct gen_perf_context *perf_ctx, unsigned query_index)
  {
@@ -206,6 +372,47 @@ gen_perf_new_query(struct gen_perf_context *perf_ctx, unsigned query_index)
     return obj;
  }
  
+int
+gen_perf_active_queries(struct gen_perf_context *perf_ctx,
+                        const struct gen_perf_query_info *query)
+{
+   assert(perf_ctx->n_active_oa_queries == 0 || perf_ctx->n_active_pipeline_stats_queries == 0);
+
+   switch (query->kind) {
+   case GEN_PERF_QUERY_TYPE_OA:
+   case GEN_PERF_QUERY_TYPE_RAW:
+      return perf_ctx->n_active_oa_queries;
+      break;
+
+   case GEN_PERF_QUERY_TYPE_PIPELINE:
+      return perf_ctx->n_active_pipeline_stats_queries;
+      break;
+
+   default:
+      unreachable("Unknown query type");
+      break;
+   }
+}
+
+static inline uint64_t to_user_pointer(void *ptr)
+{
+   return (uintptr_t) ptr;
+}
+
+static bool
+is_dir_or_link(const struct dirent *entry, const char *parent_dir)
+{
+#ifdef HAVE_DIRENT_D_TYPE
+   return entry->d_type == DT_DIR || entry->d_type == DT_LNK;
+#else
+   struct stat st;
+   char path[PATH_MAX + 1];
+   snprintf(path, sizeof(path), "%s/%s", parent_dir, entry->d_name);
+   lstat(path, &st);
+   return S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode);
+#endif
+}
+
  static bool
  get_sysfs_dev_dir(struct gen_perf_config *perf, int fd)
  {
@@ -245,8 +452,7 @@ get_sysfs_dev_dir(struct gen_perf_config *perf, int fd)
     }
  
     while ((drm_entry = readdir(drmdir))) {
-      if ((drm_entry->d_type == DT_DIR ||
-           drm_entry->d_type == DT_LNK) &&
+      if (is_dir_or_link(drm_entry, perf->sysfs_dev_dir) &&
            strncmp(drm_entry->d_name, "card", 4) == 0)
        {
           len = snprintf(perf->sysfs_dev_dir,
@@ -332,12 +538,12 @@ register_oa_config(struct gen_perf_config *perf,
                     const struct gen_perf_query_info *query,
                     uint64_t config_id)
  {
-   struct gen_perf_query_info *registred_query = append_query_info(perf, 0);
+   struct gen_perf_query_info *registered_query = append_query_info(perf, 0);
  
-   *registred_query = *query;
-   registred_query->oa_metrics_set_id = config_id;
-   DBG("metric set registred: id = %" PRIu64", guid = %s\n",
-       registred_query->oa_metrics_set_id, query->guid);
+   *registered_query = *query;
+   registered_query->oa_metrics_set_id = config_id;
+   DBG("metric set registered: id = %" PRIu64", guid = %s\n",
+       registered_query->oa_metrics_set_id, query->guid);
  }
  
  static void
@@ -362,9 +568,7 @@ enumerate_sysfs_metrics(struct gen_perf_config *perf)
  
     while ((metric_entry = readdir(metricsdir))) {
        struct hash_entry *entry;
-
-      if ((metric_entry->d_type != DT_DIR &&
-           metric_entry->d_type != DT_LNK) ||
+      if (!is_dir_or_link(metric_entry, buf) ||
            metric_entry->d_name[0] == '.')
           continue;
  
@@ -373,15 +577,7 @@ enumerate_sysfs_metrics(struct gen_perf_config *perf)
                                        metric_entry->d_name);
        if (entry) {
           uint64_t id;
-
-         len = snprintf(buf, sizeof(buf), "%s/metrics/%s/id",
-                        perf->sysfs_dev_dir, metric_entry->d_name);
-         if (len < 0 || len >= sizeof(buf)) {
-            DBG("Failed to concatenate path to sysfs metric id file\n");
-            continue;
-         }
-
-         if (!read_file_uint64(buf, &id)) {
+         if (!gen_perf_load_metric_id(perf, metric_entry->d_name, &id)) {
              DBG("Failed to read metric set id from %s: %m", buf);
              continue;
           }
@@ -403,48 +599,106 @@ kernel_has_dynamic_config_support(struct gen_perf_config *perf, int fd)
                      &invalid_config_id) < 0 && errno == ENOENT;
  }
  
+static int
+i915_query_items(struct gen_perf_config *perf, int fd,
+                 struct drm_i915_query_item *items, uint32_t n_items)
+{
+   struct drm_i915_query q = {
+      .num_items = n_items,
+      .items_ptr = to_user_pointer(items),
+   };
+   return gen_ioctl(fd, DRM_IOCTL_I915_QUERY, &q);
+}
+
+static bool
+i915_query_perf_config_supported(struct gen_perf_config *perf, int fd)
+{
+   struct drm_i915_query_item item = {
+      .query_id = DRM_I915_QUERY_PERF_CONFIG,
+      .flags = DRM_I915_QUERY_PERF_CONFIG_LIST,
+   };
+
+   return i915_query_items(perf, fd, &item, 1) == 0 && item.length > 0;
+}
+
  static bool
-load_metric_id(struct gen_perf_config *perf, const char *guid,
-               uint64_t *metric_id)
+i915_query_perf_config_data(struct gen_perf_config *perf,
+                            int fd, const char *guid,
+                            struct drm_i915_perf_oa_config *config)
+{
+   struct {
+      struct drm_i915_query_perf_config query;
+      struct drm_i915_perf_oa_config config;
+   } item_data;
+   struct drm_i915_query_item item = {
+      .query_id = DRM_I915_QUERY_PERF_CONFIG,
+      .flags = DRM_I915_QUERY_PERF_CONFIG_DATA_FOR_UUID,
+      .data_ptr = to_user_pointer(&item_data),
+      .length = sizeof(item_data),
+   };
+
+   memset(&item_data, 0, sizeof(item_data));
+   memcpy(item_data.query.uuid, guid, sizeof(item_data.query.uuid));
+   memcpy(&item_data.config, config, sizeof(item_data.config));
+
+   if (!(i915_query_items(perf, fd, &item, 1) == 0 && item.length > 0))
+      return false;
+
+   memcpy(config, &item_data.config, sizeof(item_data.config));
+
+   return true;
+}
+
+bool
+gen_perf_load_metric_id(struct gen_perf_config *perf_cfg,
+                        const char *guid,
+                        uint64_t *metric_id)
  {
     char config_path[280];
  
     snprintf(config_path, sizeof(config_path), "%s/metrics/%s/id",
-            perf->sysfs_dev_dir, guid);
+            perf_cfg->sysfs_dev_dir, guid);
  
     /* Don't recreate already loaded configs. */
     return read_file_uint64(config_path, metric_id);
  }
  
+static uint64_t
+i915_add_config(struct gen_perf_config *perf, int fd,
+                const struct gen_perf_registers *config,
+                const char *guid)
+{
+   struct drm_i915_perf_oa_config i915_config = { 0, };
+
+   memcpy(i915_config.uuid, guid, sizeof(i915_config.uuid));
+
+   i915_config.n_mux_regs = config->n_mux_regs;
+   i915_config.mux_regs_ptr = to_user_pointer(config->mux_regs);
+
+   i915_config.n_boolean_regs = config->n_b_counter_regs;
+   i915_config.boolean_regs_ptr = to_user_pointer(config->b_counter_regs);
+
+   i915_config.n_flex_regs = config->n_flex_regs;
+   i915_config.flex_regs_ptr = to_user_pointer(config->flex_regs);
+
+   int ret = gen_ioctl(fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &i915_config);
+   return ret > 0 ? ret : 0;
+}
+
  static void
  init_oa_configs(struct gen_perf_config *perf, int fd)
  {
     hash_table_foreach(perf->oa_metrics_table, entry) {
        const struct gen_perf_query_info *query = entry->data;
-      struct drm_i915_perf_oa_config config;
        uint64_t config_id;
-      int ret;
  
-      if (load_metric_id(perf, query->guid, &config_id)) {
+      if (gen_perf_load_metric_id(perf, query->guid, &config_id)) {
           DBG("metric set: %s (already loaded)\n", query->guid);
           register_oa_config(perf, query, config_id);
           continue;
        }
  
-      memset(&config, 0, sizeof(config));
-
-      memcpy(config.uuid, query->guid, sizeof(config.uuid));
-
-      config.n_mux_regs = query->n_mux_regs;
-      config.mux_regs_ptr = (uintptr_t) query->mux_regs;
-
-      config.n_boolean_regs = query->n_b_counter_regs;
-      config.boolean_regs_ptr = (uintptr_t) query->b_counter_regs;
-
-      config.n_flex_regs = query->n_flex_regs;
-      config.flex_regs_ptr = (uintptr_t) query->flex_regs;
-
-      ret = gen_ioctl(fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &config);
+      int ret = i915_add_config(perf, fd, &query->config, query->guid);
        if (ret < 0) {
           DBG("Failed to load \"%s\" (%s) metrics set in kernel: %s\n",
               query->name, query->guid, strerror(errno));
@@ -550,8 +804,13 @@ get_register_queries_function(const struct gen_device_info *devinfo)
     }
     if (devinfo->is_cannonlake)
        return gen_oa_register_queries_cnl;
-   if (devinfo->gen == 11)
+   if (devinfo->gen == 11) {
+      if (devinfo->is_elkhartlake)
+         return gen_oa_register_queries_lkf;
        return gen_oa_register_queries_icl;
+   }
+   if (devinfo->gen == 12)
+      return gen_oa_register_queries_tgl;
  
     return NULL;
  }
@@ -679,6 +938,8 @@ load_oa_metrics(struct gen_perf_config *perf, int fd,
     bool i915_perf_oa_available = false;
     struct stat sb;
  
+   perf->i915_query_supported = i915_query_perf_config_supported(perf, fd);
+
     /* The existence of this sysctl parameter implies the kernel supports
      * the i915 perf interface.
      */
@@ -706,7 +967,7 @@ load_oa_metrics(struct gen_perf_config *perf, int fd,
        return false;
  
     perf->oa_metrics_table =
-      _mesa_hash_table_create(perf, _mesa_key_hash_string,
+      _mesa_hash_table_create(perf, _mesa_hash_string,
                                _mesa_key_string_equal);
  
     /* Index all the metric sets mesa knows about before looking to see what
@@ -723,6 +984,87 @@ load_oa_metrics(struct gen_perf_config *perf, int fd,
     return true;
  }
  
+struct gen_perf_registers *
+gen_perf_load_configuration(struct gen_perf_config *perf_cfg, int fd, const char *guid)
+{
+   if (!perf_cfg->i915_query_supported)
+      return NULL;
+
+   struct drm_i915_perf_oa_config i915_config = { 0, };
+   if (!i915_query_perf_config_data(perf_cfg, fd, guid, &i915_config))
+      return NULL;
+
+   struct gen_perf_registers *config = rzalloc(NULL, struct gen_perf_registers);
+   config->n_flex_regs = i915_config.n_flex_regs;
+   config->flex_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_flex_regs);
+   config->n_mux_regs = i915_config.n_mux_regs;
+   config->mux_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_mux_regs);
+   config->n_b_counter_regs = i915_config.n_boolean_regs;
+   config->b_counter_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_b_counter_regs);
+
+   /*
+    * struct gen_perf_query_register_prog maps exactly to the tuple of
+    * (register offset, register value) returned by the i915.
+    */
+   i915_config.flex_regs_ptr = to_user_pointer(config->flex_regs);
+   i915_config.mux_regs_ptr = to_user_pointer(config->mux_regs);
+   i915_config.boolean_regs_ptr = to_user_pointer(config->b_counter_regs);
+   if (!i915_query_perf_config_data(perf_cfg, fd, guid, &i915_config)) {
+      ralloc_free(config);
+      return NULL;
+   }
+
+   return config;
+}
+
+uint64_t
+gen_perf_store_configuration(struct gen_perf_config *perf_cfg, int fd,
+                             const struct gen_perf_registers *config,
+                             const char *guid)
+{
+   if (guid)
+      return i915_add_config(perf_cfg, fd, config, guid);
+
+   struct mesa_sha1 sha1_ctx;
+   _mesa_sha1_init(&sha1_ctx);
+
+   if (config->flex_regs) {
+      _mesa_sha1_update(&sha1_ctx, config->flex_regs,
+                        sizeof(config->flex_regs[0]) *
+                        config->n_flex_regs);
+   }
+   if (config->mux_regs) {
+      _mesa_sha1_update(&sha1_ctx, config->mux_regs,
+                        sizeof(config->mux_regs[0]) *
+                        config->n_mux_regs);
+   }
+   if (config->b_counter_regs) {
+      _mesa_sha1_update(&sha1_ctx, config->b_counter_regs,
+                        sizeof(config->b_counter_regs[0]) *
+                        config->n_b_counter_regs);
+   }
+
+   uint8_t hash[20];
+   _mesa_sha1_final(&sha1_ctx, hash);
+
+   char formatted_hash[41];
+   _mesa_sha1_format(formatted_hash, hash);
+
+   char generated_guid[37];
+   snprintf(generated_guid, sizeof(generated_guid),
+            "%.8s-%.4s-%.4s-%.4s-%.12s",
+            &formatted_hash[0], &formatted_hash[8],
+            &formatted_hash[8 + 4], &formatted_hash[8 + 4 + 4],
+            &formatted_hash[8 + 4 + 4 + 4]);
+
+   /* Check if already present. */
+   uint64_t id;
+   if (gen_perf_load_metric_id(perf_cfg, generated_guid, &id))
+      return id;
+
+   return i915_add_config(perf_cfg, fd, config, generated_guid);
+}
+
  /* Accumulate 32bits OA counters */
  static inline void
  accumulate_uint32(const uint32_t *report0,
@@ -784,11 +1126,11 @@ gen8_read_report_clock_ratios(const uint32_t *report,
     *unslice_freq_hz = unslice_freq * 16666667ULL;
  }
  
-static void
-query_result_read_frequencies(struct gen_perf_query_result *result,
-                              const struct gen_device_info *devinfo,
-                              const uint32_t *start,
-                              const uint32_t *end)
+void
+gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result,
+                                       const struct gen_device_info *devinfo,
+                                       const uint32_t *start,
+                                       const uint32_t *end)
  {
     /* Slice/Unslice frequency is only available in the OA reports when the
      * "Disable OA reports due to clock ratio change" field in
@@ -809,15 +1151,19 @@ query_result_read_frequencies(struct gen_perf_query_result *result,
                                   &result->unslice_frequency[1]);
  }
  
-static void
-query_result_accumulate(struct gen_perf_query_result *result,
-                        const struct gen_perf_query_info *query,
-                        const uint32_t *start,
-                        const uint32_t *end)
+void
+gen_perf_query_result_accumulate(struct gen_perf_query_result *result,
+                                 const struct gen_perf_query_info *query,
+                                 const uint32_t *start,
+                                 const uint32_t *end)
  {
     int i, idx = 0;
  
-   result->hw_id = start[2];
+   if (result->hw_id == OA_REPORT_INVALID_CTX_ID &&
+       start[2] != OA_REPORT_INVALID_CTX_ID)
+      result->hw_id = start[2];
+   if (result->reports_accumulated == 0)
+      result->begin_timestamp = start[1];
     result->reports_accumulated++;
  
     switch (query->oa_format) {
@@ -851,11 +1197,11 @@ query_result_accumulate(struct gen_perf_query_result *result,
  
  }
  
-static void
-query_result_clear(struct gen_perf_query_result *result)
+void
+gen_perf_query_result_clear(struct gen_perf_query_result *result)
  {
     memset(result, 0, sizeof(*result));
-   result->hw_id = 0xffffffff; /* invalid */
+   result->hw_id = OA_REPORT_INVALID_CTX_ID; /* invalid */
  }
  
  static void
@@ -1112,8 +1458,8 @@ get_metric_id(struct gen_perf_config *perf,
     }
  
     struct gen_perf_query_info *raw_query = (struct gen_perf_query_info *)query;
-   if (!load_metric_id(perf, query->guid,
-                       &raw_query->oa_metrics_set_id)) {
+   if (!gen_perf_load_metric_id(perf, query->guid,
+                                &raw_query->oa_metrics_set_id)) {
        DBG("Unable to read query guid=%s ID, falling back to test config\n", query->guid);
        raw_query->oa_metrics_set_id = 1ULL;
     } else {
@@ -1136,8 +1482,8 @@ get_free_sample_buf(struct gen_perf_context *perf_ctx)
  
        exec_node_init(&buf->link);
        buf->refcount = 0;
-      buf->len = 0;
     }
+   buf->len = 0;
  
     return buf;
  }
@@ -1183,11 +1529,11 @@ free_sample_bufs(struct gen_perf_context *perf_ctx)
   * pipeline statistics for the performance query object.
   */
  static void
-snapshot_statistics_registers(void *context,
-                              struct gen_perf_config *perf,
+snapshot_statistics_registers(struct gen_perf_context *ctx,
                                struct gen_perf_query_object *obj,
                                uint32_t offset_in_bytes)
  {
+   struct gen_perf_config *perf = ctx->perf;
     const struct gen_perf_query_info *query = obj->queryinfo;
     const int n_counters = query->n_counters;
  
@@ -1196,12 +1542,26 @@ snapshot_statistics_registers(void *context,
  
        assert(counter->data_type == GEN_PERF_COUNTER_DATA_TYPE_UINT64);
  
-      perf->vtbl.store_register_mem64(context, obj->pipeline_stats.bo,
-                                      counter->pipeline_stat.reg,
-                                      offset_in_bytes + i * sizeof(uint64_t));
+      perf->vtbl.store_register_mem(ctx->ctx, obj->pipeline_stats.bo,
+                                    counter->pipeline_stat.reg, 8,
+                                    offset_in_bytes + i * sizeof(uint64_t));
     }
  }
  
+static void
+snapshot_freq_register(struct gen_perf_context *ctx,
+                       struct gen_perf_query_object *query,
+                       uint32_t bo_offset)
+{
+   struct gen_perf_config *perf = ctx->perf;
+   const struct gen_device_info *devinfo = ctx->devinfo;
+
+   if (devinfo->gen == 8 && !devinfo->is_cherryview)
+      perf->vtbl.store_register_mem(ctx->ctx, query->oa.bo, GEN7_RPSTAT1, 4, bo_offset);
+   else if (devinfo->gen >= 9)
+      perf->vtbl.store_register_mem(ctx->ctx, query->oa.bo, GEN9_RPSTAT0, 4, bo_offset);
+}
+
  static void
  gen_perf_close(struct gen_perf_context *perfquery,
                 const struct gen_perf_query_info *query)
@@ -1387,15 +1747,9 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx,
      * end snapshot - otherwise the results won't be a complete representation
      * of the work.
      *
-    * Theoretically there could be opportunities to minimize how much of the
-    * GPU pipeline is drained, or that we stall for, when we know what specific
-    * units the performance counters being queried relate to but we don't
-    * currently attempt to be clever here.
-    *
-    * Note: with our current simple approach here then for back-to-back queries
-    * we will redundantly emit duplicate commands to synchronize the command
-    * streamer with the rest of the GPU pipeline, but we assume that in HW the
-    * second synchronization is effectively a NOOP.
+    * To achieve this, we stall the pipeline at pixel scoreboard (prevent any
+    * additional work to be processed by the pipeline until all pixels of the
+    * previous draw has be completed).
      *
      * N.B. The final results are based on deltas of counters between (inside)
      * Begin/End markers so even though the total wall clock time of the
@@ -1409,7 +1763,7 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx,
      * This is our Begin synchronization point to drain current work on the
      * GPU before we capture our first counter snapshot...
      */
-   perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
+   perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx);
  
     switch (queryinfo->kind) {
     case GEN_PERF_QUERY_TYPE_OA:
@@ -1522,19 +1876,10 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx,
        query->oa.begin_report_id = perf_ctx->next_query_start_report_id;
        perf_ctx->next_query_start_report_id += 2;
  
-      /* We flush the batchbuffer here to minimize the chances that MI_RPC
-       * delimiting commands end up in different batchbuffers. If that's the
-       * case, the measurement will include the time it takes for the kernel
-       * scheduler to load a new request into the hardware. This is manifested in
-       * tools like frameretrace by spikes in the "GPU Core Clocks" counter.
-       */
-      perf_cfg->vtbl.batchbuffer_flush(perf_ctx->ctx, __FILE__, __LINE__);
-
        /* Take a starting OA counter snapshot. */
        perf_cfg->vtbl.emit_mi_report_perf_count(perf_ctx->ctx, query->oa.bo, 0,
                                                 query->oa.begin_report_id);
-      perf_cfg->vtbl.capture_frequency_stat_register(perf_ctx->ctx, query->oa.bo,
-                                                     MI_FREQ_START_OFFSET_BYTES);
+      snapshot_freq_register(perf_ctx, query, MI_FREQ_START_OFFSET_BYTES);
  
        ++perf_ctx->n_active_oa_queries;
  
@@ -1555,7 +1900,7 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx,
         */
        buf->refcount++;
  
-      query_result_clear(&query->oa.result);
+      gen_perf_query_result_clear(&query->oa.result);
        query->oa.results_accumulated = false;
  
        add_to_unaccumulated_query_list(perf_ctx, query);
@@ -1574,7 +1919,7 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx,
                                   STATS_BO_SIZE);
  
        /* Take starting snapshots. */
-      snapshot_statistics_registers(perf_ctx->ctx , perf_cfg, query, 0);
+      snapshot_statistics_registers(perf_ctx, query, 0);
  
        ++perf_ctx->n_active_pipeline_stats_queries;
        break;
@@ -1599,7 +1944,7 @@ gen_perf_end_query(struct gen_perf_context *perf_ctx,
      * For more details see comment in brw_begin_perf_query for
      * corresponding flush.
      */
-  perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
+   perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx);
  
     switch (query->queryinfo->kind) {
     case GEN_PERF_QUERY_TYPE_OA:
@@ -1612,8 +1957,7 @@ gen_perf_end_query(struct gen_perf_context *perf_ctx,
         */
        if (!query->oa.results_accumulated) {
           /* Take an ending OA counter snapshot. */
-         perf_cfg->vtbl.capture_frequency_stat_register(perf_ctx->ctx, query->oa.bo,
-                                                     MI_FREQ_END_OFFSET_BYTES);
+         snapshot_freq_register(perf_ctx, query, MI_FREQ_END_OFFSET_BYTES);
           perf_cfg->vtbl.emit_mi_report_perf_count(perf_ctx->ctx, query->oa.bo,
                                               MI_RPC_BO_END_OFFSET_BYTES,
                                               query->oa.begin_report_id + 1);
@@ -1628,7 +1972,7 @@ gen_perf_end_query(struct gen_perf_context *perf_ctx,
        break;
  
     case GEN_PERF_QUERY_TYPE_PIPELINE:
-      snapshot_statistics_registers(perf_ctx->ctx, perf_cfg, query,
+      snapshot_statistics_registers(perf_ctx, query,
                                      STATS_BO_END_OFFSET_BYTES);
        --perf_ctx->n_active_pipeline_stats_queries;
        break;
@@ -1654,7 +1998,8 @@ read_oa_samples_until(struct gen_perf_context *perf_ctx,
        exec_list_get_tail(&perf_ctx->sample_buffers);
     struct oa_sample_buf *tail_buf =
        exec_node_data(struct oa_sample_buf, tail_node, link);
-   uint32_t last_timestamp = tail_buf->last_timestamp;
+   uint32_t last_timestamp =
+      tail_buf->len == 0 ? start_timestamp : tail_buf->last_timestamp;
  
     while (1) {
        struct oa_sample_buf *buf = get_free_sample_buf(perf_ctx);
@@ -1669,12 +2014,13 @@ read_oa_samples_until(struct gen_perf_context *perf_ctx,
           exec_list_push_tail(&perf_ctx->free_sample_buffers, &buf->link);
  
           if (len < 0) {
-            if (errno == EAGAIN)
-               return ((last_timestamp - start_timestamp) >=
+            if (errno == EAGAIN) {
+               return ((last_timestamp - start_timestamp) < INT32_MAX &&
+                       (last_timestamp - start_timestamp) >=
                         (end_timestamp - start_timestamp)) ?
                        OA_READ_STATUS_FINISHED :
                        OA_READ_STATUS_UNFINISHED;
-            else {
+            } else {
                 DBG("Error reading i915 perf samples: %m\n");
              }
           } else
@@ -1890,6 +2236,17 @@ discard_all_queries(struct gen_perf_context *perf_ctx)
     }
  }
  
+/* Looks for the validity bit of context ID (dword 2) of an OA report. */
+static bool
+oa_report_ctx_id_valid(const struct gen_device_info *devinfo,
+                       const uint32_t *report)
+{
+   assert(devinfo->gen >= 8);
+   if (devinfo->gen == 8)
+      return (report[0] & (1 << 25)) != 0;
+   return (report[0] & (1 << 16)) != 0;
+}
+
  /**
   * Accumulate raw OA counter values based on deltas between pairs of
   * OA reports.
@@ -1917,7 +2274,7 @@ accumulate_oa_reports(struct gen_perf_context *perf_ctx,
     uint32_t *last;
     uint32_t *end;
     struct exec_node *first_samples_node;
-   bool in_ctx = true;
+   bool last_report_ctx_match = true;
     int out_duration = 0;
  
     assert(query->oa.map != NULL);
@@ -1934,6 +2291,14 @@ accumulate_oa_reports(struct gen_perf_context *perf_ctx,
        goto error;
     }
  
+   /* On Gen12+ OA reports are sourced from per context counters, so we don't
+    * ever have to look at the global OA buffer. Yey \o/
+    */
+   if (perf_ctx->devinfo->gen >= 12) {
+      last = start;
+      goto end;
+   }
+
     /* See if we have any periodic reports to accumulate too... */
  
     /* N.B. The oa.samples_head was set when the query began and
@@ -1946,7 +2311,7 @@ accumulate_oa_reports(struct gen_perf_context *perf_ctx,
     first_samples_node = query->oa.samples_head->next;
  
     foreach_list_typed_from(struct oa_sample_buf, buf, link,
-                           &perf_ctx.sample_buffers,
+                           &perf_ctx->sample_buffers,
                             first_samples_node)
     {
        int offset = 0;
@@ -1963,6 +2328,7 @@ accumulate_oa_reports(struct gen_perf_context *perf_ctx,
           switch (header->type) {
           case DRM_I915_PERF_RECORD_SAMPLE: {
              uint32_t *report = (uint32_t *)(header + 1);
+            bool report_ctx_match = true;
              bool add = true;
  
              /* Ignore reports that come before the start marker.
@@ -1991,43 +2357,46 @@ accumulate_oa_reports(struct gen_perf_context *perf_ctx,
               * of OA counters while any other context is acctive.
               */
              if (devinfo->gen >= 8) {
-               if (in_ctx && report[2] != query->oa.result.hw_id) {
-                  DBG("i915 perf: Switch AWAY (observed by ID change)\n");
-                  in_ctx = false;
+               /* Consider that the current report matches our context only if
+                * the report says the report ID is valid.
+                */
+               report_ctx_match = oa_report_ctx_id_valid(devinfo, report) &&
+                  report[2] == start[2];
+               if (report_ctx_match)
                    out_duration = 0;
-               } else if (in_ctx == false && report[2] == query->oa.result.hw_id) {
-                  DBG("i915 perf: Switch TO\n");
-                  in_ctx = true;
-
-                  /* From experimentation in IGT, we found that the OA unit
-                   * might label some report as "idle" (using an invalid
-                   * context ID), right after a report for a given context.
-                   * Deltas generated by those reports actually belong to the
-                   * previous context, even though they're not labelled as
-                   * such.
-                   *
-                   * We didn't *really* Switch AWAY in the case that we e.g.
-                   * saw a single periodic report while idle...
-                   */
-                  if (out_duration >= 1)
-                     add = false;
-               } else if (in_ctx) {
-                  assert(report[2] == query->oa.result.hw_id);
-                  DBG("i915 perf: Continuation IN\n");
-               } else {
-                  assert(report[2] != query->oa.result.hw_id);
-                  DBG("i915 perf: Continuation OUT\n");
-                  add = false;
+               else
                    out_duration++;
-               }
+
+               /* Only add the delta between <last, report> if the last report
+                * was clearly identified as our context, or if we have at most
+                * 1 report without a matching ID.
+                *
+                * The OA unit will sometimes label reports with an invalid
+                * context ID when i915 rewrites the execlist submit register
+                * with the same context as the one currently running. This
+                * happens when i915 wants to notify the HW of ringbuffer tail
+                * register update. We have to consider this report as part of
+                * our context as the 3d pipeline behind the OACS unit is still
+                * processing the operations started at the previous execlist
+                * submission.
+                */
+               add = last_report_ctx_match && out_duration < 2;
              }
  
              if (add) {
-               query_result_accumulate(&query->oa.result, query->queryinfo,
-                                       last, report);
+               gen_perf_query_result_accumulate(&query->oa.result,
+                                                query->queryinfo,
+                                                last, report);
+            } else {
+               /* We're not adding the delta because we've identified it's not
+                * for the context we filter for. We can consider that the
+                * query was split.
+                */
+               query->oa.result.query_disjoint = true;
              }
  
              last = report;
+            last_report_ctx_match = report_ctx_match;
  
              break;
           }
@@ -2044,8 +2413,8 @@ accumulate_oa_reports(struct gen_perf_context *perf_ctx,
  
  end:
  
-   query_result_accumulate(&query->oa.result, query->queryinfo,
-                           last, end);
+   gen_perf_query_result_accumulate(&query->oa.result, query->queryinfo,
+                                    last, end);
  
     query->oa.results_accumulated = true;
     drop_from_unaccumulated_query_list(perf_ctx, query);
@@ -2232,10 +2601,10 @@ gen_perf_get_query_data(struct gen_perf_context *perf_ctx,
           read_gt_frequency(perf_ctx, query);
           uint32_t *begin_report = query->oa.map;
           uint32_t *end_report = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
-         query_result_read_frequencies(&query->oa.result,
-                                       perf_ctx->devinfo,
-                                       begin_report,
-                                       end_report);
+         gen_perf_query_result_read_frequencies(&query->oa.result,
+                                                perf_ctx->devinfo,
+                                                begin_report,
+                                                end_report);
           accumulate_oa_reports(perf_ctx, query);
           assert(query->oa.results_accumulated);
  
@@ -2266,3 +2635,33 @@ gen_perf_get_query_data(struct gen_perf_context *perf_ctx,
     if (bytes_written)
        *bytes_written = written;
  }
+
+void
+gen_perf_dump_query_count(struct gen_perf_context *perf_ctx)
+{
+   DBG("Queries: (Open queries = %d, OA users = %d)\n",
+       perf_ctx->n_active_oa_queries, perf_ctx->n_oa_users);
+}
+
+void
+gen_perf_dump_query(struct gen_perf_context *ctx,
+                    struct gen_perf_query_object *obj,
+                    void *current_batch)
+{
+   switch (obj->queryinfo->kind) {
+   case GEN_PERF_QUERY_TYPE_OA:
+   case GEN_PERF_QUERY_TYPE_RAW:
+      DBG("BO: %-4s OA data: %-10s %-15s\n",
+          obj->oa.bo ? "yes," : "no,",
+          gen_perf_is_query_ready(ctx, obj, current_batch) ? "ready," : "not ready,",
+          obj->oa.results_accumulated ? "accumulated" : "not accumulated");
+      break;
+   case GEN_PERF_QUERY_TYPE_PIPELINE:
+      DBG("BO: %-4s\n",
+          obj->pipeline_stats.bo ? "yes" : "no");
+      break;
+   default:
+      unreachable("Unknown query type");
+      break;
+   }
+}