From 31b11f69f75ff92cb42a13bb2f6740c183f761df Mon Sep 17 00:00:00 2001
From: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Date: Thu, 30 Mar 2017 15:46:40 +0100
Subject: [PATCH] i965: perf: keep on reading reports until delimiting
 timestamp

Due to an underlying hardware race condition, we have no guarantee
that all the reports coming from the OA buffer related to the workload
we're trying to measure have landed to memory by the time all the work
submitted has completed. That means we need to keep on reading the OA
stream until we read a report with a timestamp more recent than the
timestamp recored by the MI_REPORT_PERF_COUNT at the end of the
performance query.

v2: fix uninitialized offset variable to 0 (Lionel)

v3: rework the reading to avoid blocking the user of the API unless
    requested (Rob)

v4: fix a bug that makes the i965 driver reading the perf stream when
    not necessary, leading to very long counter accumulation times
    (Lionel)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 .../drivers/dri/i965/brw_performance_query.c  | 133 +++++++++++++++---
 1 file changed, 113 insertions(+), 20 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c
index dd392b1960c..4af06185680 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_query.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
@@ -219,6 +219,7 @@ struct brw_oa_sample_buf {
    int refcount;
    int len;
    uint8_t buf[I915_PERF_OA_SAMPLE_SIZE * 10];
+   uint32_t last_timestamp;
 };
 
 /**
@@ -243,6 +244,11 @@ struct brw_perf_query_object
           */
          struct brw_bo *bo;
 
+         /**
+          * Address of mapped of @bo
+          */
+         void *map;
+
          /**
           * The MI_REPORT_PERF_COUNT command lets us specify a unique
           * ID that will be reflected in the resulting OA report
@@ -681,11 +687,26 @@ discard_all_queries(struct brw_context *brw)
    }
 }
 
-static bool
-read_oa_samples(struct brw_context *brw)
+enum OaReadStatus {
+   OA_READ_STATUS_ERROR,
+   OA_READ_STATUS_UNFINISHED,
+   OA_READ_STATUS_FINISHED,
+};
+
+static enum OaReadStatus
+read_oa_samples_until(struct brw_context *brw,
+                      uint32_t start_timestamp,
+                      uint32_t end_timestamp)
 {
+   struct exec_node *tail_node =
+      exec_list_get_tail(&brw->perfquery.sample_buffers);
+   struct brw_oa_sample_buf *tail_buf =
+      exec_node_data(struct brw_oa_sample_buf, tail_node, link);
+   uint32_t last_timestamp = tail_buf->last_timestamp;
+
    while (1) {
       struct brw_oa_sample_buf *buf = get_free_sample_buf(brw);
+      uint32_t offset;
       int len;
 
       while ((len = read(brw->perfquery.oa_stream_fd, buf->buf,
@@ -697,28 +718,94 @@ read_oa_samples(struct brw_context *brw)
 
          if (len < 0) {
             if (errno == EAGAIN)
-               return true;
+               return ((last_timestamp - start_timestamp) >=
+                       (end_timestamp - start_timestamp)) ?
+                      OA_READ_STATUS_FINISHED :
+                      OA_READ_STATUS_UNFINISHED;
             else {
                DBG("Error reading i915 perf samples: %m\n");
-               return false;
             }
-         } else {
+         } else
             DBG("Spurious EOF reading i915 perf samples\n");
-            return false;
-         }
+
+         return OA_READ_STATUS_ERROR;
       }
 
       buf->len = len;
       exec_list_push_tail(&brw->perfquery.sample_buffers, &buf->link);
+
+      /* Go through the reports and update the last timestamp. */
+      offset = 0;
+      while (offset < buf->len) {
+         const struct drm_i915_perf_record_header *header =
+            (const struct drm_i915_perf_record_header *) &buf->buf[offset];
+         uint32_t *report = (uint32_t *) (header + 1);
+
+         if (header->type == DRM_I915_PERF_RECORD_SAMPLE)
+            last_timestamp = report[1];
+
+         offset += header->size;
+      }
+
+      buf->last_timestamp = last_timestamp;
    }
 
    unreachable("not reached");
+   return OA_READ_STATUS_ERROR;
+}
+
+/**
+ * Try to read all the reports until either the delimiting timestamp
+ * or an error arises.
+ */
+static bool
+read_oa_samples_for_query(struct brw_context *brw,
+                          struct brw_perf_query_object *obj)
+{
+   uint32_t *start;
+   uint32_t *last;
+   uint32_t *end;
+
+   /* We need the MI_REPORT_PERF_COUNT to land before we can start
+    * accumulate. */
+   assert(!brw_batch_references(&brw->batch, obj->oa.bo) &&
+          !brw_bo_busy(obj->oa.bo));
+
+   /* Map the BO once here and let accumulate_oa_reports() unmap
+    * it. */
+   if (obj->oa.map == NULL)
+      obj->oa.map = brw_bo_map(brw, obj->oa.bo, MAP_READ);
+
+   start = last = obj->oa.map;
+   end = obj->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
+
+   if (start[0] != obj->oa.begin_report_id) {
+      DBG("Spurious start report id=%"PRIu32"\n", start[0]);
+      return true;
+   }
+   if (end[0] != (obj->oa.begin_report_id + 1)) {
+      DBG("Spurious end report id=%"PRIu32"\n", end[0]);
+      return true;
+   }
+
+   /* Read the reports until the end timestamp. */
+   switch (read_oa_samples_until(brw, start[1], end[1])) {
+   case OA_READ_STATUS_ERROR:
+      /* Fallthrough and let accumulate_oa_reports() deal with the
+       * error. */
+   case OA_READ_STATUS_FINISHED:
+      return true;
+   case OA_READ_STATUS_UNFINISHED:
+      return false;
+   }
+
+   unreachable("invalid read status");
    return false;
 }
 
 /**
- * Accumulate raw OA counter values based on deltas between pairs
- * of OA reports.
+ * Accumulate raw OA counter values based on deltas between pairs of
+ * OA reports.
  *
  * Accumulation starts from the first report captured via
  * MI_REPORT_PERF_COUNT (MI_RPC) by brw_begin_perf_query() until the
@@ -739,7 +826,6 @@ accumulate_oa_reports(struct brw_context *brw,
                       struct brw_perf_query_object *obj)
 {
    struct gl_perf_query_object *o = &obj->base;
-   uint32_t *query_buffer;
    uint32_t *start;
    uint32_t *last;
    uint32_t *end;
@@ -748,15 +834,10 @@ accumulate_oa_reports(struct brw_context *brw,
    uint32_t ctx_id;
 
    assert(o->Ready);
+   assert(obj->oa.map != NULL);
 
-   /* Collect the latest periodic OA reports from i915 perf */
-   if (!read_oa_samples(brw))
-      goto error;
-
-   query_buffer = brw_bo_map(brw, obj->oa.bo, MAP_READ);
-
-   start = last = query_buffer;
-   end = query_buffer + (MI_RPC_BO_END_OFFSET_BYTES / sizeof(uint32_t));
+   start = last = obj->oa.map;
+   end = obj->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
 
    if (start[0] != obj->oa.begin_report_id) {
       DBG("Spurious start report id=%"PRIu32"\n", start[0]);
@@ -864,6 +945,7 @@ end:
    DBG("Marking %d accumulated - results gathered\n", o->Id);
 
    brw_bo_unmap(obj->oa.bo);
+   obj->oa.map = NULL;
    obj->oa.results_accumulated = true;
    drop_from_unaccumulated_query_list(brw, obj);
    dec_n_oa_users(brw);
@@ -873,6 +955,7 @@ end:
 error:
 
    brw_bo_unmap(obj->oa.bo);
+   obj->oa.map = NULL;
    discard_all_queries(brw);
 }
 
@@ -1249,6 +1332,16 @@ brw_wait_perf_query(struct gl_context *ctx, struct gl_perf_query_object *o)
       intel_batchbuffer_flush(brw);
 
    brw_bo_wait_rendering(brw, bo);
+
+   /* Due to a race condition between the OA unit signaling report
+    * availability and the report actually being written into memory,
+    * we need to wait for all the reports to come in before we can
+    * read them.
+    */
+   if (obj->query->kind == OA_COUNTERS) {
+      while (!read_oa_samples_for_query(brw, obj))
+         ;
+   }
 }
 
 static bool
@@ -1266,8 +1359,8 @@ brw_is_perf_query_ready(struct gl_context *ctx,
       return (obj->oa.results_accumulated ||
               (obj->oa.bo &&
                !brw_batch_references(&brw->batch, obj->oa.bo) &&
-               !brw_bo_busy(obj->oa.bo)));
-
+               !brw_bo_busy(obj->oa.bo) &&
+               read_oa_samples_for_query(brw, obj)));
    case PIPELINE_STATS:
       return (obj->pipeline_stats.bo &&
               !brw_batch_references(&brw->batch, obj->pipeline_stats.bo) &&
-- 
2.30.2