intel/perf: fix performance counters availability after glFinish
authorMarcin Ślusarz <marcin.slusarz@intel.com>
Fri, 10 Jul 2020 17:25:10 +0000 (19:25 +0200)
committerMarge Bot <eric+marge@anholt.net>
Mon, 10 Aug 2020 13:41:29 +0000 (13:41 +0000)
Currently Linux kernel gathers performance counters at fixed
intervals (~5-10ms), so if application uses AMD_performance_monitor
extension and immediately after glFinish() asks GL driver for HW
performance counter values it might not get any data (values == 0).

Fix this by moving the "read counters from kernel" code from
"is query ready" to "get counter values" callback with a loop around
it. Unfortunately it means that the "read counters from kernel"
code can spin for up to 10ms.

Ideally kernel should gather performance counters whenever we ask
it for counter values, but for now we have deal with what we have.

Signed-off-by: Marcin Ślusarz <marcin.slusarz@intel.com>
Cc: <mesa-stable@lists.freedesktop.org>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5788>

src/gallium/drivers/iris/iris_monitor.c
src/gallium/drivers/iris/iris_performance_query.c
src/intel/perf/gen_perf_query.c
src/intel/perf/gen_perf_query.h
src/mesa/drivers/dri/i965/brw_performance_query.c

index b615476b2cf3dc775055c9cb484e26e40ee612f0..919aebca4789edcec4bf5e4e36893cff1e02a3c1 100644 (file)
@@ -283,7 +283,7 @@ iris_get_monitor_result(struct pipe_context *ctx,
    assert(gen_perf_is_query_ready(perf_ctx, monitor->query, batch));
 
    unsigned bytes_written;
-   gen_perf_get_query_data(perf_ctx, monitor->query,
+   gen_perf_get_query_data(perf_ctx, monitor->query, batch,
                            monitor->result_size,
                            (unsigned*) monitor->result_buffer,
                            &bytes_written);
index 1e8e1d9201aa7d231d5fb0900ed98cd3ad9d4293..0ef5c206fe3965202f22f6bf47f434de3836da30 100644 (file)
@@ -214,7 +214,8 @@ iris_get_perf_query_data(struct pipe_context *pipe,
    struct gen_perf_query_object *obj = perf_query->query;
    struct gen_perf_context *perf_ctx = ice->perf_ctx;
 
-   gen_perf_get_query_data(perf_ctx, obj, data_size, data, bytes_written);
+   gen_perf_get_query_data(perf_ctx, obj, &ice->batches[IRIS_BATCH_RENDER],
+         data_size, data, bytes_written);
 }
 
 void
index 309ebe024d308f8a3acbf261de681a4cbcb924d1..7bd8e0277d24d2d85e29358efd49062615626d87 100644 (file)
@@ -1065,17 +1065,6 @@ gen_perf_wait_query(struct gen_perf_context *perf_ctx,
       perf_cfg->vtbl.batchbuffer_flush(perf_ctx->ctx, __FILE__, __LINE__);
 
    perf_cfg->vtbl.bo_wait_rendering(bo);
-
-   /* Due to a race condition between the OA unit signaling report
-    * availability and the report actually being written into memory,
-    * we need to wait for all the reports to come in before we can
-    * read them.
-    */
-   if (query->queryinfo->kind == GEN_PERF_QUERY_TYPE_OA ||
-       query->queryinfo->kind == GEN_PERF_QUERY_TYPE_RAW) {
-      while (!read_oa_samples_for_query(perf_ctx, query, current_batch))
-         ;
-   }
 }
 
 bool
@@ -1091,8 +1080,8 @@ gen_perf_is_query_ready(struct gen_perf_context *perf_ctx,
       return (query->oa.results_accumulated ||
               (query->oa.bo &&
                !perf_cfg->vtbl.batch_references(current_batch, query->oa.bo) &&
-               !perf_cfg->vtbl.bo_busy(query->oa.bo) &&
-               read_oa_samples_for_query(perf_ctx, query, current_batch)));
+               !perf_cfg->vtbl.bo_busy(query->oa.bo)));
+
    case GEN_PERF_QUERY_TYPE_PIPELINE:
       return (query->pipeline_stats.bo &&
               !perf_cfg->vtbl.batch_references(current_batch, query->pipeline_stats.bo) &&
@@ -1519,6 +1508,7 @@ get_pipeline_stats_data(struct gen_perf_context *perf_ctx,
 void
 gen_perf_get_query_data(struct gen_perf_context *perf_ctx,
                         struct gen_perf_query_object *query,
+                        void *current_batch,
                         int data_size,
                         unsigned *data,
                         unsigned *bytes_written)
@@ -1530,6 +1520,17 @@ gen_perf_get_query_data(struct gen_perf_context *perf_ctx,
    case GEN_PERF_QUERY_TYPE_OA:
    case GEN_PERF_QUERY_TYPE_RAW:
       if (!query->oa.results_accumulated) {
+         /* Due to the sampling frequency of the OA buffer by the i915-perf
+          * driver, there can be a 5ms delay between the Mesa seeing the query
+          * complete and i915 making all the OA buffer reports available to us.
+          * We need to wait for all the reports to come in before we can do
+          * the post processing removing unrelated deltas.
+          * There is a i915-perf series to address this issue, but it's
+          * not been merged upstream yet.
+          */
+         while (!read_oa_samples_for_query(perf_ctx, query, current_batch))
+            ;
+
          read_gt_frequency(perf_ctx, query);
          uint32_t *begin_report = query->oa.map;
          uint32_t *end_report = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
index a0246501f7665645c22f840d5d4de157353cd47f..d064a5d06691745083db7eb5e5d6d20d7fbfdadf 100644 (file)
@@ -76,6 +76,7 @@ void gen_perf_delete_query(struct gen_perf_context *perf_ctx,
                            struct gen_perf_query_object *query);
 void gen_perf_get_query_data(struct gen_perf_context *perf_ctx,
                              struct gen_perf_query_object *query,
+                             void *current_batch,
                              int data_size,
                              unsigned *data,
                              unsigned *bytes_written);
index 6c74403da6c1ae3912959c9dd7af80f087e52e97..042c236d9768eb05b50f91f095d81c71d48cd3b4 100644 (file)
@@ -323,7 +323,7 @@ brw_get_perf_query_data(struct gl_context *ctx,
     */
    assert(o->Ready);
 
-   gen_perf_get_query_data(brw->perf_ctx, obj,
+   gen_perf_get_query_data(brw->perf_ctx, obj, &brw->batch,
                            data_size, data, bytes_written);
 }