From: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Date: Mon, 20 May 2019 06:56:18 +0000 (+0100)
Subject: i965/iris: perf-queries: don't invalidate/flush 3d pipeline
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=bd888bc1d65cefbd4e3fc0a40d416c75d9632951;p=mesa.git

i965/iris: perf-queries: don't invalidate/flush 3d pipeline

Our current implementation of performance queries is fairly harsh
because it completely flushes and invalidates the 3d pipeline caches
at the beginning and end of each query. An argument can be made that
this is how performance should be measured but it probably doesn't
reflect what the application is actually doing and the actual cost of
draw calls.

A more appropriate approach is to just stall the pipeline at
scoreboard, so that we measure the effect of a draw call without
having the pipeline in a completely pristine state for every draw
call.

v2: Use end of pipe PIPE_CONTROL instruction for Iris (Ken)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---

diff --git a/src/gallium/drivers/iris/iris_perf.c b/src/gallium/drivers/iris/iris_perf.c
index 7c0378aacee..1e5ec8140dc 100644
--- a/src/gallium/drivers/iris/iris_perf.c
+++ b/src/gallium/drivers/iris/iris_perf.c
@@ -31,18 +31,11 @@ iris_oa_bo_alloc(void *bufmgr, const char *name, uint64_t size)
 }
 
 static void
-iris_perf_emit_mi_flush(struct iris_context *ice)
+iris_perf_emit_stall_at_pixel_scoreboard(struct iris_context *ice)
 {
-   const int flags = PIPE_CONTROL_RENDER_TARGET_FLUSH |
-                     PIPE_CONTROL_INSTRUCTION_INVALIDATE |
-                     PIPE_CONTROL_CONST_CACHE_INVALIDATE |
-                     PIPE_CONTROL_DATA_CACHE_FLUSH |
-                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-                     PIPE_CONTROL_VF_CACHE_INVALIDATE |
-                     PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
-                     PIPE_CONTROL_CS_STALL;
-   iris_emit_pipe_control_flush(&ice->batches[IRIS_BATCH_RENDER],
-                                "OA metrics", flags);
+   iris_emit_end_of_pipe_sync(&ice->batches[IRIS_BATCH_RENDER],
+                              "OA metrics",
+                              PIPE_CONTROL_STALL_AT_SCOREBOARD);
 }
 
 static void
@@ -106,7 +99,8 @@ iris_perf_init_vtbl(struct gen_perf_config *perf_cfg)
    perf_cfg->vtbl.bo_unreference = (bo_unreference_t)iris_bo_unreference;
    perf_cfg->vtbl.bo_map = (bo_map_t)iris_bo_map;
    perf_cfg->vtbl.bo_unmap = (bo_unmap_t)iris_bo_unmap;
-   perf_cfg->vtbl.emit_mi_flush = (emit_mi_flush_t)iris_perf_emit_mi_flush;
+   perf_cfg->vtbl.emit_stall_at_pixel_scoreboard =
+      (emit_mi_flush_t)iris_perf_emit_stall_at_pixel_scoreboard;
 
    perf_cfg->vtbl.emit_mi_report_perf_count =
       (emit_mi_report_t)iris_perf_emit_mi_report_perf_count;
diff --git a/src/intel/perf/gen_perf.c b/src/intel/perf/gen_perf.c
index daa092c88c9..9e987d599d7 100644
--- a/src/intel/perf/gen_perf.c
+++ b/src/intel/perf/gen_perf.c
@@ -1716,15 +1716,9 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx,
     * end snapshot - otherwise the results won't be a complete representation
     * of the work.
     *
-    * Theoretically there could be opportunities to minimize how much of the
-    * GPU pipeline is drained, or that we stall for, when we know what specific
-    * units the performance counters being queried relate to but we don't
-    * currently attempt to be clever here.
-    *
-    * Note: with our current simple approach here then for back-to-back queries
-    * we will redundantly emit duplicate commands to synchronize the command
-    * streamer with the rest of the GPU pipeline, but we assume that in HW the
-    * second synchronization is effectively a NOOP.
+    * To achieve this, we stall the pipeline at pixel scoreboard (prevent any
+    * additional work to be processed by the pipeline until all pixels of the
+    * previous draw has be completed).
     *
     * N.B. The final results are based on deltas of counters between (inside)
     * Begin/End markers so even though the total wall clock time of the
@@ -1738,7 +1732,7 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx,
     * This is our Begin synchronization point to drain current work on the
     * GPU before we capture our first counter snapshot...
     */
-   perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
+   perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx);
 
    switch (queryinfo->kind) {
    case GEN_PERF_QUERY_TYPE_OA:
@@ -1920,7 +1914,7 @@ gen_perf_end_query(struct gen_perf_context *perf_ctx,
     * For more details see comment in brw_begin_perf_query for
     * corresponding flush.
     */
-  perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
+   perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx);
 
    switch (query->queryinfo->kind) {
    case GEN_PERF_QUERY_TYPE_OA:
diff --git a/src/intel/perf/gen_perf.h b/src/intel/perf/gen_perf.h
index 46d37e07c25..2cd246a1dca 100644
--- a/src/intel/perf/gen_perf.h
+++ b/src/intel/perf/gen_perf.h
@@ -219,7 +219,7 @@ struct gen_perf_config {
       bool (*batch_references)(void *batch, void *bo);
       void (*bo_wait_rendering)(void *bo);
       int (*bo_busy)(void *bo);
-      void (*emit_mi_flush)(void *ctx);
+      void (*emit_stall_at_pixel_scoreboard)(void *ctx);
       void (*emit_mi_report_perf_count)(void *ctx,
                                         void *bo,
                                         uint32_t offset_in_bytes,
diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c
index 0e5459e5e5e..cfd3efe374e 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_query.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
@@ -459,6 +459,13 @@ brw_oa_batchbuffer_flush(void *c, const char *file, int line)
    _intel_batchbuffer_flush_fence(ctx, -1, NULL, file,  line);
 }
 
+static void
+brw_oa_emit_stall_at_pixel_scoreboard(void *c)
+{
+   struct brw_context *brw = c;
+   brw_emit_end_of_pipe_sync(brw, PIPE_CONTROL_STALL_AT_SCOREBOARD);
+}
+
 typedef void (*capture_frequency_stat_register_t)(void *, void *, uint32_t );
 typedef void (*store_register_mem64_t)(void *ctx, void *bo,
                                        uint32_t reg, uint32_t offset);
@@ -487,7 +494,8 @@ brw_init_perf_query_info(struct gl_context *ctx)
    perf_cfg->vtbl.bo_unreference = (bo_unreference_t)brw_bo_unreference;
    perf_cfg->vtbl.bo_map = (bo_map_t)brw_bo_map;
    perf_cfg->vtbl.bo_unmap = (bo_unmap_t)brw_bo_unmap;
-   perf_cfg->vtbl.emit_mi_flush = (emit_mi_flush_t)brw_emit_mi_flush;
+   perf_cfg->vtbl.emit_stall_at_pixel_scoreboard =
+      (emit_mi_flush_t)brw_oa_emit_stall_at_pixel_scoreboard;
    perf_cfg->vtbl.emit_mi_report_perf_count =
       (emit_mi_report_t)brw_oa_emit_mi_report_perf_count;
    perf_cfg->vtbl.batchbuffer_flush = brw_oa_batchbuffer_flush;