i965/iris: perf-queries: don't invalidate/flush 3d pipeline
authorLionel Landwerlin <lionel.g.landwerlin@intel.com>
Mon, 20 May 2019 06:56:18 +0000 (07:56 +0100)
committerLionel Landwerlin <lionel.g.landwerlin@intel.com>
Fri, 13 Dec 2019 09:27:22 +0000 (11:27 +0200)
Our current implementation of performance queries is fairly harsh
because it completely flushes and invalidates the 3d pipeline caches
at the beginning and end of each query. An argument can be made that
this is how performance should be measured but it probably doesn't
reflect what the application is actually doing and the actual cost of
draw calls.

A more appropriate approach is to just stall the pipeline at
scoreboard, so that we measure the effect of a draw call without
having the pipeline in a completely pristine state for every draw
call.

v2: Use end of pipe PIPE_CONTROL instruction for Iris (Ken)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
src/gallium/drivers/iris/iris_perf.c
src/intel/perf/gen_perf.c
src/intel/perf/gen_perf.h
src/mesa/drivers/dri/i965/brw_performance_query.c

index 7c0378aacee97ceada76895a1d62265e58714318..1e5ec8140dcef3c79465f775502e4ed07d4be5d7 100644 (file)
@@ -31,18 +31,11 @@ iris_oa_bo_alloc(void *bufmgr, const char *name, uint64_t size)
 }
 
 static void
-iris_perf_emit_mi_flush(struct iris_context *ice)
+iris_perf_emit_stall_at_pixel_scoreboard(struct iris_context *ice)
 {
-   const int flags = PIPE_CONTROL_RENDER_TARGET_FLUSH |
-                     PIPE_CONTROL_INSTRUCTION_INVALIDATE |
-                     PIPE_CONTROL_CONST_CACHE_INVALIDATE |
-                     PIPE_CONTROL_DATA_CACHE_FLUSH |
-                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-                     PIPE_CONTROL_VF_CACHE_INVALIDATE |
-                     PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
-                     PIPE_CONTROL_CS_STALL;
-   iris_emit_pipe_control_flush(&ice->batches[IRIS_BATCH_RENDER],
-                                "OA metrics", flags);
+   iris_emit_end_of_pipe_sync(&ice->batches[IRIS_BATCH_RENDER],
+                              "OA metrics",
+                              PIPE_CONTROL_STALL_AT_SCOREBOARD);
 }
 
 static void
@@ -106,7 +99,8 @@ iris_perf_init_vtbl(struct gen_perf_config *perf_cfg)
    perf_cfg->vtbl.bo_unreference = (bo_unreference_t)iris_bo_unreference;
    perf_cfg->vtbl.bo_map = (bo_map_t)iris_bo_map;
    perf_cfg->vtbl.bo_unmap = (bo_unmap_t)iris_bo_unmap;
-   perf_cfg->vtbl.emit_mi_flush = (emit_mi_flush_t)iris_perf_emit_mi_flush;
+   perf_cfg->vtbl.emit_stall_at_pixel_scoreboard =
+      (emit_mi_flush_t)iris_perf_emit_stall_at_pixel_scoreboard;
 
    perf_cfg->vtbl.emit_mi_report_perf_count =
       (emit_mi_report_t)iris_perf_emit_mi_report_perf_count;
index daa092c88c9a4982dc252b126b0ec2a3631fe3b7..9e987d599d7824ab571cbc05e4f16e324de58d42 100644 (file)
@@ -1716,15 +1716,9 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx,
     * end snapshot - otherwise the results won't be a complete representation
     * of the work.
     *
-    * Theoretically there could be opportunities to minimize how much of the
-    * GPU pipeline is drained, or that we stall for, when we know what specific
-    * units the performance counters being queried relate to but we don't
-    * currently attempt to be clever here.
-    *
-    * Note: with our current simple approach here then for back-to-back queries
-    * we will redundantly emit duplicate commands to synchronize the command
-    * streamer with the rest of the GPU pipeline, but we assume that in HW the
-    * second synchronization is effectively a NOOP.
+    * To achieve this, we stall the pipeline at pixel scoreboard (prevent any
+    * additional work to be processed by the pipeline until all pixels of the
+    * previous draw has be completed).
     *
     * N.B. The final results are based on deltas of counters between (inside)
     * Begin/End markers so even though the total wall clock time of the
@@ -1738,7 +1732,7 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx,
     * This is our Begin synchronization point to drain current work on the
     * GPU before we capture our first counter snapshot...
     */
-   perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
+   perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx);
 
    switch (queryinfo->kind) {
    case GEN_PERF_QUERY_TYPE_OA:
@@ -1920,7 +1914,7 @@ gen_perf_end_query(struct gen_perf_context *perf_ctx,
     * For more details see comment in brw_begin_perf_query for
     * corresponding flush.
     */
-  perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
+   perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx);
 
    switch (query->queryinfo->kind) {
    case GEN_PERF_QUERY_TYPE_OA:
index 46d37e07c2575a3ad42761c866208a01d06344ad..2cd246a1dca167fe0a6dfd373efce92ad2b890b4 100644 (file)
@@ -219,7 +219,7 @@ struct gen_perf_config {
       bool (*batch_references)(void *batch, void *bo);
       void (*bo_wait_rendering)(void *bo);
       int (*bo_busy)(void *bo);
-      void (*emit_mi_flush)(void *ctx);
+      void (*emit_stall_at_pixel_scoreboard)(void *ctx);
       void (*emit_mi_report_perf_count)(void *ctx,
                                         void *bo,
                                         uint32_t offset_in_bytes,
index 0e5459e5e5e4a54af42a888521753a10f3df4198..cfd3efe374e0d34ae22b613d3b0418a408668d35 100644 (file)
@@ -459,6 +459,13 @@ brw_oa_batchbuffer_flush(void *c, const char *file, int line)
    _intel_batchbuffer_flush_fence(ctx, -1, NULL, file,  line);
 }
 
+static void
+brw_oa_emit_stall_at_pixel_scoreboard(void *c)
+{
+   struct brw_context *brw = c;
+   brw_emit_end_of_pipe_sync(brw, PIPE_CONTROL_STALL_AT_SCOREBOARD);
+}
+
 typedef void (*capture_frequency_stat_register_t)(void *, void *, uint32_t );
 typedef void (*store_register_mem64_t)(void *ctx, void *bo,
                                        uint32_t reg, uint32_t offset);
@@ -487,7 +494,8 @@ brw_init_perf_query_info(struct gl_context *ctx)
    perf_cfg->vtbl.bo_unreference = (bo_unreference_t)brw_bo_unreference;
    perf_cfg->vtbl.bo_map = (bo_map_t)brw_bo_map;
    perf_cfg->vtbl.bo_unmap = (bo_unmap_t)brw_bo_unmap;
-   perf_cfg->vtbl.emit_mi_flush = (emit_mi_flush_t)brw_emit_mi_flush;
+   perf_cfg->vtbl.emit_stall_at_pixel_scoreboard =
+      (emit_mi_flush_t)brw_oa_emit_stall_at_pixel_scoreboard;
    perf_cfg->vtbl.emit_mi_report_perf_count =
       (emit_mi_report_t)brw_oa_emit_mi_report_perf_count;
    perf_cfg->vtbl.batchbuffer_flush = brw_oa_batchbuffer_flush;