i965/iris: perf-queries: don't invalidate/flush 3d pipeline

author Lionel Landwerlin <lionel.g.landwerlin@intel.com>

Mon, 20 May 2019 06:56:18 +0000 (07:56 +0100)

committer Lionel Landwerlin <lionel.g.landwerlin@intel.com>

Fri, 13 Dec 2019 09:27:22 +0000 (11:27 +0200)
author Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Mon, 20 May 2019 06:56:18 +0000 (07:56 +0100)
committer Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Fri, 13 Dec 2019 09:27:22 +0000 (11:27 +0200)
diff --git a/src/gallium/drivers/iris/iris_perf.c b/src/gallium/drivers/iris/iris_perf.c

index 7c0378aacee97ceada76895a1d62265e58714318..1e5ec8140dcef3c79465f775502e4ed07d4be5d7 100644 (file)
--- a/src/gallium/drivers/iris/iris_perf.c
+++ b/src/gallium/drivers/iris/iris_perf.c
@@ -31,18 +31,11 @@ iris_oa_bo_alloc(void *bufmgr, const char *name, uint64_t size)
  }
  
  static void
-iris_perf_emit_mi_flush(struct iris_context *ice)
+iris_perf_emit_stall_at_pixel_scoreboard(struct iris_context *ice)
  {
-   const int flags = PIPE_CONTROL_RENDER_TARGET_FLUSH |
-                     PIPE_CONTROL_INSTRUCTION_INVALIDATE |
-                     PIPE_CONTROL_CONST_CACHE_INVALIDATE |
-                     PIPE_CONTROL_DATA_CACHE_FLUSH |
-                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-                     PIPE_CONTROL_VF_CACHE_INVALIDATE |
-                     PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
-                     PIPE_CONTROL_CS_STALL;
-   iris_emit_pipe_control_flush(&ice->batches[IRIS_BATCH_RENDER],
-                                "OA metrics", flags);
+   iris_emit_end_of_pipe_sync(&ice->batches[IRIS_BATCH_RENDER],
+                              "OA metrics",
+                              PIPE_CONTROL_STALL_AT_SCOREBOARD);
  }
  
  static void
@@ -106,7 +99,8 @@ iris_perf_init_vtbl(struct gen_perf_config *perf_cfg)
     perf_cfg->vtbl.bo_unreference = (bo_unreference_t)iris_bo_unreference;
     perf_cfg->vtbl.bo_map = (bo_map_t)iris_bo_map;
     perf_cfg->vtbl.bo_unmap = (bo_unmap_t)iris_bo_unmap;
-   perf_cfg->vtbl.emit_mi_flush = (emit_mi_flush_t)iris_perf_emit_mi_flush;
+   perf_cfg->vtbl.emit_stall_at_pixel_scoreboard =
+      (emit_mi_flush_t)iris_perf_emit_stall_at_pixel_scoreboard;
  
     perf_cfg->vtbl.emit_mi_report_perf_count =
        (emit_mi_report_t)iris_perf_emit_mi_report_perf_count;
diff --git a/src/intel/perf/gen_perf.c b/src/intel/perf/gen_perf.c

index daa092c88c9a4982dc252b126b0ec2a3631fe3b7..9e987d599d7824ab571cbc05e4f16e324de58d42 100644 (file)
--- a/src/intel/perf/gen_perf.c
+++ b/src/intel/perf/gen_perf.c
@@ -1716,15 +1716,9 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx,
      * end snapshot - otherwise the results won't be a complete representation
      * of the work.
      *
-    * Theoretically there could be opportunities to minimize how much of the
-    * GPU pipeline is drained, or that we stall for, when we know what specific
-    * units the performance counters being queried relate to but we don't
-    * currently attempt to be clever here.
-    *
-    * Note: with our current simple approach here then for back-to-back queries
-    * we will redundantly emit duplicate commands to synchronize the command
-    * streamer with the rest of the GPU pipeline, but we assume that in HW the
-    * second synchronization is effectively a NOOP.
+    * To achieve this, we stall the pipeline at pixel scoreboard (prevent any
+    * additional work to be processed by the pipeline until all pixels of the
+    * previous draw has be completed).
      *
      * N.B. The final results are based on deltas of counters between (inside)
      * Begin/End markers so even though the total wall clock time of the
@@ -1738,7 +1732,7 @@ gen_perf_begin_query(struct gen_perf_context *perf_ctx,
      * This is our Begin synchronization point to drain current work on the
      * GPU before we capture our first counter snapshot...
      */
-   perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
+   perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx);
  
     switch (queryinfo->kind) {
     case GEN_PERF_QUERY_TYPE_OA:
@@ -1920,7 +1914,7 @@ gen_perf_end_query(struct gen_perf_context *perf_ctx,
      * For more details see comment in brw_begin_perf_query for
      * corresponding flush.
      */
-  perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
+   perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx);
  
     switch (query->queryinfo->kind) {
     case GEN_PERF_QUERY_TYPE_OA:
diff --git a/src/intel/perf/gen_perf.h b/src/intel/perf/gen_perf.h

index 46d37e07c2575a3ad42761c866208a01d06344ad..2cd246a1dca167fe0a6dfd373efce92ad2b890b4 100644 (file)
--- a/src/intel/perf/gen_perf.h
+++ b/src/intel/perf/gen_perf.h
@@ -219,7 +219,7 @@ struct gen_perf_config {
        bool (*batch_references)(void *batch, void *bo);
        void (*bo_wait_rendering)(void *bo);
        int (*bo_busy)(void *bo);
-      void (*emit_mi_flush)(void *ctx);
+      void (*emit_stall_at_pixel_scoreboard)(void *ctx);
        void (*emit_mi_report_perf_count)(void *ctx,
                                          void *bo,
                                          uint32_t offset_in_bytes,
diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c

index 0e5459e5e5e4a54af42a888521753a10f3df4198..cfd3efe374e0d34ae22b613d3b0418a408668d35 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_performance_query.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
@@ -459,6 +459,13 @@ brw_oa_batchbuffer_flush(void *c, const char *file, int line)
     _intel_batchbuffer_flush_fence(ctx, -1, NULL, file,  line);
  }
  
+static void
+brw_oa_emit_stall_at_pixel_scoreboard(void *c)
+{
+   struct brw_context *brw = c;
+   brw_emit_end_of_pipe_sync(brw, PIPE_CONTROL_STALL_AT_SCOREBOARD);
+}
+
  typedef void (*capture_frequency_stat_register_t)(void *, void *, uint32_t );
  typedef void (*store_register_mem64_t)(void *ctx, void *bo,
                                         uint32_t reg, uint32_t offset);
@@ -487,7 +494,8 @@ brw_init_perf_query_info(struct gl_context *ctx)
     perf_cfg->vtbl.bo_unreference = (bo_unreference_t)brw_bo_unreference;
     perf_cfg->vtbl.bo_map = (bo_map_t)brw_bo_map;
     perf_cfg->vtbl.bo_unmap = (bo_unmap_t)brw_bo_unmap;
-   perf_cfg->vtbl.emit_mi_flush = (emit_mi_flush_t)brw_emit_mi_flush;
+   perf_cfg->vtbl.emit_stall_at_pixel_scoreboard =
+      (emit_mi_flush_t)brw_oa_emit_stall_at_pixel_scoreboard;
     perf_cfg->vtbl.emit_mi_report_perf_count =
        (emit_mi_report_t)brw_oa_emit_mi_report_perf_count;
     perf_cfg->vtbl.batchbuffer_flush = brw_oa_batchbuffer_flush;
author	Lionel Landwerlin <lionel.g.landwerlin@intel.com>
	Mon, 20 May 2019 06:56:18 +0000 (07:56 +0100)
committer	Lionel Landwerlin <lionel.g.landwerlin@intel.com>
	Fri, 13 Dec 2019 09:27:22 +0000 (11:27 +0200)
src/gallium/drivers/iris/iris_perf.c		patch \| blob \| history
src/intel/perf/gen_perf.c		patch \| blob \| history
src/intel/perf/gen_perf.h		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_performance_query.c		patch \| blob \| history