}
static void
-iris_perf_emit_mi_flush(struct iris_context *ice)
+iris_perf_emit_stall_at_pixel_scoreboard(struct iris_context *ice)
{
- const int flags = PIPE_CONTROL_RENDER_TARGET_FLUSH |
- PIPE_CONTROL_INSTRUCTION_INVALIDATE |
- PIPE_CONTROL_CONST_CACHE_INVALIDATE |
- PIPE_CONTROL_DATA_CACHE_FLUSH |
- PIPE_CONTROL_DEPTH_CACHE_FLUSH |
- PIPE_CONTROL_VF_CACHE_INVALIDATE |
- PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
- PIPE_CONTROL_CS_STALL;
- iris_emit_pipe_control_flush(&ice->batches[IRIS_BATCH_RENDER],
- "OA metrics", flags);
+ iris_emit_end_of_pipe_sync(&ice->batches[IRIS_BATCH_RENDER],
+ "OA metrics",
+ PIPE_CONTROL_STALL_AT_SCOREBOARD);
}
static void
perf_cfg->vtbl.bo_unreference = (bo_unreference_t)iris_bo_unreference;
perf_cfg->vtbl.bo_map = (bo_map_t)iris_bo_map;
perf_cfg->vtbl.bo_unmap = (bo_unmap_t)iris_bo_unmap;
- perf_cfg->vtbl.emit_mi_flush = (emit_mi_flush_t)iris_perf_emit_mi_flush;
+ perf_cfg->vtbl.emit_stall_at_pixel_scoreboard =
+ (emit_mi_flush_t)iris_perf_emit_stall_at_pixel_scoreboard;
perf_cfg->vtbl.emit_mi_report_perf_count =
(emit_mi_report_t)iris_perf_emit_mi_report_perf_count;
* end snapshot - otherwise the results won't be a complete representation
* of the work.
*
- * Theoretically there could be opportunities to minimize how much of the
- * GPU pipeline is drained, or that we stall for, when we know what specific
- * units the performance counters being queried relate to but we don't
- * currently attempt to be clever here.
- *
- * Note: with our current simple approach here then for back-to-back queries
- * we will redundantly emit duplicate commands to synchronize the command
- * streamer with the rest of the GPU pipeline, but we assume that in HW the
- * second synchronization is effectively a NOOP.
+ * To achieve this, we stall the pipeline at pixel scoreboard (prevent any
+ * additional work to be processed by the pipeline until all pixels of the
+ * previous draw has be completed).
*
* N.B. The final results are based on deltas of counters between (inside)
* Begin/End markers so even though the total wall clock time of the
* This is our Begin synchronization point to drain current work on the
* GPU before we capture our first counter snapshot...
*/
- perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
+ perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx);
switch (queryinfo->kind) {
case GEN_PERF_QUERY_TYPE_OA:
* For more details see comment in brw_begin_perf_query for
* corresponding flush.
*/
- perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
+ perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx);
switch (query->queryinfo->kind) {
case GEN_PERF_QUERY_TYPE_OA:
bool (*batch_references)(void *batch, void *bo);
void (*bo_wait_rendering)(void *bo);
int (*bo_busy)(void *bo);
- void (*emit_mi_flush)(void *ctx);
+ void (*emit_stall_at_pixel_scoreboard)(void *ctx);
void (*emit_mi_report_perf_count)(void *ctx,
void *bo,
uint32_t offset_in_bytes,
_intel_batchbuffer_flush_fence(ctx, -1, NULL, file, line);
}
+static void
+brw_oa_emit_stall_at_pixel_scoreboard(void *c)
+{
+ struct brw_context *brw = c;
+ brw_emit_end_of_pipe_sync(brw, PIPE_CONTROL_STALL_AT_SCOREBOARD);
+}
+
typedef void (*capture_frequency_stat_register_t)(void *, void *, uint32_t );
typedef void (*store_register_mem64_t)(void *ctx, void *bo,
uint32_t reg, uint32_t offset);
perf_cfg->vtbl.bo_unreference = (bo_unreference_t)brw_bo_unreference;
perf_cfg->vtbl.bo_map = (bo_map_t)brw_bo_map;
perf_cfg->vtbl.bo_unmap = (bo_unmap_t)brw_bo_unmap;
- perf_cfg->vtbl.emit_mi_flush = (emit_mi_flush_t)brw_emit_mi_flush;
+ perf_cfg->vtbl.emit_stall_at_pixel_scoreboard =
+ (emit_mi_flush_t)brw_oa_emit_stall_at_pixel_scoreboard;
perf_cfg->vtbl.emit_mi_report_perf_count =
(emit_mi_report_t)brw_oa_emit_mi_report_perf_count;
perf_cfg->vtbl.batchbuffer_flush = brw_oa_batchbuffer_flush;