From: Rob Clark <robclark@freedesktop.org>
Date: Wed, 10 Feb 2016 19:40:01 +0000 (-0500)
Subject: freedreno/a4xx: implement time-elapsed query
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=ba194630cc89dc508aeac77a280ee5704ca48adf;p=mesa.git

freedreno/a4xx: implement time-elapsed query

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---

diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.h b/src/gallium/drivers/freedreno/a4xx/fd4_context.h
index 074c5a752bf..0c1027d5804 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_context.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.h
@@ -49,6 +49,8 @@ struct fd4_context {
 
 	/* This only needs to be 4 * num_of_pipes bytes (ie. 32 bytes).  We
 	 * could combine it with another allocation.
+	 *
+	 * (upper area used as scratch bo.. see fd4_query)
 	 */
 	struct fd_bo *vsc_size_mem;
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_query.c b/src/gallium/drivers/freedreno/a4xx/fd4_query.c
index a1fafbc6128..14a809431ac 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_query.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_query.c
@@ -31,6 +31,7 @@
 #include "freedreno_util.h"
 
 #include "fd4_query.h"
+#include "fd4_context.h"
 #include "fd4_draw.h"
 #include "fd4_format.h"
 
@@ -107,6 +108,127 @@ occlusion_predicate_accumulate_result(struct fd_context *ctx,
 	result->b |= (n > 0);
 }
 
+/*
+ * Time Elapsed Query:
+ *
+ * Note: we could in theory support timestamp queries, but they
+ * won't give sensible results for tilers.
+ */
+
+static void
+time_elapsed_enable(struct fd_context *ctx, struct fd_ringbuffer *ring)
+{
+	/* Right now, the assignment of countable to counter register is
+	 * just hard coded.  If we start exposing more countables than we
+	 * have counters, we will need to be more clever.
+	 */
+	fd_wfi(ctx, ring);
+	OUT_PKT0(ring, REG_A4XX_CP_PERFCTR_CP_SEL_0, 1);
+	OUT_RING(ring, CP_ALWAYS_COUNT);
+}
+
+static struct fd_hw_sample *
+time_elapsed_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring)
+{
+	struct fd_hw_sample *samp = fd_hw_sample_init(ctx, sizeof(uint64_t));
+
+	/* use unused part of vsc_size_mem as scratch space, to avoid
+	 * extra allocation:
+	 */
+	struct fd_bo *scratch_bo = fd4_context(ctx)->vsc_size_mem;
+	const int sample_off = 128;
+	const int addr_off = sample_off + 8;
+
+	debug_assert(ctx->screen->max_freq > 0);
+
+	/* Basic issue is that we need to read counter value to a relative
+	 * destination (with per-tile offset) rather than absolute dest
+	 * addr.  But there is no pm4 packet that can do that.  This is
+	 * where it would be *really* nice if we could write our own fw
+	 * since afaict implementing the sort of packet we need would be
+	 * trivial.
+	 *
+	 * Instead, we:
+	 * (1) CP_REG_TO_MEM to do a 64b copy of counter to scratch buffer
+	 * (2) CP_MEM_WRITE to write per-sample offset to scratch buffer
+	 * (3) CP_REG_TO_MEM w/ accumulate flag to add the per-tile base
+	 *     address to the per-sample offset in the scratch buffer
+	 * (4) CP_MEM_TO_REG to copy resulting address from steps #2 and #3
+	 *     to CP_ME_NRT_ADDR
+	 * (5) CP_MEM_TO_REG's to copy saved counter value from scratch
+	 *     buffer to CP_ME_NRT_DATA to trigger the write out to query
+	 *     result buffer
+	 *
+	 * Straightforward, right?
+	 *
+	 * Maybe could swap the order of things in the scratch buffer to
+	 * put address first, and copy back to CP_ME_NRT_ADDR+DATA in one
+	 * shot, but that's really just polishing a turd..
+	 */
+
+	fd_wfi(ctx, ring);
+
+	/* copy sample counter _LO and _HI to scratch: */
+	OUT_PKT3(ring, CP_REG_TO_MEM, 2);
+	OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A4XX_RBBM_PERFCTR_CP_0_LO) |
+			CP_REG_TO_MEM_0_64B |
+			CP_REG_TO_MEM_0_CNT(2-1)); /* write 2 regs to mem */
+	OUT_RELOC(ring, scratch_bo, sample_off, 0, 0);
+
+	/* ok... here we really *would* like to use the CP_SET_CONSTANT
+	 * mode which can add a constant to value in reg2 and write to
+	 * reg1... *but* that only works for banked/context registers,
+	 * and CP_ME_NRT_DATA isn't one of those.. so we need to do some
+	 * CP math to the scratch buffer instead:
+	 *
+	 * (note first 8 bytes are counter value, use offset 0x8 for
+	 * address calculation)
+	 */
+
+	/* per-sample offset to scratch bo: */
+	OUT_PKT3(ring, CP_MEM_WRITE, 2);
+	OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
+	OUT_RING(ring, samp->offset);
+
+	/* now add to that the per-tile base: */
+	OUT_PKT3(ring, CP_REG_TO_MEM, 2);
+	OUT_RING(ring, CP_REG_TO_MEM_0_REG(HW_QUERY_BASE_REG) |
+			CP_REG_TO_MEM_0_ACCUMULATE |
+			CP_REG_TO_MEM_0_CNT(1-1));       /* readback 1 regs */
+	OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
+
+	/* now copy that back to CP_ME_NRT_ADDR: */
+	OUT_PKT3(ring, CP_MEM_TO_REG, 2);
+	OUT_RING(ring, REG_A4XX_CP_ME_NRT_ADDR);
+	OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
+
+	/* and finally, copy sample from scratch buffer to CP_ME_NRT_DATA
+	 * to trigger the write to result buffer
+	 */
+	OUT_PKT3(ring, CP_MEM_TO_REG, 2);
+	OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA);
+	OUT_RELOC(ring, scratch_bo, sample_off, 0, 0);
+
+	/* and again to get the value of the _HI reg from scratch: */
+	OUT_PKT3(ring, CP_MEM_TO_REG, 2);
+	OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA);
+	OUT_RELOC(ring, scratch_bo, sample_off + 0x4, 0, 0);
+
+	/* Sigh.. */
+
+	return samp;
+}
+
+static void
+time_elapsed_accumulate_result(struct fd_context *ctx,
+		const void *start, const void *end,
+		union pipe_query_result *result)
+{
+	uint64_t n = *(uint64_t *)end - *(uint64_t *)start;
+	/* max_freq is in Hz, convert cycle count to ns: */
+	result->u64 += n * 1000000000 / ctx->screen->max_freq;
+}
+
 static const struct fd_hw_sample_provider occlusion_counter = {
 		.query_type = PIPE_QUERY_OCCLUSION_COUNTER,
 		.active = FD_STAGE_DRAW,
@@ -121,8 +243,17 @@ static const struct fd_hw_sample_provider occlusion_predicate = {
 		.accumulate_result = occlusion_predicate_accumulate_result,
 };
 
+static const struct fd_hw_sample_provider time_elapsed = {
+		.query_type = PIPE_QUERY_TIME_ELAPSED,
+		.active = FD_STAGE_DRAW,
+		.enable = time_elapsed_enable,
+		.get_sample = time_elapsed_get_sample,
+		.accumulate_result = time_elapsed_accumulate_result,
+};
+
 void fd4_query_context_init(struct pipe_context *pctx)
 {
 	fd_hw_query_register_provider(pctx, &occlusion_counter);
 	fd_hw_query_register_provider(pctx, &occlusion_predicate);
+	fd_hw_query_register_provider(pctx, &time_elapsed);
 }