From ba194630cc89dc508aeac77a280ee5704ca48adf Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Wed, 10 Feb 2016 14:40:01 -0500 Subject: [PATCH] freedreno/a4xx: implement time-elapsed query Signed-off-by: Rob Clark --- .../drivers/freedreno/a4xx/fd4_context.h | 2 + .../drivers/freedreno/a4xx/fd4_query.c | 131 ++++++++++++++++++ 2 files changed, 133 insertions(+) diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.h b/src/gallium/drivers/freedreno/a4xx/fd4_context.h index 074c5a752bf..0c1027d5804 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_context.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.h @@ -49,6 +49,8 @@ struct fd4_context { /* This only needs to be 4 * num_of_pipes bytes (ie. 32 bytes). We * could combine it with another allocation. + * + * (upper area used as scratch bo.. see fd4_query) */ struct fd_bo *vsc_size_mem; diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_query.c b/src/gallium/drivers/freedreno/a4xx/fd4_query.c index a1fafbc6128..14a809431ac 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_query.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_query.c @@ -31,6 +31,7 @@ #include "freedreno_util.h" #include "fd4_query.h" +#include "fd4_context.h" #include "fd4_draw.h" #include "fd4_format.h" @@ -107,6 +108,127 @@ occlusion_predicate_accumulate_result(struct fd_context *ctx, result->b |= (n > 0); } +/* + * Time Elapsed Query: + * + * Note: we could in theory support timestamp queries, but they + * won't give sensible results for tilers. + */ + +static void +time_elapsed_enable(struct fd_context *ctx, struct fd_ringbuffer *ring) +{ + /* Right now, the assignment of countable to counter register is + * just hard coded. If we start exposing more countables than we + * have counters, we will need to be more clever. + */ + fd_wfi(ctx, ring); + OUT_PKT0(ring, REG_A4XX_CP_PERFCTR_CP_SEL_0, 1); + OUT_RING(ring, CP_ALWAYS_COUNT); +} + +static struct fd_hw_sample * +time_elapsed_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring) +{ + struct fd_hw_sample *samp = fd_hw_sample_init(ctx, sizeof(uint64_t)); + + /* use unused part of vsc_size_mem as scratch space, to avoid + * extra allocation: + */ + struct fd_bo *scratch_bo = fd4_context(ctx)->vsc_size_mem; + const int sample_off = 128; + const int addr_off = sample_off + 8; + + debug_assert(ctx->screen->max_freq > 0); + + /* Basic issue is that we need to read counter value to a relative + * destination (with per-tile offset) rather than absolute dest + * addr. But there is no pm4 packet that can do that. This is + * where it would be *really* nice if we could write our own fw + * since afaict implementing the sort of packet we need would be + * trivial. + * + * Instead, we: + * (1) CP_REG_TO_MEM to do a 64b copy of counter to scratch buffer + * (2) CP_MEM_WRITE to write per-sample offset to scratch buffer + * (3) CP_REG_TO_MEM w/ accumulate flag to add the per-tile base + * address to the per-sample offset in the scratch buffer + * (4) CP_MEM_TO_REG to copy resulting address from steps #2 and #3 + * to CP_ME_NRT_ADDR + * (5) CP_MEM_TO_REG's to copy saved counter value from scratch + * buffer to CP_ME_NRT_DATA to trigger the write out to query + * result buffer + * + * Straightforward, right? + * + * Maybe could swap the order of things in the scratch buffer to + * put address first, and copy back to CP_ME_NRT_ADDR+DATA in one + * shot, but that's really just polishing a turd.. + */ + + fd_wfi(ctx, ring); + + /* copy sample counter _LO and _HI to scratch: */ + OUT_PKT3(ring, CP_REG_TO_MEM, 2); + OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A4XX_RBBM_PERFCTR_CP_0_LO) | + CP_REG_TO_MEM_0_64B | + CP_REG_TO_MEM_0_CNT(2-1)); /* write 2 regs to mem */ + OUT_RELOC(ring, scratch_bo, sample_off, 0, 0); + + /* ok... here we really *would* like to use the CP_SET_CONSTANT + * mode which can add a constant to value in reg2 and write to + * reg1... *but* that only works for banked/context registers, + * and CP_ME_NRT_DATA isn't one of those.. so we need to do some + * CP math to the scratch buffer instead: + * + * (note first 8 bytes are counter value, use offset 0x8 for + * address calculation) + */ + + /* per-sample offset to scratch bo: */ + OUT_PKT3(ring, CP_MEM_WRITE, 2); + OUT_RELOC(ring, scratch_bo, addr_off, 0, 0); + OUT_RING(ring, samp->offset); + + /* now add to that the per-tile base: */ + OUT_PKT3(ring, CP_REG_TO_MEM, 2); + OUT_RING(ring, CP_REG_TO_MEM_0_REG(HW_QUERY_BASE_REG) | + CP_REG_TO_MEM_0_ACCUMULATE | + CP_REG_TO_MEM_0_CNT(1-1)); /* readback 1 regs */ + OUT_RELOC(ring, scratch_bo, addr_off, 0, 0); + + /* now copy that back to CP_ME_NRT_ADDR: */ + OUT_PKT3(ring, CP_MEM_TO_REG, 2); + OUT_RING(ring, REG_A4XX_CP_ME_NRT_ADDR); + OUT_RELOC(ring, scratch_bo, addr_off, 0, 0); + + /* and finally, copy sample from scratch buffer to CP_ME_NRT_DATA + * to trigger the write to result buffer + */ + OUT_PKT3(ring, CP_MEM_TO_REG, 2); + OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA); + OUT_RELOC(ring, scratch_bo, sample_off, 0, 0); + + /* and again to get the value of the _HI reg from scratch: */ + OUT_PKT3(ring, CP_MEM_TO_REG, 2); + OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA); + OUT_RELOC(ring, scratch_bo, sample_off + 0x4, 0, 0); + + /* Sigh.. */ + + return samp; +} + +static void +time_elapsed_accumulate_result(struct fd_context *ctx, + const void *start, const void *end, + union pipe_query_result *result) +{ + uint64_t n = *(uint64_t *)end - *(uint64_t *)start; + /* max_freq is in Hz, convert cycle count to ns: */ + result->u64 += n * 1000000000 / ctx->screen->max_freq; +} + static const struct fd_hw_sample_provider occlusion_counter = { .query_type = PIPE_QUERY_OCCLUSION_COUNTER, .active = FD_STAGE_DRAW, @@ -121,8 +243,17 @@ static const struct fd_hw_sample_provider occlusion_predicate = { .accumulate_result = occlusion_predicate_accumulate_result, }; +static const struct fd_hw_sample_provider time_elapsed = { + .query_type = PIPE_QUERY_TIME_ELAPSED, + .active = FD_STAGE_DRAW, + .enable = time_elapsed_enable, + .get_sample = time_elapsed_get_sample, + .accumulate_result = time_elapsed_accumulate_result, +}; + void fd4_query_context_init(struct pipe_context *pctx) { fd_hw_query_register_provider(pctx, &occlusion_counter); fd_hw_query_register_provider(pctx, &occlusion_predicate); + fd_hw_query_register_provider(pctx, &time_elapsed); } -- 2.30.2