radeonsi: extract TGSI memory/texture opcode handling into its own file
[mesa.git] / src / gallium / drivers / freedreno / a4xx / fd4_query.c
index a1fafbc61283915002f195f8b15bda83edf43be2..f7b385d552d4b0d2ea864f689bfc04d10c18cddf 100644 (file)
@@ -31,6 +31,7 @@
 #include "freedreno_util.h"
 
 #include "fd4_query.h"
+#include "fd4_context.h"
 #include "fd4_draw.h"
 #include "fd4_format.h"
 
@@ -47,10 +48,10 @@ struct fd_rb_samp_ctrs {
  */
 
 static struct fd_hw_sample *
-occlusion_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring)
+occlusion_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring)
 {
        struct fd_hw_sample *samp =
-                       fd_hw_sample_init(ctx, sizeof(struct fd_rb_samp_ctrs));
+                       fd_hw_sample_init(batch, sizeof(struct fd_rb_samp_ctrs));
 
        /* low bits of sample addr should be zero (since they are control
         * flags in RB_SAMPLE_COUNT_CONTROL):
@@ -72,7 +73,7 @@ occlusion_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring)
        OUT_RING(ring, 1);             /* NumInstances */
        OUT_RING(ring, 0);             /* NumIndices */
 
-       fd_event_write(ctx, ring, ZPASS_DONE);
+       fd_event_write(batch, ring, ZPASS_DONE);
 
        return samp;
 }
@@ -81,12 +82,7 @@ static uint64_t
 count_samples(const struct fd_rb_samp_ctrs *start,
                const struct fd_rb_samp_ctrs *end)
 {
-       uint64_t n = 0;
-
-       for (unsigned i = 0; i < 16; i += 4)
-               n += end->ctr[i] - start->ctr[i];
-
-       return n / 2;
+       return end->ctr[0] - start->ctr[0];
 }
 
 static void
@@ -107,6 +103,140 @@ occlusion_predicate_accumulate_result(struct fd_context *ctx,
        result->b |= (n > 0);
 }
 
+/*
+ * Time Elapsed Query:
+ *
+ * Note: we could in theory support timestamp queries, but they
+ * won't give sensible results for tilers.
+ */
+
+static void
+time_elapsed_enable(struct fd_context *ctx, struct fd_ringbuffer *ring)
+{
+       /* Right now, the assignment of countable to counter register is
+        * just hard coded.  If we start exposing more countables than we
+        * have counters, we will need to be more clever.
+        */
+       fd_wfi(ctx->batch, ring);
+       OUT_PKT0(ring, REG_A4XX_CP_PERFCTR_CP_SEL_0, 1);
+       OUT_RING(ring, CP_ALWAYS_COUNT);
+}
+
+static struct fd_hw_sample *
+time_elapsed_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring)
+{
+       struct fd_hw_sample *samp = fd_hw_sample_init(batch, sizeof(uint64_t));
+
+       /* use unused part of vsc_size_mem as scratch space, to avoid
+        * extra allocation:
+        */
+       struct fd_bo *scratch_bo = fd4_context(batch->ctx)->vsc_size_mem;
+       const int sample_off = 128;
+       const int addr_off = sample_off + 8;
+
+       debug_assert(batch->ctx->screen->max_freq > 0);
+
+       /* Basic issue is that we need to read counter value to a relative
+        * destination (with per-tile offset) rather than absolute dest
+        * addr.  But there is no pm4 packet that can do that.  This is
+        * where it would be *really* nice if we could write our own fw
+        * since afaict implementing the sort of packet we need would be
+        * trivial.
+        *
+        * Instead, we:
+        * (1) CP_REG_TO_MEM to do a 64b copy of counter to scratch buffer
+        * (2) CP_MEM_WRITE to write per-sample offset to scratch buffer
+        * (3) CP_REG_TO_MEM w/ accumulate flag to add the per-tile base
+        *     address to the per-sample offset in the scratch buffer
+        * (4) CP_MEM_TO_REG to copy resulting address from steps #2 and #3
+        *     to CP_ME_NRT_ADDR
+        * (5) CP_MEM_TO_REG's to copy saved counter value from scratch
+        *     buffer to CP_ME_NRT_DATA to trigger the write out to query
+        *     result buffer
+        *
+        * Straightforward, right?
+        *
+        * Maybe could swap the order of things in the scratch buffer to
+        * put address first, and copy back to CP_ME_NRT_ADDR+DATA in one
+        * shot, but that's really just polishing a turd..
+        */
+
+       fd_wfi(batch, ring);
+
+       /* copy sample counter _LO and _HI to scratch: */
+       OUT_PKT3(ring, CP_REG_TO_MEM, 2);
+       OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A4XX_RBBM_PERFCTR_CP_0_LO) |
+                       CP_REG_TO_MEM_0_64B |
+                       CP_REG_TO_MEM_0_CNT(2-1)); /* write 2 regs to mem */
+       OUT_RELOCW(ring, scratch_bo, sample_off, 0, 0);
+
+       /* ok... here we really *would* like to use the CP_SET_CONSTANT
+        * mode which can add a constant to value in reg2 and write to
+        * reg1... *but* that only works for banked/context registers,
+        * and CP_ME_NRT_DATA isn't one of those.. so we need to do some
+        * CP math to the scratch buffer instead:
+        *
+        * (note first 8 bytes are counter value, use offset 0x8 for
+        * address calculation)
+        */
+
+       /* per-sample offset to scratch bo: */
+       OUT_PKT3(ring, CP_MEM_WRITE, 2);
+       OUT_RELOCW(ring, scratch_bo, addr_off, 0, 0);
+       OUT_RING(ring, samp->offset);
+
+       /* now add to that the per-tile base: */
+       OUT_PKT3(ring, CP_REG_TO_MEM, 2);
+       OUT_RING(ring, CP_REG_TO_MEM_0_REG(HW_QUERY_BASE_REG) |
+                       CP_REG_TO_MEM_0_ACCUMULATE |
+                       CP_REG_TO_MEM_0_CNT(1-1));       /* readback 1 regs */
+       OUT_RELOCW(ring, scratch_bo, addr_off, 0, 0);
+
+       /* now copy that back to CP_ME_NRT_ADDR: */
+       OUT_PKT3(ring, CP_MEM_TO_REG, 2);
+       OUT_RING(ring, REG_A4XX_CP_ME_NRT_ADDR);
+       OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
+
+       /* and finally, copy sample from scratch buffer to CP_ME_NRT_DATA
+        * to trigger the write to result buffer
+        */
+       OUT_PKT3(ring, CP_MEM_TO_REG, 2);
+       OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA);
+       OUT_RELOC(ring, scratch_bo, sample_off, 0, 0);
+
+       /* and again to get the value of the _HI reg from scratch: */
+       OUT_PKT3(ring, CP_MEM_TO_REG, 2);
+       OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA);
+       OUT_RELOC(ring, scratch_bo, sample_off + 0x4, 0, 0);
+
+       /* Sigh.. */
+
+       return samp;
+}
+
+static void
+time_elapsed_accumulate_result(struct fd_context *ctx,
+               const void *start, const void *end,
+               union pipe_query_result *result)
+{
+       uint64_t n = *(uint64_t *)end - *(uint64_t *)start;
+       /* max_freq is in Hz, convert cycle count to ns: */
+       result->u64 += n * 1000000000 / ctx->screen->max_freq;
+}
+
+static void
+timestamp_accumulate_result(struct fd_context *ctx,
+               const void *start, const void *end,
+               union pipe_query_result *result)
+{
+       /* just return the value from fist tile: */
+       if (result->u64 != 0)
+               return;
+       uint64_t n = *(uint64_t *)start;
+       /* max_freq is in Hz, convert cycle count to ns: */
+       result->u64 = n * 1000000000 / ctx->screen->max_freq;
+}
+
 static const struct fd_hw_sample_provider occlusion_counter = {
                .query_type = PIPE_QUERY_OCCLUSION_COUNTER,
                .active = FD_STAGE_DRAW,
@@ -121,8 +251,39 @@ static const struct fd_hw_sample_provider occlusion_predicate = {
                .accumulate_result = occlusion_predicate_accumulate_result,
 };
 
+static const struct fd_hw_sample_provider time_elapsed = {
+               .query_type = PIPE_QUERY_TIME_ELAPSED,
+               .active = FD_STAGE_DRAW | FD_STAGE_CLEAR,
+               .enable = time_elapsed_enable,
+               .get_sample = time_elapsed_get_sample,
+               .accumulate_result = time_elapsed_accumulate_result,
+};
+
+/* NOTE: timestamp query isn't going to give terribly sensible results
+ * on a tiler.  But it is needed by qapitrace profile heatmap.  If you
+ * add in a binning pass, the results get even more non-sensical.  So
+ * we just return the timestamp on the first tile and hope that is
+ * kind of good enough.
+ */
+static const struct fd_hw_sample_provider timestamp = {
+               .query_type = PIPE_QUERY_TIMESTAMP,
+               .active = FD_STAGE_ALL,
+               .enable = time_elapsed_enable,
+               .get_sample = time_elapsed_get_sample,
+               .accumulate_result = timestamp_accumulate_result,
+};
+
 void fd4_query_context_init(struct pipe_context *pctx)
 {
+       struct fd_context *ctx = fd_context(pctx);
+
+       ctx->create_query = fd_hw_create_query;
+       ctx->query_prepare = fd_hw_query_prepare;
+       ctx->query_prepare_tile = fd_hw_query_prepare_tile;
+       ctx->query_set_stage = fd_hw_query_set_stage;
+
        fd_hw_query_register_provider(pctx, &occlusion_counter);
        fd_hw_query_register_provider(pctx, &occlusion_predicate);
+       fd_hw_query_register_provider(pctx, &time_elapsed);
+       fd_hw_query_register_provider(pctx, &timestamp);
 }