From: Eric Anholt <eric@anholt.net>
Date: Thu, 3 Jan 2019 20:13:18 +0000 (-0800)
Subject: v3d: Do UBO loads a vector at a time.
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=f8a8de8b9a69fc6f4a8fc86a71f81c168cdd18b0;p=mesa.git

v3d: Do UBO loads a vector at a time.

In the process of adding support for SSBOs and CS shared vars, I ended up
needing a helper function for doing TMU general ops.  This helper can be
that starting point, and saves us a bunch of round-trips to the TMU by
loading a vector at a time.
---

diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index f2099182dcb..91d4ab0020e 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -32,6 +32,40 @@
 #include "common/v3d_device_info.h"
 #include "v3d_compiler.h"
 
+#define GENERAL_TMU_LOOKUP_PER_QUAD                 (0 << 7)
+#define GENERAL_TMU_LOOKUP_PER_PIXEL                (1 << 7)
+#define GENERAL_TMU_READ_OP_PREFETCH                (0 << 3)
+#define GENERAL_TMU_READ_OP_CACHE_CLEAR             (1 << 3)
+#define GENERAL_TMU_READ_OP_CACHE_FLUSH             (3 << 3)
+#define GENERAL_TMU_READ_OP_CACHE_CLEAN             (3 << 3)
+#define GENERAL_TMU_READ_OP_CACHE_L1T_CLEAR         (4 << 3)
+#define GENERAL_TMU_READ_OP_CACHE_L1T_FLUSH_AGGREGATION (5 << 3)
+#define GENERAL_TMU_READ_OP_ATOMIC_INC              (8 << 3)
+#define GENERAL_TMU_READ_OP_ATOMIC_DEC              (9 << 3)
+#define GENERAL_TMU_READ_OP_ATOMIC_NOT              (10 << 3)
+#define GENERAL_TMU_READ_OP_READ                    (15 << 3)
+#define GENERAL_TMU_LOOKUP_TYPE_8BIT_I              (0 << 0)
+#define GENERAL_TMU_LOOKUP_TYPE_16BIT_I             (1 << 0)
+#define GENERAL_TMU_LOOKUP_TYPE_VEC2                (2 << 0)
+#define GENERAL_TMU_LOOKUP_TYPE_VEC3                (3 << 0)
+#define GENERAL_TMU_LOOKUP_TYPE_VEC4                (4 << 0)
+#define GENERAL_TMU_LOOKUP_TYPE_8BIT_UI             (5 << 0)
+#define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI            (6 << 0)
+#define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI            (7 << 0)
+
+#define GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP         (0 << 3)
+#define GENERAL_TMU_WRITE_OP_ATOMIC_SUB_WRAP         (1 << 3)
+#define GENERAL_TMU_WRITE_OP_ATOMIC_XCHG             (2 << 3)
+#define GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG          (3 << 3)
+#define GENERAL_TMU_WRITE_OP_ATOMIC_UMIN             (4 << 3)
+#define GENERAL_TMU_WRITE_OP_ATOMIC_UMAX             (5 << 3)
+#define GENERAL_TMU_WRITE_OP_ATOMIC_SMIN             (6 << 3)
+#define GENERAL_TMU_WRITE_OP_ATOMIC_SMAX             (7 << 3)
+#define GENERAL_TMU_WRITE_OP_ATOMIC_AND              (8 << 3)
+#define GENERAL_TMU_WRITE_OP_ATOMIC_OR               (9 << 3)
+#define GENERAL_TMU_WRITE_OP_ATOMIC_XOR              (10 << 3)
+#define GENERAL_TMU_WRITE_OP_WRITE                   (15 << 3)
+
 static void
 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
 
@@ -73,6 +107,60 @@ vir_emit_thrsw(struct v3d_compile *c)
         c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
 }
 
+/**
+ * Implements indirect uniform loads through the TMU general memory access
+ * interface.
+ */
+static void
+ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+        uint32_t tmu_op = GENERAL_TMU_READ_OP_READ;
+        bool has_index = instr->intrinsic == nir_intrinsic_load_ubo;
+        int offset_src = 0 + has_index;
+
+        /* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by
+         * 1 (0 is gallium's constant buffer 0).
+         */
+        struct qreg offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
+                                         nir_src_as_uint(instr->src[0]) + 1);
+
+        uint32_t config = (0xffffff00 |
+                           tmu_op |
+                           GENERAL_TMU_LOOKUP_PER_PIXEL);
+        if (instr->num_components == 1) {
+                config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
+        } else {
+                config |= (GENERAL_TMU_LOOKUP_TYPE_VEC2 +
+                           instr->num_components - 2);
+        }
+
+        struct qreg dest;
+        if (config == ~0)
+                dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
+        else
+                dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
+
+        struct qinst *tmu;
+        if (nir_src_is_const(instr->src[offset_src]) &&
+            nir_src_as_uint(instr->src[offset_src]) == 0) {
+                tmu = vir_MOV_dest(c, dest, offset);
+        } else {
+                tmu = vir_ADD_dest(c, dest,
+                                   offset,
+                                   ntq_get_src(c, instr->src[offset_src], 0));
+        }
+
+        if (config != ~0) {
+                tmu->src[vir_get_implicit_uniform_src(tmu)] =
+                        vir_uniform_ui(c, config);
+        }
+
+        vir_emit_thrsw(c);
+
+        for (int i = 0; i < nir_intrinsic_dest_components(instr); i++)
+                ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c)));
+}
+
 static struct qreg
 indirect_uniform_load(struct v3d_compile *c, nir_intrinsic_instr *intr)
 {
@@ -1547,41 +1635,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 break;
 
         case nir_intrinsic_load_ubo:
-                for (int i = 0; i < instr->num_components; i++) {
-                        int ubo = nir_src_as_uint(instr->src[0]);
-
-                        /* XXX perf: On V3D 4.x with uniform offsets, we
-                         * should probably try setting UBOs up in the A
-                         * register file and doing a sequence of loads that
-                         * way.
-                         */
-                        /* Adjust for where we stored the TGSI register base. */
-                        vir_ADD_dest(c,
-                                     vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
-                                     vir_uniform(c, QUNIFORM_UBO_ADDR, 1 + ubo),
-                                     vir_ADD(c,
-                                             ntq_get_src(c, instr->src[1], 0),
-                                             vir_uniform_ui(c, i * 4)));
-
-                        vir_emit_thrsw(c);
-
-                        ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c));
-                }
-                break;
-
-                if (nir_src_is_const(instr->src[0])) {
-                        offset = (nir_intrinsic_base(instr) +
-                                  nir_src_as_uint(instr->src[0]));
-                        assert(offset % 4 == 0);
-                        /* We need dwords */
-                        offset = offset / 4;
-                        ntq_store_dest(c, &instr->dest, 0,
-                                       vir_uniform(c, QUNIFORM_UNIFORM,
-                                                   offset));
-                } else {
-                        ntq_store_dest(c, &instr->dest, 0,
-                                       indirect_uniform_load(c, instr));
-                }
+                ntq_emit_tmu_general(c, instr);
                 break;
 
         case nir_intrinsic_load_user_clip_plane:
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 2f32359f384..6eb346ce9fd 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -61,6 +61,16 @@ vir_has_implicit_uniform(struct qinst *inst)
                 switch (inst->dst.file) {
                 case QFILE_TLBU:
                         return true;
+                case QFILE_MAGIC:
+                        switch (inst->dst.index) {
+                        case V3D_QPU_WADDR_TLBU:
+                        case V3D_QPU_WADDR_TMUAU:
+                        case V3D_QPU_WADDR_SYNCU:
+                                return true;
+                        default:
+                                break;
+                        }
+                        break;
                 default:
                         return inst->has_implicit_uniform;
                 }