+ switch (instr->intrinsic) {
+ case nir_intrinsic_load_ssbo:
+ case nir_intrinsic_load_ubo:
+ case nir_intrinsic_load_uniform:
+ case nir_intrinsic_load_shared:
+ case nir_intrinsic_load_scratch:
+ return GENERAL_TMU_READ_OP_READ;
+ case nir_intrinsic_store_ssbo:
+ case nir_intrinsic_store_shared:
+ case nir_intrinsic_store_scratch:
+ return GENERAL_TMU_WRITE_OP_WRITE;
+ case nir_intrinsic_ssbo_atomic_add:
+ case nir_intrinsic_shared_atomic_add:
+ return GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP;
+ case nir_intrinsic_ssbo_atomic_imin:
+ case nir_intrinsic_shared_atomic_imin:
+ return GENERAL_TMU_WRITE_OP_ATOMIC_SMIN;
+ case nir_intrinsic_ssbo_atomic_umin:
+ case nir_intrinsic_shared_atomic_umin:
+ return GENERAL_TMU_WRITE_OP_ATOMIC_UMIN;
+ case nir_intrinsic_ssbo_atomic_imax:
+ case nir_intrinsic_shared_atomic_imax:
+ return GENERAL_TMU_WRITE_OP_ATOMIC_SMAX;
+ case nir_intrinsic_ssbo_atomic_umax:
+ case nir_intrinsic_shared_atomic_umax:
+ return GENERAL_TMU_WRITE_OP_ATOMIC_UMAX;
+ case nir_intrinsic_ssbo_atomic_and:
+ case nir_intrinsic_shared_atomic_and:
+ return GENERAL_TMU_WRITE_OP_ATOMIC_AND;
+ case nir_intrinsic_ssbo_atomic_or:
+ case nir_intrinsic_shared_atomic_or:
+ return GENERAL_TMU_WRITE_OP_ATOMIC_OR;
+ case nir_intrinsic_ssbo_atomic_xor:
+ case nir_intrinsic_shared_atomic_xor:
+ return GENERAL_TMU_WRITE_OP_ATOMIC_XOR;
+ case nir_intrinsic_ssbo_atomic_exchange:
+ case nir_intrinsic_shared_atomic_exchange:
+ return GENERAL_TMU_WRITE_OP_ATOMIC_XCHG;
+ case nir_intrinsic_ssbo_atomic_comp_swap:
+ case nir_intrinsic_shared_atomic_comp_swap:
+ return GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG;
+ default:
+ unreachable("unknown intrinsic op");
+ }
+}
+
+/**
+ * Implements indirect uniform loads and SSBO accesses through the TMU general
+ * memory access interface.
+ */
+static void
+ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
+ bool is_shared_or_scratch)
+{
+ /* XXX perf: We should turn add/sub of 1 to inc/dec. Perhaps NIR
+ * wants to have support for inc/dec?
+ */
+
+ uint32_t tmu_op = v3d_general_tmu_op(instr);
+ bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
+ instr->intrinsic == nir_intrinsic_store_scratch ||
+ instr->intrinsic == nir_intrinsic_store_shared);
+ bool has_index = !is_shared_or_scratch;
+
+ int offset_src;
+ int tmu_writes = 1; /* address */
+ if (instr->intrinsic == nir_intrinsic_load_uniform) {
+ offset_src = 0;
+ } else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
+ instr->intrinsic == nir_intrinsic_load_ubo ||
+ instr->intrinsic == nir_intrinsic_load_scratch ||
+ instr->intrinsic == nir_intrinsic_load_shared) {
+ offset_src = 0 + has_index;
+ } else if (is_store) {
+ offset_src = 1 + has_index;
+ for (int i = 0; i < instr->num_components; i++) {
+ vir_MOV_dest(c,
+ vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
+ ntq_get_src(c, instr->src[0], i));
+ tmu_writes++;
+ }
+ } else {
+ offset_src = 0 + has_index;
+ vir_MOV_dest(c,
+ vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
+ ntq_get_src(c, instr->src[1 + has_index], 0));
+ tmu_writes++;
+ if (tmu_op == GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG) {
+ vir_MOV_dest(c,
+ vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
+ ntq_get_src(c, instr->src[2 + has_index],
+ 0));
+ tmu_writes++;
+ }
+ }
+
+ bool dynamic_src = !nir_src_is_const(instr->src[offset_src]);
+ uint32_t const_offset = 0;
+ if (!dynamic_src)
+ const_offset = nir_src_as_uint(instr->src[offset_src]);
+
+ /* Make sure we won't exceed the 16-entry TMU fifo if each thread is
+ * storing at the same time.
+ */
+ while (tmu_writes > 16 / c->threads)
+ c->threads /= 2;
+
+ struct qreg offset;
+ if (instr->intrinsic == nir_intrinsic_load_uniform) {
+ const_offset += nir_intrinsic_base(instr);
+ offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
+ v3d_unit_data_create(0, const_offset));
+ const_offset = 0;
+ } else if (instr->intrinsic == nir_intrinsic_load_ubo) {
+ uint32_t index = nir_src_as_uint(instr->src[0]) + 1;
+ /* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by
+ * 1 (0 is gallium's constant buffer 0).
+ */
+ offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
+ v3d_unit_data_create(index, const_offset));
+ const_offset = 0;
+ } else if (is_shared_or_scratch) {
+ /* Shared and scratch variables have no buffer index, and all
+ * start from a common base that we set up at the start of
+ * dispatch.
+ */
+ if (instr->intrinsic == nir_intrinsic_load_scratch ||
+ instr->intrinsic == nir_intrinsic_store_scratch) {
+ offset = c->spill_base;
+ } else {
+ offset = c->cs_shared_offset;
+ const_offset += nir_intrinsic_base(instr);