From 99e9809cab8aacb326b35b3bf50c6cd33edab6b8 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 7 Aug 2019 08:20:35 +0200
Subject: [PATCH] v3d: honor the write mask on store operations

v2:
  - Fix incremental update of the const offset when we need to emit a sequence
    with more than one write because of the writemask.
  - Do not move the tmu write emission to a separate helper.

v3:
  - Get the store writemask before the loop, use ffs to get the first component
    to write and clear writemask bits as we process the components (Eric).
  - Simplified the code that figured out the number of components for the TMU
    config based on the number of tmu writes for stores and atomics.

v4:
  - Code clean-ups (Eric).

Fixes:
KHR-GLES31.core.shader_image_load_store.advanced-cast-cs
KHR-GLES31.core.shader_image_load_store.advanced-cast-fs
KHR-GLES31.core.shader_storage_buffer_object.advanced-switchBuffers-cs
KHR-GLES31.core.shader_storage_buffer_object.advanced-switchPrograms-cs
KHR-GLES31.core.shader_storage_buffer_object.basic-operations-case1-cs

Reviewed-by: Eric Anholt <eric@anholt.net>
---
 src/broadcom/compiler/nir_to_vir.c | 205 +++++++++++++++++------------
 1 file changed, 120 insertions(+), 85 deletions(-)

diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 3857f96a9bd..92252b03218 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -229,19 +229,19 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
         if (!dynamic_src)
                 const_offset = nir_src_as_uint(instr->src[offset_src]);
 
-        struct qreg offset;
+        struct qreg base_offset;
         if (instr->intrinsic == nir_intrinsic_load_uniform) {
                 const_offset += nir_intrinsic_base(instr);
-                offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
-                                     v3d_unit_data_create(0, const_offset));
+                base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
+                                          v3d_unit_data_create(0, const_offset));
                 const_offset = 0;
         } else if (instr->intrinsic == nir_intrinsic_load_ubo) {
                 uint32_t index = nir_src_as_uint(instr->src[0]) + 1;
                 /* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by
                  * 1 (0 is gallium's constant buffer 0).
                  */
-                offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
-                                     v3d_unit_data_create(index, const_offset));
+                base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
+                                          v3d_unit_data_create(index, const_offset));
                 const_offset = 0;
         } else if (is_shared_or_scratch) {
                 /* Shared and scratch variables have no buffer index, and all
@@ -250,109 +250,144 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                  */
                 if (instr->intrinsic == nir_intrinsic_load_scratch ||
                     instr->intrinsic == nir_intrinsic_store_scratch) {
-                        offset = c->spill_base;
+                        base_offset = c->spill_base;
                 } else {
-                        offset = c->cs_shared_offset;
+                        base_offset = c->cs_shared_offset;
                         const_offset += nir_intrinsic_base(instr);
                 }
         } else {
-                offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
-                                     nir_src_as_uint(instr->src[is_store ?
-                                                                1 : 0]));
+                base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
+                                          nir_src_as_uint(instr->src[is_store ?
+                                                                      1 : 0]));
         }
 
-        int tmu_writes = 1; /* address */
-        if (is_store) {
-                for (int i = 0; i < instr->num_components; i++) {
-                        vir_MOV_dest(c,
-                                     vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
-                                     ntq_get_src(c, instr->src[0], i));
-                        tmu_writes++;
-                }
-        } else if (!is_load && !atomic_add_replaced) {
-                vir_MOV_dest(c,
-                             vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
-                             ntq_get_src(c, instr->src[1 + has_index], 0));
-                tmu_writes++;
-                if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
+        unsigned writemask = is_store ? nir_intrinsic_write_mask(instr) : 0;
+        uint32_t base_const_offset = const_offset;
+        int first_component = -1;
+        int last_component = -1;
+        do {
+                int tmu_writes = 1; /* address */
+
+                if (is_store) {
+                        /* Find the first set of consecutive components that
+                         * are enabled in the writemask and emit the TMUD
+                         * instructions for them.
+                         */
+                        first_component = ffs(writemask) - 1;
+                        last_component = first_component;
+                        while (writemask & BITFIELD_BIT(last_component + 1))
+                                last_component++;
+
+                        assert(first_component >= 0 &&
+                               first_component <= last_component &&
+                               last_component < instr->num_components);
+
+                        struct qreg tmud = vir_reg(QFILE_MAGIC,
+                                                   V3D_QPU_WADDR_TMUD);
+                        for (int i = first_component; i <= last_component; i++) {
+                                struct qreg data =
+                                        ntq_get_src(c, instr->src[0], i);
+                                vir_MOV_dest(c, tmud, data);
+                                tmu_writes++;
+                        }
+
+                        /* Update the offset for the TMU write based on the
+                         * the first component we are writing.
+                         */
+                        const_offset = base_const_offset + first_component * 4;
+
+                        /* Clear these components from the writemask */
+                        uint32_t written_mask =
+                                BITFIELD_RANGE(first_component, tmu_writes - 1);
+                        writemask &= ~written_mask;
+                } else if (!is_load && !atomic_add_replaced) {
                         vir_MOV_dest(c,
                                      vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
-                                     ntq_get_src(c, instr->src[2 + has_index],
-                                                 0));
+                                     ntq_get_src(c, instr->src[1 + has_index], 0));
                         tmu_writes++;
+                        if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
+                                vir_MOV_dest(c,
+                                             vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
+                                             ntq_get_src(c, instr->src[2 + has_index],
+                                                         0));
+                                tmu_writes++;
+                        }
                 }
-        }
-
-        /* Make sure we won't exceed the 16-entry TMU fifo if each thread is
-         * storing at the same time.
-         */
-        while (tmu_writes > 16 / c->threads)
-                c->threads /= 2;
 
-        /* The spec says that for atomics, the TYPE field is ignored, but that
-         * doesn't seem to be the case for CMPXCHG.  Just use the number of
-         * tmud writes we did to decide the type (or choose "32bit" for atomic
-         * reads, which has been fine).
-         */
-        int num_components;
-        if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH)
-                num_components = 2;
-        else
-                num_components = instr->num_components;
-
-        uint32_t config = (0xffffff00 |
-                           tmu_op << 3|
-                           GENERAL_TMU_LOOKUP_PER_PIXEL);
-        if (num_components == 1) {
-                config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
-        } else {
-                config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 + num_components - 2;
-        }
+                /* Make sure we won't exceed the 16-entry TMU fifo if each thread is
+                 * storing at the same time.
+                 */
+                while (tmu_writes > 16 / c->threads)
+                        c->threads /= 2;
 
-        if (vir_in_nonuniform_control_flow(c)) {
-                vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
-                           V3D_QPU_PF_PUSHZ);
-        }
+                /* The spec says that for atomics, the TYPE field is ignored, but that
+                 * doesn't seem to be the case for CMPXCHG.  Just use the number of
+                 * tmud writes we did to decide the type (or choose "32bit" for atomic
+                 * reads, which has been fine).
+                 */
+                uint32_t num_components;
+                if (is_load || atomic_add_replaced) {
+                        num_components = instr->num_components;
+                } else {
+                        assert(tmu_writes > 1);
+                        num_components = tmu_writes - 1;
+                }
 
-        struct qreg tmua;
-        if (config == ~0)
-                tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
-        else
-                tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
+                uint32_t config = (0xffffff00 |
+                                   tmu_op << 3|
+                                   GENERAL_TMU_LOOKUP_PER_PIXEL);
+                if (num_components == 1) {
+                        config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
+                } else {
+                        config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 + num_components - 2;
+                }
 
-        struct qinst *tmu;
-        if (dynamic_src) {
-                if (const_offset != 0) {
-                        offset = vir_ADD(c, offset,
-                                         vir_uniform_ui(c, const_offset));
+                if (vir_in_nonuniform_control_flow(c)) {
+                        vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+                                   V3D_QPU_PF_PUSHZ);
                 }
-                tmu = vir_ADD_dest(c, tmua, offset,
-                                   ntq_get_src(c, instr->src[offset_src], 0));
-        } else {
-                if (const_offset != 0) {
+
+                struct qreg tmua;
+                if (config == ~0)
+                        tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
+                else
+                        tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
+
+                struct qinst *tmu;
+                if (dynamic_src) {
+                        struct qreg offset = base_offset;
+                        if (const_offset != 0) {
+                                offset = vir_ADD(c, offset,
+                                                 vir_uniform_ui(c, const_offset));
+                        }
                         tmu = vir_ADD_dest(c, tmua, offset,
-                                           vir_uniform_ui(c, const_offset));
+                                           ntq_get_src(c, instr->src[offset_src], 0));
                 } else {
-                        tmu = vir_MOV_dest(c, tmua, offset);
+                        if (const_offset != 0) {
+                                tmu = vir_ADD_dest(c, tmua, base_offset,
+                                                   vir_uniform_ui(c, const_offset));
+                        } else {
+                                tmu = vir_MOV_dest(c, tmua, base_offset);
+                        }
                 }
-        }
 
-        if (config != ~0) {
-                tmu->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
-                                                     config);
-        }
+                if (config != ~0) {
+                        tmu->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
+                                                             config);
+                }
 
-        if (vir_in_nonuniform_control_flow(c))
-                vir_set_cond(tmu, V3D_QPU_COND_IFA);
+                if (vir_in_nonuniform_control_flow(c))
+                        vir_set_cond(tmu, V3D_QPU_COND_IFA);
 
-        vir_emit_thrsw(c);
+                vir_emit_thrsw(c);
 
-        /* Read the result, or wait for the TMU op to complete. */
-        for (int i = 0; i < nir_intrinsic_dest_components(instr); i++)
-                ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c)));
+                /* Read the result, or wait for the TMU op to complete. */
+                for (int i = 0; i < nir_intrinsic_dest_components(instr); i++)
+                        ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c)));
 
-        if (nir_intrinsic_dest_components(instr) == 0)
-                vir_TMUWT(c);
+                if (nir_intrinsic_dest_components(instr) == 0)
+                        vir_TMUWT(c);
+        } while (is_store && writemask != 0);
 }
 
 static struct qreg *
-- 
2.30.2