From: Alejandro Piñeiro <apinheiro@igalia.com>
Date: Wed, 19 Jun 2019 11:17:41 +0000 (+0200)
Subject: v3d: use inc/dec tmu operation with atomic sub/add of 1
X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=commitdiff_plain;h=934ce48db81a1d1745919ddb8aaea9f7b638414b

v3d: use inc/dec tmu operation with atomic sub/add of 1

Among other things, this avoid the need of loading 1/-1 constants (so
one less operation).

The removed comment suggest the option of adding support on NIR for
inc/dec. Intel just uses an auxiliar method to get which hw operation
is needed, so no lowering is needed. And at the same time, being so
small, seems unreasonable to try to add a general one on NIR
itself. It is more easy to just adapt the method here (that is what
the patch does right now).

It is worth to note that we are not getting any change on shader-db
stats because all those methods are used on the usual shader-db set
with shaders needing GLSL > 4.2. In general there aren't too many GLSL
ES 3.1 tests.

As an alternative, we captured the GLES3/GLSL31/GLS32 used on
vk-gl-cts, even if that is not a real life usage of shaders. With
those we get the following:

total instructions in shared programs: 1217022 -> 1217013 (<.01%)
instructions in affected programs: 117 -> 108 (-7.69%)
helped: 6
HURT: 0
helped stats (abs) min: 1 max: 2 x̄: 1.50 x̃: 1
helped stats (rel) min: 3.57% max: 10.00% x̄: 8.09% x̃: 9.09%
95% mean confidence interval for instructions value: -2.07 -0.93
95% mean confidence interval for instructions %-change: -10.54% -5.64%
Instructions are helped.

Note that the shaders helped are really low because most of the
vk-gl-cts tests using AtomicInc/Dec/Add are mostly used on compute
shaders. Although right now there is a branch around with CS support,
the usual is doing the stats against master.

Reviewed-by: Eric Anholt <eric@anholt.net>
---

diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 53af9be9f74..45e34c70a49 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -115,6 +115,20 @@ vir_emit_thrsw(struct v3d_compile *c)
                 c->lock_scoreboard_on_first_thrsw = true;
 }
 
+uint32_t
+v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
+{
+        if (nir_src_is_const(instr->src[src])) {
+                int64_t add_val = nir_src_as_int(instr->src[src]);
+                if (add_val == 1)
+                        return V3D_TMU_OP_WRITE_AND_READ_INC;
+                else if (add_val == -1)
+                        return V3D_TMU_OP_WRITE_OR_READ_DEC;
+        }
+
+        return V3D_TMU_OP_WRITE_ADD_READ_PREFETCH;
+}
+
 static uint32_t
 v3d_general_tmu_op(nir_intrinsic_instr *instr)
 {
@@ -129,8 +143,9 @@ v3d_general_tmu_op(nir_intrinsic_instr *instr)
         case nir_intrinsic_store_scratch:
                 return V3D_TMU_OP_REGULAR;
         case nir_intrinsic_ssbo_atomic_add:
+                return v3d_get_op_for_atomic_add(instr, 2);
         case nir_intrinsic_shared_atomic_add:
-                return V3D_TMU_OP_WRITE_ADD_READ_PREFETCH;
+                return v3d_get_op_for_atomic_add(instr, 1);
         case nir_intrinsic_ssbo_atomic_imin:
         case nir_intrinsic_shared_atomic_imin:
                 return V3D_TMU_OP_WRITE_SMIN;
@@ -171,11 +186,16 @@ static void
 ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                      bool is_shared_or_scratch)
 {
-        /* XXX perf: We should turn add/sub of 1 to inc/dec.  Perhaps NIR
-         * wants to have support for inc/dec?
-         */
-
         uint32_t tmu_op = v3d_general_tmu_op(instr);
+
+        /* If we were able to replace atomic_add for an inc/dec, then we
+         * need/can to do things slightly different, like not loading the
+         * amount to add/sub, as that is implicit.
+         */
+        bool atomic_add_replaced = ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add ||
+                                     instr->intrinsic == nir_intrinsic_shared_atomic_add) &&
+                                    (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
+                                     tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC));
         bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
                          instr->intrinsic == nir_intrinsic_store_scratch ||
                          instr->intrinsic == nir_intrinsic_store_shared);
@@ -188,7 +208,8 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
         } else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
                    instr->intrinsic == nir_intrinsic_load_ubo ||
                    instr->intrinsic == nir_intrinsic_load_scratch ||
-                   instr->intrinsic == nir_intrinsic_load_shared) {
+                   instr->intrinsic == nir_intrinsic_load_shared ||
+                   atomic_add_replaced) {
                 offset_src = 0 + has_index;
         } else if (is_store) {
                 offset_src = 1 + has_index;
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 44840a563bb..da32d47a28d 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -826,6 +826,9 @@ bool vir_init_reg_sets(struct v3d_compiler *compiler);
 
 bool v3d_gl_format_is_return_32(GLenum format);
 
+uint32_t
+v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src);
+
 static inline bool
 quniform_contents_is_texture_p0(enum quniform_contents contents)
 {