From 991c4261604b136cac0770c7d6c7345ea134129c Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason@jlekstrand.net>
Date: Sat, 14 Dec 2019 10:44:39 -0600
Subject: [PATCH] intel/nir: Enable load/store vectorization
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

This commit enables the I/O vectorization pass that was originally
written for ACO for Intel drivers.  We enable it for UBOs, SSBOs, global
memory, and SLM.  We only enable vectorization for the scalar back-end
because it vec4 makes certain alignment assumptions.

Shader-db results with iris on ICL:

    total instructions in shared programs: 16077927 -> 16068236 (-0.06%)
    instructions in affected programs: 199839 -> 190148 (-4.85%)
    helped: 324
    HURT: 0
    helped stats (abs) min: 2 max: 458 xÌ: 29.91 xÌ: 4
    helped stats (rel) min: 0.11% max: 38.94% xÌ: 4.32% xÌ: 1.64%
    95% mean confidence interval for instructions value: -37.02 -22.80
    95% mean confidence interval for instructions %-change: -5.07% -3.58%
    Instructions are helped.

    total cycles in shared programs: 336806135 -> 336151501 (-0.19%)
    cycles in affected programs: 16009735 -> 15355101 (-4.09%)
    helped: 458
    HURT: 154
    helped stats (abs) min: 1 max: 77812 xÌ: 1542.50 xÌ: 75
    helped stats (rel) min: <.01% max: 34.46% xÌ: 5.16% xÌ: 2.01%
    HURT stats (abs)   min: 1 max: 22800 xÌ: 336.55 xÌ: 20
    HURT stats (rel)   min: <.01% max: 17.11% xÌ: 2.12% xÌ: 1.00%
    95% mean confidence interval for cycles value: -1596.83 -542.49
    95% mean confidence interval for cycles %-change: -3.83% -2.82%
    Cycles are helped.

    total sends in shared programs: 814177 -> 809049 (-0.63%)
    sends in affected programs: 15422 -> 10294 (-33.25%)
    helped: 324
    HURT: 0
    helped stats (abs) min: 1 max: 256 xÌ: 15.83 xÌ: 2
    helped stats (rel) min: 1.33% max: 67.90% xÌ: 21.21% xÌ: 15.38%
    95% mean confidence interval for sends value: -19.67 -11.98
    95% mean confidence interval for sends %-change: -23.03% -19.39%
    Sends are helped.

    LOST:   7
    GAINED: 2

Most of the helped shaders were in the following titles:

 - Doom
 - Deus Ex: Mankind Divided
 - Aztec Ruins
 - Shadow of Mordor
 - DiRT Showdown
 - Tomb Raider (Rise, I think)

Five of the lost programs are SIMD16 shaders we lost from dirt showdown.
The other two are compute shaders in Aztec Ruins which switched from
SIMD8 to SIMD16.

Vulkan pipeline-db stats on ICL:

    Instructions in all programs: 296780486 -> 293493363 (-1.1%)
    Loops in all programs: 149669 -> 149669 (+0.0%)
    Cycles in all programs: 90999206722 -> 88513844563 (-2.7%)
    Spills in all programs: 1710217 -> 1730691 (+1.2%)
    Fills in all programs: 1931235 -> 1958138 (+1.4%)

By far the most help was in the Tomb Raider games.  A couple of Batman
games with DXVK were also helped.  In Shadow of the Tomb Raider:

    Instructions in all programs: 41614336 -> 39408023 (-5.3%)
    Loops in all programs: 32200 -> 32200 (+0.0%)
    Cycles in all programs: 1875498485 -> 1667034831 (-11.1%)
    Spills in all programs: 196307 -> 214945 (+9.5%)
    Fills in all programs: 282736 -> 307113 (+8.6%)

Benchmarks of real games I've done on this patch:

 - Rise of the Tomb Raider: +3%
 - Shadow of the Tomb Raider: +10%

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Tested-by: Marge Bot <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4367>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4367>
---
 src/intel/compiler/brw_nir.c | 66 ++++++++++++++++++++++++++++++------
 1 file changed, 55 insertions(+), 11 deletions(-)

diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 8a6cc8fe696..02a6f0cba9f 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -824,6 +824,31 @@ brw_nir_link_shaders(const struct brw_compiler *compiler,
    }
 }
 
+static bool
+brw_nir_should_vectorize_mem(unsigned align, unsigned bit_size,
+                             unsigned num_components, unsigned high_offset,
+                             nir_intrinsic_instr *low,
+                             nir_intrinsic_instr *high)
+{
+   /* Don't combine things to generate 64-bit loads/stores.  We have to split
+    * those back into 32-bit ones anyway and UBO loads aren't split in NIR so
+    * we don't want to make a mess for the back-end.
+    */
+   if (bit_size > 32)
+      return false;
+
+   /* We can handle at most a vec4 right now.  Anything bigger would get
+    * immediately split by brw_nir_lower_mem_access_bit_sizes anyway.
+    */
+   if (num_components > 4)
+      return false;
+
+   if (align < bit_size / 8)
+      return false;
+
+   return true;
+}
+
 static
 bool combine_all_barriers(nir_intrinsic_instr *a,
                           nir_intrinsic_instr *b,
@@ -844,6 +869,35 @@ bool combine_all_barriers(nir_intrinsic_instr *a,
    return true;
 }
 
+static void
+brw_vectorize_lower_mem_access(nir_shader *nir,
+                               const struct brw_compiler *compiler,
+                               bool is_scalar)
+{
+   const struct gen_device_info *devinfo = compiler->devinfo;
+   bool progress = false;
+
+   if (is_scalar) {
+      OPT(nir_opt_load_store_vectorize,
+          nir_var_mem_ubo | nir_var_mem_ssbo |
+          nir_var_mem_global | nir_var_mem_shared,
+          brw_nir_should_vectorize_mem);
+   }
+
+   OPT(brw_nir_lower_mem_access_bit_sizes, devinfo);
+
+   while (progress) {
+      progress = false;
+
+      OPT(nir_lower_pack);
+      OPT(nir_copy_prop);
+      OPT(nir_opt_dce);
+      OPT(nir_opt_cse);
+      OPT(nir_opt_algebraic);
+      OPT(nir_opt_constant_folding);
+   }
+}
+
 /* Prepare the given shader for codegen
  *
  * This function is intended to be called right before going into the actual
@@ -870,17 +924,7 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
 
    brw_nir_optimize(nir, compiler, is_scalar, false);
 
-   if (OPT(brw_nir_lower_mem_access_bit_sizes, devinfo)) {
-      do {
-         progress = false;
-         OPT(nir_lower_pack);
-         OPT(nir_copy_prop);
-         OPT(nir_opt_dce);
-         OPT(nir_opt_cse);
-         OPT(nir_opt_algebraic);
-         OPT(nir_opt_constant_folding);
-      } while (progress);
-   }
+   brw_vectorize_lower_mem_access(nir, compiler, is_scalar);
 
    if (OPT(nir_lower_int64, nir->options->lower_int64_options))
       brw_nir_optimize(nir, compiler, is_scalar, false);
-- 
2.30.2