From 10d0f39beb20c4cd6fe6d3f23a3b1d918653127a Mon Sep 17 00:00:00 2001
From: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Date: Tue, 19 May 2020 14:37:44 -0700
Subject: [PATCH] intel/fs: Remove min_dispatch_width spilling decision from RA

Move the decision one level up, let brw_compile_*() functions use the
spilling information to decide whether or not a certain width
compilation can spill (passed via run_*() functions).

The min_dispatch_width was used to compare with the dispatch_width and
decide whether "a previous shader is already available, so don't
accept spill".

This is replaced by:

- Not calling run_*() functions if it is know beforehand a smaller width
  already spilled -- since the larger width will spill and fail;

- Explicitly passing whether or not a shader is allowed to spill.  For
  the cases where the smaller width is available and haven't spilled,
  the larger width will be compiled but is only useful if it won't
  spill.

Moving the decision to this level will be useful later for variable
group size, which is a case where we want all the widths to be allowed
to spill.

Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5142>
---
 src/intel/compiler/brw_fs.cpp | 64 +++++++++++++++--------------------
 src/intel/compiler/brw_fs.h   |  4 +--
 2 files changed, 30 insertions(+), 38 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 0ff32148e0c..83702ef78b5 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -7856,7 +7856,7 @@ fs_visitor::fixup_nomask_control_flow()
 }
 
 void
-fs_visitor::allocate_registers(unsigned min_dispatch_width, bool allow_spilling)
+fs_visitor::allocate_registers(bool allow_spilling)
 {
    bool allocated;
 
@@ -7905,13 +7905,8 @@ fs_visitor::allocate_registers(unsigned min_dispatch_width, bool allow_spilling)
             fixup_3src_null_dest();
       }
 
-
-      /* We only allow spilling for the last schedule mode and only if the
-       * allow_spilling parameter and dispatch width work out ok.
-       */
       bool can_spill = allow_spilling &&
-                       (i == ARRAY_SIZE(pre_modes) - 1) &&
-                       dispatch_width == min_dispatch_width;
+                       (i == ARRAY_SIZE(pre_modes) - 1);
 
       /* We should only spill registers on the last scheduling. */
       assert(!spilled_any_registers);
@@ -7922,20 +7917,8 @@ fs_visitor::allocate_registers(unsigned min_dispatch_width, bool allow_spilling)
    }
 
    if (!allocated) {
-      if (!allow_spilling)
-         fail("Failure to register allocate and spilling is not allowed.");
-
-      /* We assume that any spilling is worse than just dropping back to
-       * SIMD8.  There's probably actually some intermediate point where
-       * SIMD16 with a couple of spills is still better.
-       */
-      if (dispatch_width > min_dispatch_width) {
-         fail("Failure to register allocate.  Reduce number of "
-              "live scalar values to avoid this.");
-      }
-
-      /* If we failed to allocate, we must have a reason */
-      assert(failed);
+      fail("Failure to register allocate.  Reduce number of "
+           "live scalar values to avoid this.");
    } else if (spilled_any_registers) {
       compiler->shader_perf_log(log_data,
                                 "%s shader triggered register spilling.  "
@@ -8024,7 +8007,7 @@ fs_visitor::run_vs()
    assign_vs_urb_setup();
 
    fixup_3src_null_dest();
-   allocate_registers(8, true);
+   allocate_registers(true /* allow_spilling */);
 
    return !failed;
 }
@@ -8145,7 +8128,7 @@ fs_visitor::run_tcs()
    assign_tcs_urb_setup();
 
    fixup_3src_null_dest();
-   allocate_registers(8, true);
+   allocate_registers(true /* allow_spilling */);
 
    return !failed;
 }
@@ -8179,7 +8162,7 @@ fs_visitor::run_tes()
    assign_tes_urb_setup();
 
    fixup_3src_null_dest();
-   allocate_registers(8, true);
+   allocate_registers(true /* allow_spilling */);
 
    return !failed;
 }
@@ -8228,7 +8211,7 @@ fs_visitor::run_gs()
    assign_gs_urb_setup();
 
    fixup_3src_null_dest();
-   allocate_registers(8, true);
+   allocate_registers(true /* allow_spilling */);
 
    return !failed;
 }
@@ -8337,7 +8320,8 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
       assign_urb_setup();
 
       fixup_3src_null_dest();
-      allocate_registers(8, allow_spilling);
+
+      allocate_registers(allow_spilling);
 
       if (failed)
          return false;
@@ -8347,10 +8331,9 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
 }
 
 bool
-fs_visitor::run_cs(unsigned min_dispatch_width)
+fs_visitor::run_cs(bool allow_spilling)
 {
    assert(stage == MESA_SHADER_COMPUTE);
-   assert(dispatch_width >= min_dispatch_width);
 
    setup_cs_payload();
 
@@ -8381,7 +8364,7 @@ fs_visitor::run_cs(unsigned min_dispatch_width)
    assign_curb_setup();
 
    fixup_3src_null_dest();
-   allocate_registers(min_dispatch_width, true);
+   allocate_registers(allow_spilling);
 
    if (failed)
       return false;
@@ -8710,6 +8693,7 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
    fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
    cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL;
    float throughput = 0;
+   bool has_spilled = false;
 
    v8 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,
                        &prog_data->base, shader, 8, shader_time_index8);
@@ -8725,6 +8709,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
       prog_data->reg_blocks_8 = brw_register_blocks(v8->grf_used);
       const performance &perf = v8->performance_analysis.require();
       throughput = MAX2(throughput, perf.throughput);
+      has_spilled = v8->spilled_any_registers;
+      allow_spilling = false;
    }
 
    /* Limit dispatch width to simd8 with dual source blending on gen8.
@@ -8737,7 +8723,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                                "using SIMD8 when dual src blending.\n");
    }
 
-   if (v8->max_dispatch_width >= 16 &&
+   if (!has_spilled &&
+       v8->max_dispatch_width >= 16 &&
        likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) {
       /* Try a SIMD16 compile */
       v16 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,
@@ -8753,11 +8740,14 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
          prog_data->reg_blocks_16 = brw_register_blocks(v16->grf_used);
          const performance &perf = v16->performance_analysis.require();
          throughput = MAX2(throughput, perf.throughput);
+         has_spilled = v16->spilled_any_registers;
+         allow_spilling = false;
       }
    }
 
    /* Currently, the compiler only supports SIMD32 on SNB+ */
-   if (v8->max_dispatch_width >= 32 && !use_rep_send &&
+   if (!has_spilled &&
+       v8->max_dispatch_width >= 32 && !use_rep_send &&
        devinfo->gen >= 6 && simd16_cfg &&
        !(INTEL_DEBUG & DEBUG_NO32)) {
       /* Try a SIMD32 compile */
@@ -9085,7 +9075,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
       v8 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,
                           &prog_data->base,
                           nir8, 8, shader_time_index);
-      if (!v8->run_cs(min_dispatch_width)) {
+      if (!v8->run_cs(true /* allow_spilling */)) {
          fail_msg = v8->fail_msg;
       } else {
          /* We should always be able to do SIMD32 for compute shaders */
@@ -9097,7 +9087,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
       }
    }
 
-   if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
+   if ((!v || !v->spilled_any_registers) &&
+       likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
        !fail_msg && min_dispatch_width <= 16 && max_dispatch_width >= 16) {
       /* Try a SIMD16 compile */
       nir_shader *nir16 = compile_cs_to_nir(compiler, mem_ctx, key,
@@ -9108,7 +9099,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
       if (v8)
          v16->import_uniforms(v8);
 
-      if (!v16->run_cs(min_dispatch_width)) {
+      if (!v16->run_cs(v == NULL /* allow_spilling */)) {
          compiler->shader_perf_log(log_data,
                                    "SIMD16 shader failed to compile: %s",
                                    v16->fail_msg);
@@ -9127,7 +9118,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
       }
    }
 
-   if (!fail_msg && (min_dispatch_width > 16 || (INTEL_DEBUG & DEBUG_DO32)) &&
+   if ((!v || !v->spilled_any_registers) &&
+       !fail_msg && (min_dispatch_width > 16 || (INTEL_DEBUG & DEBUG_DO32)) &&
        max_dispatch_width >= 32) {
       /* Try a SIMD32 compile */
       nir_shader *nir32 = compile_cs_to_nir(compiler, mem_ctx, key,
@@ -9140,7 +9132,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
       else if (v16)
          v32->import_uniforms(v16);
 
-      if (!v32->run_cs(min_dispatch_width)) {
+      if (!v32->run_cs(v == NULL /* allow_spilling */)) {
          compiler->shader_perf_log(log_data,
                                    "SIMD32 shader failed to compile: %s",
                                    v32->fail_msg);
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index b9783eb2285..5bf5cdeed41 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -125,9 +125,9 @@ public:
    bool run_tcs();
    bool run_tes();
    bool run_gs();
-   bool run_cs(unsigned min_dispatch_width);
+   bool run_cs(bool allow_spilling);
    void optimize();
-   void allocate_registers(unsigned min_dispatch_width, bool allow_spilling);
+   void allocate_registers(bool allow_spilling);
    void setup_fs_payload_gen4();
    void setup_fs_payload_gen6();
    void setup_vs_payload();
-- 
2.30.2