From 5710250074a066288c0ed44fd796baa0c4d0f42b Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Date: Tue, 17 Dec 2019 19:05:35 -0500
Subject: [PATCH] pan/midgard: Add uniform/work heuristic
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Uniform/work registers are partitioned on a shader-by-shader basis as
determined by the compiler. We add a simple heuristic here running
before scheduling that prioritizes mitigating spilling at all costs.

A more sophisticated heuristic should run *after* scheduling, doing a
dry run of the register allocator itself to determine spilling. Fitting
this into our current scheduling model is difficult, so while this
heuristic does hurt some shaders, overall the results are acceptable:

total instructions in shared programs: 50065 -> 38747 (-22.61%)
instructions in affected programs: 37187 -> 25869 (-30.44%)
helped: 59
HURT: 77
helped stats (abs) min: 1 max: 757 xÌ: 198.46 xÌ: 151
helped stats (rel) min: 0.48% max: 62.89% xÌ: 32.95% xÌ: 36.27%
HURT stats (abs)   min: 1 max: 9 xÌ: 5.08 xÌ: 6
HURT stats (rel)   min: 0.92% max: 14.29% xÌ: 6.71% xÌ: 4.60%
95% mean confidence interval for instructions value: -111.15 -55.29
95% mean confidence interval for instructions %-change: -14.33% -6.67%
Instructions are helped.

total bundles in shared programs: 30606 -> 19157 (-37.41%)
bundles in affected programs: 23907 -> 12458 (-47.89%)
helped: 58
HURT: 74
helped stats (abs) min: 6 max: 757 xÌ: 203.09 xÌ: 152
helped stats (rel) min: 5.19% max: 77.00% xÌ: 49.38% xÌ: 53.79%
HURT stats (abs)   min: 1 max: 9 xÌ: 4.46 xÌ: 5
HURT stats (rel)   min: 1.85% max: 26.32% xÌ: 11.70% xÌ: 9.57%
95% mean confidence interval for bundles value: -115.46 -58.01
95% mean confidence interval for bundles %-change: -20.87% -9.41%
Bundles are helped.

total quadwords in shared programs: 31305 -> 32027 (2.31%)
quadwords in affected programs: 20471 -> 21193 (3.53%)
helped: 0
HURT: 133
HURT stats (abs)   min: 1 max: 9 xÌ: 5.43 xÌ: 5
HURT stats (rel)   min: 0.76% max: 15.15% xÌ: 5.47% xÌ: 4.65%
95% mean confidence interval for quadwords value: 5.00 5.86
95% mean confidence interval for quadwords %-change: 4.85% 6.08%
Quadwords are HURT.

total registers in shared programs: 2256 -> 2545 (12.81%)
registers in affected programs: 708 -> 997 (40.82%)
helped: 0
HURT: 95
HURT stats (abs)   min: 1 max: 8 xÌ: 3.04 xÌ: 3
HURT stats (rel)   min: 12.50% max: 100.00% xÌ: 39.41% xÌ: 37.50%
95% mean confidence interval for registers value: 2.64 3.45
95% mean confidence interval for registers %-change: 34.62% 44.19%
Registers are HURT.

total threads in shared programs: 1776 -> 1709 (-3.77%)
threads in affected programs: 134 -> 67 (-50.00%)
helped: 0
HURT: 67
HURT stats (abs)   min: 1 max: 1 xÌ: 1.00 xÌ: 1
HURT stats (rel)   min: 50.00% max: 50.00% xÌ: 50.00% xÌ: 50.00%
95% mean confidence interval for threads value: -1.00 -1.00
95% mean confidence interval for threads %-change: -50.00% -50.00%
Threads are HURT.

total spills in shared programs: 3868 -> 2 (-99.95%)
spills in affected programs: 3868 -> 2 (-99.95%)
helped: 60
HURT: 0

total fills in shared programs: 6456 -> 4 (-99.94%)
fills in affected programs: 6456 -> 4 (-99.94%)
helped: 60
HURT: 0

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Tested-by: Marge Bot <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3150>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3150>
---
 src/panfrost/midgard/compiler.h             |   4 +-
 src/panfrost/midgard/midgard_schedule.c     |   2 +-
 src/panfrost/midgard/mir_promote_uniforms.c | 113 +++++++++++++++++---
 3 files changed, 100 insertions(+), 19 deletions(-)

diff --git a/src/panfrost/midgard/compiler.h b/src/panfrost/midgard/compiler.h
index 45a0617a840..acc1a22a866 100644
--- a/src/panfrost/midgard/compiler.h
+++ b/src/panfrost/midgard/compiler.h
@@ -655,9 +655,7 @@ void mir_invalidate_liveness(compiler_context *ctx);
 bool mir_is_live_after(compiler_context *ctx, midgard_block *block, midgard_instruction *start, int src);
 
 void mir_create_pipeline_registers(compiler_context *ctx);
-
-void
-midgard_promote_uniforms(compiler_context *ctx, unsigned promoted_count);
+void midgard_promote_uniforms(compiler_context *ctx);
 
 midgard_instruction *
 emit_ubo_read(
diff --git a/src/panfrost/midgard/midgard_schedule.c b/src/panfrost/midgard/midgard_schedule.c
index 157595db501..9ee1f3b77c8 100644
--- a/src/panfrost/midgard/midgard_schedule.c
+++ b/src/panfrost/midgard/midgard_schedule.c
@@ -1141,7 +1141,7 @@ schedule_block(compiler_context *ctx, midgard_block *block)
 void
 schedule_program(compiler_context *ctx)
 {
-        midgard_promote_uniforms(ctx, 16);
+        midgard_promote_uniforms(ctx);
 
         /* Must be lowered right before scheduling */
         mir_squeeze_index(ctx);
diff --git a/src/panfrost/midgard/mir_promote_uniforms.c b/src/panfrost/midgard/mir_promote_uniforms.c
index d7b3cce36d2..12f860787c1 100644
--- a/src/panfrost/midgard/mir_promote_uniforms.c
+++ b/src/panfrost/midgard/mir_promote_uniforms.c
@@ -26,35 +26,118 @@
 
 #include "compiler.h"
 #include "util/u_math.h"
+#include "util/u_memory.h"
 
 /* This pass promotes reads from uniforms from load/store ops to uniform
  * registers if it is beneficial to do so. Normally, this saves both
  * instructions and total register pressure, but it does take a toll on the
  * number of work registers that are available, so this is a balance.
  *
- * To cope, we take as an argument the maximum work register pressure in the
- * program so we allow that many registers through at minimum, to prevent
- * spilling. If we spill anyway, I mean, it's a lose-lose at that point. */
+ * We use a heuristic to determine the ideal count, implemented by
+ * mir_work_heuristic, which returns the ideal number of work registers.
+ */
+
+static bool
+mir_is_promoteable_ubo(midgard_instruction *ins)
+{
+        /* TODO: promote unaligned access via swizzle? */
+
+        return (ins->type == TAG_LOAD_STORE_4) &&
+                (OP_IS_UBO_READ(ins->load_store.op)) &&
+                !(ins->constants[0] & 0xF) &&
+                !(ins->load_store.arg_1) &&
+                (ins->load_store.arg_2 == 0x1E) &&
+                ((ins->constants[0] / 16) < 16);
+}
+
+static unsigned
+mir_promoteable_uniform_count(compiler_context *ctx)
+{
+        unsigned count = 0;
+
+        mir_foreach_instr_global(ctx, ins) {
+                if (mir_is_promoteable_ubo(ins))
+                        count = MAX2(count, ins->constants[0] / 16);
+        }
+
+        return count;
+}
+
+static unsigned
+mir_count_live(uint16_t *live, unsigned temp_count)
+{
+        unsigned count = 0;
+
+        for (unsigned i = 0; i < temp_count; ++i)
+                count += util_bitcount(live[i]);
+
+        return count;
+}
+
+static unsigned
+mir_estimate_pressure(compiler_context *ctx)
+{
+        mir_invalidate_liveness(ctx);
+        mir_compute_liveness(ctx);
+
+        unsigned max_live = 0;
+
+        mir_foreach_block(ctx, block) {
+                uint16_t *live = mem_dup(block->live_out, ctx->temp_count * sizeof(uint16_t));
+
+                mir_foreach_instr_in_block_rev(block, ins) {
+                        unsigned count = mir_count_live(live, ctx->temp_count);
+                        max_live = MAX2(max_live, count);
+                        mir_liveness_ins_update(live, ins, ctx->temp_count);
+                }
+
+                free(live);
+        }
+
+        return DIV_ROUND_UP(max_live, 16);
+}
+
+static unsigned
+mir_work_heuristic(compiler_context *ctx)
+{
+        unsigned uniform_count = mir_promoteable_uniform_count(ctx);
+
+        /* If there are 8 or fewer uniforms, it doesn't matter what we do, so
+         * allow as many work registers as needed */
+
+        if (uniform_count <= 8)
+                return 16;
+
+        /* Otherwise, estimate the register pressure */
+
+        unsigned pressure = mir_estimate_pressure(ctx);
+
+        /* Prioritize not spilling above all else. The relation between the
+         * pressure estimate and the actual register pressure is a little
+         * murkier than we might like (due to scheduling, pipeline registers,
+         * failure to pack vector registers, load/store registers, texture
+         * registers...), hence why this is a heuristic parameter */
+
+        if (pressure > 6)
+                return 16;
+
+        /* If there's no chance of spilling, prioritize UBOs and thread count */
+
+        return 8;
+}
 
 void
-midgard_promote_uniforms(compiler_context *ctx, unsigned promoted_count)
+midgard_promote_uniforms(compiler_context *ctx)
 {
+        unsigned work_count = mir_work_heuristic(ctx);
+        unsigned promoted_count = 24 - work_count;
+
         mir_foreach_instr_global_safe(ctx, ins) {
-                if (ins->type != TAG_LOAD_STORE_4) continue;
-                if (!OP_IS_UBO_READ(ins->load_store.op)) continue;
+                if (!mir_is_promoteable_ubo(ins)) continue;
 
-                /* TODO: promote unaligned access via swizzle? */
                 unsigned off = ins->constants[0];
-                if (off & 0xF) continue;
-
                 unsigned address = off / 16;
 
-                /* Check this is UBO 0 */
-                if (ins->load_store.arg_1) continue;
-
-                /* Check we're accessing directly */
-                if (ins->load_store.arg_2 != 0x1E) continue;
-
                 /* Check if it's a promotable range */
                 unsigned uniform_reg = 23 - address;
 
-- 
2.30.2