From 3174bc9972fa812989c2bbb4be8d0651024d84f2 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Date: Tue, 16 Jul 2019 14:10:08 -0700
Subject: [PATCH] panfrost: Promote uniform registers late

Rather than creating either a load or a uniform register read with a
fixed beginning offset, we always create a load and then promote to a
uniform register later. This will allow us to promote in a register
pressure aware manner.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
---
 src/panfrost/midgard/compiler.h             | 17 ++++
 src/panfrost/midgard/helpers.h              |  6 ++
 src/panfrost/midgard/meson.build            |  1 +
 src/panfrost/midgard/midgard_compile.c      | 97 ++++-----------------
 src/panfrost/midgard/midgard_schedule.c     | 59 +++++++++++++
 src/panfrost/midgard/mir_promote_uniforms.c | 76 ++++++++++++++++
 6 files changed, 174 insertions(+), 82 deletions(-)
 create mode 100644 src/panfrost/midgard/mir_promote_uniforms.c

diff --git a/src/panfrost/midgard/compiler.h b/src/panfrost/midgard/compiler.h
index bd7c11efab3..73ec4b56fb3 100644
--- a/src/panfrost/midgard/compiler.h
+++ b/src/panfrost/midgard/compiler.h
@@ -345,6 +345,11 @@ mir_next_op(struct midgard_instruction *ins)
         mir_foreach_block(ctx, v_block) \
                 mir_foreach_instr_in_block(v_block, v)
 
+#define mir_foreach_instr_global_safe(ctx, v) \
+        mir_foreach_block(ctx, v_block) \
+                mir_foreach_instr_in_block_safe(v_block, v)
+
+
 
 static inline midgard_instruction *
 mir_last_in_block(struct midgard_block *block)
@@ -440,6 +445,18 @@ bool mir_has_multiple_writes(compiler_context *ctx, int src);
 
 void mir_create_pipeline_registers(compiler_context *ctx);
 
+void
+midgard_promote_uniforms(compiler_context *ctx, unsigned pressure);
+
+void
+emit_ubo_read(
+        compiler_context *ctx,
+        unsigned dest,
+        unsigned offset,
+        nir_src *indirect_offset,
+        unsigned index);
+
+
 /* Final emission */
 
 void emit_binary_bundle(
diff --git a/src/panfrost/midgard/helpers.h b/src/panfrost/midgard/helpers.h
index ef854dc60c1..13b868b1739 100644
--- a/src/panfrost/midgard/helpers.h
+++ b/src/panfrost/midgard/helpers.h
@@ -40,6 +40,12 @@
                 op == midgard_alu_op_imov \
         )
 
+#define OP_IS_UBO_READ(op) ( \
+                op == midgard_op_ld_uniform_32  || \
+                op == midgard_op_ld_uniform_16  || \
+                op == midgard_op_ld_uniform_32i \
+        )
+
 /* ALU control words are single bit fields with a lot of space */
 
 #define ALU_ENAB_VEC_MUL  (1 << 17)
diff --git a/src/panfrost/midgard/meson.build b/src/panfrost/midgard/meson.build
index cbe26004e2d..b467f835032 100644
--- a/src/panfrost/midgard/meson.build
+++ b/src/panfrost/midgard/meson.build
@@ -29,6 +29,7 @@ libpanfrost_midgard_files = files(
   'midgard_ra_pipeline.c',
   'midgard_liveness.c',
   'midgard_ops.c',
+  'mir_promote_uniforms.c',
   'cppwrap.cpp',
   'disassemble.c',
 )
diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c
index 81484078348..37436e133fc 100644
--- a/src/panfrost/midgard/midgard_compile.c
+++ b/src/panfrost/midgard/midgard_compile.c
@@ -1157,7 +1157,7 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr)
 /* Uniforms and UBOs use a shared code path, as uniforms are just (slightly
  * optimized) versions of UBO #0 */
 
-static void
+void
 emit_ubo_read(
         compiler_context *ctx,
         unsigned dest,
@@ -1167,36 +1167,20 @@ emit_ubo_read(
 {
         /* TODO: half-floats */
 
-        if (!indirect_offset && offset < ctx->uniform_cutoff && index == 0) {
-                /* Fast path: For the first 16 uniforms, direct accesses are
-                 * 0-cycle, since they're just a register fetch in the usual
-                 * case.  So, we alias the registers while we're still in
-                 * SSA-space */
-
-                int reg_slot = 23 - offset;
-                alias_ssa(ctx, dest, SSA_FIXED_REGISTER(reg_slot));
-        } else {
-                /* Otherwise, read from the 'special' UBO to access
-                 * higher-indexed uniforms, at a performance cost. More
-                 * generally, we're emitting a UBO read instruction. */
-
-                midgard_instruction ins = m_ld_uniform_32(dest, offset);
-
-                /* TODO: Don't split */
-                ins.load_store.varying_parameters = (offset & 7) << 7;
-                ins.load_store.address = offset >> 3;
+        midgard_instruction ins = m_ld_uniform_32(dest, offset);
 
-                if (indirect_offset) {
-                        emit_indirect_offset(ctx, indirect_offset);
-                        ins.load_store.unknown = 0x8700 | index; /* xxx: what is this? */
-                } else {
-                        ins.load_store.unknown = 0x1E00 | index; /* xxx: what is this? */
-                }
-
-                /* TODO respect index */
+        /* TODO: Don't split */
+        ins.load_store.varying_parameters = (offset & 7) << 7;
+        ins.load_store.address = offset >> 3;
 
-                emit_mir_instruction(ctx, ins);
+        if (indirect_offset) {
+                emit_indirect_offset(ctx, indirect_offset);
+                ins.load_store.unknown = 0x8700 | index; /* xxx: what is this? */
+        } else {
+                ins.load_store.unknown = 0x1E00 | index; /* xxx: what is this? */
         }
+
+        emit_mir_instruction(ctx, ins);
 }
 
 static void
@@ -2228,57 +2212,6 @@ midgard_opt_pos_propagate(compiler_context *ctx, midgard_block *block)
         return progress;
 }
 
-/* The following passes reorder MIR instructions to enable better scheduling */
-
-static void
-midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
-{
-        mir_foreach_instr_in_block_safe(block, ins) {
-                if (ins->type != TAG_LOAD_STORE_4) continue;
-
-                /* We've found a load/store op. Check if next is also load/store. */
-                midgard_instruction *next_op = mir_next_op(ins);
-                if (&next_op->link != &block->instructions) {
-                        if (next_op->type == TAG_LOAD_STORE_4) {
-                                /* If so, we're done since we're a pair */
-                                ins = mir_next_op(ins);
-                                continue;
-                        }
-
-                        /* Maximum search distance to pair, to avoid register pressure disasters */
-                        int search_distance = 8;
-
-                        /* Otherwise, we have an orphaned load/store -- search for another load */
-                        mir_foreach_instr_in_block_from(block, c, mir_next_op(ins)) {
-                                /* Terminate search if necessary */
-                                if (!(search_distance--)) break;
-
-                                if (c->type != TAG_LOAD_STORE_4) continue;
-
-                                /* Stores cannot be reordered, since they have
-                                 * dependencies. For the same reason, indirect
-                                 * loads cannot be reordered as their index is
-                                 * loaded in r27.w */
-
-                                if (OP_IS_STORE(c->load_store.op)) continue;
-
-                                /* It appears the 0x800 bit is set whenever a
-                                 * load is direct, unset when it is indirect.
-                                 * Skip indirect loads. */
-
-                                if (!(c->load_store.unknown & 0x800)) continue;
-
-                                /* We found one! Move it up to pair and remove it from the old location */
-
-                                mir_insert_instruction_before(ins, *c);
-                                mir_remove_instruction(c);
-
-                                break;
-                        }
-                }
-        }
-}
-
 /* If there are leftovers after the below pass, emit actual fmov
  * instructions for the slow-but-correct path */
 
@@ -2358,8 +2291,6 @@ emit_block(compiler_context *ctx, nir_block *block)
         /* Perform heavylifting for aliasing */
         actualise_ssa_to_alias(ctx);
 
-        midgard_pair_load_store(ctx, this_block);
-
         /* Append fragment shader epilogue (value writeout) */
         if (ctx->stage == MESA_SHADER_FRAGMENT) {
                 if (block == nir_impl_last_block(ctx->func->impl)) {
@@ -2564,7 +2495,9 @@ midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_bl
 
         compiler_context *ctx = &ictx;
 
-        /* TODO: Decide this at runtime */
+        /* Start off with a safe cutoff, allowing usage of all 16 work
+         * registers. Later, we'll promote uniform reads to uniform registers
+         * if we determine it is beneficial to do so */
         ctx->uniform_cutoff = 8;
 
         /* Initialize at a global (not block) level hash tables */
diff --git a/src/panfrost/midgard/midgard_schedule.c b/src/panfrost/midgard/midgard_schedule.c
index ccc641c7b86..97e06d743fa 100644
--- a/src/panfrost/midgard/midgard_schedule.c
+++ b/src/panfrost/midgard/midgard_schedule.c
@@ -524,6 +524,59 @@ schedule_block(compiler_context *ctx, midgard_block *block)
         block->is_scheduled = true;
 }
 
+/* The following passes reorder MIR instructions to enable better scheduling */
+
+static void
+midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
+{
+        mir_foreach_instr_in_block_safe(block, ins) {
+                if (ins->type != TAG_LOAD_STORE_4) continue;
+
+                /* We've found a load/store op. Check if next is also load/store. */
+                midgard_instruction *next_op = mir_next_op(ins);
+                if (&next_op->link != &block->instructions) {
+                        if (next_op->type == TAG_LOAD_STORE_4) {
+                                /* If so, we're done since we're a pair */
+                                ins = mir_next_op(ins);
+                                continue;
+                        }
+
+                        /* Maximum search distance to pair, to avoid register pressure disasters */
+                        int search_distance = 8;
+
+                        /* Otherwise, we have an orphaned load/store -- search for another load */
+                        mir_foreach_instr_in_block_from(block, c, mir_next_op(ins)) {
+                                /* Terminate search if necessary */
+                                if (!(search_distance--)) break;
+
+                                if (c->type != TAG_LOAD_STORE_4) continue;
+
+                                /* Stores cannot be reordered, since they have
+                                 * dependencies. For the same reason, indirect
+                                 * loads cannot be reordered as their index is
+                                 * loaded in r27.w */
+
+                                if (OP_IS_STORE(c->load_store.op)) continue;
+
+                                /* It appears the 0x800 bit is set whenever a
+                                 * load is direct, unset when it is indirect.
+                                 * Skip indirect loads. */
+
+                                if (!(c->load_store.unknown & 0x800)) continue;
+
+                                /* We found one! Move it up to pair and remove it from the old location */
+
+                                mir_insert_instruction_before(ins, *c);
+                                mir_remove_instruction(c);
+
+                                break;
+                        }
+                }
+        }
+}
+
+
+
 void
 schedule_program(compiler_context *ctx)
 {
@@ -531,6 +584,12 @@ schedule_program(compiler_context *ctx)
         bool spilled = false;
         int iter_count = 10; /* max iterations */
 
+        midgard_promote_uniforms(ctx, 8);
+
+        mir_foreach_block(ctx, block) {
+                midgard_pair_load_store(ctx, block);
+        }
+
         do {
                 /* We would like to run RA after scheduling, but spilling can
                  * complicate this */
diff --git a/src/panfrost/midgard/mir_promote_uniforms.c b/src/panfrost/midgard/mir_promote_uniforms.c
new file mode 100644
index 00000000000..7206cb70165
--- /dev/null
+++ b/src/panfrost/midgard/mir_promote_uniforms.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright (C) 2019 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors (Collabora):
+ *   Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
+ */
+
+#include "compiler.h"
+
+/* This pass promotes reads from uniforms from load/store ops to uniform
+ * registers if it is beneficial to do so. Normally, this saves both
+ * instructions and total register pressure, but it does take a toll on the
+ * number of work registers that are available, so this is a balance.
+ *
+ * To cope, we take as an argument the maximum work register pressure in the
+ * program so we allow that many registers through at minimum, to prevent
+ * spilling. If we spill anyway, I mean, it's a lose-lose at that point. */
+
+void
+midgard_promote_uniforms(compiler_context *ctx, unsigned register_pressure)
+{
+        /* For our purposes, pressure is capped at the number of vec4 work
+         * registers, not live values which would consider spills */
+        register_pressure = MAX2(register_pressure, 16);
+
+        mir_foreach_instr_global_safe(ctx, ins) {
+                if (ins->type != TAG_LOAD_STORE_4) continue;
+                if (!OP_IS_UBO_READ(ins->load_store.op)) continue;
+
+                unsigned lo = ins->load_store.varying_parameters >> 7;
+                unsigned hi = ins->load_store.address;
+
+                /* TODO: Combine fields logically */
+                unsigned address = (hi << 3) | lo;
+
+                /* Check this is UBO 0 */
+                if (ins->load_store.unknown & 0xF) continue;
+
+                /* Check we're accessing directly */
+                if (ins->load_store.unknown != 0x1E00) continue;
+
+                /* Check if it's a promotable range */
+                unsigned uniform_reg = 23 - address;
+
+                if (address > 16) continue;
+                if (register_pressure > uniform_reg) continue;
+
+                /* It is, great! Let's promote */
+
+                ctx->uniform_cutoff = MAX2(ctx->uniform_cutoff, address + 1);
+
+                unsigned promoted = SSA_FIXED_REGISTER(uniform_reg);
+                mir_rewrite_index_src(ctx, ins->ssa_args.dest, promoted);
+
+                mir_remove_instruction(ins);
+        }
+}
-- 
2.30.2