From 5a3493c536b174030d0c62e0196955d88c74066a Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Date: Mon, 23 Mar 2020 16:48:58 -0400
Subject: [PATCH] pan/bi: Lower combines to rewrites for scalars

This avoids unneeded moves for scalars. It still generates suboptimal
code for vectors, however.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4288>
---
 src/panfrost/bifrost/bi_lower_combine.c | 148 ++++++++++++++++++++++--
 1 file changed, 141 insertions(+), 7 deletions(-)

diff --git a/src/panfrost/bifrost/bi_lower_combine.c b/src/panfrost/bifrost/bi_lower_combine.c
index e41bdb61c52..8d4bea93368 100644
--- a/src/panfrost/bifrost/bi_lower_combine.c
+++ b/src/panfrost/bifrost/bi_lower_combine.c
@@ -28,18 +28,32 @@
  *
  *      v = combine x, y, z, w
  *
- * These combines need to be lowered by the pass in this file.
+ * These combines need to be lowered by the pass in this file. Fix a given
+ * source at component c.
+ *
+ * First suppose the source is SSA. If it is also scalar, then we may rewrite
+ * the destination of the generating instruction (unique by SSA+scalar) to
+ * write to v.c, and rewrite each of its uses to swizzle out .c instead of .x
+ * (the original by scalar). If it is vector, there are two cases. If the
+ * component c is `x`, we are accessing v.x, and each of the succeeding
+ * components y, z... up to the last component of the vector are accessed
+ * sequentially, then we may perform the same rewrite. If this is not the case,
+ * rewriting would require a swizzle or writemask (TODO), so we fallback on a
+ * move.
+ *
+ * Otherwise is the source is not SSA, we also fallback on a move. We could
+ * probably do better.
  */
 
 static void
-bi_insert_combine_mov(bi_context *ctx, bi_instruction *parent, unsigned comp)
+bi_insert_combine_mov(bi_context *ctx, bi_instruction *parent, unsigned comp, unsigned R)
 {
         unsigned bits = nir_alu_type_get_type_size(parent->dest_type);
         unsigned bytes = bits / 8;
 
         bi_instruction move = {
                 .type = BI_MOV,
-                .dest = parent->dest,
+                .dest = R,
                 .dest_type = parent->dest_type,
                 .writemask = ((1 << bytes) - 1) << (bytes * comp),
                 .src = { parent->src[comp] },
@@ -50,19 +64,139 @@ bi_insert_combine_mov(bi_context *ctx, bi_instruction *parent, unsigned comp)
         bi_emit_before(ctx, parent, move);
 }
 
+/* Gets the instruction generating a given source. Combine lowering is
+ * accidentally O(n^2) right now because this function is O(n) instead of O(1).
+ * If this pass is slow, this cost can be avoided in favour for better
+ * bookkeeping. */
+
+static bi_instruction *
+bi_get_parent(bi_context *ctx, unsigned idx, unsigned mask)
+{
+        bi_foreach_instr_global(ctx, ins) {
+                if (ins->dest == idx)
+                        if ((ins->writemask & mask) == mask)
+                                return ins;
+        }
+
+        return NULL;
+}
+
+/* Rewrites uses of an index. Again, this could be O(n) to the program but is
+ * currently O(nc) to the program and number of combines, so the pass becomes
+ * effectively O(n^2). Better bookkeeping would bring down to linear if that's
+ * an issue. */
+
+static void
+bi_rewrite_uses(bi_context *ctx,
+                unsigned old, unsigned oldc,
+                unsigned new, unsigned newc)
+{
+        bi_foreach_instr_global(ctx, ins) {
+                bi_foreach_src(ins, s) {
+                        if (ins->src[s] != old) continue;
+
+                        for (unsigned i = 0; i < 16; ++i)
+                                ins->swizzle[s][0] += (newc - oldc);
+
+                        ins->src[s] = new;
+                }
+        }
+}
+
+/* Shifts the writemask of an instruction by a specified byte count,
+ * rotating the sources to compensate. Returns true if successful, and
+ * returns false if not (nondestructive in this case). */
+
+static bool
+bi_shift_mask_scalar(bi_instruction *ins, signed shift)
+{
+        unsigned sz = nir_alu_type_get_type_size(ins->dest_type);
+        unsigned bytes = sz / 8;
+
+        /* If things are misaligned, we bail. Check if shift % bytes is
+         * nonzero. Note bytes is a power-of-two. */
+        if (shift & (bytes - 1))
+                return false;
+
+        /* Ensure there are no funny types */
+        bi_foreach_src(ins, s) {
+                if (ins->src[s] && nir_alu_type_get_type_size(ins->src_types[s]) != sz)
+                        return false;
+        }
+
+        /* Otherwise, shift is divisible by bytes, and all relevant src types
+         * are the same size as the dest type. */
+        ins->writemask <<= shift;
+
+        /* Shift swizzle so old i'th component is accessed by new (i + j)'th
+         * component where j is component shift */
+        signed component_shift = shift / bytes;
+
+        bi_foreach_src(ins, s) {
+                if (!ins->src[s]) continue;
+
+                size_t overlap = sizeof(ins->swizzle[s]) - abs(component_shift);
+
+                if (component_shift > 0)
+                        memmove(ins->swizzle[s] + component_shift, ins->swizzle[s], overlap);
+                else
+                        memmove(ins->swizzle[s], ins->swizzle[s] - component_shift, overlap);
+        }
+
+        return true;
+}
+
+/* Tries to lower a given source of a combine to an appropriate rewrite,
+ * returning true if successful, and false with no changes otherwise. */
+
+static bool
+bi_lower_combine_src(bi_context *ctx, bi_instruction *ins, unsigned s, unsigned R)
+{
+        unsigned src = ins->src[s];
+
+        /* We currently only handle SSA */
+
+        if (!src) return false;
+        if (src & (BIR_SPECIAL | BIR_IS_REG)) return false;
+
+        /* We are SSA. Lookup the generating instruction. */
+        unsigned bytes = nir_alu_type_get_type_size(ins->dest_type) / 8;
+
+        bi_instruction *parent = bi_get_parent(ctx, src,
+                         0xF << (ins->swizzle[s][0] * bytes));
+
+        if (!parent) return false;
+
+        /* We have a parent instuction, sanity check the typesize */
+        unsigned pbytes = nir_alu_type_get_type_size(parent->dest_type) / 8;
+        if (pbytes != bytes) return false;
+
+        /* Scalar? */
+        if (parent->writemask != ((1 << bytes) - 1)) return false;
+        if (!bi_shift_mask_scalar(parent, bytes * s)) return false;
+        bi_rewrite_uses(ctx, parent->dest, 0, R, s);
+        parent->dest = R;
+        return true;
+}
+
 void
 bi_lower_combine(bi_context *ctx, bi_block *block)
 {
         bi_foreach_instr_in_block_safe(block, ins) {
                 if (ins->type != BI_COMBINE) continue;
 
-                bi_foreach_src(ins, s) {
-                        if (!ins->src[s])
-                                break;
+                /* The vector itself can't be shifted */
+                assert(ins->writemask & 0x1);
+
+                unsigned R = bi_make_temp_reg(ctx);
 
-                        bi_insert_combine_mov(ctx, ins, s);
+                bi_foreach_src(ins, s) {
+                        if (!bi_lower_combine_src(ctx, ins, s, R))
+                                bi_insert_combine_mov(ctx, ins, s, R);
                 }
 
+
+                bi_rewrite_uses(ctx, ins->dest, 0, R, 0);
                 bi_remove_instruction(ins);
         }
 }
-- 
2.30.2