pan/bi: Futureproof COMBINE lowering against non-u32

[mesa.git] / src / panfrost / bifrost / bi_lower_combine.c
diff --git a/src/panfrost/bifrost/bi_lower_combine.c b/src/panfrost/bifrost/bi_lower_combine.c

index 4a01cfcca5e0175b8f87c2e507e68b3f1738f3cc..ae9ceb9742fed1e32845b7e001408eeee5859d55 100644 (file)
--- a/src/panfrost/bifrost/bi_lower_combine.c
+++ b/src/panfrost/bifrost/bi_lower_combine.c
@@ -38,7 +38,7 @@
   * component c is `x`, we are accessing v.x, and each of the succeeding
   * components y, z... up to the last component of the vector are accessed
   * sequentially, then we may perform the same rewrite. If this is not the case,
- * rewriting would require a swizzle or writemask (TODO), so we fallback on a
+ * rewriting would require more complex vector features, so we fallback on a
   * move.
   *
   * Otherwise is the source is not SSA, we also fallback on a move. We could
@@ -46,36 +46,51 @@
   */
  
  static void
-bi_insert_combine_mov(bi_context *ctx, bi_instruction *parent, unsigned comp, unsigned R)
+bi_combine_mov32(bi_context *ctx, bi_instruction *parent, unsigned comp, unsigned R)
  {
-        unsigned bits = nir_alu_type_get_type_size(parent->dest_type);
-        unsigned bytes = bits / 8;
-
          bi_instruction move = {
                  .type = BI_MOV,
                  .dest = R,
-                .dest_type = parent->dest_type,
-                .writemask = ((1 << bytes) - 1) << (bytes * comp),
+                .dest_type = nir_type_uint32,
+                .dest_offset = comp,
                  .src = { parent->src[comp] },
-                .src_types = { parent->dest_type },
+                .src_types = { nir_type_uint32 },
                  .swizzle = { { parent->swizzle[comp][0] } }
          };
  
          bi_emit_before(ctx, parent, move);
  }
  
+static void
+bi_combine_sel16(bi_context *ctx, bi_instruction *parent, unsigned comp, unsigned R)
+{
+        bi_instruction sel = {
+                .type = BI_SELECT,
+                .dest = R,
+                .dest_type = nir_type_uint32,
+                .dest_offset = comp >> 1,
+                .src = { parent->src[comp], parent->src[comp + 1] },
+                .src_types = { nir_type_uint16, nir_type_uint16 },
+                .swizzle = { {
+                        parent->swizzle[comp][0],
+                        parent->swizzle[comp + 1][0],
+                } }
+        };
+
+        bi_emit_before(ctx, parent, sel);
+}
+
  /* Gets the instruction generating a given source. Combine lowering is
   * accidentally O(n^2) right now because this function is O(n) instead of O(1).
   * If this pass is slow, this cost can be avoided in favour for better
   * bookkeeping. */
  
  static bi_instruction *
-bi_get_parent(bi_context *ctx, unsigned idx, unsigned mask)
+bi_get_parent(bi_context *ctx, unsigned idx)
  {
          bi_foreach_instr_global(ctx, ins) {
                  if (ins->dest == idx)
-                        if ((ins->writemask & mask) == mask)
-                                return ins;
+                        return ins;
          }
  
          return NULL;
@@ -103,76 +118,31 @@ bi_rewrite_uses(bi_context *ctx,
          }
  }
  
-/* Shifts the writemask of an instruction by a specified byte count,
- * rotating the sources to compensate. Returns true if successful, and
- * returns false if not (nondestructive in this case). */
-
-static bool
-bi_shift_mask(bi_instruction *ins, unsigned shift)
-{
-        /* No op and handles the funny cases */
-        if (!shift)
-                return true;
-
-        unsigned sz = nir_alu_type_get_type_size(ins->dest_type);
-        unsigned bytes = sz / 8;
-
-        /* If things are misaligned, we bail. Check if shift % bytes is
-         * nonzero. Note bytes is a power-of-two. */
-        if (shift & (bytes - 1))
-                return false;
-
-        /* Ensure there are no funny types */
-        bi_foreach_src(ins, s) {
-                if (ins->src[s] && nir_alu_type_get_type_size(ins->src_types[s]) != sz)
-                        return false;
-        }
-
-        /* Shift swizzle so old i'th component is accessed by new (i + j)'th
-         * component where j is component shift */
-        unsigned component_shift = shift / bytes;
-
-        /* Sanity check to avoid memory corruption */
-        if (component_shift >= sizeof(ins->swizzle[0]))
-                return false;
-
-        /* Otherwise, shift is divisible by bytes, and all relevant src types
-         * are the same size as the dest type. */
-        ins->writemask <<= shift;
-
-        bi_foreach_src(ins, s) {
-                if (!ins->src[s]) continue;
-
-                size_t overlap = sizeof(ins->swizzle[s]) - component_shift;
-                memmove(ins->swizzle[s] + component_shift, ins->swizzle[s], overlap);
-        }
-
-        return true;
-}
-
  /* Checks if we have a nicely aligned vector prefix */
  
  static bool
-bi_is_aligned_vec(bi_instruction *combine, unsigned s, bi_instruction *parent,
+bi_is_aligned_vec32(bi_instruction *combine, unsigned s, bi_instruction *io,
                  unsigned *count)
  {
          /* We only support prefixes */
          if (s != 0)
                  return false;
  
-        /* Is it a contiguous write? */
-        unsigned writes = util_bitcount(parent->writemask);
-        if (parent->writemask != ((1 << writes) - 1))
+        if (!(bi_class_props[io->type] & BI_VECTOR))
+                return false;
+
+        if (nir_alu_type_get_type_size(combine->dest_type) != 32)
+                return false;
+
+        if (nir_alu_type_get_type_size(io->dest_type) != 32)
                  return false;
  
-        /* Okay - how many components? */
-        unsigned bytes = nir_alu_type_get_type_size(parent->dest_type) / 8;
-        unsigned components = writes / bytes;
+        unsigned components = io->vector_channels;
  
          /* Are we contiguous like that? */
  
          for (unsigned i = 0; i < components; ++i) {
-                if (combine->src[i] != parent->dest)
+                if (combine->src[i] != io->dest)
                          return false;
  
                  if (combine->swizzle[i][0] != i)
@@ -184,6 +154,7 @@ bi_is_aligned_vec(bi_instruction *combine, unsigned s, bi_instruction *parent,
          return true;
  }
  
+#if 0
  /* Tries to lower a given source of a combine to an appropriate rewrite,
   * returning true if successful, and false with no changes otherwise. */
  
@@ -196,7 +167,7 @@ bi_lower_combine_src(bi_context *ctx, bi_instruction *ins, unsigned s, unsigned
          /* We currently only handle SSA */
  
          if (!src) return false;
-        if (src & (BIR_SPECIAL | BIR_IS_REG)) return false;
+        if (src & (BIR_SPECIAL | PAN_IS_REG)) return false;
  
          /* We are SSA. Lookup the generating instruction. */
          unsigned bytes = nir_alu_type_get_type_size(ins->dest_type) / 8;
@@ -210,7 +181,7 @@ bi_lower_combine_src(bi_context *ctx, bi_instruction *ins, unsigned s, unsigned
          unsigned pbytes = nir_alu_type_get_type_size(parent->dest_type) / 8;
          if (pbytes != bytes) return false;
  
-        bool scalar = (parent->writemask == ((1 << bytes) - 1));
+        bool scalar = parent->vector_channels != 0;
          if (!(scalar || bi_is_aligned_vec(ins, s, parent, vec_count))) return false;
  
          if (!bi_shift_mask(parent, bytes * s)) return false;
@@ -218,6 +189,7 @@ bi_lower_combine_src(bi_context *ctx, bi_instruction *ins, unsigned s, unsigned
          parent->dest = R;
          return true;
  }
+#endif
  
  void
  bi_lower_combine(bi_context *ctx, bi_block *block)
@@ -225,16 +197,15 @@ bi_lower_combine(bi_context *ctx, bi_block *block)
          bi_foreach_instr_in_block_safe(block, ins) {
                  if (ins->type != BI_COMBINE) continue;
  
-                /* The vector itself can't be shifted */
-                assert(ins->writemask & 0x1);
-
                  unsigned R = bi_make_temp_reg(ctx);
+                unsigned sz = nir_alu_type_get_type_size(ins->dest_type);
  
                  bi_foreach_src(ins, s) {
                          /* We're done early for vec2/3 */
                          if (!ins->src[s])
                                  continue;
  
+#if 0
                          unsigned vec_count = 0;
  
                          if (bi_lower_combine_src(ctx, ins, s, R, &vec_count)) {
@@ -244,6 +215,15 @@ bi_lower_combine(bi_context *ctx, bi_block *block)
                          } else {
                                  bi_insert_combine_mov(ctx, ins, s, R);
                          }
+#endif
+                        if (sz == 32)
+                                bi_combine_mov32(ctx, ins, s, R);
+                        else if (sz == 16) {
+                                bi_combine_sel16(ctx, ins, s, R);
+                                s++;
+                        } else {
+                                unreachable("Unknown COMBINE size");
+                        }
                  }