* component c is `x`, we are accessing v.x, and each of the succeeding
* components y, z... up to the last component of the vector are accessed
* sequentially, then we may perform the same rewrite. If this is not the case,
- * rewriting would require a swizzle or writemask (TODO), so we fallback on a
+ * rewriting would require more complex vector features, so we fallback on a
* move.
*
* Otherwise is the source is not SSA, we also fallback on a move. We could
*/
static void
-bi_insert_combine_mov(bi_context *ctx, bi_instruction *parent, unsigned comp, unsigned R)
+bi_combine_mov32(bi_context *ctx, bi_instruction *parent, unsigned comp, unsigned R)
{
- unsigned bits = nir_alu_type_get_type_size(parent->dest_type);
- unsigned bytes = bits / 8;
-
bi_instruction move = {
.type = BI_MOV,
.dest = R,
- .dest_type = parent->dest_type,
- .writemask = ((1 << bytes) - 1) << (bytes * comp),
+ .dest_type = nir_type_uint32,
+ .dest_offset = comp,
.src = { parent->src[comp] },
- .src_types = { parent->dest_type },
+ .src_types = { nir_type_uint32 },
.swizzle = { { parent->swizzle[comp][0] } }
};
bi_emit_before(ctx, parent, move);
}
+static void
+bi_combine_sel16(bi_context *ctx, bi_instruction *parent, unsigned comp, unsigned R)
+{
+ bi_instruction sel = {
+ .type = BI_SELECT,
+ .dest = R,
+ .dest_type = nir_type_uint32,
+ .dest_offset = comp >> 1,
+ .src = { parent->src[comp], parent->src[comp + 1] },
+ .src_types = { nir_type_uint16, nir_type_uint16 },
+ .swizzle = { {
+ parent->swizzle[comp][0],
+ parent->swizzle[comp + 1][0],
+ } }
+ };
+
+ bi_emit_before(ctx, parent, sel);
+}
+
/* Gets the instruction generating a given source. Combine lowering is
* accidentally O(n^2) right now because this function is O(n) instead of O(1).
* If this pass is slow, this cost can be avoided in favour for better
* bookkeeping. */
static bi_instruction *
-bi_get_parent(bi_context *ctx, unsigned idx, unsigned mask)
+bi_get_parent(bi_context *ctx, unsigned idx)
{
bi_foreach_instr_global(ctx, ins) {
if (ins->dest == idx)
- if ((ins->writemask & mask) == mask)
- return ins;
+ return ins;
}
return NULL;
}
}
-/* Shifts the writemask of an instruction by a specified byte count,
- * rotating the sources to compensate. Returns true if successful, and
- * returns false if not (nondestructive in this case). */
-
-static bool
-bi_shift_mask(bi_instruction *ins, unsigned shift)
-{
- /* No op and handles the funny cases */
- if (!shift)
- return true;
-
- unsigned sz = nir_alu_type_get_type_size(ins->dest_type);
- unsigned bytes = sz / 8;
-
- /* If things are misaligned, we bail. Check if shift % bytes is
- * nonzero. Note bytes is a power-of-two. */
- if (shift & (bytes - 1))
- return false;
-
- /* Ensure there are no funny types */
- bi_foreach_src(ins, s) {
- if (ins->src[s] && nir_alu_type_get_type_size(ins->src_types[s]) != sz)
- return false;
- }
-
- /* Shift swizzle so old i'th component is accessed by new (i + j)'th
- * component where j is component shift */
- unsigned component_shift = shift / bytes;
-
- /* Sanity check to avoid memory corruption */
- if (component_shift >= sizeof(ins->swizzle[0]))
- return false;
-
- /* Otherwise, shift is divisible by bytes, and all relevant src types
- * are the same size as the dest type. */
- ins->writemask <<= shift;
-
- bi_foreach_src(ins, s) {
- if (!ins->src[s]) continue;
-
- size_t overlap = sizeof(ins->swizzle[s]) - component_shift;
- memmove(ins->swizzle[s] + component_shift, ins->swizzle[s], overlap);
- }
-
- return true;
-}
-
/* Checks if we have a nicely aligned vector prefix */
static bool
-bi_is_aligned_vec(bi_instruction *combine, unsigned s, bi_instruction *parent,
+bi_is_aligned_vec32(bi_instruction *combine, unsigned s, bi_instruction *io,
unsigned *count)
{
/* We only support prefixes */
if (s != 0)
return false;
- /* Is it a contiguous write? */
- unsigned writes = util_bitcount(parent->writemask);
- if (parent->writemask != ((1 << writes) - 1))
+ if (!(bi_class_props[io->type] & BI_VECTOR))
+ return false;
+
+ if (nir_alu_type_get_type_size(combine->dest_type) != 32)
+ return false;
+
+ if (nir_alu_type_get_type_size(io->dest_type) != 32)
return false;
- /* Okay - how many components? */
- unsigned bytes = nir_alu_type_get_type_size(parent->dest_type) / 8;
- unsigned components = writes / bytes;
+ unsigned components = io->vector_channels;
/* Are we contiguous like that? */
for (unsigned i = 0; i < components; ++i) {
- if (combine->src[i] != parent->dest)
+ if (combine->src[i] != io->dest)
return false;
if (combine->swizzle[i][0] != i)
return true;
}
+#if 0
/* Tries to lower a given source of a combine to an appropriate rewrite,
* returning true if successful, and false with no changes otherwise. */
/* We currently only handle SSA */
if (!src) return false;
- if (src & (BIR_SPECIAL | BIR_IS_REG)) return false;
+ if (src & (BIR_SPECIAL | PAN_IS_REG)) return false;
/* We are SSA. Lookup the generating instruction. */
unsigned bytes = nir_alu_type_get_type_size(ins->dest_type) / 8;
unsigned pbytes = nir_alu_type_get_type_size(parent->dest_type) / 8;
if (pbytes != bytes) return false;
- bool scalar = (parent->writemask == ((1 << bytes) - 1));
+ bool scalar = parent->vector_channels != 0;
if (!(scalar || bi_is_aligned_vec(ins, s, parent, vec_count))) return false;
if (!bi_shift_mask(parent, bytes * s)) return false;
parent->dest = R;
return true;
}
+#endif
void
bi_lower_combine(bi_context *ctx, bi_block *block)
bi_foreach_instr_in_block_safe(block, ins) {
if (ins->type != BI_COMBINE) continue;
- /* The vector itself can't be shifted */
- assert(ins->writemask & 0x1);
-
unsigned R = bi_make_temp_reg(ctx);
+ unsigned sz = nir_alu_type_get_type_size(ins->dest_type);
bi_foreach_src(ins, s) {
/* We're done early for vec2/3 */
if (!ins->src[s])
continue;
+#if 0
unsigned vec_count = 0;
if (bi_lower_combine_src(ctx, ins, s, R, &vec_count)) {
} else {
bi_insert_combine_mov(ctx, ins, s, R);
}
+#endif
+ if (sz == 32)
+ bi_combine_mov32(ctx, ins, s, R);
+ else if (sz == 16) {
+ bi_combine_sel16(ctx, ins, s, R);
+ s++;
+ } else {
+ unreachable("Unknown COMBINE size");
+ }
}