From 5a3493c536b174030d0c62e0196955d88c74066a Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Mon, 23 Mar 2020 16:48:58 -0400 Subject: [PATCH] pan/bi: Lower combines to rewrites for scalars This avoids unneeded moves for scalars. It still generates suboptimal code for vectors, however. Signed-off-by: Alyssa Rosenzweig Part-of: --- src/panfrost/bifrost/bi_lower_combine.c | 148 ++++++++++++++++++++++-- 1 file changed, 141 insertions(+), 7 deletions(-) diff --git a/src/panfrost/bifrost/bi_lower_combine.c b/src/panfrost/bifrost/bi_lower_combine.c index e41bdb61c52..8d4bea93368 100644 --- a/src/panfrost/bifrost/bi_lower_combine.c +++ b/src/panfrost/bifrost/bi_lower_combine.c @@ -28,18 +28,32 @@ * * v = combine x, y, z, w * - * These combines need to be lowered by the pass in this file. + * These combines need to be lowered by the pass in this file. Fix a given + * source at component c. + * + * First suppose the source is SSA. If it is also scalar, then we may rewrite + * the destination of the generating instruction (unique by SSA+scalar) to + * write to v.c, and rewrite each of its uses to swizzle out .c instead of .x + * (the original by scalar). If it is vector, there are two cases. If the + * component c is `x`, we are accessing v.x, and each of the succeeding + * components y, z... up to the last component of the vector are accessed + * sequentially, then we may perform the same rewrite. If this is not the case, + * rewriting would require a swizzle or writemask (TODO), so we fallback on a + * move. + * + * Otherwise is the source is not SSA, we also fallback on a move. We could + * probably do better. */ static void -bi_insert_combine_mov(bi_context *ctx, bi_instruction *parent, unsigned comp) +bi_insert_combine_mov(bi_context *ctx, bi_instruction *parent, unsigned comp, unsigned R) { unsigned bits = nir_alu_type_get_type_size(parent->dest_type); unsigned bytes = bits / 8; bi_instruction move = { .type = BI_MOV, - .dest = parent->dest, + .dest = R, .dest_type = parent->dest_type, .writemask = ((1 << bytes) - 1) << (bytes * comp), .src = { parent->src[comp] }, @@ -50,19 +64,139 @@ bi_insert_combine_mov(bi_context *ctx, bi_instruction *parent, unsigned comp) bi_emit_before(ctx, parent, move); } +/* Gets the instruction generating a given source. Combine lowering is + * accidentally O(n^2) right now because this function is O(n) instead of O(1). + * If this pass is slow, this cost can be avoided in favour for better + * bookkeeping. */ + +static bi_instruction * +bi_get_parent(bi_context *ctx, unsigned idx, unsigned mask) +{ + bi_foreach_instr_global(ctx, ins) { + if (ins->dest == idx) + if ((ins->writemask & mask) == mask) + return ins; + } + + return NULL; +} + +/* Rewrites uses of an index. Again, this could be O(n) to the program but is + * currently O(nc) to the program and number of combines, so the pass becomes + * effectively O(n^2). Better bookkeeping would bring down to linear if that's + * an issue. */ + +static void +bi_rewrite_uses(bi_context *ctx, + unsigned old, unsigned oldc, + unsigned new, unsigned newc) +{ + bi_foreach_instr_global(ctx, ins) { + bi_foreach_src(ins, s) { + if (ins->src[s] != old) continue; + + for (unsigned i = 0; i < 16; ++i) + ins->swizzle[s][0] += (newc - oldc); + + ins->src[s] = new; + } + } +} + +/* Shifts the writemask of an instruction by a specified byte count, + * rotating the sources to compensate. Returns true if successful, and + * returns false if not (nondestructive in this case). */ + +static bool +bi_shift_mask_scalar(bi_instruction *ins, signed shift) +{ + unsigned sz = nir_alu_type_get_type_size(ins->dest_type); + unsigned bytes = sz / 8; + + /* If things are misaligned, we bail. Check if shift % bytes is + * nonzero. Note bytes is a power-of-two. */ + if (shift & (bytes - 1)) + return false; + + /* Ensure there are no funny types */ + bi_foreach_src(ins, s) { + if (ins->src[s] && nir_alu_type_get_type_size(ins->src_types[s]) != sz) + return false; + } + + /* Otherwise, shift is divisible by bytes, and all relevant src types + * are the same size as the dest type. */ + ins->writemask <<= shift; + + /* Shift swizzle so old i'th component is accessed by new (i + j)'th + * component where j is component shift */ + signed component_shift = shift / bytes; + + bi_foreach_src(ins, s) { + if (!ins->src[s]) continue; + + size_t overlap = sizeof(ins->swizzle[s]) - abs(component_shift); + + if (component_shift > 0) + memmove(ins->swizzle[s] + component_shift, ins->swizzle[s], overlap); + else + memmove(ins->swizzle[s], ins->swizzle[s] - component_shift, overlap); + } + + return true; +} + +/* Tries to lower a given source of a combine to an appropriate rewrite, + * returning true if successful, and false with no changes otherwise. */ + +static bool +bi_lower_combine_src(bi_context *ctx, bi_instruction *ins, unsigned s, unsigned R) +{ + unsigned src = ins->src[s]; + + /* We currently only handle SSA */ + + if (!src) return false; + if (src & (BIR_SPECIAL | BIR_IS_REG)) return false; + + /* We are SSA. Lookup the generating instruction. */ + unsigned bytes = nir_alu_type_get_type_size(ins->dest_type) / 8; + + bi_instruction *parent = bi_get_parent(ctx, src, + 0xF << (ins->swizzle[s][0] * bytes)); + + if (!parent) return false; + + /* We have a parent instuction, sanity check the typesize */ + unsigned pbytes = nir_alu_type_get_type_size(parent->dest_type) / 8; + if (pbytes != bytes) return false; + + /* Scalar? */ + if (parent->writemask != ((1 << bytes) - 1)) return false; + if (!bi_shift_mask_scalar(parent, bytes * s)) return false; + bi_rewrite_uses(ctx, parent->dest, 0, R, s); + parent->dest = R; + return true; +} + void bi_lower_combine(bi_context *ctx, bi_block *block) { bi_foreach_instr_in_block_safe(block, ins) { if (ins->type != BI_COMBINE) continue; - bi_foreach_src(ins, s) { - if (!ins->src[s]) - break; + /* The vector itself can't be shifted */ + assert(ins->writemask & 0x1); + + unsigned R = bi_make_temp_reg(ctx); - bi_insert_combine_mov(ctx, ins, s); + bi_foreach_src(ins, s) { + if (!bi_lower_combine_src(ctx, ins, s, R)) + bi_insert_combine_mov(ctx, ins, s, R); } + + bi_rewrite_uses(ctx, ins->dest, 0, R, 0); bi_remove_instruction(ins); } } -- 2.30.2