From f98e9a2771e43efeae049c15c4ca6b579dfe125c Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 23 Oct 2019 20:19:22 -0400 Subject: [PATCH] pan/midgard: Express allocated registers as offsets Rather than supplying a mask/swizzle to compose with the original, just supply the offset of the allocated register so we can directly offset the mask/swizzle, without resorting to composition. This is simpler, cleaner, and will generalize to non-32-bit. Signed-off-by: Alyssa Rosenzweig --- src/panfrost/midgard/midgard_ra.c | 166 +++++++++++------------------- 1 file changed, 62 insertions(+), 104 deletions(-) diff --git a/src/panfrost/midgard/midgard_ra.c b/src/panfrost/midgard/midgard_ra.c index f8355a89720..224970b8aea 100644 --- a/src/panfrost/midgard/midgard_ra.c +++ b/src/panfrost/midgard/midgard_ra.c @@ -61,64 +61,38 @@ static unsigned reg_type_to_mask[WORK_STRIDE] = { 0x1, 0x1 << 1, 0x1 << 2, 0x1 << 3 /* x */ }; -static unsigned reg_type_to_swizzle[WORK_STRIDE] = { - SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), - - SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), - SWIZZLE(COMPONENT_Y, COMPONENT_Z, COMPONENT_W, COMPONENT_W), - - SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), - SWIZZLE(COMPONENT_Y, COMPONENT_Z, COMPONENT_Z, COMPONENT_W), - SWIZZLE(COMPONENT_Z, COMPONENT_W, COMPONENT_Z, COMPONENT_W), - - SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), - SWIZZLE(COMPONENT_Y, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), - SWIZZLE(COMPONENT_Z, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), - SWIZZLE(COMPONENT_W, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), -}; - struct phys_reg { + /* Physical register: 0-31 */ unsigned reg; - unsigned mask; - unsigned swizzle; -}; -/* Given the mask/swizzle of both the register and the original source, - * compose to find the actual mask/swizzle to give the hardware */ + /* Byte offset into the physical register: 0-15 */ + unsigned offset; -static unsigned -compose_writemask(unsigned mask, struct phys_reg reg) -{ - /* Note: the reg mask is guaranteed to be contiguous. So we shift - * into the X place, compose via a simple AND, and shift back */ + /* Number of bytes in a component of this register */ + unsigned size; +}; - unsigned shift = __builtin_ctz(reg.mask); - return ((reg.mask >> shift) & mask) << shift; -} +/* Shift each component up by reg_offset and shift all components horizontally + * by dst_offset. TODO: Generalize to !32-bit */ static unsigned -compose_swizzle(unsigned swizzle, unsigned mask, - struct phys_reg reg, struct phys_reg dst) +offset_swizzle(unsigned swizzle, unsigned reg_offset, unsigned srcsize, unsigned dst_offset, unsigned dstsize) { - unsigned out = pan_compose_swizzle(swizzle, reg.swizzle); + unsigned out = 0; - /* Based on the register mask, we need to adjust over. E.g if we're - * writing to yz, a base swizzle of xy__ becomes _xy_. Save the - * original first component (x). But to prevent duplicate shifting - * (only applies to ALU -- mask param is set to xyzw out on L/S to - * prevent changes), we have to account for the shift inherent to the - * original writemask */ + signed reg_comp = reg_offset / srcsize; + signed dst_comp = dst_offset / dstsize; - unsigned rep = out & 0x3; - unsigned shift = __builtin_ctz(dst.mask) - __builtin_ctz(mask); - unsigned shifted = out << (2*shift); + assert(reg_comp * srcsize == reg_offset); + assert(dst_comp * dstsize == dst_offset); - /* ..but we fill in the gaps so it appears to replicate */ - - for (unsigned s = 0; s < shift; ++s) - shifted |= rep << (2*s); + for (signed c = 0; c < 4; ++c) { + signed comp = MAX2(c - dst_comp, 0); + signed s = (swizzle >> (2*comp)) & 0x3; + out |= (MIN2(s + reg_comp, 3) << (2*c)); + } - return shifted; + return out; } /* Helper to return the default phys_reg for a given register */ @@ -128,8 +102,8 @@ default_phys_reg(int reg) { struct phys_reg r = { .reg = reg, - .mask = 0xF, /* xyzw */ - .swizzle = 0xE4 /* xyzw */ + .offset = 0, + .size = 4 }; return r; @@ -139,7 +113,7 @@ default_phys_reg(int reg) * register corresponds to */ static struct phys_reg -index_to_reg(compiler_context *ctx, struct ra_graph *g, unsigned reg) +index_to_reg(compiler_context *ctx, struct ra_graph *g, unsigned reg, midgard_reg_mode size) { /* Check for special cases */ if (reg == ~0) @@ -163,10 +137,12 @@ index_to_reg(compiler_context *ctx, struct ra_graph *g, unsigned reg) else if (phys == SHADOW_R0) phys = 0; + unsigned bytes = mir_bytes_for_mode(size); + struct phys_reg r = { .reg = phys, - .mask = reg_type_to_mask[type], - .swizzle = reg_type_to_swizzle[type] + .offset = __builtin_ctz(reg_type_to_mask[type]) * bytes, + .size = bytes }; /* Report that we actually use this register, and return it */ @@ -702,20 +678,19 @@ install_registers_instr( if (ins->compact_branch) return; - struct phys_reg src1 = index_to_reg(ctx, g, ins->src[0]); - struct phys_reg src2 = index_to_reg(ctx, g, ins->src[1]); - struct phys_reg dest = index_to_reg(ctx, g, ins->dest); + struct phys_reg src1 = index_to_reg(ctx, g, ins->src[0], mir_srcsize(ins, 0)); + struct phys_reg src2 = index_to_reg(ctx, g, ins->src[1], mir_srcsize(ins, 1)); + struct phys_reg dest = index_to_reg(ctx, g, ins->dest, mir_typesize(ins)); - unsigned uncomposed_mask = ins->mask; - ins->mask = compose_writemask(uncomposed_mask, dest); + mir_set_bytemask(ins, mir_bytemask(ins) << dest.offset); - /* Adjust the dest mask if necessary. Mostly this is a no-op - * but it matters for dot products */ - dest.mask = effective_writemask(&ins->alu, ins->mask); + unsigned dest_offset = + GET_CHANNEL_COUNT(alu_opcode_props[ins->alu.op].props) ? 0 : + dest.offset; midgard_vector_alu_src mod1 = vector_alu_from_unsigned(ins->alu.src1); - mod1.swizzle = compose_swizzle(mod1.swizzle, uncomposed_mask, src1, dest); + mod1.swizzle = offset_swizzle(mod1.swizzle, src1.offset, src1.size, dest_offset, dest.size); ins->alu.src1 = vector_alu_srco_unsigned(mod1); ins->registers.src1_reg = src1.reg; @@ -736,8 +711,7 @@ install_registers_instr( } else { midgard_vector_alu_src mod2 = vector_alu_from_unsigned(ins->alu.src2); - mod2.swizzle = compose_swizzle( - mod2.swizzle, uncomposed_mask, src2, dest); + mod2.swizzle = offset_swizzle(mod2.swizzle, src2.offset, src2.size, dest_offset, dest.size); ins->alu.src2 = vector_alu_srco_unsigned(mod2); ins->registers.src2_reg = src2.reg; @@ -755,54 +729,38 @@ install_registers_instr( bool encodes_src = OP_IS_STORE(ins->load_store.op); if (encodes_src) { - struct phys_reg src = index_to_reg(ctx, g, ins->src[0]); + struct phys_reg src = index_to_reg(ctx, g, ins->src[0], mir_srcsize(ins, 0)); assert(src.reg == 26 || src.reg == 27); ins->load_store.reg = src.reg - 26; - - unsigned shift = __builtin_ctz(src.mask); - unsigned adjusted_mask = src.mask >> shift; - assert(((adjusted_mask + 1) & adjusted_mask) == 0); - - unsigned new_swizzle = 0; - for (unsigned q = 0; q < 4; ++q) { - unsigned c = (ins->load_store.swizzle >> (2*q)) & 3; - new_swizzle |= (c + shift) << (2*q); - } - - ins->load_store.swizzle = compose_swizzle( - new_swizzle, src.mask, - default_phys_reg(0), src); + ins->load_store.swizzle = offset_swizzle(ins->load_store.swizzle, src.offset, src.size, 0, 4); } else { - struct phys_reg src = index_to_reg(ctx, g, ins->dest); - - ins->load_store.reg = src.reg; - - ins->load_store.swizzle = compose_swizzle( - ins->load_store.swizzle, 0xF, - default_phys_reg(0), src); + struct phys_reg dst = index_to_reg(ctx, g, ins->dest, mir_typesize(ins)); - ins->mask = compose_writemask( - ins->mask, src); + ins->load_store.reg = dst.reg; + ins->load_store.swizzle = offset_swizzle(ins->load_store.swizzle, 0, 4, dst.offset, dst.size); + mir_set_bytemask(ins, mir_bytemask(ins) << dst.offset); } /* We also follow up by actual arguments */ - int src2 = - encodes_src ? ins->src[1] : ins->src[0]; + unsigned src2_idx = encodes_src ? 1 : 0; + unsigned src3_idx = encodes_src ? 2 : 1; - int src3 = - encodes_src ? ins->src[2] : ins->src[1]; + unsigned src2 = ins->src[src2_idx]; + unsigned src3 = ins->src[src3_idx]; - if (src2 >= 0) { - struct phys_reg src = index_to_reg(ctx, g, src2); - unsigned component = __builtin_ctz(src.mask); + if (src2 != ~0) { + struct phys_reg src = index_to_reg(ctx, g, src2, mir_srcsize(ins, src2_idx)); + unsigned component = src.offset / src.size; + assert(component * src.size == src.offset); ins->load_store.arg_1 |= midgard_ldst_reg(src.reg, component); } - if (src3 >= 0) { - struct phys_reg src = index_to_reg(ctx, g, src3); - unsigned component = __builtin_ctz(src.mask); + if (src3 != ~0) { + struct phys_reg src = index_to_reg(ctx, g, src3, mir_srcsize(ins, src3_idx)); + unsigned component = src.offset / src.size; + assert(component * src.size == src.offset); ins->load_store.arg_2 |= midgard_ldst_reg(src.reg, component); } @@ -811,9 +769,9 @@ install_registers_instr( case TAG_TEXTURE_4: { /* Grab RA results */ - struct phys_reg dest = index_to_reg(ctx, g, ins->dest); - struct phys_reg coord = index_to_reg(ctx, g, ins->src[0]); - struct phys_reg lod = index_to_reg(ctx, g, ins->src[1]); + struct phys_reg dest = index_to_reg(ctx, g, ins->dest, mir_typesize(ins)); + struct phys_reg coord = index_to_reg(ctx, g, ins->src[0], mir_srcsize(ins, 0)); + struct phys_reg lod = index_to_reg(ctx, g, ins->src[1], mir_srcsize(ins, 1)); assert(dest.reg == 28 || dest.reg == 29); assert(coord.reg == 28 || coord.reg == 29); @@ -823,23 +781,23 @@ install_registers_instr( ins->texture.in_reg_upper = 0; ins->texture.in_reg_select = coord.reg - 28; ins->texture.in_reg_swizzle = - pan_compose_swizzle(ins->texture.in_reg_swizzle, coord.swizzle); + offset_swizzle(ins->texture.in_reg_swizzle, coord.offset, coord.size, 0, 4); /* Next, install the destination */ ins->texture.out_full = 1; ins->texture.out_upper = 0; ins->texture.out_reg_select = dest.reg - 28; ins->texture.swizzle = - compose_swizzle(ins->texture.swizzle, dest.mask, dest, dest); - ins->mask = - compose_writemask(ins->mask, dest); + offset_swizzle(ins->texture.swizzle, 0, 4, dest.offset, coord.size); + mir_set_bytemask(ins, mir_bytemask(ins) << dest.offset); /* If there is a register LOD/bias, use it */ if (ins->src[1] != ~0) { + assert(!(lod.offset & 3)); midgard_tex_register_select sel = { .select = lod.reg, .full = 1, - .component = lod.swizzle & 3, + .component = lod.offset / 4 }; uint8_t packed; -- 2.30.2