From f8b18a4277cc96b2048c9b74fb0d39e2112bb4c1 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Mon, 1 Jul 2019 18:51:48 -0700 Subject: [PATCH] panfrost/midgard: Hoist mask field Share a single mask field in midgard_instruction with a unified format, rather than using separate masks for each instruction tag with hardware-specific formats. Eliminates quite a bit of duplicated code and will enable vec8/vec16 masks as well (which don't map as cleanly to the hardware as we might like). Signed-off-by: Alyssa Rosenzweig --- .../drivers/panfrost/midgard/compiler.h | 7 ++- .../drivers/panfrost/midgard/helpers.h | 21 ++------- .../panfrost/midgard/midgard_compile.c | 43 +++++++++---------- .../drivers/panfrost/midgard/midgard_emit.c | 27 +++++++++--- .../drivers/panfrost/midgard/midgard_ops.h | 9 ++-- .../drivers/panfrost/midgard/midgard_ra.c | 25 ++++------- .../panfrost/midgard/midgard_schedule.c | 18 +++----- 7 files changed, 70 insertions(+), 80 deletions(-) diff --git a/src/gallium/drivers/panfrost/midgard/compiler.h b/src/gallium/drivers/panfrost/midgard/compiler.h index 4c2202711b1..5fe1d80692c 100644 --- a/src/gallium/drivers/panfrost/midgard/compiler.h +++ b/src/gallium/drivers/panfrost/midgard/compiler.h @@ -120,6 +120,11 @@ typedef struct midgard_instruction { bool writeout; bool prepacked_branch; + /* Masks in a saneish format. One bit per channel, not packed fancy. + * Use this instead of the op specific ones, and switch over at emit + * time */ + uint16_t mask; + union { midgard_load_store_word load_store; midgard_vector_alu alu; @@ -398,6 +403,7 @@ v_mov(unsigned src, midgard_vector_alu_src mod, unsigned dest) { midgard_instruction ins = { .type = TAG_ALU_4, + .mask = 0xF, .ssa_args = { .src0 = SSA_UNUSED_1, .src1 = src, @@ -408,7 +414,6 @@ v_mov(unsigned src, midgard_vector_alu_src mod, unsigned dest) .reg_mode = midgard_reg_mode_32, .dest_override = midgard_dest_override_none, .outmod = midgard_outmod_int_wrap, - .mask = 0xFF, .src1 = vector_alu_srco_unsigned(zero_alu_src), .src2 = vector_alu_srco_unsigned(mod) }, diff --git a/src/gallium/drivers/panfrost/midgard/helpers.h b/src/gallium/drivers/panfrost/midgard/helpers.h index 25def4c85b3..4a395a4c8cd 100644 --- a/src/gallium/drivers/panfrost/midgard/helpers.h +++ b/src/gallium/drivers/panfrost/midgard/helpers.h @@ -217,13 +217,11 @@ struct mir_op_props { /* This file is common, so don't define the tables themselves. #include * midgard_op.h if you need that, or edit midgard_ops.c directly */ -/* Duplicate bits to convert standard 4-bit writemask to duplicated 8-bit - * format (or do the inverse). The 8-bit format only really matters for - * int8, as far as I know, where performance can be improved by using a - * vec8 output */ +/* Duplicate bits to convert a 4-bit writemask to duplicated 8-bit format, + * which is used for 32-bit vector units */ static inline unsigned -expand_writemask(unsigned mask) +expand_writemask_32(unsigned mask) { unsigned o = 0; @@ -234,19 +232,6 @@ expand_writemask(unsigned mask) return o; } -static inline unsigned -squeeze_writemask(unsigned mask) -{ - unsigned o = 0; - - for (int i = 0; i < 4; ++i) - if (mask & (3 << (2 * i))) - o |= (1 << i); - - return o; - -} - /* Coerce structs to integer */ static inline unsigned diff --git a/src/gallium/drivers/panfrost/midgard/midgard_compile.c b/src/gallium/drivers/panfrost/midgard/midgard_compile.c index 21197efa499..de40eeafdd5 100644 --- a/src/gallium/drivers/panfrost/midgard/midgard_compile.c +++ b/src/gallium/drivers/panfrost/midgard/midgard_compile.c @@ -88,6 +88,7 @@ midgard_block_add_successor(midgard_block *block, midgard_block *successor) static midgard_instruction m_##name(unsigned ssa, unsigned address) { \ midgard_instruction i = { \ .type = TAG_LOAD_STORE_4, \ + .mask = 0xF, \ .ssa_args = { \ .rname = ssa, \ .uname = -1, \ @@ -95,7 +96,6 @@ midgard_block_add_successor(midgard_block *block, midgard_block *successor) }, \ .load_store = { \ .op = midgard_op_##name, \ - .mask = 0xF, \ .swizzle = SWIZZLE_XYZW, \ .address = address \ } \ @@ -596,6 +596,7 @@ emit_condition(compiler_context *ctx, nir_src *src, bool for_branch, unsigned co /* We need to set the conditional as close as possible */ .precede_break = true, .unit = for_branch ? UNIT_SMUL : UNIT_SADD, + .mask = 1 << COMPONENT_W, .ssa_args = { .src0 = condition, @@ -608,7 +609,6 @@ emit_condition(compiler_context *ctx, nir_src *src, bool for_branch, unsigned co .outmod = midgard_outmod_int_wrap, .reg_mode = midgard_reg_mode_32, .dest_override = midgard_dest_override_none, - .mask = (0x3 << 6), /* w */ .src1 = vector_alu_srco_unsigned(alu_src), .src2 = vector_alu_srco_unsigned(alu_src) }, @@ -637,6 +637,7 @@ emit_condition_mixed(compiler_context *ctx, nir_alu_src *src, unsigned nr_comp) midgard_instruction ins = { .type = TAG_ALU_4, .precede_break = true, + .mask = mask_of(nr_comp), .ssa_args = { .src0 = condition, .src1 = condition, @@ -647,7 +648,6 @@ emit_condition_mixed(compiler_context *ctx, nir_alu_src *src, unsigned nr_comp) .outmod = midgard_outmod_int_wrap, .reg_mode = midgard_reg_mode_32, .dest_override = midgard_dest_override_none, - .mask = expand_writemask(mask_of(nr_comp)), .src1 = vector_alu_srco_unsigned(alu_src), .src2 = vector_alu_srco_unsigned(alu_src) }, @@ -668,6 +668,7 @@ emit_indirect_offset(compiler_context *ctx, nir_src *src) midgard_instruction ins = { .type = TAG_ALU_4, + .mask = 1 << COMPONENT_W, .ssa_args = { .src0 = SSA_UNUSED_1, .src1 = offset, @@ -678,7 +679,6 @@ emit_indirect_offset(compiler_context *ctx, nir_src *src) .outmod = midgard_outmod_int_wrap, .reg_mode = midgard_reg_mode_32, .dest_override = midgard_dest_override_none, - .mask = (0x3 << 6), /* w */ .src1 = vector_alu_srco_unsigned(zero_alu_src), .src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx) }, @@ -1062,15 +1062,14 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) bool is_int = midgard_is_integer_op(op); + ins.mask = mask_of(nr_components); + midgard_vector_alu alu = { .op = op, .reg_mode = reg_mode, .dest_override = dest_override, .outmod = outmod, - /* Writemask only valid for non-SSA NIR */ - .mask = expand_writemask(mask_of(nr_components)), - .src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[0], is_int, broadcast_swizzle, half_1, sext_1)), .src2 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[1], is_int, broadcast_swizzle, half_2, sext_2)), }; @@ -1078,7 +1077,7 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) /* Apply writemask if non-SSA, keeping in mind that we can't write to components that don't exist */ if (!is_ssa) - alu.mask &= expand_writemask(instr->dest.write_mask); + ins.mask &= instr->dest.write_mask; ins.alu = alu; @@ -1123,15 +1122,16 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) uint8_t original_swizzle[4]; memcpy(original_swizzle, nirmods[0]->swizzle, sizeof(nirmods[0]->swizzle)); + unsigned orig_mask = ins.mask; for (int i = 0; i < nr_components; ++i) { /* Mask the associated component, dropping the * instruction if needed */ - ins.alu.mask = (0x3) << (2 * i); - ins.alu.mask &= alu.mask; + ins.mask = 1 << i; + ins.mask &= orig_mask; - if (!ins.alu.mask) + if (!ins.mask) continue; for (int j = 0; j < 4; ++j) @@ -1203,7 +1203,7 @@ emit_varying_read( /* TODO: swizzle, mask */ midgard_instruction ins = m_ld_vary_32(dest, offset); - ins.load_store.mask = mask_of(nr_comp); + ins.mask = mask_of(nr_comp); ins.load_store.swizzle = SWIZZLE_XYZW >> (2 * component); midgard_varying_parameter p = { @@ -1342,7 +1342,7 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) } else if (ctx->stage == MESA_SHADER_VERTEX) { midgard_instruction ins = m_ld_attr_32(reg, offset); ins.load_store.unknown = 0x1E1E; /* XXX: What is this? */ - ins.load_store.mask = mask_of(nr_comp); + ins.mask = mask_of(nr_comp); /* Use the type appropriate load */ switch (t) { @@ -1574,6 +1574,7 @@ emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, /* No helper to build texture words -- we do it all here */ midgard_instruction ins = { .type = TAG_TEXTURE_4, + .mask = 0xF, .texture = { .op = midgard_texop, .format = midgard_tex_format(instr->sampler_dim), @@ -1582,7 +1583,6 @@ emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, /* TODO: Regalloc it in */ .swizzle = SWIZZLE_XYZW, - .mask = 0xF, /* TODO: half */ .in_reg_full = 1, @@ -1616,7 +1616,7 @@ emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, midgard_instruction st = m_st_cubemap_coords(reg, 0); st.load_store.unknown = 0x24; /* XXX: What is this? */ - st.load_store.mask = 0x3; /* xy */ + st.mask = 0x3; /* xy */ st.load_store.swizzle = alu_src.swizzle; emit_mir_instruction(ctx, st); @@ -1625,7 +1625,7 @@ emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, ins.texture.in_reg_swizzle = alu_src.swizzle = swizzle_of(nr_comp); midgard_instruction mov = v_mov(index, alu_src, reg); - mov.alu.mask = expand_writemask(mask_of(nr_comp)); + mov.mask = mask_of(nr_comp); emit_mir_instruction(ctx, mov); if (midgard_texop == TEXTURE_OP_TEXEL_FETCH) { @@ -1642,7 +1642,7 @@ emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, zero.ssa_args.inline_constant = true; zero.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT); zero.has_constants = true; - zero.alu.mask = ~mov.alu.mask; + zero.mask = ~mov.mask; emit_mir_instruction(ctx, zero); ins.texture.in_reg_swizzle = SWIZZLE_XYZZ; @@ -1673,7 +1673,7 @@ emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, alu_src.swizzle = SWIZZLE_XXXX; midgard_instruction mov = v_mov(index, alu_src, reg); - mov.alu.mask = expand_writemask(1 << COMPONENT_W); + mov.mask = 1 << COMPONENT_W; emit_mir_instruction(ctx, mov); ins.texture.lod_register = true; @@ -1969,7 +1969,7 @@ embedded_to_inline_constant(compiler_context *ctx) uint32_t value = cons[component]; bool is_vector = false; - unsigned mask = effective_writemask(&ins->alu); + unsigned mask = effective_writemask(&ins->alu, ins->mask); for (int c = 1; c < 4; ++c) { /* We only care if this component is actually used */ @@ -2097,13 +2097,12 @@ mir_nontrivial_mod(midgard_vector_alu_src src, bool is_int, unsigned mask) static bool mir_nontrivial_source2_mod(midgard_instruction *ins) { - unsigned mask = squeeze_writemask(ins->alu.mask); bool is_int = midgard_is_integer_op(ins->alu.op); midgard_vector_alu_src src2 = vector_alu_from_unsigned(ins->alu.src2); - return mir_nontrivial_mod(src2, is_int, mask); + return mir_nontrivial_mod(src2, is_int, ins->mask); } static bool @@ -2250,7 +2249,7 @@ midgard_opt_copy_prop_tex(compiler_context *ctx, midgard_block *block) if (v->ssa_args.dest == from) { /* We don't want to track partial writes ... */ - if (v->alu.mask == 0xF) { + if (v->mask == 0xF) { v->ssa_args.dest = to; eliminated = true; } diff --git a/src/gallium/drivers/panfrost/midgard/midgard_emit.c b/src/gallium/drivers/panfrost/midgard/midgard_emit.c index 701ef1074ff..2a71d1c0da1 100644 --- a/src/gallium/drivers/panfrost/midgard/midgard_emit.c +++ b/src/gallium/drivers/panfrost/midgard/midgard_emit.c @@ -29,15 +29,11 @@ * this, we just demote vector ALU payloads to scalar. */ static int -component_from_mask(unsigned mask, bool full) +component_from_mask(unsigned mask) { for (int c = 0; c < 8; ++c) { if (mask & (1 << c)) return c; - - /* Full uses every other bit */ - if (full) - c++; } assert(0); @@ -102,9 +98,15 @@ vector_to_scalar_alu(midgard_vector_alu v, midgard_instruction *ins) .unknown = 0, .outmod = v.outmod, .output_full = is_full, - .output_component = component_from_mask(v.mask, is_full), + .output_component = component_from_mask(ins->mask), }; + /* Full components are physically spaced out */ + if (is_full) { + assert(s.output_component < 4); + s.output_component <<= 1; + } + /* Inline constant is passed along rather than trying to extract it * from v */ @@ -156,6 +158,11 @@ emit_alu_bundle(compiler_context *ctx, midgard_scalar_alu scalarized; if (ins->unit & UNITS_ANY_VECTOR) { + if (ins->alu.reg_mode == midgard_reg_mode_32) + ins->alu.mask = expand_writemask_32(ins->mask); + else + ins->alu.mask = ins->mask; + size = sizeof(midgard_vector_alu); source = &ins->alu; } else if (ins->unit == ALU_ENAB_BR_COMPACT) { @@ -209,6 +216,13 @@ emit_binary_bundle(compiler_context *ctx, uint64_t current64, next64 = LDST_NOP; + /* Copy masks */ + + for (unsigned i = 0; i < bundle->instruction_count; ++i) { + bundle->instructions[i]->load_store.mask = + bundle->instructions[i]->mask; + } + memcpy(¤t64, &bundle->instructions[0]->load_store, sizeof(current64)); if (bundle->instruction_count == 2) @@ -236,6 +250,7 @@ emit_binary_bundle(compiler_context *ctx, ins->texture.type = bundle->tag; ins->texture.next_type = next_tag; + ins->texture.mask = ins->mask; ctx->texture_op_count--; diff --git a/src/gallium/drivers/panfrost/midgard/midgard_ops.h b/src/gallium/drivers/panfrost/midgard/midgard_ops.h index 78b999a8dc6..64c91a5bcac 100644 --- a/src/gallium/drivers/panfrost/midgard/midgard_ops.h +++ b/src/gallium/drivers/panfrost/midgard/midgard_ops.h @@ -53,8 +53,9 @@ midgard_is_integer_out_op(int op) } /* Determines effective writemask, taking quirks and expansion into account */ + static inline unsigned -effective_writemask(midgard_vector_alu *alu) +effective_writemask(midgard_vector_alu *alu, unsigned existing_mask) { /* Channel count is off-by-one to fit in two-bits (0 channel makes no * sense) */ @@ -66,9 +67,7 @@ effective_writemask(midgard_vector_alu *alu) if (channel_count) return (1 << channel_count) - 1; - /* Otherwise, just squeeze the existing mask */ - return squeeze_writemask(alu->mask); -} - + return existing_mask; +}; diff --git a/src/gallium/drivers/panfrost/midgard/midgard_ra.c b/src/gallium/drivers/panfrost/midgard/midgard_ra.c index 82760aa42af..9dae43dfff0 100644 --- a/src/gallium/drivers/panfrost/midgard/midgard_ra.c +++ b/src/gallium/drivers/panfrost/midgard/midgard_ra.c @@ -293,16 +293,7 @@ allocate_registers(compiler_context *ctx) if (ins->ssa_args.dest < 0) continue; if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue; - /* Default to vec4 if we're not sure */ - - int mask = 0xF; - - if (ins->type == TAG_ALU_4) - mask = squeeze_writemask(ins->alu.mask); - else if (ins->type == TAG_LOAD_STORE_4) - mask = ins->load_store.mask; - - int class = util_logbase2(mask) + 1; + int class = util_logbase2(ins->mask) + 1; /* Use the largest class if there's ambiguity, this * handles partial writes */ @@ -430,16 +421,16 @@ install_registers_instr( struct phys_reg src2 = index_to_reg(ctx, g, adjusted_src); struct phys_reg dest = index_to_reg(ctx, g, args.dest); - unsigned mask = squeeze_writemask(ins->alu.mask); - ins->alu.mask = expand_writemask(compose_writemask(mask, dest)); + unsigned uncomposed_mask = ins->mask; + ins->mask = compose_writemask(uncomposed_mask, dest); /* Adjust the dest mask if necessary. Mostly this is a no-op * but it matters for dot products */ - dest.mask = effective_writemask(&ins->alu); + dest.mask = effective_writemask(&ins->alu, ins->mask); midgard_vector_alu_src mod1 = vector_alu_from_unsigned(ins->alu.src1); - mod1.swizzle = compose_swizzle(mod1.swizzle, mask, src1, dest); + mod1.swizzle = compose_swizzle(mod1.swizzle, uncomposed_mask, src1, dest); ins->alu.src1 = vector_alu_srco_unsigned(mod1); ins->registers.src1_reg = src1.reg; @@ -461,7 +452,7 @@ install_registers_instr( midgard_vector_alu_src mod2 = vector_alu_from_unsigned(ins->alu.src2); mod2.swizzle = compose_swizzle( - mod2.swizzle, mask, src2, dest); + mod2.swizzle, uncomposed_mask, src2, dest); ins->alu.src2 = vector_alu_srco_unsigned(mod2); ins->registers.src2_reg = src2.reg; @@ -490,8 +481,8 @@ install_registers_instr( ins->load_store.swizzle, 0xF, default_phys_reg(0), src); - ins->load_store.mask = compose_writemask( - ins->load_store.mask, src); + ins->mask = compose_writemask( + ins->mask, src); } break; diff --git a/src/gallium/drivers/panfrost/midgard/midgard_schedule.c b/src/gallium/drivers/panfrost/midgard/midgard_schedule.c index 77738731b8a..caa29b7a2e4 100644 --- a/src/gallium/drivers/panfrost/midgard/midgard_schedule.c +++ b/src/gallium/drivers/panfrost/midgard/midgard_schedule.c @@ -44,17 +44,13 @@ swizzle_to_access_mask(unsigned swizzle) /* Does the mask cover more than a scalar? */ static bool -is_single_component_mask(unsigned mask, bool full) +is_single_component_mask(unsigned mask) { int components = 0; for (int c = 0; c < 8; ++c) { if (mask & (1 << c)) components++; - - /* Full uses 2-bit components */ - if (full) - c++; } return components == 1; @@ -72,7 +68,7 @@ can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second) /* Figure out where exactly we wrote to */ int source = first->ssa_args.dest; - int source_mask = first->type == TAG_ALU_4 ? squeeze_writemask(first->alu.mask) : 0xF; + int source_mask = first->mask; /* As long as the second doesn't read from the first, we're okay */ if (second->ssa_args.src0 == source) { @@ -98,9 +94,8 @@ can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second) if (second->ssa_args.dest == source) { /* ...but only if the components overlap */ - int dest_mask = second->type == TAG_ALU_4 ? squeeze_writemask(second->alu.mask) : 0xF; - if (dest_mask & source_mask) + if (second->mask & source_mask) return false; } @@ -198,13 +193,14 @@ schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction bool vectorable = units & UNITS_ANY_VECTOR; bool scalarable = units & UNITS_SCALAR; - bool full = ains->alu.reg_mode == midgard_reg_mode_32; - bool could_scalar = is_single_component_mask(ains->alu.mask, full); - bool vector = vectorable && !(could_scalar && scalarable); + bool could_scalar = is_single_component_mask(ains->mask); /* Only 16/32-bit can run on a scalar unit */ could_scalar &= ains->alu.reg_mode != midgard_reg_mode_8; could_scalar &= ains->alu.reg_mode != midgard_reg_mode_64; + could_scalar &= ains->alu.dest_override == midgard_dest_override_none; + + bool vector = vectorable && !(could_scalar && scalarable); /* TODO: Check ahead-of-time for other scalar * hazards that otherwise get aborted out */ -- 2.30.2