X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fpanfrost%2Fmidgard%2Fmidgard_compile.c;h=a98e2b3280517f18c1baefd83171022647cd5bc3;hb=d8c16200e9730e4f4f56dc1478dc72dccce26203;hp=a47cc9bb791ca6a34adf102e72062a2357c6e781;hpb=449e5ded9340243b68183d7fffcc838cf283c89c;p=mesa.git diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c index a47cc9bb791..a98e2b32805 100644 --- a/src/panfrost/midgard/midgard_compile.c +++ b/src/panfrost/midgard/midgard_compile.c @@ -117,6 +117,7 @@ schedule_barrier(compiler_context *ctx) if (store) { \ i.src[0] = ssa; \ i.src_types[0] = T; \ + i.dest_type = T; \ } else { \ i.dest = ssa; \ i.dest_type = T; \ @@ -127,44 +128,6 @@ schedule_barrier(compiler_context *ctx) #define M_LOAD(name, T) M_LOAD_STORE(name, false, T) #define M_STORE(name, T) M_LOAD_STORE(name, true, T) -/* Inputs a NIR ALU source, with modifiers attached if necessary, and outputs - * the corresponding Midgard source */ - -static midgard_vector_alu_src -vector_alu_modifiers(bool abs, bool neg, bool is_int, - bool half, bool sext) -{ - /* Figure out how many components there are so we can adjust. - * Specifically we want to broadcast the last channel so things like - * ball2/3 work. - */ - - midgard_vector_alu_src alu_src = { - .rep_low = 0, - .rep_high = 0, - .half = half - }; - - if (is_int) { - alu_src.mod = midgard_int_normal; - - /* Sign/zero-extend if needed */ - - if (half) { - alu_src.mod = sext ? - midgard_int_sign_extend - : midgard_int_zero_extend; - } - - /* These should have been lowered away */ - assert(!(abs || neg)); - } else { - alu_src.mod = (abs << 0) | (neg << 1); - } - - return alu_src; -} - M_LOAD(ld_attr_32, nir_type_uint32); M_LOAD(ld_vary_32, nir_type_uint32); M_LOAD(ld_ubo_int4, nir_type_uint32); @@ -572,21 +535,54 @@ nir_is_non_scalar_swizzle(nir_alu_src *src, unsigned nr_components) assert(src_bitsize == dst_bitsize); \ break; +#define ALU_CHECK_CMP(sext) \ + assert(src_bitsize == 16 || src_bitsize == 32); \ + assert(dst_bitsize == 16 || dst_bitsize == 32); \ + #define ALU_CASE_BCAST(nir, _op, count) \ case nir_op_##nir: \ op = midgard_alu_op_##_op; \ broadcast_swizzle = count; \ - assert(src_bitsize == dst_bitsize); \ + ALU_CHECK_CMP(true); \ break; -/* Analyze the sizes of the inputs to determine which reg mode. Ops needed - * special treatment override this anyway. */ + +#define ALU_CASE_CMP(nir, _op, sext) \ + case nir_op_##nir: \ + op = midgard_alu_op_##_op; \ + ALU_CHECK_CMP(sext); \ + break; + +/* Analyze the sizes of the dest and inputs to determine reg mode. */ static midgard_reg_mode reg_mode_for_nir(nir_alu_instr *instr) { unsigned src_bitsize = nir_src_bit_size(instr->src[0].src); + unsigned dst_bitsize = nir_dest_bit_size(instr->dest.dest); + unsigned max_bitsize = MAX2(src_bitsize, dst_bitsize); + + /* We don't have fp16 LUTs, so we'll want to emit code like: + * + * vlut.fsinr hr0, hr0 + * + * where both input and output are 16-bit but the operation is carried + * out in 32-bit + */ - switch (src_bitsize) { + switch (instr->op) { + case nir_op_fsqrt: + case nir_op_frcp: + case nir_op_frsq: + case nir_op_fsin: + case nir_op_fcos: + case nir_op_fexp2: + case nir_op_flog2: + max_bitsize = MAX2(max_bitsize, 32); + default: + break; + } + + switch (max_bitsize) { case 8: return midgard_reg_mode_8; case 16: @@ -606,7 +602,7 @@ nir_accepts_inot(nir_op op, unsigned src) { switch (op) { case nir_op_ior: - case nir_op_iand: + case nir_op_iand: /* TODO: b2f16 */ case nir_op_ixor: return true; case nir_op_b32csel: @@ -617,6 +613,18 @@ nir_accepts_inot(nir_op op, unsigned src) } } +static bool +mir_accept_dest_mod(compiler_context *ctx, nir_dest **dest, nir_op op) +{ + if (pan_has_dest_mod(dest, op)) { + assert((*dest)->is_ssa); + BITSET_SET(ctx->already_emitted, (*dest)->ssa.index); + return true; + } + + return false; +} + static void mir_copy_src(midgard_instruction *ins, nir_alu_instr *instr, unsigned i, unsigned to, bool *abs, bool *neg, bool *not, bool is_int, unsigned bcast_count) { @@ -645,6 +653,57 @@ mir_copy_src(midgard_instruction *ins, nir_alu_instr *instr, unsigned i, unsigne } } +/* Midgard features both fcsel and icsel, depending on whether you want int or + * float modifiers. NIR's csel is typeless, so we want a heuristic to guess if + * we should emit an int or float csel depending on what modifiers could be + * placed. In the absense of modifiers, this is probably arbitrary. */ + +static bool +mir_is_bcsel_float(nir_alu_instr *instr) +{ + nir_op intmods[] = { + nir_op_i2i8, nir_op_i2i16, + nir_op_i2i32, nir_op_i2i64 + }; + + nir_op floatmods[] = { + nir_op_fabs, nir_op_fneg, + nir_op_f2f16, nir_op_f2f32, + nir_op_f2f64 + }; + + nir_op floatdestmods[] = { + nir_op_fsat, nir_op_fsat_signed, nir_op_fclamp_pos, + nir_op_f2f16, nir_op_f2f32 + }; + + signed score = 0; + + for (unsigned i = 1; i < 3; ++i) { + nir_alu_src s = instr->src[i]; + for (unsigned q = 0; q < ARRAY_SIZE(intmods); ++q) { + if (pan_has_source_mod(&s, intmods[q])) + score--; + } + } + + for (unsigned i = 1; i < 3; ++i) { + nir_alu_src s = instr->src[i]; + for (unsigned q = 0; q < ARRAY_SIZE(floatmods); ++q) { + if (pan_has_source_mod(&s, floatmods[q])) + score++; + } + } + + for (unsigned q = 0; q < ARRAY_SIZE(floatdestmods); ++q) { + nir_dest *dest = &instr->dest.dest; + if (pan_has_dest_mod(&dest, floatdestmods[q])) + score++; + } + + return (score > 0); +} + static void emit_alu(compiler_context *ctx, nir_alu_instr *instr) { @@ -678,17 +737,6 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) midgard_reg_mode reg_mode = reg_mode_for_nir(instr); - /* Do we need a destination override? Used for inline - * type conversion */ - - midgard_dest_override dest_override = - midgard_dest_override_none; - - /* Should we use a smaller respective source and sign-extend? */ - - bool half_1 = false, sext_1 = false; - bool half_2 = false, sext_2 = false; - /* Should we swap arguments? */ bool flip_src12 = false; @@ -719,13 +767,13 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) ALU_CASE(mov, imov); - ALU_CASE(feq32, feq); - ALU_CASE(fne32, fne); - ALU_CASE(flt32, flt); - ALU_CASE(ieq32, ieq); - ALU_CASE(ine32, ine); - ALU_CASE(ilt32, ilt); - ALU_CASE(ult32, ult); + ALU_CASE_CMP(feq32, feq, false); + ALU_CASE_CMP(fne32, fne, false); + ALU_CASE_CMP(flt32, flt, false); + ALU_CASE_CMP(ieq32, ieq, true); + ALU_CASE_CMP(ine32, ine, true); + ALU_CASE_CMP(ilt32, ilt, true); + ALU_CASE_CMP(ult32, ult, false); /* We don't have a native b2f32 instruction. Instead, like many * GPUs, we exploit booleans as 0/~0 for false/true, and @@ -738,14 +786,15 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) * At the end of emit_alu (as MIR), we'll fix-up the constant */ - ALU_CASE(b2f32, iand); - ALU_CASE(b2i32, iand); + ALU_CASE_CMP(b2f32, iand, true); + ALU_CASE_CMP(b2f16, iand, true); + ALU_CASE_CMP(b2i32, iand, true); /* Likewise, we don't have a dedicated f2b32 instruction, but * we can do a "not equal to 0.0" test. */ - ALU_CASE(f2b32, fne); - ALU_CASE(i2b32, ine); + ALU_CASE_CMP(f2b32, fne, false); + ALU_CASE_CMP(i2b32, ine, true); ALU_CASE(frcp, frcp); ALU_CASE(frsq, frsqrt); @@ -783,19 +832,19 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) ALU_CASE_BCAST(b32all_fequal2, fball_eq, 2); ALU_CASE_BCAST(b32all_fequal3, fball_eq, 3); - ALU_CASE(b32all_fequal4, fball_eq); + ALU_CASE_CMP(b32all_fequal4, fball_eq, true); ALU_CASE_BCAST(b32any_fnequal2, fbany_neq, 2); ALU_CASE_BCAST(b32any_fnequal3, fbany_neq, 3); - ALU_CASE(b32any_fnequal4, fbany_neq); + ALU_CASE_CMP(b32any_fnequal4, fbany_neq, true); ALU_CASE_BCAST(b32all_iequal2, iball_eq, 2); ALU_CASE_BCAST(b32all_iequal3, iball_eq, 3); - ALU_CASE(b32all_iequal4, iball_eq); + ALU_CASE_CMP(b32all_iequal4, iball_eq, true); ALU_CASE_BCAST(b32any_inequal2, ibany_neq, 2); ALU_CASE_BCAST(b32any_inequal3, ibany_neq, 3); - ALU_CASE(b32any_inequal4, ibany_neq); + ALU_CASE_CMP(b32any_inequal4, ibany_neq, true); /* Source mods will be shoved in later */ ALU_CASE(fabs, fmov); @@ -815,11 +864,6 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) case nir_op_i2i16: case nir_op_i2i32: case nir_op_i2i64: - /* If we end up upscale, we'll need a sign-extend on the - * operand (the second argument) */ - - sext_2 = true; - /* fallthrough */ case nir_op_u2u8: case nir_op_u2u16: case nir_op_u2u32: @@ -833,17 +877,6 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) else op = midgard_alu_op_imov; - if (dst_bitsize == (src_bitsize * 2)) { - /* Converting up */ - half_2 = true; - - /* Use a greater register mode */ - reg_mode++; - } else if (src_bitsize == (dst_bitsize * 2)) { - /* Converting down */ - dest_override = midgard_dest_override_lower; - } - break; } @@ -862,21 +895,16 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) 0; flip_src12 = true; + ALU_CHECK_CMP(false); break; } case nir_op_b32csel: { - /* Midgard features both fcsel and icsel, depending on - * the type of the arguments/output. However, as long - * as we're careful we can _always_ use icsel and - * _never_ need fcsel, since the latter does additional - * floating-point-specific processing whereas the - * former just moves bits on the wire. It's not obvious - * why these are separate opcodes, save for the ability - * to do things like sat/pos/abs/neg for free */ - bool mixed = nir_is_non_scalar_swizzle(&instr->src[0], nr_components); - op = mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel; + bool is_float = mir_is_bcsel_float(instr); + op = is_float ? + (mixed ? midgard_alu_op_fcsel_v : midgard_alu_op_fcsel) : + (mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel); break; } @@ -887,12 +915,17 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) return; } + /* Promote imov to fmov if it might help inline a constant */ + if (op == midgard_alu_op_imov && nir_src_is_const(instr->src[0].src) + && nir_src_bit_size(instr->src[0].src) == 32 + && nir_is_same_comp_swizzle(instr->src[0].swizzle, + nir_src_num_components(instr->src[0].src))) { + op = midgard_alu_op_fmov; + } + /* Midgard can perform certain modifiers on output of an ALU op */ unsigned outmod = 0; - - bool abs[4] = { false }; - bool neg[4] = { false }; bool is_int = midgard_is_integer_op(op); if (midgard_is_integer_out_op(op)) { @@ -909,6 +942,33 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) unsigned opcode_props = alu_opcode_props[op].props; bool quirk_flipped_r24 = opcode_props & QUIRK_FLIPPED_R24; + /* Look for floating point mods. We have the mods fsat, fsat_signed, + * and fpos. We also have the relations (note 3 * 2 = 6 cases): + * + * fsat_signed(fpos(x)) = fsat(x) + * fsat_signed(fsat(x)) = fsat(x) + * fpos(fsat_signed(x)) = fsat(x) + * fpos(fsat(x)) = fsat(x) + * fsat(fsat_signed(x)) = fsat(x) + * fsat(fpos(x)) = fsat(x) + * + * So by cases any composition of output modifiers is equivalent to + * fsat alone. + */ + + if (!is_int && !(opcode_props & OP_TYPE_CONVERT)) { + bool fpos = mir_accept_dest_mod(ctx, &dest, nir_op_fclamp_pos); + bool fsat = mir_accept_dest_mod(ctx, &dest, nir_op_fsat); + bool ssat = mir_accept_dest_mod(ctx, &dest, nir_op_fsat_signed); + bool prior = (outmod != midgard_outmod_none); + int count = (int) prior + (int) fpos + (int) ssat + (int) fsat; + + outmod = ((count > 1) || fsat) ? midgard_outmod_sat : + fpos ? midgard_outmod_pos : + ssat ? midgard_outmod_sat_signed : + outmod; + } + midgard_instruction ins = { .type = TAG_ALU_4, .dest = nir_dest_index(dest), @@ -921,7 +981,7 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) if (quirk_flipped_r24) { ins.src[0] = ~0; - mir_copy_src(&ins, instr, 0, 1, &abs[1], &neg[1], &ins.src_invert[1], is_int, broadcast_swizzle); + mir_copy_src(&ins, instr, 0, 1, &ins.src_abs[1], &ins.src_neg[1], &ins.src_invert[1], is_int, broadcast_swizzle); } else { for (unsigned i = 0; i < nr_inputs; ++i) { unsigned to = i; @@ -942,7 +1002,7 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) to = 1 - to; } - mir_copy_src(&ins, instr, i, to, &abs[to], &neg[to], &ins.src_invert[to], is_int, broadcast_swizzle); + mir_copy_src(&ins, instr, i, to, &ins.src_abs[to], &ins.src_neg[to], &ins.src_invert[to], is_int, broadcast_swizzle); /* (!c) ? a : b = c ? b : a */ if (instr->op == nir_op_b32csel && ins.src_invert[2]) { @@ -955,10 +1015,10 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) if (instr->op == nir_op_fneg || instr->op == nir_op_fabs) { /* Lowered to move */ if (instr->op == nir_op_fneg) - neg[1] = !neg[1]; + ins.src_neg[1] ^= true; if (instr->op == nir_op_fabs) - abs[1] = true; + ins.src_abs[1] = true; } ins.mask = mask_of(nr_components); @@ -966,11 +1026,7 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) midgard_vector_alu alu = { .op = op, .reg_mode = reg_mode, - .dest_override = dest_override, .outmod = outmod, - - .src1 = vector_alu_srco_unsigned(vector_alu_modifiers(abs[0], neg[0], is_int, half_1, sext_1)), - .src2 = vector_alu_srco_unsigned(vector_alu_modifiers(abs[1], neg[1], is_int, half_2, sext_2)), }; /* Apply writemask if non-SSA, keeping in mind that we can't write to @@ -982,13 +1038,6 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) ins.alu = alu; - /* Arrange for creation of iandnot/iornot */ - if (ins.src_invert[0] && !ins.src_invert[1]) { - mir_flip(&ins); - ins.src_invert[0] = false; - ins.src_invert[1] = true; - } - /* Late fixup for emulated instructions */ if (instr->op == nir_op_b2f32 || instr->op == nir_op_b2i32) { @@ -1007,6 +1056,14 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) else ins.constants.i32[0] = 1; + for (unsigned c = 0; c < 16; ++c) + ins.swizzle[1][c] = 0; + } else if (instr->op == nir_op_b2f16) { + ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + ins.src_types[1] = nir_type_float16; + ins.has_constants = true; + ins.constants.i16[0] = _mesa_float_to_half(1.0); + for (unsigned c = 0; c < 16; ++c) ins.swizzle[1][c] = 0; } else if (nr_inputs == 1 && !quirk_flipped_r24) { @@ -1021,6 +1078,13 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) ins.swizzle[1][c] = 0; } + /* Arrange for creation of iandnot/iornot */ + if (ins.src_invert[0] && !ins.src_invert[1]) { + mir_flip(&ins); + ins.src_invert[0] = false; + ins.src_invert[1] = true; + } + if ((opcode_props & UNITS_ALL) == UNIT_VLUT) { /* To avoid duplicating the lookup tables (probably), true LUT * instructions can only operate as if they were scalars. Lower @@ -1071,9 +1135,7 @@ mir_set_intr_mask(nir_instr *instr, midgard_instruction *ins, bool is_read) /* Once we have the NIR mask, we need to normalize to work in 32-bit space */ unsigned bytemask = pan_to_bytemask(dsize, nir_mask); mir_set_bytemask(ins, bytemask); - - if (dsize == 64) - ins->load_64 = true; + ins->dest_type = nir_type_uint | dsize; } /* Uniforms and UBOs use a shared code path, as uniforms are just (slightly @@ -1326,6 +1388,7 @@ emit_control_barrier(compiler_context *ctx) { midgard_instruction ins = { .type = TAG_TEXTURE_4, + .dest = ~0, .src = { ~0, ~0, ~0, ~0 }, .texture = { .op = TEXTURE_OP_BARRIER, @@ -1350,6 +1413,19 @@ search_var(struct exec_list *vars, unsigned driver_loc) return NULL; } +static unsigned +mir_get_branch_cond(nir_src *src, bool *invert) +{ + /* Wrap it. No swizzle since it's a scalar */ + + nir_alu_src alu = { + .src = *src + }; + + *invert = pan_has_source_mod(&alu, nir_op_inot); + return nir_src_index(NULL, &alu.src); +} + static void emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) { @@ -1363,7 +1439,8 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) discard.branch.target_type = TARGET_DISCARD; if (conditional) { - discard.src[0] = nir_src_index(ctx, &instr->src[0]); + discard.src[0] = mir_get_branch_cond(&instr->src[0], + &discard.branch.invert_conditional); discard.src_types[0] = nir_type_uint32; } @@ -1473,8 +1550,10 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) ld.load_store.arg_2 = 0x1E; } - for (unsigned c = 2; c < 16; ++c) + for (unsigned c = 4; c < 16; ++c) ld.swizzle[0][c] = 0; + + ld.dest_type = nir_type_float16; } emit_mir_instruction(ctx, ld); @@ -1693,7 +1772,7 @@ midgard_tex_format(enum glsl_sampler_dim dim) } } -/* Tries to attach an explicit LOD / bias as a constant. Returns whether this +/* Tries to attach an explicit LOD or bias as a constant. Returns whether this * was successful */ static bool @@ -2077,13 +2156,6 @@ embedded_to_inline_constant(compiler_context *ctx, midgard_block *block) } if (ins->src[1] == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) { - /* Extract the source information */ - - midgard_vector_alu_src *src; - int q = ins->alu.src2; - midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q; - src = m; - /* Component is from the swizzle. Take a nonzero component */ assert(ins->mask); unsigned first_comp = ffs(ins->mask) - 1; @@ -2117,12 +2189,9 @@ embedded_to_inline_constant(compiler_context *ctx, midgard_block *block) continue; } - /* We don't know how to handle these with a constant */ - - if (mir_nontrivial_source2_mod_simple(ins) || src->rep_low || src->rep_high) { - DBG("Bailing inline constant...\n"); + /* Should've been const folded */ + if (ins->src_abs[1] || ins->src_neg[1]) continue; - } /* Make sure that the constant is not itself a vector * by checking if all accessed values are the same. */ @@ -2230,10 +2299,12 @@ emit_if(struct compiler_context *ctx, nir_if *nif) midgard_block *before_block = ctx->current_block; /* Speculatively emit the branch, but we can't fill it in until later */ + bool inv = false; EMIT(branch, true, true); midgard_instruction *then_branch = mir_last_in_block(ctx->current_block); - then_branch->src[0] = nir_src_index(ctx, &nif->condition); + then_branch->src[0] = mir_get_branch_cond(&nif->condition, &inv); then_branch->src_types[0] = nir_type_uint32; + then_branch->branch.invert_conditional = !inv; /* Emit the two subblocks. */ midgard_block *then_block = emit_cf_list(ctx, &nif->then_list); @@ -2513,7 +2584,6 @@ midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_b mir_foreach_block(ctx, _block) { midgard_block *block = (midgard_block *) _block; inline_alu_constants(ctx, block); - midgard_opt_promote_fmov(ctx, block); embedded_to_inline_constant(ctx, block); } /* MIR-level optimizations */ @@ -2522,43 +2592,22 @@ midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_b do { progress = false; + progress |= midgard_opt_dead_code_eliminate(ctx); mir_foreach_block(ctx, _block) { midgard_block *block = (midgard_block *) _block; progress |= midgard_opt_copy_prop(ctx, block); - progress |= midgard_opt_dead_code_eliminate(ctx, block); progress |= midgard_opt_combine_projection(ctx, block); progress |= midgard_opt_varying_projection(ctx, block); -#if 0 - progress |= midgard_opt_not_propagate(ctx, block); - progress |= midgard_opt_fuse_src_invert(ctx, block); - progress |= midgard_opt_fuse_dest_invert(ctx, block); - progress |= midgard_opt_csel_invert(ctx, block); - progress |= midgard_opt_drop_cmp_invert(ctx, block); - progress |= midgard_opt_invert_branch(ctx, block); -#endif } } while (progress); mir_foreach_block(ctx, _block) { midgard_block *block = (midgard_block *) _block; - //midgard_lower_invert(ctx, block); midgard_lower_derivatives(ctx, block); - } - - /* Nested control-flow can result in dead branches at the end of the - * block. This messes with our analysis and is just dead code, so cull - * them */ - mir_foreach_block(ctx, _block) { - midgard_block *block = (midgard_block *) _block; midgard_cull_dead_branch(ctx, block); } - /* Ensure we were lowered */ - mir_foreach_instr_global(ctx, ins) { - assert(!ins->invert); - } - if (ctx->stage == MESA_SHADER_FRAGMENT) mir_add_writeout_loops(ctx);