X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fpanfrost%2Fmidgard%2Fmidgard_compile.c;h=a98e2b3280517f18c1baefd83171022647cd5bc3;hb=d8c16200e9730e4f4f56dc1478dc72dccce26203;hp=bf2dadf16b3db828d6a8fa243e0fa7ed807fd080;hpb=d4600c43402b4473a7f0bf741be65ad8835d7f83;p=mesa.git diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c index bf2dadf16b3..a98e2b32805 100644 --- a/src/panfrost/midgard/midgard_compile.c +++ b/src/panfrost/midgard/midgard_compile.c @@ -100,7 +100,7 @@ schedule_barrier(compiler_context *ctx) #define EMIT(op, ...) emit_mir_instruction(ctx, v_##op(__VA_ARGS__)); -#define M_LOAD_STORE(name, store) \ +#define M_LOAD_STORE(name, store, T) \ static midgard_instruction m_##name(unsigned ssa, unsigned address) { \ midgard_instruction i = { \ .type = TAG_LOAD_STORE_4, \ @@ -114,82 +114,29 @@ schedule_barrier(compiler_context *ctx) } \ }; \ \ - if (store) \ + if (store) { \ i.src[0] = ssa; \ - else \ + i.src_types[0] = T; \ + i.dest_type = T; \ + } else { \ i.dest = ssa; \ - \ + i.dest_type = T; \ + } \ return i; \ } -#define M_LOAD(name) M_LOAD_STORE(name, false) -#define M_STORE(name) M_LOAD_STORE(name, true) +#define M_LOAD(name, T) M_LOAD_STORE(name, false, T) +#define M_STORE(name, T) M_LOAD_STORE(name, true, T) -/* Inputs a NIR ALU source, with modifiers attached if necessary, and outputs - * the corresponding Midgard source */ - -static midgard_vector_alu_src -vector_alu_modifiers(nir_alu_src *src, bool is_int, unsigned broadcast_count, - bool half, bool sext) -{ - /* Figure out how many components there are so we can adjust. - * Specifically we want to broadcast the last channel so things like - * ball2/3 work. - */ - - if (broadcast_count && src) { - uint8_t last_component = src->swizzle[broadcast_count - 1]; - - for (unsigned c = broadcast_count; c < NIR_MAX_VEC_COMPONENTS; ++c) { - src->swizzle[c] = last_component; - } - } - - midgard_vector_alu_src alu_src = { - .rep_low = 0, - .rep_high = 0, - .half = half - }; - - if (is_int) { - alu_src.mod = midgard_int_normal; - - /* Sign/zero-extend if needed */ - - if (half) { - alu_src.mod = sext ? - midgard_int_sign_extend - : midgard_int_zero_extend; - } - - /* These should have been lowered away */ - if (src) - assert(!(src->abs || src->negate)); - } else { - if (src) - alu_src.mod = (src->abs << 0) | (src->negate << 1); - } - - return alu_src; -} - -/* load/store instructions have both 32-bit and 16-bit variants, depending on - * whether we are using vectors composed of highp or mediump. At the moment, we - * don't support half-floats -- this requires changes in other parts of the - * compiler -- therefore the 16-bit versions are commented out. */ - -//M_LOAD(ld_attr_16); -M_LOAD(ld_attr_32); -//M_LOAD(ld_vary_16); -M_LOAD(ld_vary_32); -M_LOAD(ld_ubo_int4); -M_LOAD(ld_int4); -M_STORE(st_int4); -M_LOAD(ld_color_buffer_32u); -//M_STORE(st_vary_16); -M_STORE(st_vary_32); -M_LOAD(ld_cubemap_coords); -M_LOAD(ld_compute_id); +M_LOAD(ld_attr_32, nir_type_uint32); +M_LOAD(ld_vary_32, nir_type_uint32); +M_LOAD(ld_ubo_int4, nir_type_uint32); +M_LOAD(ld_int4, nir_type_uint32); +M_STORE(st_int4, nir_type_uint32); +M_LOAD(ld_color_buffer_32u, nir_type_uint32); +M_STORE(st_vary_32, nir_type_uint32); +M_LOAD(ld_cubemap_coords, nir_type_uint32); +M_LOAD(ld_compute_id, nir_type_uint32); static midgard_instruction v_branch(bool conditional, bool invert) @@ -495,18 +442,15 @@ optimise_nir(nir_shader *nir, unsigned quirks) } while (progress); NIR_PASS(progress, nir, nir_opt_algebraic_late); + NIR_PASS(progress, nir, nir_opt_algebraic_distribute_src_mods); /* We implement booleans as 32-bit 0/~0 */ NIR_PASS(progress, nir, nir_lower_bool_to_int32); /* Now that booleans are lowered, we can run out late opts */ NIR_PASS(progress, nir, midgard_nir_lower_algebraic_late); + NIR_PASS(progress, nir, midgard_nir_cancel_inot); - /* Lower mods for float ops only. Integer ops don't support modifiers - * (saturate doesn't make sense on integers, neg/abs require dedicated - * instructions) */ - - NIR_PASS(progress, nir, nir_lower_to_source_mods, nir_lower_float_source_mods); NIR_PASS(progress, nir, nir_copy_prop); NIR_PASS(progress, nir, nir_opt_dce); @@ -591,35 +535,54 @@ nir_is_non_scalar_swizzle(nir_alu_src *src, unsigned nr_components) assert(src_bitsize == dst_bitsize); \ break; +#define ALU_CHECK_CMP(sext) \ + assert(src_bitsize == 16 || src_bitsize == 32); \ + assert(dst_bitsize == 16 || dst_bitsize == 32); \ + #define ALU_CASE_BCAST(nir, _op, count) \ case nir_op_##nir: \ op = midgard_alu_op_##_op; \ broadcast_swizzle = count; \ - assert(src_bitsize == dst_bitsize); \ + ALU_CHECK_CMP(true); \ break; -static bool -nir_is_fzero_constant(nir_src src) -{ - if (!nir_src_is_const(src)) - return false; - - for (unsigned c = 0; c < nir_src_num_components(src); ++c) { - if (nir_src_comp_as_float(src, c) != 0.0) - return false; - } - - return true; -} -/* Analyze the sizes of the inputs to determine which reg mode. Ops needed - * special treatment override this anyway. */ +#define ALU_CASE_CMP(nir, _op, sext) \ + case nir_op_##nir: \ + op = midgard_alu_op_##_op; \ + ALU_CHECK_CMP(sext); \ + break; + +/* Analyze the sizes of the dest and inputs to determine reg mode. */ static midgard_reg_mode reg_mode_for_nir(nir_alu_instr *instr) { unsigned src_bitsize = nir_src_bit_size(instr->src[0].src); + unsigned dst_bitsize = nir_dest_bit_size(instr->dest.dest); + unsigned max_bitsize = MAX2(src_bitsize, dst_bitsize); + + /* We don't have fp16 LUTs, so we'll want to emit code like: + * + * vlut.fsinr hr0, hr0 + * + * where both input and output are 16-bit but the operation is carried + * out in 32-bit + */ - switch (src_bitsize) { + switch (instr->op) { + case nir_op_fsqrt: + case nir_op_frcp: + case nir_op_frsq: + case nir_op_fsin: + case nir_op_fcos: + case nir_op_fexp2: + case nir_op_flog2: + max_bitsize = MAX2(max_bitsize, 32); + default: + break; + } + + switch (max_bitsize) { case 8: return midgard_reg_mode_8; case 16: @@ -633,9 +596,122 @@ reg_mode_for_nir(nir_alu_instr *instr) } } +/* Compare mir_lower_invert */ +static bool +nir_accepts_inot(nir_op op, unsigned src) +{ + switch (op) { + case nir_op_ior: + case nir_op_iand: /* TODO: b2f16 */ + case nir_op_ixor: + return true; + case nir_op_b32csel: + /* Only the condition */ + return (src == 0); + default: + return false; + } +} + +static bool +mir_accept_dest_mod(compiler_context *ctx, nir_dest **dest, nir_op op) +{ + if (pan_has_dest_mod(dest, op)) { + assert((*dest)->is_ssa); + BITSET_SET(ctx->already_emitted, (*dest)->ssa.index); + return true; + } + + return false; +} + +static void +mir_copy_src(midgard_instruction *ins, nir_alu_instr *instr, unsigned i, unsigned to, bool *abs, bool *neg, bool *not, bool is_int, unsigned bcast_count) +{ + nir_alu_src src = instr->src[i]; + + if (!is_int) { + if (pan_has_source_mod(&src, nir_op_fneg)) + *neg = !(*neg); + + if (pan_has_source_mod(&src, nir_op_fabs)) + *abs = true; + } + + if (nir_accepts_inot(instr->op, i) && pan_has_source_mod(&src, nir_op_inot)) + *not = true; + + unsigned bits = nir_src_bit_size(src.src); + + ins->src[to] = nir_src_index(NULL, &src.src); + ins->src_types[to] = nir_op_infos[instr->op].input_types[i] | bits; + + for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; ++c) { + ins->swizzle[to][c] = src.swizzle[ + (!bcast_count || c < bcast_count) ? c : + (bcast_count - 1)]; + } +} + +/* Midgard features both fcsel and icsel, depending on whether you want int or + * float modifiers. NIR's csel is typeless, so we want a heuristic to guess if + * we should emit an int or float csel depending on what modifiers could be + * placed. In the absense of modifiers, this is probably arbitrary. */ + +static bool +mir_is_bcsel_float(nir_alu_instr *instr) +{ + nir_op intmods[] = { + nir_op_i2i8, nir_op_i2i16, + nir_op_i2i32, nir_op_i2i64 + }; + + nir_op floatmods[] = { + nir_op_fabs, nir_op_fneg, + nir_op_f2f16, nir_op_f2f32, + nir_op_f2f64 + }; + + nir_op floatdestmods[] = { + nir_op_fsat, nir_op_fsat_signed, nir_op_fclamp_pos, + nir_op_f2f16, nir_op_f2f32 + }; + + signed score = 0; + + for (unsigned i = 1; i < 3; ++i) { + nir_alu_src s = instr->src[i]; + for (unsigned q = 0; q < ARRAY_SIZE(intmods); ++q) { + if (pan_has_source_mod(&s, intmods[q])) + score--; + } + } + + for (unsigned i = 1; i < 3; ++i) { + nir_alu_src s = instr->src[i]; + for (unsigned q = 0; q < ARRAY_SIZE(floatmods); ++q) { + if (pan_has_source_mod(&s, floatmods[q])) + score++; + } + } + + for (unsigned q = 0; q < ARRAY_SIZE(floatdestmods); ++q) { + nir_dest *dest = &instr->dest.dest; + if (pan_has_dest_mod(&dest, floatdestmods[q])) + score++; + } + + return (score > 0); +} + static void emit_alu(compiler_context *ctx, nir_alu_instr *instr) { + nir_dest *dest = &instr->dest.dest; + + if (dest->is_ssa && BITSET_TEST(ctx->already_emitted, dest->ssa.index)) + return; + /* Derivatives end up emitted on the texture pipe, not the ALUs. This * is handled elsewhere */ @@ -644,20 +720,11 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) return; } - bool is_ssa = instr->dest.dest.is_ssa; + bool is_ssa = dest->is_ssa; - unsigned dest = nir_dest_index(&instr->dest.dest); - unsigned nr_components = nir_dest_num_components(instr->dest.dest); + unsigned nr_components = nir_dest_num_components(*dest); unsigned nr_inputs = nir_op_infos[instr->op].num_inputs; - - /* Most Midgard ALU ops have a 1:1 correspondance to NIR ops; these are - * supported. A few do not and are commented for now. Also, there are a - * number of NIR ops which Midgard does not support and need to be - * lowered, also TODO. This switch block emits the opcode and calling - * convention of the Midgard instruction; actual packing is done in - * emit_alu below */ - - unsigned op; + unsigned op = 0; /* Number of components valid to check for the instruction (the rest * will be forced to the last), or 0 to use as-is. Relevant as @@ -670,19 +737,11 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) midgard_reg_mode reg_mode = reg_mode_for_nir(instr); - /* Do we need a destination override? Used for inline - * type conversion */ - - midgard_dest_override dest_override = - midgard_dest_override_none; - - /* Should we use a smaller respective source and sign-extend? */ - - bool half_1 = false, sext_1 = false; - bool half_2 = false, sext_2 = false; + /* Should we swap arguments? */ + bool flip_src12 = false; unsigned src_bitsize = nir_src_bit_size(instr->src[0].src); - unsigned dst_bitsize = nir_dest_bit_size(instr->dest.dest); + unsigned dst_bitsize = nir_dest_bit_size(*dest); switch (instr->op) { ALU_CASE(fadd, fadd); @@ -708,13 +767,13 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) ALU_CASE(mov, imov); - ALU_CASE(feq32, feq); - ALU_CASE(fne32, fne); - ALU_CASE(flt32, flt); - ALU_CASE(ieq32, ieq); - ALU_CASE(ine32, ine); - ALU_CASE(ilt32, ilt); - ALU_CASE(ult32, ult); + ALU_CASE_CMP(feq32, feq, false); + ALU_CASE_CMP(fne32, fne, false); + ALU_CASE_CMP(flt32, flt, false); + ALU_CASE_CMP(ieq32, ieq, true); + ALU_CASE_CMP(ine32, ine, true); + ALU_CASE_CMP(ilt32, ilt, true); + ALU_CASE_CMP(ult32, ult, false); /* We don't have a native b2f32 instruction. Instead, like many * GPUs, we exploit booleans as 0/~0 for false/true, and @@ -727,14 +786,15 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) * At the end of emit_alu (as MIR), we'll fix-up the constant */ - ALU_CASE(b2f32, iand); - ALU_CASE(b2i32, iand); + ALU_CASE_CMP(b2f32, iand, true); + ALU_CASE_CMP(b2f16, iand, true); + ALU_CASE_CMP(b2i32, iand, true); /* Likewise, we don't have a dedicated f2b32 instruction, but * we can do a "not equal to 0.0" test. */ - ALU_CASE(f2b32, fne); - ALU_CASE(i2b32, ine); + ALU_CASE_CMP(f2b32, fne, false); + ALU_CASE_CMP(i2b32, ine, true); ALU_CASE(frcp, frcp); ALU_CASE(frsq, frsqrt); @@ -760,8 +820,9 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) ALU_CASE(fsin, fsin); ALU_CASE(fcos, fcos); - /* We'll set invert */ - ALU_CASE(inot, imov); + /* We'll get 0 in the second arg, so: + * ~a = ~(a | 0) = nor(a, 0) */ + ALU_CASE(inot, inor); ALU_CASE(iand, iand); ALU_CASE(ior, ior); ALU_CASE(ixor, ixor); @@ -771,24 +832,26 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) ALU_CASE_BCAST(b32all_fequal2, fball_eq, 2); ALU_CASE_BCAST(b32all_fequal3, fball_eq, 3); - ALU_CASE(b32all_fequal4, fball_eq); + ALU_CASE_CMP(b32all_fequal4, fball_eq, true); ALU_CASE_BCAST(b32any_fnequal2, fbany_neq, 2); ALU_CASE_BCAST(b32any_fnequal3, fbany_neq, 3); - ALU_CASE(b32any_fnequal4, fbany_neq); + ALU_CASE_CMP(b32any_fnequal4, fbany_neq, true); ALU_CASE_BCAST(b32all_iequal2, iball_eq, 2); ALU_CASE_BCAST(b32all_iequal3, iball_eq, 3); - ALU_CASE(b32all_iequal4, iball_eq); + ALU_CASE_CMP(b32all_iequal4, iball_eq, true); ALU_CASE_BCAST(b32any_inequal2, ibany_neq, 2); ALU_CASE_BCAST(b32any_inequal3, ibany_neq, 3); - ALU_CASE(b32any_inequal4, ibany_neq); + ALU_CASE_CMP(b32any_inequal4, ibany_neq, true); /* Source mods will be shoved in later */ ALU_CASE(fabs, fmov); ALU_CASE(fneg, fmov); ALU_CASE(fsat, fmov); + ALU_CASE(fsat_signed, fmov); + ALU_CASE(fclamp_pos, fmov); /* For size conversion, we use a move. Ideally though we would squash * these ops together; maybe that has to happen after in NIR as part of @@ -801,11 +864,6 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) case nir_op_i2i16: case nir_op_i2i32: case nir_op_i2i64: - /* If we end up upscale, we'll need a sign-extend on the - * operand (the second argument) */ - - sext_2 = true; - /* fallthrough */ case nir_op_u2u8: case nir_op_u2u16: case nir_op_u2u32: @@ -819,17 +877,6 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) else op = midgard_alu_op_imov; - if (dst_bitsize == (src_bitsize * 2)) { - /* Converting up */ - half_2 = true; - - /* Use a greater register mode */ - reg_mode++; - } else if (src_bitsize == (dst_bitsize * 2)) { - /* Converting down */ - dest_override = midgard_dest_override_lower; - } - break; } @@ -847,36 +894,17 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) instr->op == nir_op_uge32 ? midgard_alu_op_ule : 0; - /* Swap via temporary */ - nir_alu_src temp = instr->src[1]; - instr->src[1] = instr->src[0]; - instr->src[0] = temp; - + flip_src12 = true; + ALU_CHECK_CMP(false); break; } case nir_op_b32csel: { - /* Midgard features both fcsel and icsel, depending on - * the type of the arguments/output. However, as long - * as we're careful we can _always_ use icsel and - * _never_ need fcsel, since the latter does additional - * floating-point-specific processing whereas the - * former just moves bits on the wire. It's not obvious - * why these are separate opcodes, save for the ability - * to do things like sat/pos/abs/neg for free */ - bool mixed = nir_is_non_scalar_swizzle(&instr->src[0], nr_components); - op = mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel; - - /* The condition is the first argument; move the other - * arguments up one to be a binary instruction for - * Midgard with the condition last */ - - nir_alu_src temp = instr->src[2]; - - instr->src[2] = instr->src[0]; - instr->src[0] = instr->src[1]; - instr->src[1] = temp; + bool is_float = mir_is_bcsel_float(instr); + op = is_float ? + (mixed ? midgard_alu_op_fcsel_v : midgard_alu_op_fcsel) : + (mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel); break; } @@ -887,121 +915,127 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) return; } + /* Promote imov to fmov if it might help inline a constant */ + if (op == midgard_alu_op_imov && nir_src_is_const(instr->src[0].src) + && nir_src_bit_size(instr->src[0].src) == 32 + && nir_is_same_comp_swizzle(instr->src[0].swizzle, + nir_src_num_components(instr->src[0].src))) { + op = midgard_alu_op_fmov; + } + /* Midgard can perform certain modifiers on output of an ALU op */ - unsigned outmod; + + unsigned outmod = 0; + bool is_int = midgard_is_integer_op(op); if (midgard_is_integer_out_op(op)) { outmod = midgard_outmod_int_wrap; - } else { - bool sat = instr->dest.saturate || instr->op == nir_op_fsat; - outmod = sat ? midgard_outmod_sat : midgard_outmod_none; - } - - /* fmax(a, 0.0) can turn into a .pos modifier as an optimization */ - - if (instr->op == nir_op_fmax) { - if (nir_is_fzero_constant(instr->src[0].src)) { - op = midgard_alu_op_fmov; - nr_inputs = 1; - outmod = midgard_outmod_pos; - instr->src[0] = instr->src[1]; - } else if (nir_is_fzero_constant(instr->src[1].src)) { - op = midgard_alu_op_fmov; - nr_inputs = 1; - outmod = midgard_outmod_pos; - } + } else if (instr->op == nir_op_fsat) { + outmod = midgard_outmod_sat; + } else if (instr->op == nir_op_fsat_signed) { + outmod = midgard_outmod_sat_signed; + } else if (instr->op == nir_op_fclamp_pos) { + outmod = midgard_outmod_pos; } /* Fetch unit, quirks, etc information */ unsigned opcode_props = alu_opcode_props[op].props; bool quirk_flipped_r24 = opcode_props & QUIRK_FLIPPED_R24; - /* src0 will always exist afaik, but src1 will not for 1-argument - * instructions. The latter can only be fetched if the instruction - * needs it, or else we may segfault. */ + /* Look for floating point mods. We have the mods fsat, fsat_signed, + * and fpos. We also have the relations (note 3 * 2 = 6 cases): + * + * fsat_signed(fpos(x)) = fsat(x) + * fsat_signed(fsat(x)) = fsat(x) + * fpos(fsat_signed(x)) = fsat(x) + * fpos(fsat(x)) = fsat(x) + * fsat(fsat_signed(x)) = fsat(x) + * fsat(fpos(x)) = fsat(x) + * + * So by cases any composition of output modifiers is equivalent to + * fsat alone. + */ - unsigned src0 = nir_src_index(ctx, &instr->src[0].src); - unsigned src1 = nr_inputs >= 2 ? nir_src_index(ctx, &instr->src[1].src) : ~0; - unsigned src2 = nr_inputs == 3 ? nir_src_index(ctx, &instr->src[2].src) : ~0; - assert(nr_inputs <= 3); + if (!is_int && !(opcode_props & OP_TYPE_CONVERT)) { + bool fpos = mir_accept_dest_mod(ctx, &dest, nir_op_fclamp_pos); + bool fsat = mir_accept_dest_mod(ctx, &dest, nir_op_fsat); + bool ssat = mir_accept_dest_mod(ctx, &dest, nir_op_fsat_signed); + bool prior = (outmod != midgard_outmod_none); + int count = (int) prior + (int) fpos + (int) ssat + (int) fsat; - /* Rather than use the instruction generation helpers, we do it - * ourselves here to avoid the mess */ + outmod = ((count > 1) || fsat) ? midgard_outmod_sat : + fpos ? midgard_outmod_pos : + ssat ? midgard_outmod_sat_signed : + outmod; + } midgard_instruction ins = { .type = TAG_ALU_4, - .src = { - quirk_flipped_r24 ? ~0 : src0, - quirk_flipped_r24 ? src0 : src1, - src2, - ~0 - }, - .dest = dest, + .dest = nir_dest_index(dest), + .dest_type = nir_op_infos[instr->op].output_type + | nir_dest_bit_size(*dest), }; - nir_alu_src *nirmods[3] = { NULL }; + for (unsigned i = nr_inputs; i < ARRAY_SIZE(ins.src); ++i) + ins.src[i] = ~0; - if (nr_inputs >= 2) { - nirmods[0] = &instr->src[0]; - nirmods[1] = &instr->src[1]; - } else if (nr_inputs == 1) { - nirmods[quirk_flipped_r24] = &instr->src[0]; + if (quirk_flipped_r24) { + ins.src[0] = ~0; + mir_copy_src(&ins, instr, 0, 1, &ins.src_abs[1], &ins.src_neg[1], &ins.src_invert[1], is_int, broadcast_swizzle); } else { - assert(0); - } + for (unsigned i = 0; i < nr_inputs; ++i) { + unsigned to = i; + + if (instr->op == nir_op_b32csel) { + /* The condition is the first argument; move + * the other arguments up one to be a binary + * instruction for Midgard with the condition + * last */ + + if (i == 0) + to = 2; + else if (flip_src12) + to = 2 - i; + else + to = i - 1; + } else if (flip_src12) { + to = 1 - to; + } - if (nr_inputs == 3) - nirmods[2] = &instr->src[2]; + mir_copy_src(&ins, instr, i, to, &ins.src_abs[to], &ins.src_neg[to], &ins.src_invert[to], is_int, broadcast_swizzle); - /* These were lowered to a move, so apply the corresponding mod */ + /* (!c) ? a : b = c ? b : a */ + if (instr->op == nir_op_b32csel && ins.src_invert[2]) { + ins.src_invert[2] = false; + flip_src12 ^= true; + } + } + } if (instr->op == nir_op_fneg || instr->op == nir_op_fabs) { - nir_alu_src *s = nirmods[quirk_flipped_r24]; - + /* Lowered to move */ if (instr->op == nir_op_fneg) - s->negate = !s->negate; + ins.src_neg[1] ^= true; if (instr->op == nir_op_fabs) - s->abs = !s->abs; + ins.src_abs[1] = true; } - bool is_int = midgard_is_integer_op(op); - ins.mask = mask_of(nr_components); midgard_vector_alu alu = { .op = op, .reg_mode = reg_mode, - .dest_override = dest_override, .outmod = outmod, - - .src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[0], is_int, broadcast_swizzle, half_1, sext_1)), - .src2 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[1], is_int, broadcast_swizzle, half_2, sext_2)), }; - /* Apply writemask if non-SSA, keeping in mind that we can't write to components that don't exist */ + /* Apply writemask if non-SSA, keeping in mind that we can't write to + * components that don't exist. Note modifier => SSA => !reg => no + * writemask, so we don't have to worry about writemasks here.*/ if (!is_ssa) ins.mask &= instr->dest.write_mask; - for (unsigned m = 0; m < 3; ++m) { - if (!nirmods[m]) - continue; - - for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; ++c) - ins.swizzle[m][c] = nirmods[m]->swizzle[c]; - - /* Replicate. TODO: remove when vec16 lands */ - for (unsigned c = NIR_MAX_VEC_COMPONENTS; c < MIR_VEC_COMPONENTS; ++c) - ins.swizzle[m][c] = nirmods[m]->swizzle[NIR_MAX_VEC_COMPONENTS - 1]; - } - - if (nr_inputs == 3) { - /* Conditions can't have mods */ - assert(!nirmods[2]->abs); - assert(!nirmods[2]->negate); - } - ins.alu = alu; /* Late fixup for emulated instructions */ @@ -1014,6 +1048,7 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) ins.has_inline_constant = false; ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + ins.src_types[1] = nir_type_float32; ins.has_constants = true; if (instr->op == nir_op_b2f32) @@ -1021,19 +1056,33 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) else ins.constants.i32[0] = 1; + for (unsigned c = 0; c < 16; ++c) + ins.swizzle[1][c] = 0; + } else if (instr->op == nir_op_b2f16) { + ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + ins.src_types[1] = nir_type_float16; + ins.has_constants = true; + ins.constants.i16[0] = _mesa_float_to_half(1.0); + for (unsigned c = 0; c < 16; ++c) ins.swizzle[1][c] = 0; } else if (nr_inputs == 1 && !quirk_flipped_r24) { /* Lots of instructions need a 0 plonked in */ ins.has_inline_constant = false; ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + ins.src_types[1] = nir_type_uint32; ins.has_constants = true; ins.constants.u32[0] = 0; for (unsigned c = 0; c < 16; ++c) ins.swizzle[1][c] = 0; - } else if (instr->op == nir_op_inot) { - ins.invert = true; + } + + /* Arrange for creation of iandnot/iornot */ + if (ins.src_invert[0] && !ins.src_invert[1]) { + mir_flip(&ins); + ins.src_invert[0] = false; + ins.src_invert[1] = true; } if ((opcode_props & UNITS_ALL) == UNIT_VLUT) { @@ -1043,6 +1092,9 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) unsigned orig_mask = ins.mask; + unsigned swizzle_back[MIR_VEC_COMPONENTS]; + memcpy(&swizzle_back, ins.swizzle[0], sizeof(swizzle_back)); + for (int i = 0; i < nr_components; ++i) { /* Mask the associated component, dropping the * instruction if needed */ @@ -1054,7 +1106,7 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) continue; for (unsigned j = 0; j < MIR_VEC_COMPONENTS; ++j) - ins.swizzle[0][j] = nirmods[0]->swizzle[i]; /* Pull from the correct component */ + ins.swizzle[0][j] = swizzle_back[i]; /* Pull from the correct component */ emit_mir_instruction(ctx, ins); } @@ -1083,9 +1135,7 @@ mir_set_intr_mask(nir_instr *instr, midgard_instruction *ins, bool is_read) /* Once we have the NIR mask, we need to normalize to work in 32-bit space */ unsigned bytemask = pan_to_bytemask(dsize, nir_mask); mir_set_bytemask(ins, bytemask); - - if (dsize == 64) - ins->load_64 = true; + ins->dest_type = nir_type_uint | dsize; } /* Uniforms and UBOs use a shared code path, as uniforms are just (slightly @@ -1111,6 +1161,7 @@ emit_ubo_read( if (indirect_offset) { ins.src[2] = nir_src_index(ctx, indirect_offset); + ins.src_types[2] = nir_type_uint32; ins.load_store.arg_2 = (indirect_shift << 5); } else { ins.load_store.arg_2 = 0x1E; @@ -1174,9 +1225,10 @@ emit_varying_read( memcpy(&u, &p, sizeof(p)); ins.load_store.varying_parameters = u; - if (indirect_offset) + if (indirect_offset) { ins.src[2] = nir_src_index(ctx, indirect_offset); - else + ins.src_types[2] = nir_type_uint32; + } else ins.load_store.arg_2 = 0x1E; ins.load_store.arg_1 = 0x9E; @@ -1285,6 +1337,7 @@ emit_fragment_store(compiler_context *ctx, unsigned src, enum midgard_rt_id rt) /* Add dependencies */ ins.src[0] = src; + ins.src_types[0] = nir_type_uint32; ins.constants.u32[0] = rt == MIDGARD_ZS_RT ? 0xFF : (rt - MIDGARD_COLOR_RT0) * 0x100; @@ -1335,6 +1388,7 @@ emit_control_barrier(compiler_context *ctx) { midgard_instruction ins = { .type = TAG_TEXTURE_4, + .dest = ~0, .src = { ~0, ~0, ~0, ~0 }, .texture = { .op = TEXTURE_OP_BARRIER, @@ -1359,6 +1413,19 @@ search_var(struct exec_list *vars, unsigned driver_loc) return NULL; } +static unsigned +mir_get_branch_cond(nir_src *src, bool *invert) +{ + /* Wrap it. No swizzle since it's a scalar */ + + nir_alu_src alu = { + .src = *src + }; + + *invert = pan_has_source_mod(&alu, nir_op_inot); + return nir_src_index(NULL, &alu.src); +} + static void emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) { @@ -1371,8 +1438,11 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) struct midgard_instruction discard = v_branch(conditional, false); discard.branch.target_type = TARGET_DISCARD; - if (conditional) - discard.src[0] = nir_src_index(ctx, &instr->src[0]); + if (conditional) { + discard.src[0] = mir_get_branch_cond(&instr->src[0], + &discard.branch.invert_conditional); + discard.src_types[0] = nir_type_uint32; + } emit_mir_instruction(ctx, discard); schedule_barrier(ctx); @@ -1480,8 +1550,10 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) ld.load_store.arg_2 = 0x1E; } - for (unsigned c = 2; c < 16; ++c) + for (unsigned c = 4; c < 16; ++c) ld.swizzle[0][c] = 0; + + ld.dest_type = nir_type_float16; } emit_mir_instruction(ctx, ld); @@ -1700,7 +1772,7 @@ midgard_tex_format(enum glsl_sampler_dim dim) } } -/* Tries to attach an explicit LOD / bias as a constant. Returns whether this +/* Tries to attach an explicit LOD or bias as a constant. Returns whether this * was successful */ static bool @@ -1733,21 +1805,6 @@ pan_attach_constant_bias( return true; } -static enum mali_sampler_type -midgard_sampler_type(nir_alu_type t) { - switch (nir_alu_type_get_base_type(t)) - { - case nir_type_float: - return MALI_SAMPLER_FLOAT; - case nir_type_int: - return MALI_SAMPLER_SIGNED; - case nir_type_uint: - return MALI_SAMPLER_UNSIGNED; - default: - unreachable("Unknown sampler type"); - } -} - static void emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, unsigned midgard_texop) @@ -1758,28 +1815,29 @@ emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, int texture_index = instr->texture_index; int sampler_index = texture_index; - /* No helper to build texture words -- we do it all here */ + nir_alu_type dest_base = nir_alu_type_get_base_type(instr->dest_type); + nir_alu_type dest_type = dest_base | nir_dest_bit_size(instr->dest); + midgard_instruction ins = { .type = TAG_TEXTURE_4, .mask = 0xF, .dest = nir_dest_index(&instr->dest), .src = { ~0, ~0, ~0, ~0 }, + .dest_type = dest_type, .swizzle = SWIZZLE_IDENTITY_4, .texture = { .op = midgard_texop, .format = midgard_tex_format(instr->sampler_dim), .texture_handle = texture_index, .sampler_handle = sampler_index, - - /* TODO: half */ - .in_reg_full = 1, - .out_full = 1, - - .sampler_type = midgard_sampler_type(instr->dest_type), .shadow = instr->is_shadow, } }; + if (instr->is_shadow && !instr->is_new_style_shadow) + for (int i = 0; i < 4; ++i) + ins.swizzle[0][i] = COMPONENT_X; + /* We may need a temporary for the coordinate */ bool needs_temp_coord = @@ -1792,6 +1850,8 @@ emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, for (unsigned i = 0; i < instr->num_srcs; ++i) { int index = nir_src_index(ctx, &instr->src[i].src); unsigned nr_components = nir_src_num_components(instr->src[i].src); + unsigned sz = nir_src_bit_size(instr->src[i].src); + nir_alu_type T = nir_tex_instr_src_type(instr, i) | sz; switch (instr->src[i].src_type) { case nir_tex_src_coord: { @@ -1814,6 +1874,7 @@ emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, midgard_instruction ld = m_ld_cubemap_coords(coords, 0); ld.src[1] = index; + ld.src_types[1] = T; ld.mask = 0x3; /* xy */ ld.load_store.arg_1 = 0x20; ld.swizzle[1][3] = COMPONENT_X; @@ -1836,6 +1897,7 @@ emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, } ins.src[1] = coords; + ins.src_types[1] = T; /* Texelfetch coordinates uses all four elements * (xyz/index) regardless of texture dimensionality, @@ -1886,6 +1948,7 @@ emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, ins.texture.lod_register = true; ins.src[2] = index; + ins.src_types[2] = T; for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) ins.swizzle[2][c] = COMPONENT_X; @@ -1898,6 +1961,7 @@ emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, case nir_tex_src_offset: { ins.texture.offset_register = true; ins.src[3] = index; + ins.src_types[3] = T; for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) ins.swizzle[3][c] = (c > COMPONENT_Z) ? 0 : c; @@ -1928,9 +1992,6 @@ emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, } emit_mir_instruction(ctx, ins); - - /* Used for .cont and .last hinting */ - ctx->texture_op_count++; } static void @@ -2042,15 +2103,10 @@ inline_alu_constants(compiler_context *ctx, midgard_block *block) /* Corner case: _two_ vec4 constants, for instance with a * csel. For this case, we can only use a constant * register for one, we'll have to emit a move for the - * other. Note, if both arguments are constants, then - * necessarily neither argument depends on the value of - * any particular register. As the destination register - * will be wiped, that means we can spill the constant - * to the destination register. - */ + * other. */ void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->src[1] + 1); - unsigned scratch = alu->dest; + unsigned scratch = make_compiler_temp(ctx); if (entry) { midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), scratch); @@ -2066,41 +2122,6 @@ inline_alu_constants(compiler_context *ctx, midgard_block *block) } } -/* Being a little silly with the names, but returns the op that is the bitwise - * inverse of the op with the argument switched. I.e. (f and g are - * contrapositives): - * - * f(a, b) = ~g(b, a) - * - * Corollary: if g is the contrapositve of f, f is the contrapositive of g: - * - * f(a, b) = ~g(b, a) - * ~f(a, b) = g(b, a) - * ~f(a, b) = ~h(a, b) where h is the contrapositive of g - * f(a, b) = h(a, b) - * - * Thus we define this function in pairs. - */ - -static inline midgard_alu_op -mir_contrapositive(midgard_alu_op op) -{ - switch (op) { - case midgard_alu_op_flt: - return midgard_alu_op_fle; - case midgard_alu_op_fle: - return midgard_alu_op_flt; - - case midgard_alu_op_ilt: - return midgard_alu_op_ile; - case midgard_alu_op_ile: - return midgard_alu_op_ilt; - - default: - unreachable("No known contrapositive"); - } -} - /* Midgard supports two types of constants, embedded constants (128-bit) and * inline constants (16-bit). Sometimes, especially with scalar ops, embedded * constants can be demoted to inline constants, for space savings and @@ -2129,39 +2150,12 @@ embedded_to_inline_constant(compiler_context *ctx, midgard_block *block) int op = ins->alu.op; - if (ins->src[0] == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) { - bool flip = alu_opcode_props[op].props & OP_COMMUTES; - - switch (op) { - /* Conditionals can be inverted */ - case midgard_alu_op_flt: - case midgard_alu_op_ilt: - case midgard_alu_op_fle: - case midgard_alu_op_ile: - ins->alu.op = mir_contrapositive(ins->alu.op); - ins->invert = true; - flip = true; - break; - - case midgard_alu_op_fcsel: - case midgard_alu_op_icsel: - DBG("Missed non-commutative flip (%s)\n", alu_opcode_props[op].name); - default: - break; - } - - if (flip) - mir_flip(ins); + if (ins->src[0] == SSA_FIXED_REGISTER(REGISTER_CONSTANT) && + alu_opcode_props[op].props & OP_COMMUTES) { + mir_flip(ins); } if (ins->src[1] == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) { - /* Extract the source information */ - - midgard_vector_alu_src *src; - int q = ins->alu.src2; - midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q; - src = m; - /* Component is from the swizzle. Take a nonzero component */ assert(ins->mask); unsigned first_comp = ffs(ins->mask) - 1; @@ -2195,12 +2189,9 @@ embedded_to_inline_constant(compiler_context *ctx, midgard_block *block) continue; } - /* We don't know how to handle these with a constant */ - - if (mir_nontrivial_source2_mod_simple(ins) || src->rep_low || src->rep_high) { - DBG("Bailing inline constant...\n"); + /* Should've been const folded */ + if (ins->src_abs[1] || ins->src_neg[1]) continue; - } /* Make sure that the constant is not itself a vector * by checking if all accessed values are the same. */ @@ -2242,7 +2233,7 @@ embedded_to_inline_constant(compiler_context *ctx, midgard_block *block) * per block is legal semantically */ static void -midgard_opt_cull_dead_branch(compiler_context *ctx, midgard_block *block) +midgard_cull_dead_branch(compiler_context *ctx, midgard_block *block) { bool branched = false; @@ -2256,67 +2247,6 @@ midgard_opt_cull_dead_branch(compiler_context *ctx, midgard_block *block) } } -/* fmov.pos is an idiom for fpos. Propoagate the .pos up to the source, so then - * the move can be propagated away entirely */ - -static bool -mir_compose_float_outmod(midgard_outmod_float *outmod, midgard_outmod_float comp) -{ - /* Nothing to do */ - if (comp == midgard_outmod_none) - return true; - - if (*outmod == midgard_outmod_none) { - *outmod = comp; - return true; - } - - /* TODO: Compose rules */ - return false; -} - -static bool -midgard_opt_pos_propagate(compiler_context *ctx, midgard_block *block) -{ - bool progress = false; - - mir_foreach_instr_in_block_safe(block, ins) { - if (ins->type != TAG_ALU_4) continue; - if (ins->alu.op != midgard_alu_op_fmov) continue; - if (ins->alu.outmod != midgard_outmod_pos) continue; - - /* TODO: Registers? */ - unsigned src = ins->src[1]; - if (src & IS_REG) continue; - - /* There might be a source modifier, too */ - if (mir_nontrivial_source2_mod(ins)) continue; - - /* Backpropagate the modifier */ - mir_foreach_instr_in_block_from_rev(block, v, mir_prev_op(ins)) { - if (v->type != TAG_ALU_4) continue; - if (v->dest != src) continue; - - /* Can we even take a float outmod? */ - if (midgard_is_integer_out_op(v->alu.op)) continue; - - midgard_outmod_float temp = v->alu.outmod; - progress |= mir_compose_float_outmod(&temp, ins->alu.outmod); - - /* Throw in the towel.. */ - if (!progress) break; - - /* Otherwise, transfer the modifier */ - v->alu.outmod = temp; - ins->alu.outmod = midgard_outmod_none; - - break; - } - } - - return progress; -} - static unsigned emit_fragment_epilogue(compiler_context *ctx, unsigned rt) { @@ -2369,9 +2299,12 @@ emit_if(struct compiler_context *ctx, nir_if *nif) midgard_block *before_block = ctx->current_block; /* Speculatively emit the branch, but we can't fill it in until later */ + bool inv = false; EMIT(branch, true, true); midgard_instruction *then_branch = mir_last_in_block(ctx->current_block); - then_branch->src[0] = nir_src_index(ctx, &nif->condition); + then_branch->src[0] = mir_get_branch_cond(&nif->condition, &inv); + then_branch->src_types[0] = nir_type_uint32; + then_branch->branch.invert_conditional = !inv; /* Emit the two subblocks. */ midgard_block *then_block = emit_cf_list(ctx, &nif->then_list); @@ -2546,6 +2479,7 @@ mir_add_writeout_loops(compiler_context *ctx) unsigned popped = br->branch.target_block; pan_block_add_successor(&(mir_get_block(ctx, popped - 1)->base), &ctx->current_block->base); br->branch.target_block = emit_fragment_epilogue(ctx, rt); + br->branch.target_type = TARGET_GOTO; /* If we have more RTs, we'll need to restore back after our * loop terminates */ @@ -2553,6 +2487,7 @@ mir_add_writeout_loops(compiler_context *ctx) if ((rt + 1) < ARRAY_SIZE(ctx->writeout_branch) && ctx->writeout_branch[rt + 1]) { midgard_instruction uncond = v_branch(false, false); uncond.branch.target_block = popped; + uncond.branch.target_type = TARGET_GOTO; emit_mir_instruction(ctx, uncond); pan_block_add_successor(&ctx->current_block->base, &(mir_get_block(ctx, popped)->base)); schedule_barrier(ctx); @@ -2635,8 +2570,10 @@ midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_b list_inithead(&ctx->blocks); ctx->block_count = 0; ctx->func = func; + ctx->already_emitted = calloc(BITSET_WORDS(func->impl->ssa_alloc), sizeof(BITSET_WORD)); emit_cf_list(ctx, &func->impl->body); + free(ctx->already_emitted); break; /* TODO: Multi-function shaders */ } @@ -2647,7 +2584,6 @@ midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_b mir_foreach_block(ctx, _block) { midgard_block *block = (midgard_block *) _block; inline_alu_constants(ctx, block); - midgard_opt_promote_fmov(ctx, block); embedded_to_inline_constant(ctx, block); } /* MIR-level optimizations */ @@ -2656,45 +2592,30 @@ midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_b do { progress = false; + progress |= midgard_opt_dead_code_eliminate(ctx); mir_foreach_block(ctx, _block) { midgard_block *block = (midgard_block *) _block; - progress |= midgard_opt_pos_propagate(ctx, block); progress |= midgard_opt_copy_prop(ctx, block); - progress |= midgard_opt_dead_code_eliminate(ctx, block); progress |= midgard_opt_combine_projection(ctx, block); progress |= midgard_opt_varying_projection(ctx, block); - progress |= midgard_opt_not_propagate(ctx, block); - progress |= midgard_opt_fuse_src_invert(ctx, block); - progress |= midgard_opt_fuse_dest_invert(ctx, block); - progress |= midgard_opt_csel_invert(ctx, block); - progress |= midgard_opt_drop_cmp_invert(ctx, block); - progress |= midgard_opt_invert_branch(ctx, block); } } while (progress); mir_foreach_block(ctx, _block) { midgard_block *block = (midgard_block *) _block; - midgard_lower_invert(ctx, block); midgard_lower_derivatives(ctx, block); - } - - /* Nested control-flow can result in dead branches at the end of the - * block. This messes with our analysis and is just dead code, so cull - * them */ - mir_foreach_block(ctx, _block) { - midgard_block *block = (midgard_block *) _block; - midgard_opt_cull_dead_branch(ctx, block); - } - - /* Ensure we were lowered */ - mir_foreach_instr_global(ctx, ins) { - assert(!ins->invert); + midgard_cull_dead_branch(ctx, block); } if (ctx->stage == MESA_SHADER_FRAGMENT) mir_add_writeout_loops(ctx); + /* Analyze now that the code is known but before scheduling creates + * pipeline registers which are harder to track */ + mir_analyze_helper_terminate(ctx); + mir_analyze_helper_requirements(ctx); + /* Schedule! */ midgard_schedule_program(ctx); mir_ra(ctx);