X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=blobdiff_plain;f=src%2Fpanfrost%2Fmidgard%2Fmidgard_compile.c;h=1f54cfdcf39fe3a40765be6a3489559a7447889b;hp=e144dd72b2a78bed66a21f54dc6ac9cfe12e680e;hb=7dab5749c89181bc71b8d1635ab1a9b1a7928163;hpb=9a7f0e268b7baeb92d8d7d2989beb43550262406 diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c index e144dd72b2a..1f54cfdcf39 100644 --- a/src/panfrost/midgard/midgard_compile.c +++ b/src/panfrost/midgard/midgard_compile.c @@ -48,6 +48,8 @@ #include "helpers.h" #include "compiler.h" #include "midgard_quirks.h" +#include "panfrost-quirks.h" +#include "panfrost/util/pan_lower_framebuffer.h" #include "disassemble.h" @@ -108,8 +110,8 @@ schedule_barrier(compiler_context *ctx) .dest = ~0, \ .src = { ~0, ~0, ~0, ~0 }, \ .swizzle = SWIZZLE_IDENTITY_4, \ + .op = midgard_op_##name, \ .load_store = { \ - .op = midgard_op_##name, \ .address = address \ } \ }; \ @@ -117,6 +119,7 @@ schedule_barrier(compiler_context *ctx) if (store) { \ i.src[0] = ssa; \ i.src_types[0] = T; \ + i.dest_type = T; \ } else { \ i.dest = ssa; \ i.dest_type = T; \ @@ -127,60 +130,14 @@ schedule_barrier(compiler_context *ctx) #define M_LOAD(name, T) M_LOAD_STORE(name, false, T) #define M_STORE(name, T) M_LOAD_STORE(name, true, T) -/* Inputs a NIR ALU source, with modifiers attached if necessary, and outputs - * the corresponding Midgard source */ - -static midgard_vector_alu_src -vector_alu_modifiers(nir_alu_src *src, bool is_int, unsigned broadcast_count, - bool half, bool sext) -{ - /* Figure out how many components there are so we can adjust. - * Specifically we want to broadcast the last channel so things like - * ball2/3 work. - */ - - if (broadcast_count && src) { - uint8_t last_component = src->swizzle[broadcast_count - 1]; - - for (unsigned c = broadcast_count; c < NIR_MAX_VEC_COMPONENTS; ++c) { - src->swizzle[c] = last_component; - } - } - - midgard_vector_alu_src alu_src = { - .rep_low = 0, - .rep_high = 0, - .half = half - }; - - if (is_int) { - alu_src.mod = midgard_int_normal; - - /* Sign/zero-extend if needed */ - - if (half) { - alu_src.mod = sext ? - midgard_int_sign_extend - : midgard_int_zero_extend; - } - - /* These should have been lowered away */ - if (src) - assert(!(src->abs || src->negate)); - } else { - if (src) - alu_src.mod = (src->abs << 0) | (src->negate << 1); - } - - return alu_src; -} - M_LOAD(ld_attr_32, nir_type_uint32); M_LOAD(ld_vary_32, nir_type_uint32); M_LOAD(ld_ubo_int4, nir_type_uint32); M_LOAD(ld_int4, nir_type_uint32); M_STORE(st_int4, nir_type_uint32); M_LOAD(ld_color_buffer_32u, nir_type_uint32); +M_LOAD(ld_color_buffer_as_fp16, nir_type_float16); +M_LOAD(ld_color_buffer_as_fp32, nir_type_float32); M_STORE(st_vary_32, nir_type_uint32); M_LOAD(ld_cubemap_coords, nir_type_uint32); M_LOAD(ld_compute_id, nir_type_uint32); @@ -203,37 +160,6 @@ v_branch(bool conditional, bool invert) return ins; } -static midgard_branch_extended -midgard_create_branch_extended( midgard_condition cond, - midgard_jmp_writeout_op op, - unsigned dest_tag, - signed quadword_offset) -{ - /* The condition code is actually a LUT describing a function to - * combine multiple condition codes. However, we only support a single - * condition code at the moment, so we just duplicate over a bunch of - * times. */ - - uint16_t duplicated_cond = - (cond << 14) | - (cond << 12) | - (cond << 10) | - (cond << 8) | - (cond << 6) | - (cond << 4) | - (cond << 2) | - (cond << 0); - - midgard_branch_extended branch = { - .op = op, - .dest_tag = dest_tag, - .offset = quadword_offset, - .cond = duplicated_cond - }; - - return branch; -} - static void attach_constants(compiler_context *ctx, midgard_instruction *ins, void *constants, int name) { @@ -299,10 +225,20 @@ midgard_nir_lower_fdot2(nir_shader *shader) return progress; } -/* Midgard can't write depth and stencil separately. It has to happen in a - * single store operation containing both. Let's add a panfrost specific - * intrinsic and turn all depth/stencil stores into a packed depth+stencil - * one. +static const nir_variable * +search_var(nir_shader *nir, nir_variable_mode mode, unsigned driver_loc) +{ + nir_foreach_variable_with_modes(var, nir, mode) { + if (var->data.driver_location == driver_loc) + return var; + } + + return NULL; +} + +/* Midgard can write all of color, depth and stencil in a single writeout + * operation, so we merge depth/stencil stores with color stores. + * If there are no color stores, we add a write to the "depth RT". */ static bool midgard_nir_lower_zs_store(nir_shader *nir) @@ -312,7 +248,7 @@ midgard_nir_lower_zs_store(nir_shader *nir) nir_variable *z_var = NULL, *s_var = NULL; - nir_foreach_variable(var, &nir->outputs) { + nir_foreach_shader_out_variable(var, nir) { if (var->data.location == FRAG_RESULT_DEPTH) z_var = var; else if (var->data.location == FRAG_RESULT_STENCIL) @@ -327,7 +263,7 @@ midgard_nir_lower_zs_store(nir_shader *nir) nir_foreach_function(function, nir) { if (!function->impl) continue; - nir_intrinsic_instr *z_store = NULL, *s_store = NULL, *last_store = NULL; + nir_intrinsic_instr *z_store = NULL, *s_store = NULL; nir_foreach_block(block, function->impl) { nir_foreach_instr_safe(instr, block) { @@ -341,47 +277,132 @@ midgard_nir_lower_zs_store(nir_shader *nir) if (z_var && nir_intrinsic_base(intr) == z_var->data.driver_location) { assert(!z_store); z_store = intr; - last_store = intr; } if (s_var && nir_intrinsic_base(intr) == s_var->data.driver_location) { assert(!s_store); s_store = intr; - last_store = intr; } } } if (!z_store && !s_store) continue; - nir_builder b; - nir_builder_init(&b, function->impl); + bool replaced = false; - b.cursor = nir_before_instr(&last_store->instr); + nir_foreach_block(block, function->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; - nir_ssa_def *zs_store_src; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_output) + continue; - if (z_store && s_store) { - nir_ssa_def *srcs[2] = { - nir_ssa_for_src(&b, z_store->src[0], 1), - nir_ssa_for_src(&b, s_store->src[0], 1), - }; + const nir_variable *var = search_var(nir, nir_var_shader_out, nir_intrinsic_base(intr)); + assert(var); - zs_store_src = nir_vec(&b, srcs, 2); - } else { - zs_store_src = nir_ssa_for_src(&b, last_store->src[0], 1); + if (var->data.location != FRAG_RESULT_COLOR && + var->data.location < FRAG_RESULT_DATA0) + continue; + + if (var->data.index) + continue; + + assert(nir_src_is_const(intr->src[1]) && "no indirect outputs"); + + nir_builder b; + nir_builder_init(&b, function->impl); + + assert(!z_store || z_store->instr.block == instr->block); + assert(!s_store || s_store->instr.block == instr->block); + b.cursor = nir_after_block_before_jump(instr->block); + + nir_intrinsic_instr *combined_store; + combined_store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_combined_output_pan); + + combined_store->num_components = intr->src[0].ssa->num_components; + + nir_intrinsic_set_base(combined_store, nir_intrinsic_base(intr)); + + unsigned writeout = PAN_WRITEOUT_C; + if (z_store) + writeout |= PAN_WRITEOUT_Z; + if (s_store) + writeout |= PAN_WRITEOUT_S; + + nir_intrinsic_set_component(combined_store, writeout); + + struct nir_ssa_def *zero = nir_imm_int(&b, 0); + + struct nir_ssa_def *src[4] = { + intr->src[0].ssa, + intr->src[1].ssa, + z_store ? z_store->src[0].ssa : zero, + s_store ? s_store->src[0].ssa : zero, + }; + + for (int i = 0; i < 4; ++i) + combined_store->src[i] = nir_src_for_ssa(src[i]); + + nir_builder_instr_insert(&b, &combined_store->instr); + + nir_instr_remove(instr); + + replaced = true; + } } - nir_intrinsic_instr *zs_store; + /* Insert a store to the depth RT (0xff) if needed */ + if (!replaced) { + nir_builder b; + nir_builder_init(&b, function->impl); + + nir_block *block = NULL; + if (z_store && s_store) + assert(z_store->instr.block == s_store->instr.block); + + if (z_store) + block = z_store->instr.block; + else + block = s_store->instr.block; + + b.cursor = nir_after_block_before_jump(block); + + nir_intrinsic_instr *combined_store; + combined_store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_combined_output_pan); - zs_store = nir_intrinsic_instr_create(b.shader, - nir_intrinsic_store_zs_output_pan); - zs_store->src[0] = nir_src_for_ssa(zs_store_src); - zs_store->num_components = z_store && s_store ? 2 : 1; - nir_intrinsic_set_component(zs_store, z_store ? 0 : 1); + combined_store->num_components = 4; + + unsigned base; + if (z_store) + base = nir_intrinsic_base(z_store); + else + base = nir_intrinsic_base(s_store); + nir_intrinsic_set_base(combined_store, base); - /* Replace the Z and S store by a ZS store */ - nir_builder_instr_insert(&b, &zs_store->instr); + unsigned writeout = 0; + if (z_store) + writeout |= PAN_WRITEOUT_Z; + if (s_store) + writeout |= PAN_WRITEOUT_S; + + nir_intrinsic_set_component(combined_store, writeout); + + struct nir_ssa_def *zero = nir_imm_int(&b, 0); + + struct nir_ssa_def *src[4] = { + nir_imm_vec4(&b, 0, 0, 0, 0), + zero, + z_store ? z_store->src[0].ssa : zero, + s_store ? s_store->src[0].ssa : zero, + }; + + for (int i = 0; i < 4; ++i) + combined_store->src[i] = nir_src_for_ssa(src[i]); + + nir_builder_instr_insert(&b, &combined_store->instr); + } if (z_store) nir_instr_remove(&z_store->instr); @@ -396,10 +417,71 @@ midgard_nir_lower_zs_store(nir_shader *nir) return progress; } +/* Real writeout stores, which break execution, need to be moved to after + * dual-source stores, which are just standard register writes. */ +static bool +midgard_nir_reorder_writeout(nir_shader *nir) +{ + bool progress = false; + + nir_foreach_function(function, nir) { + if (!function->impl) continue; + + nir_foreach_block(block, function->impl) { + nir_instr *last_writeout = NULL; + + nir_foreach_instr_reverse_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_output) + continue; + + const nir_variable *var = search_var(nir, nir_var_shader_out, nir_intrinsic_base(intr)); + + if (var->data.index) { + if (!last_writeout) + last_writeout = instr; + continue; + } + + if (!last_writeout) + continue; + + /* This is a real store, so move it to after dual-source stores */ + exec_node_remove(&instr->node); + exec_node_insert_after(&last_writeout->node, &instr->node); + + progress = true; + } + } + } + + return progress; +} + +static bool +mdg_is_64(const nir_instr *instr, const void *_unused) +{ + const nir_alu_instr *alu = nir_instr_as_alu(instr); + + if (nir_dest_bit_size(alu->dest.dest) == 64) + return true; + + switch (alu->op) { + case nir_op_umul_high: + case nir_op_imul_high: + return true; + default: + return false; + } +} + /* Flushes undefined values to zero */ static void -optimise_nir(nir_shader *nir, unsigned quirks) +optimise_nir(nir_shader *nir, unsigned quirks, bool is_blend) { bool progress; unsigned lower_flrp = @@ -415,6 +497,7 @@ optimise_nir(nir_shader *nir, unsigned quirks) .lower_txp = ~0, .lower_tex_without_implicit_lod = (quirks & MIDGARD_EXPLICIT_LOD), + .lower_tg4_broadcom_swizzle = true, /* TODO: we have native gradient.. */ .lower_txd = true, @@ -430,6 +513,8 @@ optimise_nir(nir_shader *nir, unsigned quirks) if (quirks & MIDGARD_BROKEN_LOD) NIR_PASS_V(nir, midgard_nir_lod_errata); + NIR_PASS(progress, nir, midgard_nir_lower_algebraic_early); + do { progress = false; @@ -476,6 +561,12 @@ optimise_nir(nir_shader *nir, unsigned quirks) NIR_PASS(progress, nir, nir_opt_vectorize); } while (progress); + NIR_PASS_V(nir, nir_lower_alu_to_scalar, mdg_is_64, NULL); + + /* Run after opts so it can hit more */ + if (!is_blend) + NIR_PASS(progress, nir, nir_fuse_io_16); + /* Must be run at the end to prevent creation of fsin/fcos ops */ NIR_PASS(progress, nir, midgard_nir_scale_trig); @@ -496,12 +587,8 @@ optimise_nir(nir_shader *nir, unsigned quirks) /* Now that booleans are lowered, we can run out late opts */ NIR_PASS(progress, nir, midgard_nir_lower_algebraic_late); + NIR_PASS(progress, nir, midgard_nir_cancel_inot); - /* Lower mods for float ops only. Integer ops don't support modifiers - * (saturate doesn't make sense on integers, neg/abs require dedicated - * instructions) */ - - NIR_PASS(progress, nir, nir_lower_to_source_mods, nir_lower_float_source_mods); NIR_PASS(progress, nir, nir_copy_prop); NIR_PASS(progress, nir, nir_opt_dce); @@ -586,60 +673,187 @@ nir_is_non_scalar_swizzle(nir_alu_src *src, unsigned nr_components) assert(src_bitsize == dst_bitsize); \ break; +#define ALU_CASE_RTZ(nir, _op) \ + case nir_op_##nir: \ + op = midgard_alu_op_##_op; \ + roundmode = MIDGARD_RTZ; \ + break; + +#define ALU_CHECK_CMP(sext) \ + assert(src_bitsize == 16 || src_bitsize == 32); \ + assert(dst_bitsize == 16 || dst_bitsize == 32); \ + #define ALU_CASE_BCAST(nir, _op, count) \ case nir_op_##nir: \ op = midgard_alu_op_##_op; \ broadcast_swizzle = count; \ - assert(src_bitsize == dst_bitsize); \ + ALU_CHECK_CMP(true); \ break; + +#define ALU_CASE_CMP(nir, _op, sext) \ + case nir_op_##nir: \ + op = midgard_alu_op_##_op; \ + ALU_CHECK_CMP(sext); \ + break; + +/* Compare mir_lower_invert */ static bool -nir_is_fzero_constant(nir_src src) +nir_accepts_inot(nir_op op, unsigned src) { - if (!nir_src_is_const(src)) + switch (op) { + case nir_op_ior: + case nir_op_iand: /* TODO: b2f16 */ + case nir_op_ixor: + return true; + case nir_op_b32csel: + /* Only the condition */ + return (src == 0); + default: return false; + } +} - for (unsigned c = 0; c < nir_src_num_components(src); ++c) { - if (nir_src_comp_as_float(src, c) != 0.0) - return false; +static bool +mir_accept_dest_mod(compiler_context *ctx, nir_dest **dest, nir_op op) +{ + if (pan_has_dest_mod(dest, op)) { + assert((*dest)->is_ssa); + BITSET_SET(ctx->already_emitted, (*dest)->ssa.index); + return true; } - return true; + return false; } -/* Analyze the sizes of the inputs to determine which reg mode. Ops needed - * special treatment override this anyway. */ +/* Look for floating point mods. We have the mods fsat, fsat_signed, + * and fpos. We also have the relations (note 3 * 2 = 6 cases): + * + * fsat_signed(fpos(x)) = fsat(x) + * fsat_signed(fsat(x)) = fsat(x) + * fpos(fsat_signed(x)) = fsat(x) + * fpos(fsat(x)) = fsat(x) + * fsat(fsat_signed(x)) = fsat(x) + * fsat(fpos(x)) = fsat(x) + * + * So by cases any composition of output modifiers is equivalent to + * fsat alone. + */ +static unsigned +mir_determine_float_outmod(compiler_context *ctx, nir_dest **dest, unsigned prior_outmod) +{ + bool fpos = mir_accept_dest_mod(ctx, dest, nir_op_fclamp_pos); + bool fsat = mir_accept_dest_mod(ctx, dest, nir_op_fsat); + bool ssat = mir_accept_dest_mod(ctx, dest, nir_op_fsat_signed); + bool prior = (prior_outmod != midgard_outmod_none); + int count = (int) prior + (int) fpos + (int) ssat + (int) fsat; + + return ((count > 1) || fsat) ? midgard_outmod_sat : + fpos ? midgard_outmod_pos : + ssat ? midgard_outmod_sat_signed : + prior_outmod; +} -static midgard_reg_mode -reg_mode_for_nir(nir_alu_instr *instr) +static void +mir_copy_src(midgard_instruction *ins, nir_alu_instr *instr, unsigned i, unsigned to, bool *abs, bool *neg, bool *not, enum midgard_roundmode *roundmode, bool is_int, unsigned bcast_count) { - unsigned src_bitsize = nir_src_bit_size(instr->src[0].src); + nir_alu_src src = instr->src[i]; - switch (src_bitsize) { - case 8: - return midgard_reg_mode_8; - case 16: - return midgard_reg_mode_16; - case 32: - return midgard_reg_mode_32; - case 64: - return midgard_reg_mode_64; - default: - unreachable("Invalid bit size"); + if (!is_int) { + if (pan_has_source_mod(&src, nir_op_fneg)) + *neg = !(*neg); + + if (pan_has_source_mod(&src, nir_op_fabs)) + *abs = true; + } + + if (nir_accepts_inot(instr->op, i) && pan_has_source_mod(&src, nir_op_inot)) + *not = true; + + if (roundmode) { + if (pan_has_source_mod(&src, nir_op_fround_even)) + *roundmode = MIDGARD_RTE; + + if (pan_has_source_mod(&src, nir_op_ftrunc)) + *roundmode = MIDGARD_RTZ; + + if (pan_has_source_mod(&src, nir_op_ffloor)) + *roundmode = MIDGARD_RTN; + + if (pan_has_source_mod(&src, nir_op_fceil)) + *roundmode = MIDGARD_RTP; + } + + unsigned bits = nir_src_bit_size(src.src); + + ins->src[to] = nir_src_index(NULL, &src.src); + ins->src_types[to] = nir_op_infos[instr->op].input_types[i] | bits; + + for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; ++c) { + ins->swizzle[to][c] = src.swizzle[ + (!bcast_count || c < bcast_count) ? c : + (bcast_count - 1)]; } } -static void -mir_copy_src(midgard_instruction *ins, nir_alu_instr *instr, unsigned i, unsigned to) +/* Midgard features both fcsel and icsel, depending on whether you want int or + * float modifiers. NIR's csel is typeless, so we want a heuristic to guess if + * we should emit an int or float csel depending on what modifiers could be + * placed. In the absense of modifiers, this is probably arbitrary. */ + +static bool +mir_is_bcsel_float(nir_alu_instr *instr) { - unsigned bits = nir_src_bit_size(instr->src[i].src); + nir_op intmods[] = { + nir_op_i2i8, nir_op_i2i16, + nir_op_i2i32, nir_op_i2i64 + }; - ins->src[to] = nir_src_index(NULL, &instr->src[i].src); - ins->src_types[to] = nir_op_infos[instr->op].input_types[i] | bits; + nir_op floatmods[] = { + nir_op_fabs, nir_op_fneg, + nir_op_f2f16, nir_op_f2f32, + nir_op_f2f64 + }; + + nir_op floatdestmods[] = { + nir_op_fsat, nir_op_fsat_signed, nir_op_fclamp_pos, + nir_op_f2f16, nir_op_f2f32 + }; + + signed score = 0; + + for (unsigned i = 1; i < 3; ++i) { + nir_alu_src s = instr->src[i]; + for (unsigned q = 0; q < ARRAY_SIZE(intmods); ++q) { + if (pan_has_source_mod(&s, intmods[q])) + score--; + } + } + + for (unsigned i = 1; i < 3; ++i) { + nir_alu_src s = instr->src[i]; + for (unsigned q = 0; q < ARRAY_SIZE(floatmods); ++q) { + if (pan_has_source_mod(&s, floatmods[q])) + score++; + } + } + + for (unsigned q = 0; q < ARRAY_SIZE(floatdestmods); ++q) { + nir_dest *dest = &instr->dest.dest; + if (pan_has_dest_mod(&dest, floatdestmods[q])) + score++; + } + + return (score > 0); } static void emit_alu(compiler_context *ctx, nir_alu_instr *instr) { + nir_dest *dest = &instr->dest.dest; + + if (dest->is_ssa && BITSET_TEST(ctx->already_emitted, dest->ssa.index)) + return; + /* Derivatives end up emitted on the texture pipe, not the ALUs. This * is handled elsewhere */ @@ -648,9 +862,9 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) return; } - bool is_ssa = instr->dest.dest.is_ssa; + bool is_ssa = dest->is_ssa; - unsigned nr_components = nir_dest_num_components(instr->dest.dest); + unsigned nr_components = nir_dest_num_components(*dest); unsigned nr_inputs = nir_op_infos[instr->op].num_inputs; unsigned op = 0; @@ -661,23 +875,13 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) unsigned broadcast_swizzle = 0; - /* What register mode should we operate in? */ - midgard_reg_mode reg_mode = - reg_mode_for_nir(instr); - - /* Do we need a destination override? Used for inline - * type conversion */ - - midgard_dest_override dest_override = - midgard_dest_override_none; + /* Should we swap arguments? */ + bool flip_src12 = false; - /* Should we use a smaller respective source and sign-extend? */ + ASSERTED unsigned src_bitsize = nir_src_bit_size(instr->src[0].src); + ASSERTED unsigned dst_bitsize = nir_dest_bit_size(*dest); - bool half_1 = false, sext_1 = false; - bool half_2 = false, sext_2 = false; - - unsigned src_bitsize = nir_src_bit_size(instr->src[0].src); - unsigned dst_bitsize = nir_dest_bit_size(instr->dest.dest); + enum midgard_roundmode roundmode = MIDGARD_RTE; switch (instr->op) { ALU_CASE(fadd, fadd); @@ -697,19 +901,21 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) ALU_CASE(iadd, iadd); ALU_CASE(isub, isub); ALU_CASE(imul, imul); + ALU_CASE(imul_high, imul); + ALU_CASE(umul_high, imul); /* Zero shoved as second-arg */ ALU_CASE(iabs, iabsdiff); ALU_CASE(mov, imov); - ALU_CASE(feq32, feq); - ALU_CASE(fne32, fne); - ALU_CASE(flt32, flt); - ALU_CASE(ieq32, ieq); - ALU_CASE(ine32, ine); - ALU_CASE(ilt32, ilt); - ALU_CASE(ult32, ult); + ALU_CASE_CMP(feq32, feq, false); + ALU_CASE_CMP(fneu32, fne, false); + ALU_CASE_CMP(flt32, flt, false); + ALU_CASE_CMP(ieq32, ieq, true); + ALU_CASE_CMP(ine32, ine, true); + ALU_CASE_CMP(ilt32, ilt, true); + ALU_CASE_CMP(ult32, ult, false); /* We don't have a native b2f32 instruction. Instead, like many * GPUs, we exploit booleans as 0/~0 for false/true, and @@ -722,14 +928,15 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) * At the end of emit_alu (as MIR), we'll fix-up the constant */ - ALU_CASE(b2f32, iand); - ALU_CASE(b2i32, iand); + ALU_CASE_CMP(b2f32, iand, true); + ALU_CASE_CMP(b2f16, iand, true); + ALU_CASE_CMP(b2i32, iand, true); /* Likewise, we don't have a dedicated f2b32 instruction, but * we can do a "not equal to 0.0" test. */ - ALU_CASE(f2b32, fne); - ALU_CASE(i2b32, ine); + ALU_CASE_CMP(f2b32, fne, false); + ALU_CASE_CMP(i2b32, ine, true); ALU_CASE(frcp, frcp); ALU_CASE(frsq, frsqrt); @@ -737,26 +944,30 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) ALU_CASE(fexp2, fexp2); ALU_CASE(flog2, flog2); - ALU_CASE(f2i64, f2i_rtz); - ALU_CASE(f2u64, f2u_rtz); - ALU_CASE(i2f64, i2f_rtz); - ALU_CASE(u2f64, u2f_rtz); + ALU_CASE_RTZ(f2i64, f2i_rte); + ALU_CASE_RTZ(f2u64, f2u_rte); + ALU_CASE_RTZ(i2f64, i2f_rte); + ALU_CASE_RTZ(u2f64, u2f_rte); + + ALU_CASE_RTZ(f2i32, f2i_rte); + ALU_CASE_RTZ(f2u32, f2u_rte); + ALU_CASE_RTZ(i2f32, i2f_rte); + ALU_CASE_RTZ(u2f32, u2f_rte); - ALU_CASE(f2i32, f2i_rtz); - ALU_CASE(f2u32, f2u_rtz); - ALU_CASE(i2f32, i2f_rtz); - ALU_CASE(u2f32, u2f_rtz); + ALU_CASE_RTZ(f2i8, f2i_rte); + ALU_CASE_RTZ(f2u8, f2u_rte); - ALU_CASE(f2i16, f2i_rtz); - ALU_CASE(f2u16, f2u_rtz); - ALU_CASE(i2f16, i2f_rtz); - ALU_CASE(u2f16, u2f_rtz); + ALU_CASE_RTZ(f2i16, f2i_rte); + ALU_CASE_RTZ(f2u16, f2u_rte); + ALU_CASE_RTZ(i2f16, i2f_rte); + ALU_CASE_RTZ(u2f16, u2f_rte); ALU_CASE(fsin, fsin); ALU_CASE(fcos, fcos); - /* We'll set invert */ - ALU_CASE(inot, imov); + /* We'll get 0 in the second arg, so: + * ~a = ~(a | 0) = nor(a, 0) */ + ALU_CASE(inot, inor); ALU_CASE(iand, iand); ALU_CASE(ior, ior); ALU_CASE(ixor, ixor); @@ -766,24 +977,26 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) ALU_CASE_BCAST(b32all_fequal2, fball_eq, 2); ALU_CASE_BCAST(b32all_fequal3, fball_eq, 3); - ALU_CASE(b32all_fequal4, fball_eq); + ALU_CASE_CMP(b32all_fequal4, fball_eq, true); ALU_CASE_BCAST(b32any_fnequal2, fbany_neq, 2); ALU_CASE_BCAST(b32any_fnequal3, fbany_neq, 3); - ALU_CASE(b32any_fnequal4, fbany_neq); + ALU_CASE_CMP(b32any_fnequal4, fbany_neq, true); ALU_CASE_BCAST(b32all_iequal2, iball_eq, 2); ALU_CASE_BCAST(b32all_iequal3, iball_eq, 3); - ALU_CASE(b32all_iequal4, iball_eq); + ALU_CASE_CMP(b32all_iequal4, iball_eq, true); ALU_CASE_BCAST(b32any_inequal2, ibany_neq, 2); ALU_CASE_BCAST(b32any_inequal3, ibany_neq, 3); - ALU_CASE(b32any_inequal4, ibany_neq); + ALU_CASE_CMP(b32any_inequal4, ibany_neq, true); /* Source mods will be shoved in later */ ALU_CASE(fabs, fmov); ALU_CASE(fneg, fmov); ALU_CASE(fsat, fmov); + ALU_CASE(fsat_signed, fmov); + ALU_CASE(fclamp_pos, fmov); /* For size conversion, we use a move. Ideally though we would squash * these ops together; maybe that has to happen after in NIR as part of @@ -796,11 +1009,6 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) case nir_op_i2i16: case nir_op_i2i32: case nir_op_i2i64: - /* If we end up upscale, we'll need a sign-extend on the - * operand (the second argument) */ - - sext_2 = true; - /* fallthrough */ case nir_op_u2u8: case nir_op_u2u16: case nir_op_u2u32: @@ -814,17 +1022,6 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) else op = midgard_alu_op_imov; - if (dst_bitsize == (src_bitsize * 2)) { - /* Converting up */ - half_2 = true; - - /* Use a greater register mode */ - reg_mode++; - } else if (src_bitsize == (dst_bitsize * 2)) { - /* Converting down */ - dest_override = midgard_dest_override_lower; - } - break; } @@ -842,37 +1039,26 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) instr->op == nir_op_uge32 ? midgard_alu_op_ule : 0; - /* Swap via temporary */ - nir_alu_src temp = instr->src[1]; - instr->src[1] = instr->src[0]; - instr->src[0] = temp; - + flip_src12 = true; + ALU_CHECK_CMP(false); break; } case nir_op_b32csel: { - /* Midgard features both fcsel and icsel, depending on - * the type of the arguments/output. However, as long - * as we're careful we can _always_ use icsel and - * _never_ need fcsel, since the latter does additional - * floating-point-specific processing whereas the - * former just moves bits on the wire. It's not obvious - * why these are separate opcodes, save for the ability - * to do things like sat/pos/abs/neg for free */ - bool mixed = nir_is_non_scalar_swizzle(&instr->src[0], nr_components); - op = mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel; - - /* The condition is the first argument; move the other - * arguments up one to be a binary instruction for - * Midgard with the condition last */ + bool is_float = mir_is_bcsel_float(instr); + op = is_float ? + (mixed ? midgard_alu_op_fcsel_v : midgard_alu_op_fcsel) : + (mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel); - nir_alu_src temp = instr->src[2]; - - instr->src[2] = instr->src[0]; - instr->src[0] = instr->src[1]; - instr->src[1] = temp; + break; + } + case nir_op_unpack_32_2x16: + case nir_op_unpack_32_4x8: + case nir_op_pack_32_2x16: + case nir_op_pack_32_4x8: { + op = midgard_alu_op_imov; break; } @@ -882,117 +1068,106 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) return; } - /* Midgard can perform certain modifiers on output of an ALU op */ - unsigned outmod; - - if (midgard_is_integer_out_op(op)) { - outmod = midgard_outmod_int_wrap; - } else { - bool sat = instr->dest.saturate || instr->op == nir_op_fsat; - outmod = sat ? midgard_outmod_sat : midgard_outmod_none; + /* Promote imov to fmov if it might help inline a constant */ + if (op == midgard_alu_op_imov && nir_src_is_const(instr->src[0].src) + && nir_src_bit_size(instr->src[0].src) == 32 + && nir_is_same_comp_swizzle(instr->src[0].swizzle, + nir_src_num_components(instr->src[0].src))) { + op = midgard_alu_op_fmov; } - /* fmax(a, 0.0) can turn into a .pos modifier as an optimization */ + /* Midgard can perform certain modifiers on output of an ALU op */ - if (instr->op == nir_op_fmax) { - if (nir_is_fzero_constant(instr->src[0].src)) { - op = midgard_alu_op_fmov; - nr_inputs = 1; - outmod = midgard_outmod_pos; - instr->src[0] = instr->src[1]; - } else if (nir_is_fzero_constant(instr->src[1].src)) { - op = midgard_alu_op_fmov; - nr_inputs = 1; - outmod = midgard_outmod_pos; - } + unsigned outmod = 0; + bool is_int = midgard_is_integer_op(op); + + if (instr->op == nir_op_umul_high || instr->op == nir_op_imul_high) { + outmod = midgard_outmod_int_high; + } else if (midgard_is_integer_out_op(op)) { + outmod = midgard_outmod_int_wrap; + } else if (instr->op == nir_op_fsat) { + outmod = midgard_outmod_sat; + } else if (instr->op == nir_op_fsat_signed) { + outmod = midgard_outmod_sat_signed; + } else if (instr->op == nir_op_fclamp_pos) { + outmod = midgard_outmod_pos; } /* Fetch unit, quirks, etc information */ unsigned opcode_props = alu_opcode_props[op].props; bool quirk_flipped_r24 = opcode_props & QUIRK_FLIPPED_R24; + if (!midgard_is_integer_out_op(op)) { + outmod = mir_determine_float_outmod(ctx, &dest, outmod); + } + midgard_instruction ins = { .type = TAG_ALU_4, - .dest = nir_dest_index(&instr->dest.dest), + .dest = nir_dest_index(dest), .dest_type = nir_op_infos[instr->op].output_type - | nir_dest_bit_size(instr->dest.dest), + | nir_dest_bit_size(*dest), + .roundmode = roundmode, }; + enum midgard_roundmode *roundptr = (opcode_props & MIDGARD_ROUNDS) ? + &ins.roundmode : NULL; + for (unsigned i = nr_inputs; i < ARRAY_SIZE(ins.src); ++i) ins.src[i] = ~0; if (quirk_flipped_r24) { ins.src[0] = ~0; - mir_copy_src(&ins, instr, 0, 1); + mir_copy_src(&ins, instr, 0, 1, &ins.src_abs[1], &ins.src_neg[1], &ins.src_invert[1], roundptr, is_int, broadcast_swizzle); } else { - for (unsigned i = 0; i < nr_inputs; ++i) - mir_copy_src(&ins, instr, i, quirk_flipped_r24 ? 1 : i); - } + for (unsigned i = 0; i < nr_inputs; ++i) { + unsigned to = i; + + if (instr->op == nir_op_b32csel) { + /* The condition is the first argument; move + * the other arguments up one to be a binary + * instruction for Midgard with the condition + * last */ + + if (i == 0) + to = 2; + else if (flip_src12) + to = 2 - i; + else + to = i - 1; + } else if (flip_src12) { + to = 1 - to; + } - nir_alu_src *nirmods[3] = { NULL }; + mir_copy_src(&ins, instr, i, to, &ins.src_abs[to], &ins.src_neg[to], &ins.src_invert[to], roundptr, is_int, broadcast_swizzle); - if (nr_inputs >= 2) { - nirmods[0] = &instr->src[0]; - nirmods[1] = &instr->src[1]; - } else if (nr_inputs == 1) { - nirmods[quirk_flipped_r24] = &instr->src[0]; - } else { - assert(0); + /* (!c) ? a : b = c ? b : a */ + if (instr->op == nir_op_b32csel && ins.src_invert[2]) { + ins.src_invert[2] = false; + flip_src12 ^= true; + } + } } - if (nr_inputs == 3) - nirmods[2] = &instr->src[2]; - - /* These were lowered to a move, so apply the corresponding mod */ - if (instr->op == nir_op_fneg || instr->op == nir_op_fabs) { - nir_alu_src *s = nirmods[quirk_flipped_r24]; - + /* Lowered to move */ if (instr->op == nir_op_fneg) - s->negate = !s->negate; + ins.src_neg[1] ^= true; if (instr->op == nir_op_fabs) - s->abs = !s->abs; + ins.src_abs[1] = true; } - bool is_int = midgard_is_integer_op(op); - ins.mask = mask_of(nr_components); - midgard_vector_alu alu = { - .op = op, - .reg_mode = reg_mode, - .dest_override = dest_override, - .outmod = outmod, - - .src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[0], is_int, broadcast_swizzle, half_1, sext_1)), - .src2 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[1], is_int, broadcast_swizzle, half_2, sext_2)), - }; - - /* Apply writemask if non-SSA, keeping in mind that we can't write to components that don't exist */ + /* Apply writemask if non-SSA, keeping in mind that we can't write to + * components that don't exist. Note modifier => SSA => !reg => no + * writemask, so we don't have to worry about writemasks here.*/ if (!is_ssa) ins.mask &= instr->dest.write_mask; - for (unsigned m = 0; m < 3; ++m) { - if (!nirmods[m]) - continue; - - for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; ++c) - ins.swizzle[m][c] = nirmods[m]->swizzle[c]; - - /* Replicate. TODO: remove when vec16 lands */ - for (unsigned c = NIR_MAX_VEC_COMPONENTS; c < MIR_VEC_COMPONENTS; ++c) - ins.swizzle[m][c] = nirmods[m]->swizzle[NIR_MAX_VEC_COMPONENTS - 1]; - } - - if (nr_inputs == 3) { - /* Conditions can't have mods */ - assert(!nirmods[2]->abs); - assert(!nirmods[2]->negate); - } - - ins.alu = alu; + ins.op = op; + ins.outmod = outmod; /* Late fixup for emulated instructions */ @@ -1012,20 +1187,42 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) else ins.constants.i32[0] = 1; + for (unsigned c = 0; c < 16; ++c) + ins.swizzle[1][c] = 0; + } else if (instr->op == nir_op_b2f16) { + ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + ins.src_types[1] = nir_type_float16; + ins.has_constants = true; + ins.constants.i16[0] = _mesa_float_to_half(1.0); + for (unsigned c = 0; c < 16; ++c) ins.swizzle[1][c] = 0; } else if (nr_inputs == 1 && !quirk_flipped_r24) { /* Lots of instructions need a 0 plonked in */ ins.has_inline_constant = false; ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); - ins.src_types[1] = nir_type_uint32; + ins.src_types[1] = ins.src_types[0]; ins.has_constants = true; ins.constants.u32[0] = 0; for (unsigned c = 0; c < 16; ++c) ins.swizzle[1][c] = 0; - } else if (instr->op == nir_op_inot) { - ins.invert = true; + } else if (instr->op == nir_op_pack_32_2x16) { + ins.dest_type = nir_type_uint16; + ins.mask = mask_of(nr_components * 2); + ins.is_pack = true; + } else if (instr->op == nir_op_pack_32_4x8) { + ins.dest_type = nir_type_uint8; + ins.mask = mask_of(nr_components * 4); + ins.is_pack = true; + } else if (instr->op == nir_op_unpack_32_2x16) { + ins.dest_type = nir_type_uint32; + ins.mask = mask_of(nr_components >> 1); + ins.is_pack = true; + } else if (instr->op == nir_op_unpack_32_4x8) { + ins.dest_type = nir_type_uint32; + ins.mask = mask_of(nr_components >> 2); + ins.is_pack = true; } if ((opcode_props & UNITS_ALL) == UNIT_VLUT) { @@ -1035,6 +1232,12 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) unsigned orig_mask = ins.mask; + unsigned swizzle_back[MIR_VEC_COMPONENTS]; + memcpy(&swizzle_back, ins.swizzle[0], sizeof(swizzle_back)); + + midgard_instruction ins_split[MIR_VEC_COMPONENTS]; + unsigned ins_count = 0; + for (int i = 0; i < nr_components; ++i) { /* Mask the associated component, dropping the * instruction if needed */ @@ -1042,13 +1245,27 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) ins.mask = 1 << i; ins.mask &= orig_mask; + for (unsigned j = 0; j < ins_count; ++j) { + if (swizzle_back[i] == ins_split[j].swizzle[0][0]) { + ins_split[j].mask |= ins.mask; + ins.mask = 0; + break; + } + } + if (!ins.mask) continue; for (unsigned j = 0; j < MIR_VEC_COMPONENTS; ++j) - ins.swizzle[0][j] = nirmods[0]->swizzle[i]; /* Pull from the correct component */ + ins.swizzle[0][j] = swizzle_back[i]; /* Pull from the correct component */ + + ins_split[ins_count] = ins; + + ++ins_count; + } - emit_mir_instruction(ctx, ins); + for (unsigned i = 0; i < ins_count; ++i) { + emit_mir_instruction(ctx, ins_split[i]); } } else { emit_mir_instruction(ctx, ins); @@ -1075,9 +1292,7 @@ mir_set_intr_mask(nir_instr *instr, midgard_instruction *ins, bool is_read) /* Once we have the NIR mask, we need to normalize to work in 32-bit space */ unsigned bytemask = pan_to_bytemask(dsize, nir_mask); mir_set_bytemask(ins, bytemask); - - if (dsize == 64) - ins->load_64 = true; + ins->dest_type = nir_type_uint | dsize; } /* Uniforms and UBOs use a shared code path, as uniforms are just (slightly @@ -1105,6 +1320,11 @@ emit_ubo_read( ins.src[2] = nir_src_index(ctx, indirect_offset); ins.src_types[2] = nir_type_uint32; ins.load_store.arg_2 = (indirect_shift << 5); + + /* X component for the whole swizzle to prevent register + * pressure from ballooning from the extra components */ + for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[2]); ++i) + ins.swizzle[2][i] = 0; } else { ins.load_store.arg_2 = 0x1E; } @@ -1138,6 +1358,15 @@ emit_global( mir_set_offset(ctx, &ins, offset, is_shared); mir_set_intr_mask(instr, &ins, is_read); + /* Set a valid swizzle for masked out components */ + assert(ins.mask); + unsigned first_component = __builtin_ffs(ins.mask) - 1; + + for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[0]); ++i) { + if (!(ins.mask & (1 << i))) + ins.swizzle[0][i] = first_component; + } + emit_mir_instruction(ctx, ins); } @@ -1153,6 +1382,12 @@ emit_varying_read( midgard_instruction ins = m_ld_vary_32(dest, offset); ins.mask = mask_of(nr_comp); + ins.dest_type = type; + + if (type == nir_type_float16) { + /* Ensure we are aligned so we can pack it later */ + ins.mask = mask_of(ALIGN_POT(nr_comp, 2)); + } for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[0]); ++i) ins.swizzle[0][i] = MIN2(i + component, COMPONENT_W); @@ -1177,15 +1412,18 @@ emit_varying_read( /* Use the type appropriate load */ switch (type) { - case nir_type_uint: - case nir_type_bool: - ins.load_store.op = midgard_op_ld_vary_32u; + case nir_type_uint32: + case nir_type_bool32: + ins.op = midgard_op_ld_vary_32u; break; - case nir_type_int: - ins.load_store.op = midgard_op_ld_vary_32i; + case nir_type_int32: + ins.op = midgard_op_ld_vary_32i; break; - case nir_type_float: - ins.load_store.op = midgard_op_ld_vary_32; + case nir_type_float32: + ins.op = midgard_op_ld_vary_32; + break; + case nir_type_float16: + ins.op = midgard_op_ld_vary_16; break; default: unreachable("Attempted to load unknown type"); @@ -1210,13 +1448,13 @@ emit_attr_read( switch (t) { case nir_type_uint: case nir_type_bool: - ins.load_store.op = midgard_op_ld_attr_32u; + ins.op = midgard_op_ld_attr_32u; break; case nir_type_int: - ins.load_store.op = midgard_op_ld_attr_32i; + ins.op = midgard_op_ld_attr_32i; break; case nir_type_float: - ins.load_store.op = midgard_op_ld_attr_32; + ins.op = midgard_op_ld_attr_32; break; default: unreachable("Attempted to load unknown type"); @@ -1262,7 +1500,7 @@ compute_builtin_arg(nir_op op) } static void -emit_fragment_store(compiler_context *ctx, unsigned src, enum midgard_rt_id rt) +emit_fragment_store(compiler_context *ctx, unsigned src, unsigned src_z, unsigned src_s, enum midgard_rt_id rt) { assert(rt < ARRAY_SIZE(ctx->writeout_branch)); @@ -1275,13 +1513,29 @@ emit_fragment_store(compiler_context *ctx, unsigned src, enum midgard_rt_id rt) struct midgard_instruction ins = v_branch(false, false); - ins.writeout = true; + bool depth_only = (rt == MIDGARD_ZS_RT); + + ins.writeout = depth_only ? 0 : PAN_WRITEOUT_C; /* Add dependencies */ ins.src[0] = src; ins.src_types[0] = nir_type_uint32; - ins.constants.u32[0] = rt == MIDGARD_ZS_RT ? - 0xFF : (rt - MIDGARD_COLOR_RT0) * 0x100; + ins.constants.u32[0] = depth_only ? 0xFF : (rt - MIDGARD_COLOR_RT0) * 0x100; + for (int i = 0; i < 4; ++i) + ins.swizzle[0][i] = i; + + if (~src_z) { + emit_explicit_constant(ctx, src_z, src_z); + ins.src[2] = src_z; + ins.src_types[2] = nir_type_uint32; + ins.writeout |= PAN_WRITEOUT_Z; + } + if (~src_s) { + emit_explicit_constant(ctx, src_s, src_s); + ins.src[3] = src_s; + ins.src_types[3] = nir_type_uint32; + ins.writeout |= PAN_WRITEOUT_S; + } /* Emit the branch */ br = emit_mir_instruction(ctx, ins); @@ -1325,33 +1579,72 @@ emit_vertex_builtin(compiler_context *ctx, nir_intrinsic_instr *instr) emit_attr_read(ctx, reg, vertex_builtin_arg(instr->intrinsic), 1, nir_type_int); } +static void +emit_special(compiler_context *ctx, nir_intrinsic_instr *instr, unsigned idx) +{ + unsigned reg = nir_dest_index(&instr->dest); + + midgard_instruction ld = m_ld_color_buffer_32u(reg, 0); + ld.op = midgard_op_ld_color_buffer_32u_old; + ld.load_store.address = idx; + ld.load_store.arg_2 = 0x1E; + + for (int i = 0; i < 4; ++i) + ld.swizzle[0][i] = COMPONENT_X; + + emit_mir_instruction(ctx, ld); +} + static void emit_control_barrier(compiler_context *ctx) { midgard_instruction ins = { .type = TAG_TEXTURE_4, + .dest = ~0, .src = { ~0, ~0, ~0, ~0 }, - .texture = { - .op = TEXTURE_OP_BARRIER, - - /* TODO: optimize */ - .barrier_buffer = 1, - .barrier_shared = 1 - } + .op = TEXTURE_OP_BARRIER, }; emit_mir_instruction(ctx, ins); } -static const nir_variable * -search_var(struct exec_list *vars, unsigned driver_loc) +static unsigned +mir_get_branch_cond(nir_src *src, bool *invert) { - nir_foreach_variable(var, vars) { - if (var->data.driver_location == driver_loc) - return var; - } + /* Wrap it. No swizzle since it's a scalar */ - return NULL; + nir_alu_src alu = { + .src = *src + }; + + *invert = pan_has_source_mod(&alu, nir_op_inot); + return nir_src_index(NULL, &alu.src); +} + +static uint8_t +output_load_rt_addr(compiler_context *ctx, nir_intrinsic_instr *instr) +{ + if (ctx->is_blend) + return ctx->blend_rt; + + const nir_variable *var; + var = search_var(ctx->nir, nir_var_shader_out, nir_intrinsic_base(instr)); + assert(var); + + unsigned loc = var->data.location; + + if (loc == FRAG_RESULT_COLOR) + loc = FRAG_RESULT_DATA0; + + if (loc >= FRAG_RESULT_DATA0) + return loc - FRAG_RESULT_DATA0; + + if (loc == FRAG_RESULT_DEPTH) + return 0x1F; + if (loc == FRAG_RESULT_STENCIL) + return 0x1E; + + unreachable("Invalid RT to load from"); } static void @@ -1367,7 +1660,8 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) discard.branch.target_type = TARGET_DISCARD; if (conditional) { - discard.src[0] = nir_src_index(ctx, &instr->src[0]); + discard.src[0] = mir_get_branch_cond(&instr->src[0], + &discard.branch.invert_conditional); discard.src_types[0] = nir_type_uint32; } @@ -1431,14 +1725,17 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) } else if (is_global || is_shared) { emit_global(ctx, &instr->instr, true, reg, src_offset, is_shared); } else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_blend) { - emit_varying_read(ctx, reg, offset, nr_comp, component, indirect_offset, t, is_flat); + emit_varying_read(ctx, reg, offset, nr_comp, component, indirect_offset, t | nir_dest_bit_size(instr->dest), is_flat); } else if (ctx->is_blend) { - /* For blend shaders, load the input color, which is - * preloaded to r0 */ + /* ctx->blend_input will be precoloured to r0/r2, where + * the input is preloaded */ - midgard_instruction move = v_mov(SSA_FIXED_REGISTER(0), reg); - emit_mir_instruction(ctx, move); - schedule_barrier(ctx); + unsigned *input = offset ? &ctx->blend_src1 : &ctx->blend_input; + + if (*input == ~0) + *input = reg; + else + emit_mir_instruction(ctx, v_mov(*input, reg)); } else if (ctx->stage == MESA_SHADER_VERTEX) { emit_attr_read(ctx, reg, offset, nr_comp, t); } else { @@ -1456,33 +1753,62 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) /* Reads 128-bit value raw off the tilebuffer during blending, tasty */ - case nir_intrinsic_load_raw_output_pan: - case nir_intrinsic_load_output_u8_as_fp16_pan: + case nir_intrinsic_load_raw_output_pan: { reg = nir_dest_index(&instr->dest); - assert(ctx->is_blend); /* T720 and below use different blend opcodes with slightly * different semantics than T760 and up */ midgard_instruction ld = m_ld_color_buffer_32u(reg, 0); - bool old_blend = ctx->quirks & MIDGARD_OLD_BLEND; - if (instr->intrinsic == nir_intrinsic_load_output_u8_as_fp16_pan) { - ld.load_store.op = old_blend ? - midgard_op_ld_color_buffer_u8_as_fp16_old : - midgard_op_ld_color_buffer_u8_as_fp16; + ld.load_store.arg_2 = output_load_rt_addr(ctx, instr); - if (old_blend) { - ld.load_store.address = 1; - ld.load_store.arg_2 = 0x1E; - } + if (nir_src_is_const(instr->src[0])) { + ld.load_store.arg_1 = nir_src_as_uint(instr->src[0]); + } else { + ld.load_store.varying_parameters = 2; + ld.src[1] = nir_src_index(ctx, &instr->src[0]); + ld.src_types[1] = nir_type_int32; + } + + if (ctx->quirks & MIDGARD_OLD_BLEND) { + ld.op = midgard_op_ld_color_buffer_32u_old; + ld.load_store.address = 16; + ld.load_store.arg_2 = 0x1E; + } + + emit_mir_instruction(ctx, ld); + break; + } - for (unsigned c = 2; c < 16; ++c) - ld.swizzle[0][c] = 0; + case nir_intrinsic_load_output: { + reg = nir_dest_index(&instr->dest); + + unsigned bits = nir_dest_bit_size(instr->dest); + + midgard_instruction ld; + if (bits == 16) + ld = m_ld_color_buffer_as_fp16(reg, 0); + else + ld = m_ld_color_buffer_as_fp32(reg, 0); + + ld.load_store.arg_2 = output_load_rt_addr(ctx, instr); + + for (unsigned c = 4; c < 16; ++c) + ld.swizzle[0][c] = 0; + + if (ctx->quirks & MIDGARD_OLD_BLEND) { + if (bits == 16) + ld.op = midgard_op_ld_color_buffer_as_fp16_old; + else + ld.op = midgard_op_ld_color_buffer_as_fp32_old; + ld.load_store.address = 1; + ld.load_store.arg_2 = 0x1E; } emit_mir_instruction(ctx, ld); break; + } case nir_intrinsic_load_blend_const_color_rgba: { assert(ctx->is_blend); @@ -1498,23 +1824,8 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) break; } - case nir_intrinsic_store_zs_output_pan: { - assert(ctx->stage == MESA_SHADER_FRAGMENT); - emit_fragment_store(ctx, nir_src_index(ctx, &instr->src[0]), - MIDGARD_ZS_RT); - - midgard_instruction *br = ctx->writeout_branch[MIDGARD_ZS_RT]; - - if (!nir_intrinsic_component(instr)) - br->writeout_depth = true; - if (nir_intrinsic_component(instr) || - instr->num_components) - br->writeout_stencil = true; - assert(br->writeout_depth | br->writeout_stencil); - break; - } - case nir_intrinsic_store_output: + case nir_intrinsic_store_combined_output_pan: assert(nir_src_is_const(instr->src[1]) && "no indirect outputs"); offset = nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[1]); @@ -1522,22 +1833,57 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) reg = nir_src_index(ctx, &instr->src[0]); if (ctx->stage == MESA_SHADER_FRAGMENT) { - const nir_variable *var; - enum midgard_rt_id rt; + bool combined = instr->intrinsic == + nir_intrinsic_store_combined_output_pan; - var = search_var(&ctx->nir->outputs, + const nir_variable *var; + var = search_var(ctx->nir, nir_var_shader_out, nir_intrinsic_base(instr)); assert(var); + + /* Dual-source blend writeout is done by leaving the + * value in r2 for the blend shader to use. */ + if (var->data.index) { + if (instr->src[0].is_ssa) { + emit_explicit_constant(ctx, reg, reg); + + unsigned out = make_compiler_temp(ctx); + + midgard_instruction ins = v_mov(reg, out); + emit_mir_instruction(ctx, ins); + + ctx->blend_src1 = out; + } else { + ctx->blend_src1 = reg; + } + + break; + } + + enum midgard_rt_id rt; if (var->data.location == FRAG_RESULT_COLOR) rt = MIDGARD_COLOR_RT0; else if (var->data.location >= FRAG_RESULT_DATA0) rt = MIDGARD_COLOR_RT0 + var->data.location - FRAG_RESULT_DATA0; + else if (combined) + rt = MIDGARD_ZS_RT; else - assert(0); + unreachable("bad rt"); + + unsigned reg_z = ~0, reg_s = ~0; + if (combined) { + unsigned writeout = nir_intrinsic_component(instr); + if (writeout & PAN_WRITEOUT_Z) + reg_z = nir_src_index(ctx, &instr->src[2]); + if (writeout & PAN_WRITEOUT_S) + reg_s = nir_src_index(ctx, &instr->src[3]); + } - emit_fragment_store(ctx, reg, rt); + emit_fragment_store(ctx, reg, reg_z, reg_s, rt); } else if (ctx->stage == MESA_SHADER_VERTEX) { + assert(instr->intrinsic == nir_intrinsic_store_output); + /* We should have been vectorized, though we don't * currently check that st_vary is emitted only once * per slot (this is relevant, since there's not a mask @@ -1558,13 +1904,13 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) switch (nir_alu_type_get_base_type(nir_intrinsic_type(instr))) { case nir_type_uint: case nir_type_bool: - st.load_store.op = midgard_op_st_vary_32u; + st.op = midgard_op_st_vary_32u; break; case nir_type_int: - st.load_store.op = midgard_op_st_vary_32i; + st.op = midgard_op_st_vary_32i; break; case nir_type_float: - st.load_store.op = midgard_op_st_vary_32; + st.op = midgard_op_st_vary_32; break; default: unreachable("Attempted to store unknown type"); @@ -1598,27 +1944,7 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_store_raw_output_pan: assert (ctx->stage == MESA_SHADER_FRAGMENT); reg = nir_src_index(ctx, &instr->src[0]); - - if (ctx->quirks & MIDGARD_OLD_BLEND) { - /* Suppose reg = qr0.xyzw. That means 4 8-bit ---> 1 32-bit. So - * reg = r0.x. We want to splatter. So we can do a 32-bit move - * of: - * - * imov r0.xyzw, r0.xxxx - */ - - unsigned expanded = make_compiler_temp(ctx); - - midgard_instruction splatter = v_mov(reg, expanded); - - for (unsigned c = 0; c < 16; ++c) - splatter.swizzle[1][c] = 0; - - emit_mir_instruction(ctx, splatter); - emit_fragment_store(ctx, expanded, ctx->blend_rt); - } else - emit_fragment_store(ctx, reg, ctx->blend_rt); - + emit_fragment_store(ctx, reg, ~0, ~0, ctx->blend_rt); break; case nir_intrinsic_store_global: @@ -1654,6 +1980,14 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) emit_vertex_builtin(ctx, instr); break; + case nir_intrinsic_load_sample_mask_in: + emit_special(ctx, instr, 96); + break; + + case nir_intrinsic_load_sample_id: + emit_special(ctx, instr, 97); + break; + case nir_intrinsic_memory_barrier_buffer: case nir_intrinsic_memory_barrier_shared: break; @@ -1671,24 +2005,26 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) } } +/* Returns dimension with 0 special casing cubemaps */ static unsigned midgard_tex_format(enum glsl_sampler_dim dim) { switch (dim) { case GLSL_SAMPLER_DIM_1D: case GLSL_SAMPLER_DIM_BUF: - return MALI_TEX_1D; + return 1; case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_MS: case GLSL_SAMPLER_DIM_EXTERNAL: case GLSL_SAMPLER_DIM_RECT: - return MALI_TEX_2D; + return 2; case GLSL_SAMPLER_DIM_3D: - return MALI_TEX_3D; + return 3; case GLSL_SAMPLER_DIM_CUBE: - return MALI_TEX_CUBE; + return 0; default: DBG("Unknown sampler dim type\n"); @@ -1697,7 +2033,7 @@ midgard_tex_format(enum glsl_sampler_dim dim) } } -/* Tries to attach an explicit LOD / bias as a constant. Returns whether this +/* Tries to attach an explicit LOD or bias as a constant. Returns whether this * was successful */ static bool @@ -1730,6 +2066,19 @@ pan_attach_constant_bias( return true; } +static enum mali_texture_mode +mdg_texture_mode(nir_tex_instr *instr) +{ + if (instr->op == nir_texop_tg4 && instr->is_shadow) + return TEXTURE_GATHER_SHADOW; + else if (instr->op == nir_texop_tg4) + return TEXTURE_GATHER_X + instr->component; + else if (instr->is_shadow) + return TEXTURE_SHADOW; + else + return TEXTURE_NORMAL; +} + static void emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, unsigned midgard_texop) @@ -1737,29 +2086,38 @@ emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, /* TODO */ //assert (!instr->sampler); + nir_dest *dest = &instr->dest; + int texture_index = instr->texture_index; int sampler_index = texture_index; nir_alu_type dest_base = nir_alu_type_get_base_type(instr->dest_type); - nir_alu_type dest_type = dest_base | nir_dest_bit_size(instr->dest); + nir_alu_type dest_type = dest_base | nir_dest_bit_size(*dest); + + /* texture instructions support float outmods */ + unsigned outmod = midgard_outmod_none; + if (dest_base == nir_type_float) { + outmod = mir_determine_float_outmod(ctx, &dest, 0); + } midgard_instruction ins = { .type = TAG_TEXTURE_4, .mask = 0xF, - .dest = nir_dest_index(&instr->dest), + .dest = nir_dest_index(dest), .src = { ~0, ~0, ~0, ~0 }, .dest_type = dest_type, .swizzle = SWIZZLE_IDENTITY_4, + .outmod = outmod, + .op = midgard_texop, .texture = { - .op = midgard_texop, .format = midgard_tex_format(instr->sampler_dim), .texture_handle = texture_index, .sampler_handle = sampler_index, - .shadow = instr->is_shadow, + .mode = mdg_texture_mode(instr) } }; - if (instr->is_shadow && !instr->is_new_style_shadow) + if (instr->is_shadow && !instr->is_new_style_shadow && instr->op != nir_texop_tg4) for (int i = 0; i < 4; ++i) ins.swizzle[0][i] = COMPONENT_X; @@ -1895,7 +2253,8 @@ emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, break; }; - case nir_tex_src_comparator: { + case nir_tex_src_comparator: + case nir_tex_src_ms_index: { unsigned comp = COMPONENT_Z; /* mov coord_temp.foo, coords */ @@ -1917,9 +2276,6 @@ emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, } emit_mir_instruction(ctx, ins); - - /* Used for .cont and .last hinting */ - ctx->texture_op_count++; } static void @@ -1931,9 +2287,11 @@ emit_tex(compiler_context *ctx, nir_tex_instr *instr) emit_texop_native(ctx, instr, TEXTURE_OP_NORMAL); break; case nir_texop_txl: + case nir_texop_tg4: emit_texop_native(ctx, instr, TEXTURE_OP_LOD); break; case nir_texop_txf: + case nir_texop_txf_ms: emit_texop_native(ctx, instr, TEXTURE_OP_TEXEL_FETCH); break; case nir_texop_txs: @@ -2050,6 +2408,68 @@ inline_alu_constants(compiler_context *ctx, midgard_block *block) } } +unsigned +max_bitsize_for_alu(midgard_instruction *ins) +{ + unsigned max_bitsize = 0; + for (int i = 0; i < MIR_SRC_COUNT; i++) { + if (ins->src[i] == ~0) continue; + unsigned src_bitsize = nir_alu_type_get_type_size(ins->src_types[i]); + max_bitsize = MAX2(src_bitsize, max_bitsize); + } + unsigned dst_bitsize = nir_alu_type_get_type_size(ins->dest_type); + max_bitsize = MAX2(dst_bitsize, max_bitsize); + + /* We don't have fp16 LUTs, so we'll want to emit code like: + * + * vlut.fsinr hr0, hr0 + * + * where both input and output are 16-bit but the operation is carried + * out in 32-bit + */ + + switch (ins->op) { + case midgard_alu_op_fsqrt: + case midgard_alu_op_frcp: + case midgard_alu_op_frsqrt: + case midgard_alu_op_fsin: + case midgard_alu_op_fcos: + case midgard_alu_op_fexp2: + case midgard_alu_op_flog2: + max_bitsize = MAX2(max_bitsize, 32); + break; + + default: + break; + } + + /* High implies computing at a higher bitsize, e.g umul_high of 32-bit + * requires computing at 64-bit */ + if (midgard_is_integer_out_op(ins->op) && ins->outmod == midgard_outmod_int_high) { + max_bitsize *= 2; + assert(max_bitsize <= 64); + } + + return max_bitsize; +} + +midgard_reg_mode +reg_mode_for_bitsize(unsigned bitsize) +{ + switch (bitsize) { + /* use 16 pipe for 8 since we don't support vec16 yet */ + case 8: + case 16: + return midgard_reg_mode_16; + case 32: + return midgard_reg_mode_32; + case 64: + return midgard_reg_mode_64; + default: + unreachable("invalid bit size"); + } +} + /* Midgard supports two types of constants, embedded constants (128-bit) and * inline constants (16-bit). Sometimes, especially with scalar ops, embedded * constants can be demoted to inline constants, for space savings and @@ -2065,9 +2485,11 @@ embedded_to_inline_constant(compiler_context *ctx, midgard_block *block) /* Blend constants must not be inlined by definition */ if (ins->has_blend_constant) continue; + unsigned max_bitsize = max_bitsize_for_alu(ins); + /* We can inline 32-bit (sometimes) or 16-bit (usually) */ - bool is_16 = ins->alu.reg_mode == midgard_reg_mode_16; - bool is_32 = ins->alu.reg_mode == midgard_reg_mode_32; + bool is_16 = max_bitsize == 16; + bool is_32 = max_bitsize == 32; if (!(is_16 || is_32)) continue; @@ -2076,7 +2498,7 @@ embedded_to_inline_constant(compiler_context *ctx, midgard_block *block) * restrictions. So, if possible we try to flip the arguments * in that case */ - int op = ins->alu.op; + int op = ins->op; if (ins->src[0] == SSA_FIXED_REGISTER(REGISTER_CONSTANT) && alu_opcode_props[op].props & OP_COMMUTES) { @@ -2084,20 +2506,13 @@ embedded_to_inline_constant(compiler_context *ctx, midgard_block *block) } if (ins->src[1] == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) { - /* Extract the source information */ - - midgard_vector_alu_src *src; - int q = ins->alu.src2; - midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q; - src = m; - /* Component is from the swizzle. Take a nonzero component */ assert(ins->mask); unsigned first_comp = ffs(ins->mask) - 1; unsigned component = ins->swizzle[1][first_comp]; /* Scale constant appropriately, if we can legally */ - uint16_t scaled_constant = 0; + int16_t scaled_constant = 0; if (is_16) { scaled_constant = ins->constants.u16[component]; @@ -2124,12 +2539,9 @@ embedded_to_inline_constant(compiler_context *ctx, midgard_block *block) continue; } - /* We don't know how to handle these with a constant */ - - if (mir_nontrivial_source2_mod_simple(ins) || src->rep_low || src->rep_high) { - DBG("Bailing inline constant...\n"); + /* Should've been const folded */ + if (ins->src_abs[1] || ins->src_neg[1]) continue; - } /* Make sure that the constant is not itself a vector * by checking if all accessed values are the same. */ @@ -2138,7 +2550,7 @@ embedded_to_inline_constant(compiler_context *ctx, midgard_block *block) uint32_t value = is_16 ? cons->u16[component] : cons->u32[component]; bool is_vector = false; - unsigned mask = effective_writemask(&ins->alu, ins->mask); + unsigned mask = effective_writemask(ins->op, ins->mask); for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) { /* We only care if this component is actually used */ @@ -2185,65 +2597,36 @@ midgard_cull_dead_branch(compiler_context *ctx, midgard_block *block) } } -/* fmov.pos is an idiom for fpos. Propoagate the .pos up to the source, so then - * the move can be propagated away entirely */ - -static bool -mir_compose_float_outmod(midgard_outmod_float *outmod, midgard_outmod_float comp) -{ - /* Nothing to do */ - if (comp == midgard_outmod_none) - return true; - - if (*outmod == midgard_outmod_none) { - *outmod = comp; - return true; - } - - /* TODO: Compose rules */ - return false; -} +/* We want to force the invert on AND/OR to the second slot to legalize into + * iandnot/iornot. The relevant patterns are for AND (and OR respectively) + * + * ~a & #b = ~a & ~(#~b) + * ~a & b = b & ~a + */ -static bool -midgard_opt_pos_propagate(compiler_context *ctx, midgard_block *block) +static void +midgard_legalize_invert(compiler_context *ctx, midgard_block *block) { - bool progress = false; - - mir_foreach_instr_in_block_safe(block, ins) { + mir_foreach_instr_in_block(block, ins) { if (ins->type != TAG_ALU_4) continue; - if (ins->alu.op != midgard_alu_op_fmov) continue; - if (ins->alu.outmod != midgard_outmod_pos) continue; - - /* TODO: Registers? */ - unsigned src = ins->src[1]; - if (src & PAN_IS_REG) continue; - - /* There might be a source modifier, too */ - if (mir_nontrivial_source2_mod(ins)) continue; - /* Backpropagate the modifier */ - mir_foreach_instr_in_block_from_rev(block, v, mir_prev_op(ins)) { - if (v->type != TAG_ALU_4) continue; - if (v->dest != src) continue; + if (ins->op != midgard_alu_op_iand && + ins->op != midgard_alu_op_ior) continue; - /* Can we even take a float outmod? */ - if (midgard_is_integer_out_op(v->alu.op)) continue; + if (ins->src_invert[1] || !ins->src_invert[0]) continue; - midgard_outmod_float temp = v->alu.outmod; - progress |= mir_compose_float_outmod(&temp, ins->alu.outmod); - - /* Throw in the towel.. */ - if (!progress) break; - - /* Otherwise, transfer the modifier */ - v->alu.outmod = temp; - ins->alu.outmod = midgard_outmod_none; - - break; + if (ins->has_inline_constant) { + /* ~(#~a) = ~(~#a) = a, so valid, and forces both + * inverts on */ + ins->inline_constant = ~ins->inline_constant; + ins->src_invert[1] = true; + } else { + /* Flip to the right invert order. Note + * has_inline_constant false by assumption on the + * branch, so flipping makes sense. */ + mir_flip(ins); } } - - return progress; } static unsigned @@ -2252,11 +2635,10 @@ emit_fragment_epilogue(compiler_context *ctx, unsigned rt) /* Loop to ourselves */ midgard_instruction *br = ctx->writeout_branch[rt]; struct midgard_instruction ins = v_branch(false, false); - ins.writeout = true; - ins.writeout_depth = br->writeout_depth; - ins.writeout_stencil = br->writeout_stencil; + ins.writeout = br->writeout; ins.branch.target_block = ctx->block_count - 1; ins.constants.u32[0] = br->constants.u32[0]; + memcpy(&ins.src_types, &br->src_types, sizeof(ins.src_types)); emit_mir_instruction(ctx, ins); ctx->current_block->epilogue = true; @@ -2265,7 +2647,7 @@ emit_fragment_epilogue(compiler_context *ctx, unsigned rt) } static midgard_block * -emit_block(compiler_context *ctx, nir_block *block) +emit_block_init(compiler_context *ctx) { midgard_block *this_block = ctx->after_block; ctx->after_block = NULL; @@ -2282,6 +2664,14 @@ emit_block(compiler_context *ctx, nir_block *block) list_inithead(&this_block->base.instructions); ctx->current_block = this_block; + return this_block; +} + +static midgard_block * +emit_block(compiler_context *ctx, nir_block *block) +{ + midgard_block *this_block = emit_block_init(ctx); + nir_foreach_instr(instr, block) { emit_instr(ctx, instr); ++ctx->instruction_count; @@ -2298,10 +2688,12 @@ emit_if(struct compiler_context *ctx, nir_if *nif) midgard_block *before_block = ctx->current_block; /* Speculatively emit the branch, but we can't fill it in until later */ + bool inv = false; EMIT(branch, true, true); midgard_instruction *then_branch = mir_last_in_block(ctx->current_block); - then_branch->src[0] = nir_src_index(ctx, &nif->condition); + then_branch->src[0] = mir_get_branch_cond(&nif->condition, &inv); then_branch->src_types[0] = nir_type_uint32; + then_branch->branch.invert_conditional = !inv; /* Emit the two subblocks. */ midgard_block *then_block = emit_cf_list(ctx, &nif->then_list); @@ -2442,7 +2834,7 @@ emit_cf_list(struct compiler_context *ctx, struct exec_list *list) * stream and in branch targets. An initial block might be empty, so iterate * until we find one that 'works' */ -static unsigned +unsigned midgard_get_first_tag_from_block(compiler_context *ctx, unsigned block_idx) { midgard_block *initial_block = mir_get_block(ctx, block_idx); @@ -2496,7 +2888,7 @@ mir_add_writeout_loops(compiler_context *ctx) } int -midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_blend, unsigned blend_rt, unsigned gpu_id, bool shaderdb) +midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_blend, unsigned blend_rt, unsigned gpu_id, bool shaderdb, bool silent) { struct util_dynarray *compiled = &program->compiled; @@ -2508,8 +2900,9 @@ midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_b ctx->nir = nir; ctx->stage = nir->info.stage; ctx->is_blend = is_blend; - ctx->alpha_ref = program->alpha_ref; ctx->blend_rt = MIDGARD_COLOR_RT0 + blend_rt; + ctx->blend_input = ~0; + ctx->blend_src1 = ~0; ctx->quirks = midgard_get_quirks(gpu_id); /* Start off with a safe cutoff, allowing usage of all 16 work @@ -2520,7 +2913,6 @@ midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_b /* Initialize at a global (not block) level hash tables */ ctx->ssa_constants = _mesa_hash_table_u64_create(NULL); - ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL); /* Lower gl_Position pre-optimisation, but after lowering vars to ssa * (so we don't accidentally duplicate the epilogue since mesa/st has @@ -2541,22 +2933,29 @@ midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_b NIR_PASS_V(nir, nir_lower_var_copies); NIR_PASS_V(nir, nir_lower_vars_to_ssa); - NIR_PASS_V(nir, nir_lower_io, nir_var_all, glsl_type_size, 0); + unsigned pan_quirks = panfrost_get_quirks(gpu_id); + NIR_PASS_V(nir, pan_lower_framebuffer, + program->rt_formats, is_blend, pan_quirks); + + NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + glsl_type_size, 0); NIR_PASS_V(nir, nir_lower_ssbo); NIR_PASS_V(nir, midgard_nir_lower_zs_store); /* Optimisation passes */ - optimise_nir(nir, ctx->quirks); + optimise_nir(nir, ctx->quirks, is_blend); + + NIR_PASS_V(nir, midgard_nir_reorder_writeout); - if (midgard_debug & MIDGARD_DBG_SHADERS) { + if ((midgard_debug & MIDGARD_DBG_SHADERS) && !silent) { nir_print_shader(nir, stdout); } /* Assign sysvals and counts, now that we're sure * (post-optimisation) */ - panfrost_nir_assign_sysvals(&ctx->sysvals, nir); + panfrost_nir_assign_sysvals(&ctx->sysvals, ctx, nir); program->sysval_count = ctx->sysvals.sysval_count; memcpy(program->sysvals, ctx->sysvals.sysvals, sizeof(ctx->sysvals.sysvals[0]) * ctx->sysvals.sysval_count); @@ -2567,8 +2966,21 @@ midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_b list_inithead(&ctx->blocks); ctx->block_count = 0; ctx->func = func; + ctx->already_emitted = calloc(BITSET_WORDS(func->impl->ssa_alloc), sizeof(BITSET_WORD)); + + if (nir->info.outputs_read && !is_blend) { + emit_block_init(ctx); + + struct midgard_instruction wait = v_branch(false, false); + wait.branch.target_type = TARGET_TILEBUF_WAIT; + + emit_mir_instruction(ctx, wait); + + ++ctx->instruction_count; + } emit_cf_list(ctx, &func->impl->body); + free(ctx->already_emitted); break; /* TODO: Multi-function shaders */ } @@ -2579,7 +2991,6 @@ midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_b mir_foreach_block(ctx, _block) { midgard_block *block = (midgard_block *) _block; inline_alu_constants(ctx, block); - midgard_opt_promote_fmov(ctx, block); embedded_to_inline_constant(ctx, block); } /* MIR-level optimizations */ @@ -2588,42 +2999,23 @@ midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_b do { progress = false; + progress |= midgard_opt_dead_code_eliminate(ctx); mir_foreach_block(ctx, _block) { midgard_block *block = (midgard_block *) _block; - progress |= midgard_opt_pos_propagate(ctx, block); progress |= midgard_opt_copy_prop(ctx, block); - progress |= midgard_opt_dead_code_eliminate(ctx, block); progress |= midgard_opt_combine_projection(ctx, block); progress |= midgard_opt_varying_projection(ctx, block); - progress |= midgard_opt_not_propagate(ctx, block); - progress |= midgard_opt_fuse_src_invert(ctx, block); - progress |= midgard_opt_fuse_dest_invert(ctx, block); - progress |= midgard_opt_csel_invert(ctx, block); - progress |= midgard_opt_drop_cmp_invert(ctx, block); - progress |= midgard_opt_invert_branch(ctx, block); } } while (progress); mir_foreach_block(ctx, _block) { midgard_block *block = (midgard_block *) _block; - midgard_lower_invert(ctx, block); midgard_lower_derivatives(ctx, block); - } - - /* Nested control-flow can result in dead branches at the end of the - * block. This messes with our analysis and is just dead code, so cull - * them */ - mir_foreach_block(ctx, _block) { - midgard_block *block = (midgard_block *) _block; + midgard_legalize_invert(ctx, block); midgard_cull_dead_branch(ctx, block); } - /* Ensure we were lowered */ - mir_foreach_instr_global(ctx, ins) { - assert(!ins->invert); - } - if (ctx->stage == MESA_SHADER_FRAGMENT) mir_add_writeout_loops(ctx); @@ -2636,117 +3028,6 @@ midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_b midgard_schedule_program(ctx); mir_ra(ctx); - /* Now that all the bundles are scheduled and we can calculate block - * sizes, emit actual branch instructions rather than placeholders */ - - int br_block_idx = 0; - - mir_foreach_block(ctx, _block) { - midgard_block *block = (midgard_block *) _block; - util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) { - for (int c = 0; c < bundle->instruction_count; ++c) { - midgard_instruction *ins = bundle->instructions[c]; - - if (!midgard_is_branch_unit(ins->unit)) continue; - - /* Parse some basic branch info */ - bool is_compact = ins->unit == ALU_ENAB_BR_COMPACT; - bool is_conditional = ins->branch.conditional; - bool is_inverted = ins->branch.invert_conditional; - bool is_discard = ins->branch.target_type == TARGET_DISCARD; - bool is_writeout = ins->writeout; - - /* Determine the block we're jumping to */ - int target_number = ins->branch.target_block; - - /* Report the destination tag */ - int dest_tag = is_discard ? 0 : midgard_get_first_tag_from_block(ctx, target_number); - - /* Count up the number of quadwords we're - * jumping over = number of quadwords until - * (br_block_idx, target_number) */ - - int quadword_offset = 0; - - if (is_discard) { - /* Ignored */ - } else if (target_number > br_block_idx) { - /* Jump forward */ - - for (int idx = br_block_idx + 1; idx < target_number; ++idx) { - midgard_block *blk = mir_get_block(ctx, idx); - assert(blk); - - quadword_offset += blk->quadword_count; - } - } else { - /* Jump backwards */ - - for (int idx = br_block_idx; idx >= target_number; --idx) { - midgard_block *blk = mir_get_block(ctx, idx); - assert(blk); - - quadword_offset -= blk->quadword_count; - } - } - - /* Unconditional extended branches (far jumps) - * have issues, so we always use a conditional - * branch, setting the condition to always for - * unconditional. For compact unconditional - * branches, cond isn't used so it doesn't - * matter what we pick. */ - - midgard_condition cond = - !is_conditional ? midgard_condition_always : - is_inverted ? midgard_condition_false : - midgard_condition_true; - - midgard_jmp_writeout_op op = - is_discard ? midgard_jmp_writeout_op_discard : - is_writeout ? midgard_jmp_writeout_op_writeout : - (is_compact && !is_conditional) ? midgard_jmp_writeout_op_branch_uncond : - midgard_jmp_writeout_op_branch_cond; - - if (!is_compact) { - midgard_branch_extended branch = - midgard_create_branch_extended( - cond, op, - dest_tag, - quadword_offset); - - memcpy(&ins->branch_extended, &branch, sizeof(branch)); - } else if (is_conditional || is_discard) { - midgard_branch_cond branch = { - .op = op, - .dest_tag = dest_tag, - .offset = quadword_offset, - .cond = cond - }; - - assert(branch.offset == quadword_offset); - - memcpy(&ins->br_compact, &branch, sizeof(branch)); - } else { - assert(op == midgard_jmp_writeout_op_branch_uncond); - - midgard_branch_uncond branch = { - .op = op, - .dest_tag = dest_tag, - .offset = quadword_offset, - .unknown = 1 - }; - - assert(branch.offset == quadword_offset); - - memcpy(&ins->br_compact, &branch, sizeof(branch)); - } - } - } - - ++br_block_idx; - } - /* Emit flat binary from the instruction arrays. Iterate each block in * sequence. Save instruction boundaries such that lookahead tags can * be assigned easily */ @@ -2781,7 +3062,7 @@ midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_b if (!bundle->last_writeout && (current_bundle + 1 < bundle_count)) lookahead = source_order_bundles[current_bundle + 1]->tag; - emit_binary_bundle(ctx, bundle, compiled, lookahead); + emit_binary_bundle(ctx, block, bundle, compiled, lookahead); ++current_bundle; } @@ -2801,10 +3082,10 @@ midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_b program->blend_patch_offset = ctx->blend_constant_offset; program->tls_size = ctx->tls_size; - if (midgard_debug & MIDGARD_DBG_SHADERS) + if ((midgard_debug & MIDGARD_DBG_SHADERS) && !silent) disassemble_midgard(stdout, program->compiled.data, program->compiled.size, gpu_id, ctx->stage); - if (midgard_debug & MIDGARD_DBG_SHADERDB || shaderdb) { + if ((midgard_debug & MIDGARD_DBG_SHADERDB || shaderdb) && !silent) { unsigned nr_bundles = 0, nr_ins = 0; /* Count instructions and bundles */ @@ -2835,6 +3116,7 @@ midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_b "%u registers, %u threads, %u loops, " "%u:%u spills:fills\n", SHADER_DB_COUNT++, + ctx->is_blend ? "PAN_SHADER_BLEND" : gl_shader_stage_name(ctx->stage), nr_ins, nr_bundles, ctx->quadword_count, nr_registers, nr_threads,