X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fpanfrost%2Fmidgard%2Fmidgard_compile.c;h=2ffdee3f16267ace149f53b409d4f62cce94b6c6;hb=608c87afdddb9524bce3d2ccd95c2297b453072b;hp=56918951ce53bf32bc36f1b2ffde20687544b198;hpb=797fa87ec9891384fee7588fda36c80dfed1a39d;p=mesa.git diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c index 56918951ce5..2ffdee3f162 100644 --- a/src/panfrost/midgard/midgard_compile.c +++ b/src/panfrost/midgard/midgard_compile.c @@ -48,6 +48,8 @@ #include "helpers.h" #include "compiler.h" #include "midgard_quirks.h" +#include "panfrost-quirks.h" +#include "panfrost/util/pan_lower_framebuffer.h" #include "disassemble.h" @@ -108,8 +110,8 @@ schedule_barrier(compiler_context *ctx) .dest = ~0, \ .src = { ~0, ~0, ~0, ~0 }, \ .swizzle = SWIZZLE_IDENTITY_4, \ + .op = midgard_op_##name, \ .load_store = { \ - .op = midgard_op_##name, \ .address = address \ } \ }; \ @@ -135,6 +137,7 @@ M_LOAD(ld_int4, nir_type_uint32); M_STORE(st_int4, nir_type_uint32); M_LOAD(ld_color_buffer_32u, nir_type_uint32); M_LOAD(ld_color_buffer_as_fp16, nir_type_float16); +M_LOAD(ld_color_buffer_as_fp32, nir_type_float32); M_STORE(st_vary_32, nir_type_uint32); M_LOAD(ld_cubemap_coords, nir_type_uint32); M_LOAD(ld_compute_id, nir_type_uint32); @@ -157,37 +160,6 @@ v_branch(bool conditional, bool invert) return ins; } -static midgard_branch_extended -midgard_create_branch_extended( midgard_condition cond, - midgard_jmp_writeout_op op, - unsigned dest_tag, - signed quadword_offset) -{ - /* The condition code is actually a LUT describing a function to - * combine multiple condition codes. However, we only support a single - * condition code at the moment, so we just duplicate over a bunch of - * times. */ - - uint16_t duplicated_cond = - (cond << 14) | - (cond << 12) | - (cond << 10) | - (cond << 8) | - (cond << 6) | - (cond << 4) | - (cond << 2) | - (cond << 0); - - midgard_branch_extended branch = { - .op = op, - .dest_tag = dest_tag, - .offset = quadword_offset, - .cond = duplicated_cond - }; - - return branch; -} - static void attach_constants(compiler_context *ctx, midgard_instruction *ins, void *constants, int name) { @@ -254,9 +226,9 @@ midgard_nir_lower_fdot2(nir_shader *shader) } static const nir_variable * -search_var(struct exec_list *vars, unsigned driver_loc) +search_var(nir_shader *nir, nir_variable_mode mode, unsigned driver_loc) { - nir_foreach_variable(var, vars) { + nir_foreach_variable_with_modes(var, nir, mode) { if (var->data.driver_location == driver_loc) return var; } @@ -276,7 +248,7 @@ midgard_nir_lower_zs_store(nir_shader *nir) nir_variable *z_var = NULL, *s_var = NULL; - nir_foreach_variable(var, &nir->outputs) { + nir_foreach_shader_out_variable(var, nir) { if (var->data.location == FRAG_RESULT_DEPTH) z_var = var; else if (var->data.location == FRAG_RESULT_STENCIL) @@ -327,13 +299,16 @@ midgard_nir_lower_zs_store(nir_shader *nir) if (intr->intrinsic != nir_intrinsic_store_output) continue; - const nir_variable *var = search_var(&nir->outputs, nir_intrinsic_base(intr)); + const nir_variable *var = search_var(nir, nir_var_shader_out, nir_intrinsic_base(intr)); assert(var); if (var->data.location != FRAG_RESULT_COLOR && var->data.location < FRAG_RESULT_DATA0) continue; + if (var->data.index) + continue; + assert(nir_src_is_const(intr->src[1]) && "no indirect outputs"); nir_builder b; @@ -399,7 +374,12 @@ midgard_nir_lower_zs_store(nir_shader *nir) combined_store->num_components = 4; - nir_intrinsic_set_base(combined_store, 0); + unsigned base; + if (z_store) + base = nir_intrinsic_base(z_store); + else + base = nir_intrinsic_base(s_store); + nir_intrinsic_set_base(combined_store, base); unsigned writeout = 0; if (z_store) @@ -437,6 +417,50 @@ midgard_nir_lower_zs_store(nir_shader *nir) return progress; } +/* Real writeout stores, which break execution, need to be moved to after + * dual-source stores, which are just standard register writes. */ +static bool +midgard_nir_reorder_writeout(nir_shader *nir) +{ + bool progress = false; + + nir_foreach_function(function, nir) { + if (!function->impl) continue; + + nir_foreach_block(block, function->impl) { + nir_instr *last_writeout = NULL; + + nir_foreach_instr_reverse_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_output) + continue; + + const nir_variable *var = search_var(nir, nir_var_shader_out, nir_intrinsic_base(intr)); + + if (var->data.index) { + if (!last_writeout) + last_writeout = instr; + continue; + } + + if (!last_writeout) + continue; + + /* This is a real store, so move it to after dual-source stores */ + exec_node_remove(&instr->node); + exec_node_insert_after(&last_writeout->node, &instr->node); + + progress = true; + } + } + } + + return progress; +} + /* Flushes undefined values to zero */ static void @@ -651,60 +675,6 @@ nir_is_non_scalar_swizzle(nir_alu_src *src, unsigned nr_components) op = midgard_alu_op_##_op; \ ALU_CHECK_CMP(sext); \ break; - -/* Analyze the sizes of the dest and inputs to determine reg mode. */ - -static midgard_reg_mode -reg_mode_for_nir(nir_alu_instr *instr) -{ - unsigned src_bitsize = nir_src_bit_size(instr->src[0].src); - unsigned dst_bitsize = nir_dest_bit_size(instr->dest.dest); - unsigned max_bitsize = MAX2(src_bitsize, dst_bitsize); - - /* We don't have fp16 LUTs, so we'll want to emit code like: - * - * vlut.fsinr hr0, hr0 - * - * where both input and output are 16-bit but the operation is carried - * out in 32-bit - */ - - switch (instr->op) { - case nir_op_fsqrt: - case nir_op_frcp: - case nir_op_frsq: - case nir_op_fsin: - case nir_op_fcos: - case nir_op_fexp2: - case nir_op_flog2: - max_bitsize = MAX2(max_bitsize, 32); - break; - - /* These get lowered to moves */ - case nir_op_pack_32_4x8: - max_bitsize = 8; - break; - case nir_op_pack_32_2x16: - max_bitsize = 16; - break; - default: - break; - } - - - switch (max_bitsize) { - /* Use 16 pipe for 8 since we don't support vec16 yet */ - case 8: - case 16: - return midgard_reg_mode_16; - case 32: - return midgard_reg_mode_32; - case 64: - return midgard_reg_mode_64; - default: - unreachable("Invalid bit size"); - } -} /* Compare mir_lower_invert */ static bool @@ -735,6 +705,34 @@ mir_accept_dest_mod(compiler_context *ctx, nir_dest **dest, nir_op op) return false; } +/* Look for floating point mods. We have the mods fsat, fsat_signed, + * and fpos. We also have the relations (note 3 * 2 = 6 cases): + * + * fsat_signed(fpos(x)) = fsat(x) + * fsat_signed(fsat(x)) = fsat(x) + * fpos(fsat_signed(x)) = fsat(x) + * fpos(fsat(x)) = fsat(x) + * fsat(fsat_signed(x)) = fsat(x) + * fsat(fpos(x)) = fsat(x) + * + * So by cases any composition of output modifiers is equivalent to + * fsat alone. + */ +static unsigned +mir_determine_float_outmod(compiler_context *ctx, nir_dest **dest, unsigned prior_outmod) +{ + bool fpos = mir_accept_dest_mod(ctx, dest, nir_op_fclamp_pos); + bool fsat = mir_accept_dest_mod(ctx, dest, nir_op_fsat); + bool ssat = mir_accept_dest_mod(ctx, dest, nir_op_fsat_signed); + bool prior = (prior_outmod != midgard_outmod_none); + int count = (int) prior + (int) fpos + (int) ssat + (int) fsat; + + return ((count > 1) || fsat) ? midgard_outmod_sat : + fpos ? midgard_outmod_pos : + ssat ? midgard_outmod_sat_signed : + prior_outmod; +} + static void mir_copy_src(midgard_instruction *ins, nir_alu_instr *instr, unsigned i, unsigned to, bool *abs, bool *neg, bool *not, enum midgard_roundmode *roundmode, bool is_int, unsigned bcast_count) { @@ -857,10 +855,6 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) unsigned broadcast_swizzle = 0; - /* What register mode should we operate in? */ - midgard_reg_mode reg_mode = - reg_mode_for_nir(instr); - /* Should we swap arguments? */ bool flip_src12 = false; @@ -894,7 +888,7 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) ALU_CASE(mov, imov); ALU_CASE_CMP(feq32, feq, false); - ALU_CASE_CMP(fne32, fne, false); + ALU_CASE_CMP(fneu32, fne, false); ALU_CASE_CMP(flt32, flt, false); ALU_CASE_CMP(ieq32, ieq, true); ALU_CASE_CMP(ine32, ine, true); @@ -1079,31 +1073,8 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) unsigned opcode_props = alu_opcode_props[op].props; bool quirk_flipped_r24 = opcode_props & QUIRK_FLIPPED_R24; - /* Look for floating point mods. We have the mods fsat, fsat_signed, - * and fpos. We also have the relations (note 3 * 2 = 6 cases): - * - * fsat_signed(fpos(x)) = fsat(x) - * fsat_signed(fsat(x)) = fsat(x) - * fpos(fsat_signed(x)) = fsat(x) - * fpos(fsat(x)) = fsat(x) - * fsat(fsat_signed(x)) = fsat(x) - * fsat(fpos(x)) = fsat(x) - * - * So by cases any composition of output modifiers is equivalent to - * fsat alone. - */ - - if (!is_int && !(opcode_props & OP_TYPE_CONVERT)) { - bool fpos = mir_accept_dest_mod(ctx, &dest, nir_op_fclamp_pos); - bool fsat = mir_accept_dest_mod(ctx, &dest, nir_op_fsat); - bool ssat = mir_accept_dest_mod(ctx, &dest, nir_op_fsat_signed); - bool prior = (outmod != midgard_outmod_none); - int count = (int) prior + (int) fpos + (int) ssat + (int) fsat; - - outmod = ((count > 1) || fsat) ? midgard_outmod_sat : - fpos ? midgard_outmod_pos : - ssat ? midgard_outmod_sat_signed : - outmod; + if (!midgard_is_integer_out_op(op)) { + outmod = mir_determine_float_outmod(ctx, &dest, outmod); } midgard_instruction ins = { @@ -1164,12 +1135,6 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) ins.mask = mask_of(nr_components); - midgard_vector_alu alu = { - .op = op, - .reg_mode = reg_mode, - .outmod = outmod, - }; - /* Apply writemask if non-SSA, keeping in mind that we can't write to * components that don't exist. Note modifier => SSA => !reg => no * writemask, so we don't have to worry about writemasks here.*/ @@ -1177,7 +1142,8 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) if (!is_ssa) ins.mask &= instr->dest.write_mask; - ins.alu = alu; + ins.op = op; + ins.outmod = outmod; /* Late fixup for emulated instructions */ @@ -1211,7 +1177,7 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr) /* Lots of instructions need a 0 plonked in */ ins.has_inline_constant = false; ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); - ins.src_types[1] = nir_type_uint32; + ins.src_types[1] = ins.src_types[0]; ins.has_constants = true; ins.constants.u32[0] = 0; @@ -1415,16 +1381,16 @@ emit_varying_read( switch (type) { case nir_type_uint32: case nir_type_bool32: - ins.load_store.op = midgard_op_ld_vary_32u; + ins.op = midgard_op_ld_vary_32u; break; case nir_type_int32: - ins.load_store.op = midgard_op_ld_vary_32i; + ins.op = midgard_op_ld_vary_32i; break; case nir_type_float32: - ins.load_store.op = midgard_op_ld_vary_32; + ins.op = midgard_op_ld_vary_32; break; case nir_type_float16: - ins.load_store.op = midgard_op_ld_vary_16; + ins.op = midgard_op_ld_vary_16; break; default: unreachable("Attempted to load unknown type"); @@ -1449,13 +1415,13 @@ emit_attr_read( switch (t) { case nir_type_uint: case nir_type_bool: - ins.load_store.op = midgard_op_ld_attr_32u; + ins.op = midgard_op_ld_attr_32u; break; case nir_type_int: - ins.load_store.op = midgard_op_ld_attr_32i; + ins.op = midgard_op_ld_attr_32i; break; case nir_type_float: - ins.load_store.op = midgard_op_ld_attr_32; + ins.op = midgard_op_ld_attr_32; break; default: unreachable("Attempted to load unknown type"); @@ -1580,6 +1546,22 @@ emit_vertex_builtin(compiler_context *ctx, nir_intrinsic_instr *instr) emit_attr_read(ctx, reg, vertex_builtin_arg(instr->intrinsic), 1, nir_type_int); } +static void +emit_msaa_builtin(compiler_context *ctx, nir_intrinsic_instr *instr) +{ + unsigned reg = nir_dest_index(&instr->dest); + + midgard_instruction ld = m_ld_color_buffer_32u(reg, 0); + ld.op = midgard_op_ld_color_buffer_32u_old; + ld.load_store.address = 97; + ld.load_store.arg_2 = 0x1E; + + for (int i = 0; i < 4; ++i) + ld.swizzle[0][i] = COMPONENT_X; + + emit_mir_instruction(ctx, ld); +} + static void emit_control_barrier(compiler_context *ctx) { @@ -1587,13 +1569,7 @@ emit_control_barrier(compiler_context *ctx) .type = TAG_TEXTURE_4, .dest = ~0, .src = { ~0, ~0, ~0, ~0 }, - .texture = { - .op = TEXTURE_OP_BARRIER, - - /* TODO: optimize */ - .out_of_order = MIDGARD_BARRIER_BUFFER | - MIDGARD_BARRIER_SHARED , - } + .op = TEXTURE_OP_BARRIER, }; emit_mir_instruction(ctx, ins); @@ -1612,6 +1588,32 @@ mir_get_branch_cond(nir_src *src, bool *invert) return nir_src_index(NULL, &alu.src); } +static uint8_t +output_load_rt_addr(compiler_context *ctx, nir_intrinsic_instr *instr) +{ + if (ctx->is_blend) + return ctx->blend_rt; + + const nir_variable *var; + var = search_var(ctx->nir, nir_var_shader_out, nir_intrinsic_base(instr)); + assert(var); + + unsigned loc = var->data.location; + + if (loc == FRAG_RESULT_COLOR) + loc = FRAG_RESULT_DATA0; + + if (loc >= FRAG_RESULT_DATA0) + return loc - FRAG_RESULT_DATA0; + + if (loc == FRAG_RESULT_DEPTH) + return 0x1F; + if (loc == FRAG_RESULT_STENCIL) + return 0x1E; + + unreachable("Invalid RT to load from"); +} + static void emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) { @@ -1692,13 +1694,15 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) } else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_blend) { emit_varying_read(ctx, reg, offset, nr_comp, component, indirect_offset, t | nir_dest_bit_size(instr->dest), is_flat); } else if (ctx->is_blend) { - /* ctx->blend_input will be precoloured to r0, where + /* ctx->blend_input will be precoloured to r0/r2, where * the input is preloaded */ - if (ctx->blend_input == ~0) - ctx->blend_input = reg; + unsigned *input = offset ? &ctx->blend_src1 : &ctx->blend_input; + + if (*input == ~0) + *input = reg; else - emit_mir_instruction(ctx, v_mov(ctx->blend_input, reg)); + emit_mir_instruction(ctx, v_mov(*input, reg)); } else if (ctx->stage == MESA_SHADER_VERTEX) { emit_attr_read(ctx, reg, offset, nr_comp, t); } else { @@ -1718,15 +1722,24 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_load_raw_output_pan: { reg = nir_dest_index(&instr->dest); - assert(ctx->is_blend); /* T720 and below use different blend opcodes with slightly * different semantics than T760 and up */ midgard_instruction ld = m_ld_color_buffer_32u(reg, 0); + ld.load_store.arg_2 = output_load_rt_addr(ctx, instr); + + if (nir_src_is_const(instr->src[0])) { + ld.load_store.arg_1 = nir_src_as_uint(instr->src[0]); + } else { + ld.load_store.varying_parameters = 2; + ld.src[1] = nir_src_index(ctx, &instr->src[0]); + ld.src_types[1] = nir_type_int32; + } + if (ctx->quirks & MIDGARD_OLD_BLEND) { - ld.load_store.op = midgard_op_ld_color_buffer_32u_old; + ld.op = midgard_op_ld_color_buffer_32u_old; ld.load_store.address = 16; ld.load_store.arg_2 = 0x1E; } @@ -1737,15 +1750,25 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_load_output: { reg = nir_dest_index(&instr->dest); - assert(ctx->is_blend); - midgard_instruction ld = m_ld_color_buffer_as_fp16(reg, 0); + unsigned bits = nir_dest_bit_size(instr->dest); + + midgard_instruction ld; + if (bits == 16) + ld = m_ld_color_buffer_as_fp16(reg, 0); + else + ld = m_ld_color_buffer_as_fp32(reg, 0); + + ld.load_store.arg_2 = output_load_rt_addr(ctx, instr); for (unsigned c = 4; c < 16; ++c) ld.swizzle[0][c] = 0; if (ctx->quirks & MIDGARD_OLD_BLEND) { - ld.load_store.op = midgard_op_ld_color_buffer_as_fp16_old; + if (bits == 16) + ld.op = midgard_op_ld_color_buffer_as_fp16_old; + else + ld.op = midgard_op_ld_color_buffer_as_fp32_old; ld.load_store.address = 1; ld.load_store.arg_2 = 0x1E; } @@ -1781,11 +1804,30 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) nir_intrinsic_store_combined_output_pan; const nir_variable *var; - enum midgard_rt_id rt; - - var = search_var(&ctx->nir->outputs, + var = search_var(ctx->nir, nir_var_shader_out, nir_intrinsic_base(instr)); assert(var); + + /* Dual-source blend writeout is done by leaving the + * value in r2 for the blend shader to use. */ + if (var->data.index) { + if (instr->src[0].is_ssa) { + emit_explicit_constant(ctx, reg, reg); + + unsigned out = make_compiler_temp(ctx); + + midgard_instruction ins = v_mov(reg, out); + emit_mir_instruction(ctx, ins); + + ctx->blend_src1 = out; + } else { + ctx->blend_src1 = reg; + } + + break; + } + + enum midgard_rt_id rt; if (var->data.location == FRAG_RESULT_COLOR) rt = MIDGARD_COLOR_RT0; else if (var->data.location >= FRAG_RESULT_DATA0) @@ -1829,13 +1871,13 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) switch (nir_alu_type_get_base_type(nir_intrinsic_type(instr))) { case nir_type_uint: case nir_type_bool: - st.load_store.op = midgard_op_st_vary_32u; + st.op = midgard_op_st_vary_32u; break; case nir_type_int: - st.load_store.op = midgard_op_st_vary_32i; + st.op = midgard_op_st_vary_32i; break; case nir_type_float: - st.load_store.op = midgard_op_st_vary_32; + st.op = midgard_op_st_vary_32; break; default: unreachable("Attempted to store unknown type"); @@ -1905,6 +1947,10 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) emit_vertex_builtin(ctx, instr); break; + case nir_intrinsic_load_sample_id: + emit_msaa_builtin(ctx, instr); + break; + case nir_intrinsic_memory_barrier_buffer: case nir_intrinsic_memory_barrier_shared: break; @@ -1922,24 +1968,26 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) } } +/* Returns dimension with 0 special casing cubemaps */ static unsigned midgard_tex_format(enum glsl_sampler_dim dim) { switch (dim) { case GLSL_SAMPLER_DIM_1D: case GLSL_SAMPLER_DIM_BUF: - return MALI_TEX_1D; + return 1; case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_MS: case GLSL_SAMPLER_DIM_EXTERNAL: case GLSL_SAMPLER_DIM_RECT: - return MALI_TEX_2D; + return 2; case GLSL_SAMPLER_DIM_3D: - return MALI_TEX_3D; + return 3; case GLSL_SAMPLER_DIM_CUBE: - return MALI_TEX_CUBE; + return 0; default: DBG("Unknown sampler dim type\n"); @@ -1988,21 +2036,30 @@ emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, /* TODO */ //assert (!instr->sampler); + nir_dest *dest = &instr->dest; + int texture_index = instr->texture_index; int sampler_index = texture_index; nir_alu_type dest_base = nir_alu_type_get_base_type(instr->dest_type); - nir_alu_type dest_type = dest_base | nir_dest_bit_size(instr->dest); + nir_alu_type dest_type = dest_base | nir_dest_bit_size(*dest); + + /* texture instructions support float outmods */ + unsigned outmod = midgard_outmod_none; + if (dest_base == nir_type_float) { + outmod = mir_determine_float_outmod(ctx, &dest, 0); + } midgard_instruction ins = { .type = TAG_TEXTURE_4, .mask = 0xF, - .dest = nir_dest_index(&instr->dest), + .dest = nir_dest_index(dest), .src = { ~0, ~0, ~0, ~0 }, .dest_type = dest_type, .swizzle = SWIZZLE_IDENTITY_4, + .outmod = outmod, + .op = midgard_texop, .texture = { - .op = midgard_texop, .format = midgard_tex_format(instr->sampler_dim), .texture_handle = texture_index, .sampler_handle = sampler_index, @@ -2146,7 +2203,8 @@ emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, break; }; - case nir_tex_src_comparator: { + case nir_tex_src_comparator: + case nir_tex_src_ms_index: { unsigned comp = COMPONENT_Z; /* mov coord_temp.foo, coords */ @@ -2182,6 +2240,7 @@ emit_tex(compiler_context *ctx, nir_tex_instr *instr) emit_texop_native(ctx, instr, TEXTURE_OP_LOD); break; case nir_texop_txf: + case nir_texop_txf_ms: emit_texop_native(ctx, instr, TEXTURE_OP_TEXEL_FETCH); break; case nir_texop_txs: @@ -2298,6 +2357,61 @@ inline_alu_constants(compiler_context *ctx, midgard_block *block) } } +unsigned +max_bitsize_for_alu(midgard_instruction *ins) +{ + unsigned max_bitsize = 0; + for (int i = 0; i < MIR_SRC_COUNT; i++) { + if (ins->src[i] == ~0) continue; + unsigned src_bitsize = nir_alu_type_get_type_size(ins->src_types[i]); + max_bitsize = MAX2(src_bitsize, max_bitsize); + } + unsigned dst_bitsize = nir_alu_type_get_type_size(ins->dest_type); + max_bitsize = MAX2(dst_bitsize, max_bitsize); + + /* We don't have fp16 LUTs, so we'll want to emit code like: + * + * vlut.fsinr hr0, hr0 + * + * where both input and output are 16-bit but the operation is carried + * out in 32-bit + */ + + switch (ins->op) { + case midgard_alu_op_fsqrt: + case midgard_alu_op_frcp: + case midgard_alu_op_frsqrt: + case midgard_alu_op_fsin: + case midgard_alu_op_fcos: + case midgard_alu_op_fexp2: + case midgard_alu_op_flog2: + max_bitsize = MAX2(max_bitsize, 32); + break; + + default: + break; + } + + return max_bitsize; +} + +midgard_reg_mode +reg_mode_for_bitsize(unsigned bitsize) +{ + switch (bitsize) { + /* use 16 pipe for 8 since we don't support vec16 yet */ + case 8: + case 16: + return midgard_reg_mode_16; + case 32: + return midgard_reg_mode_32; + case 64: + return midgard_reg_mode_64; + default: + unreachable("invalid bit size"); + } +} + /* Midgard supports two types of constants, embedded constants (128-bit) and * inline constants (16-bit). Sometimes, especially with scalar ops, embedded * constants can be demoted to inline constants, for space savings and @@ -2313,9 +2427,11 @@ embedded_to_inline_constant(compiler_context *ctx, midgard_block *block) /* Blend constants must not be inlined by definition */ if (ins->has_blend_constant) continue; + unsigned max_bitsize = max_bitsize_for_alu(ins); + /* We can inline 32-bit (sometimes) or 16-bit (usually) */ - bool is_16 = ins->alu.reg_mode == midgard_reg_mode_16; - bool is_32 = ins->alu.reg_mode == midgard_reg_mode_32; + bool is_16 = max_bitsize == 16; + bool is_32 = max_bitsize == 32; if (!(is_16 || is_32)) continue; @@ -2324,7 +2440,7 @@ embedded_to_inline_constant(compiler_context *ctx, midgard_block *block) * restrictions. So, if possible we try to flip the arguments * in that case */ - int op = ins->alu.op; + int op = ins->op; if (ins->src[0] == SSA_FIXED_REGISTER(REGISTER_CONSTANT) && alu_opcode_props[op].props & OP_COMMUTES) { @@ -2376,7 +2492,7 @@ embedded_to_inline_constant(compiler_context *ctx, midgard_block *block) uint32_t value = is_16 ? cons->u16[component] : cons->u32[component]; bool is_vector = false; - unsigned mask = effective_writemask(&ins->alu, ins->mask); + unsigned mask = effective_writemask(ins->op, ins->mask); for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) { /* We only care if this component is actually used */ @@ -2436,8 +2552,8 @@ midgard_legalize_invert(compiler_context *ctx, midgard_block *block) mir_foreach_instr_in_block(block, ins) { if (ins->type != TAG_ALU_4) continue; - if (ins->alu.op != midgard_alu_op_iand && - ins->alu.op != midgard_alu_op_ior) continue; + if (ins->op != midgard_alu_op_iand && + ins->op != midgard_alu_op_ior) continue; if (ins->src_invert[1] || !ins->src_invert[0]) continue; @@ -2473,7 +2589,7 @@ emit_fragment_epilogue(compiler_context *ctx, unsigned rt) } static midgard_block * -emit_block(compiler_context *ctx, nir_block *block) +emit_block_init(compiler_context *ctx) { midgard_block *this_block = ctx->after_block; ctx->after_block = NULL; @@ -2490,6 +2606,14 @@ emit_block(compiler_context *ctx, nir_block *block) list_inithead(&this_block->base.instructions); ctx->current_block = this_block; + return this_block; +} + +static midgard_block * +emit_block(compiler_context *ctx, nir_block *block) +{ + midgard_block *this_block = emit_block_init(ctx); + nir_foreach_instr(instr, block) { emit_instr(ctx, instr); ++ctx->instruction_count; @@ -2652,7 +2776,7 @@ emit_cf_list(struct compiler_context *ctx, struct exec_list *list) * stream and in branch targets. An initial block might be empty, so iterate * until we find one that 'works' */ -static unsigned +unsigned midgard_get_first_tag_from_block(compiler_context *ctx, unsigned block_idx) { midgard_block *initial_block = mir_get_block(ctx, block_idx); @@ -2706,7 +2830,7 @@ mir_add_writeout_loops(compiler_context *ctx) } int -midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_blend, unsigned blend_rt, unsigned gpu_id, bool shaderdb) +midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_blend, unsigned blend_rt, unsigned gpu_id, bool shaderdb, bool silent) { struct util_dynarray *compiled = &program->compiled; @@ -2718,9 +2842,9 @@ midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_b ctx->nir = nir; ctx->stage = nir->info.stage; ctx->is_blend = is_blend; - ctx->alpha_ref = program->alpha_ref; ctx->blend_rt = MIDGARD_COLOR_RT0 + blend_rt; ctx->blend_input = ~0; + ctx->blend_src1 = ~0; ctx->quirks = midgard_get_quirks(gpu_id); /* Start off with a safe cutoff, allowing usage of all 16 work @@ -2731,7 +2855,6 @@ midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_b /* Initialize at a global (not block) level hash tables */ ctx->ssa_constants = _mesa_hash_table_u64_create(NULL); - ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL); /* Lower gl_Position pre-optimisation, but after lowering vars to ssa * (so we don't accidentally duplicate the epilogue since mesa/st has @@ -2752,6 +2875,10 @@ midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_b NIR_PASS_V(nir, nir_lower_var_copies); NIR_PASS_V(nir, nir_lower_vars_to_ssa); + unsigned pan_quirks = panfrost_get_quirks(gpu_id); + NIR_PASS_V(nir, pan_lower_framebuffer, + program->rt_formats, is_blend, pan_quirks); + NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, glsl_type_size, 0); NIR_PASS_V(nir, nir_lower_ssbo); @@ -2761,14 +2888,16 @@ midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_b optimise_nir(nir, ctx->quirks, is_blend); - if (midgard_debug & MIDGARD_DBG_SHADERS) { + NIR_PASS_V(nir, midgard_nir_reorder_writeout); + + if ((midgard_debug & MIDGARD_DBG_SHADERS) && !silent) { nir_print_shader(nir, stdout); } /* Assign sysvals and counts, now that we're sure * (post-optimisation) */ - panfrost_nir_assign_sysvals(&ctx->sysvals, nir); + panfrost_nir_assign_sysvals(&ctx->sysvals, ctx, nir); program->sysval_count = ctx->sysvals.sysval_count; memcpy(program->sysvals, ctx->sysvals.sysvals, sizeof(ctx->sysvals.sysvals[0]) * ctx->sysvals.sysval_count); @@ -2781,6 +2910,17 @@ midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_b ctx->func = func; ctx->already_emitted = calloc(BITSET_WORDS(func->impl->ssa_alloc), sizeof(BITSET_WORD)); + if (nir->info.outputs_read && !is_blend) { + emit_block_init(ctx); + + struct midgard_instruction wait = v_branch(false, false); + wait.branch.target_type = TARGET_TILEBUF_WAIT; + + emit_mir_instruction(ctx, wait); + + ++ctx->instruction_count; + } + emit_cf_list(ctx, &func->impl->body); free(ctx->already_emitted); break; /* TODO: Multi-function shaders */ @@ -2830,117 +2970,6 @@ midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_b midgard_schedule_program(ctx); mir_ra(ctx); - /* Now that all the bundles are scheduled and we can calculate block - * sizes, emit actual branch instructions rather than placeholders */ - - int br_block_idx = 0; - - mir_foreach_block(ctx, _block) { - midgard_block *block = (midgard_block *) _block; - util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) { - for (int c = 0; c < bundle->instruction_count; ++c) { - midgard_instruction *ins = bundle->instructions[c]; - - if (!midgard_is_branch_unit(ins->unit)) continue; - - /* Parse some basic branch info */ - bool is_compact = ins->unit == ALU_ENAB_BR_COMPACT; - bool is_conditional = ins->branch.conditional; - bool is_inverted = ins->branch.invert_conditional; - bool is_discard = ins->branch.target_type == TARGET_DISCARD; - bool is_writeout = ins->writeout; - - /* Determine the block we're jumping to */ - int target_number = ins->branch.target_block; - - /* Report the destination tag */ - int dest_tag = is_discard ? 0 : midgard_get_first_tag_from_block(ctx, target_number); - - /* Count up the number of quadwords we're - * jumping over = number of quadwords until - * (br_block_idx, target_number) */ - - int quadword_offset = 0; - - if (is_discard) { - /* Ignored */ - } else if (target_number > br_block_idx) { - /* Jump forward */ - - for (int idx = br_block_idx + 1; idx < target_number; ++idx) { - midgard_block *blk = mir_get_block(ctx, idx); - assert(blk); - - quadword_offset += blk->quadword_count; - } - } else { - /* Jump backwards */ - - for (int idx = br_block_idx; idx >= target_number; --idx) { - midgard_block *blk = mir_get_block(ctx, idx); - assert(blk); - - quadword_offset -= blk->quadword_count; - } - } - - /* Unconditional extended branches (far jumps) - * have issues, so we always use a conditional - * branch, setting the condition to always for - * unconditional. For compact unconditional - * branches, cond isn't used so it doesn't - * matter what we pick. */ - - midgard_condition cond = - !is_conditional ? midgard_condition_always : - is_inverted ? midgard_condition_false : - midgard_condition_true; - - midgard_jmp_writeout_op op = - is_discard ? midgard_jmp_writeout_op_discard : - is_writeout ? midgard_jmp_writeout_op_writeout : - (is_compact && !is_conditional) ? midgard_jmp_writeout_op_branch_uncond : - midgard_jmp_writeout_op_branch_cond; - - if (!is_compact) { - midgard_branch_extended branch = - midgard_create_branch_extended( - cond, op, - dest_tag, - quadword_offset); - - memcpy(&ins->branch_extended, &branch, sizeof(branch)); - } else if (is_conditional || is_discard) { - midgard_branch_cond branch = { - .op = op, - .dest_tag = dest_tag, - .offset = quadword_offset, - .cond = cond - }; - - assert(branch.offset == quadword_offset); - - memcpy(&ins->br_compact, &branch, sizeof(branch)); - } else { - assert(op == midgard_jmp_writeout_op_branch_uncond); - - midgard_branch_uncond branch = { - .op = op, - .dest_tag = dest_tag, - .offset = quadword_offset, - .unknown = 1 - }; - - assert(branch.offset == quadword_offset); - - memcpy(&ins->br_compact, &branch, sizeof(branch)); - } - } - } - - ++br_block_idx; - } - /* Emit flat binary from the instruction arrays. Iterate each block in * sequence. Save instruction boundaries such that lookahead tags can * be assigned easily */ @@ -2995,10 +3024,10 @@ midgard_compile_shader_nir(nir_shader *nir, panfrost_program *program, bool is_b program->blend_patch_offset = ctx->blend_constant_offset; program->tls_size = ctx->tls_size; - if (midgard_debug & MIDGARD_DBG_SHADERS) + if ((midgard_debug & MIDGARD_DBG_SHADERS) && !silent) disassemble_midgard(stdout, program->compiled.data, program->compiled.size, gpu_id, ctx->stage); - if (midgard_debug & MIDGARD_DBG_SHADERDB || shaderdb) { + if ((midgard_debug & MIDGARD_DBG_SHADERDB || shaderdb) && !silent) { unsigned nr_bundles = 0, nr_ins = 0; /* Count instructions and bundles */