From: Rhys Perry Date: Mon, 14 Oct 2019 16:46:02 +0000 (+0100) Subject: radv/aco,aco: implement GS on GFX9+ X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=40bb81c9ddceaade7d12f90de087afd9882121a2;p=mesa.git radv/aco,aco: implement GS on GFX9+ v2: implement GFX10 v3: rebase v7: rebase after shader args MR v8: fix gs_vtx_offset usage on GFX9/GFX10 v8: use unreachable() instead of printing intrinsic v8: rename output_state to ge_output_state v8: fix formatting around nir_foreach_variable() v8: rename some helpers in the scheduler v8: rename p_memory_barrier_all to p_memory_barrier_common v8: fix assertion comparing ctx.stage against vertex_geometry_gs Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index 2cfd029bdc4..7aafdc21917 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -24,6 +24,7 @@ #include "aco_ir.h" #include "aco_builder.h" +#include "util/u_math.h" namespace aco { @@ -355,6 +356,12 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, Temp exec_mask = startpgm->definitions.back().getTemp(); bld.insert(std::move(startpgm)); + /* exec seems to need to be manually initialized with combined shaders */ + if (util_bitcount(ctx.program->stage & sw_mask) > 1) { + bld.sop1(Builder::s_mov, bld.exec(Definition(exec_mask)), bld.lm == s2 ? Operand(UINT64_MAX) : Operand(UINT32_MAX)); + instructions[0]->definitions.pop_back(); + } + if (ctx.handle_wqm) { ctx.info[0].exec.emplace_back(exec_mask, mask_type_global | mask_type_exact | mask_type_initial); /* if this block only needs WQM, initialize already */ diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index b74d5f57c25..8e1b64bfcd0 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -396,15 +396,18 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx) } if (instr->format == Format::PSEUDO_BARRIER) { - unsigned* bsize = ctx.program->info->cs.block_size; - unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2]; + uint32_t workgroup_size = UINT32_MAX; + if (ctx.program->stage & sw_cs) { + unsigned* bsize = ctx.program->info->cs.block_size; + workgroup_size = bsize[0] * bsize[1] * bsize[2]; + } switch (instr->opcode) { - case aco_opcode::p_memory_barrier_all: - for (unsigned i = 0; i < barrier_count; i++) { - if ((1 << i) == barrier_shared && workgroup_size <= ctx.program->wave_size) - continue; - imm.combine(ctx.barrier_imm[i]); - } + case aco_opcode::p_memory_barrier_common: + imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]); + imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]); + imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]); + if (workgroup_size > ctx.program->wave_size) + imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]); break; case aco_opcode::p_memory_barrier_atomic: imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]); @@ -419,6 +422,12 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx) if (workgroup_size > ctx.program->wave_size) imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]); break; + case aco_opcode::p_memory_barrier_gs_data: + imm.combine(ctx.barrier_imm[ffs(barrier_gs_data) - 1]); + break; + case aco_opcode::p_memory_barrier_gs_sendmsg: + imm.combine(ctx.barrier_imm[ffs(barrier_gs_sendmsg) - 1]); + break; default: assert(false); break; diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 897dbcb3655..09b751caf22 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -2655,8 +2655,8 @@ void visit_store_vs_output(isel_context *ctx, nir_intrinsic_instr *instr) for (unsigned i = 0; i < 8; ++i) { if (write_mask & (1 << i)) { - ctx->vs_output.mask[idx / 4u] |= 1 << (idx % 4u); - ctx->vs_output.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1); + ctx->vsgs_output.mask[idx / 4u] |= 1 << (idx % 4u); + ctx->vsgs_output.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1); } idx++; } @@ -3104,12 +3104,78 @@ void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t return; } +void visit_store_vsgs_output(isel_context *ctx, nir_intrinsic_instr *instr) +{ + unsigned write_mask = nir_intrinsic_write_mask(instr); + unsigned component = nir_intrinsic_component(instr); + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + unsigned idx = (nir_intrinsic_base(instr) + component) * 4u; + Operand offset(s1); + Builder bld(ctx->program, ctx->block); + + nir_instr *off_instr = instr->src[1].ssa->parent_instr; + if (off_instr->type != nir_instr_type_load_const) + offset = bld.v_mul24_imm(bld.def(v1), get_ssa_temp(ctx, instr->src[1].ssa), 16u); + else + idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 16u; + + unsigned itemsize = ctx->program->info->vs.es_info.esgs_itemsize; + + Temp vertex_idx = emit_mbcnt(ctx, bld.def(v1)); + Temp wave_idx = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(ctx, ctx->args->merged_wave_info), Operand(4u << 16 | 24)); + vertex_idx = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), vertex_idx, + bld.v_mul24_imm(bld.def(v1), as_vgpr(ctx, wave_idx), ctx->program->wave_size)); + + Temp lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, itemsize); + if (!offset.isUndefined()) + lds_base = bld.vadd32(bld.def(v1), offset, lds_base); + + unsigned align = 1 << (ffs(itemsize) - 1); + if (idx) + align = std::min(align, 1u << (ffs(idx) - 1)); + + unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; + store_lds(ctx, elem_size_bytes, src, write_mask, lds_base, idx, align); +} + +void visit_store_gs_output_gfx9(isel_context *ctx, nir_intrinsic_instr *instr) +{ + /* This wouldn't work if it wasn't in the same block as the + * emit_vertex_with_counter intrinsic but that doesn't happen because of + * nir_lower_io_to_temporaries(). */ + + unsigned write_mask = nir_intrinsic_write_mask(instr); + unsigned component = nir_intrinsic_component(instr); + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + unsigned idx = nir_intrinsic_base(instr) + component; + + nir_instr *off_instr = instr->src[1].ssa->parent_instr; + if (off_instr->type != nir_instr_type_load_const) + unreachable("Indirect GS output stores should have been lowered"); + idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 4u; + + if (instr->src[0].ssa->bit_size == 64) + write_mask = widen_mask(write_mask, 2); + + for (unsigned i = 0; i < 8; ++i) { + if (write_mask & (1 << i)) { + ctx->vsgs_output.mask[idx / 4u] |= 1 << (idx % 4u); + ctx->vsgs_output.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1); + } + idx++; + } +} + void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr) { if (ctx->stage == vertex_vs) { visit_store_vs_output(ctx, instr); } else if (ctx->stage == fragment_fs) { visit_store_fs_output(ctx, instr); + } else if (ctx->stage == vertex_geometry_gs && ctx->shader->info.stage == MESA_SHADER_VERTEX) { + visit_store_vsgs_output(ctx, instr); + } else if (ctx->stage == vertex_geometry_gs && ctx->shader->info.stage == MESA_SHADER_GEOMETRY) { + visit_store_gs_output_gfx9(ctx, instr); } else { unreachable("Shader stage not implemented"); } @@ -3420,6 +3486,64 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) } } +void visit_load_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr) +{ + assert(ctx->stage == vertex_geometry_gs); + assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY); + + Builder bld(ctx->program, ctx->block); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + Temp offset = Temp(); + if (instr->src[0].ssa->parent_instr->type != nir_instr_type_load_const) { + /* better code could be created, but this case probably doesn't happen + * much in practice */ + Temp indirect_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + for (unsigned i = 0; i < ctx->shader->info.gs.vertices_in; i++) { + Temp elem = get_arg(ctx, ctx->args->gs_vtx_offset[i / 2u * 2u]); + if (i % 2u) + elem = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), elem); + if (offset.id()) { + Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(s2)), + Operand(i), indirect_vertex); + offset = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), offset, elem, cond); + } else { + offset = elem; + } + } + offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu), offset); + } else { + unsigned vertex = nir_src_as_uint(instr->src[0]); + offset = bld.vop3( + aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->gs_vtx_offset[vertex / 2u * 2u]), + Operand((vertex % 2u) * 16u), Operand(16u)); + } + + unsigned const_offset = nir_intrinsic_base(instr); + const_offset += nir_intrinsic_component(instr); + + nir_instr *off_instr = instr->src[1].ssa->parent_instr; + if (off_instr->type != nir_instr_type_load_const) { + Temp indirect_offset = get_ssa_temp(ctx, instr->src[1].ssa); + offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), + bld.vadd32(bld.def(v1), indirect_offset, offset)); + } else { + const_offset += nir_instr_as_load_const(off_instr)->value[0].u32 * 4u; + } + const_offset *= 4u; + + offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), offset); + + unsigned itemsize = ctx->program->info->vs.es_info.esgs_itemsize; + unsigned align = 16; /* alignment of indirect offset */ + align = std::min(align, 1u << (ffs(itemsize) - 1)); + if (const_offset) + align = std::min(align, 1u << (ffs(const_offset) - 1)); + + unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8; + load_lds(ctx, elem_size_bytes, dst, offset, const_offset, align); +} + Temp load_desc_ptr(isel_context *ctx, unsigned desc_set) { if (ctx->program->info->need_indirect_descriptor_sets) { @@ -5281,7 +5405,7 @@ void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) { switch(instr->intrinsic) { case nir_intrinsic_group_memory_barrier: case nir_intrinsic_memory_barrier: - bld.barrier(aco_opcode::p_memory_barrier_all); + bld.barrier(aco_opcode::p_memory_barrier_common); break; case nir_intrinsic_memory_barrier_buffer: bld.barrier(aco_opcode::p_memory_barrier_buffer); @@ -5613,6 +5737,103 @@ void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) { bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, get_arg(ctx, ctx->args->ac.sample_coverage)); } +void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr) { + Builder bld(ctx->program, ctx->block); + + unsigned stream = nir_intrinsic_stream_id(instr); + Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u); + nir_const_value *next_vertex_cv = nir_src_as_const_value(instr->src[0]); + + /* get GSVS ring */ + Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_GSVS_GS * 16u)); + + unsigned num_components = + ctx->program->info->gs.num_stream_output_components[stream]; + assert(num_components); + + unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out; + unsigned stream_offset = 0; + for (unsigned i = 0; i < stream; i++) { + unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] * ctx->shader->info.gs.vertices_out; + stream_offset += prev_stride * ctx->program->wave_size; + } + + /* Limit on the stride field for <= GFX7. */ + assert(stride < (1 << 14)); + + Temp gsvs_dwords[4]; + for (unsigned i = 0; i < 4; i++) + gsvs_dwords[i] = bld.tmp(s1); + bld.pseudo(aco_opcode::p_split_vector, + Definition(gsvs_dwords[0]), + Definition(gsvs_dwords[1]), + Definition(gsvs_dwords[2]), + Definition(gsvs_dwords[3]), + gsvs_ring); + + if (stream_offset) { + Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand(stream_offset)); + + Temp carry = bld.tmp(s1); + gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), gsvs_dwords[0], stream_offset_tmp); + gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(0u), bld.scc(carry)); + } + + gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(S_008F04_STRIDE(stride))); + gsvs_dwords[2] = bld.copy(bld.def(s1), Operand((uint32_t)ctx->program->wave_size)); + + gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), + gsvs_dwords[0], gsvs_dwords[1], gsvs_dwords[2], gsvs_dwords[3]); + + unsigned offset = 0; + for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) { + if (ctx->program->info->gs.output_streams[i] != stream) + continue; + + for (unsigned j = 0; j < 4; j++) { + if (!(ctx->program->info->gs.output_usage_mask[i] & (1 << j))) + continue; + + if (ctx->vsgs_output.mask[i] & (1 << j)) { + Operand vaddr_offset = next_vertex_cv ? Operand(v1) : Operand(next_vertex); + unsigned const_offset = (offset + (next_vertex_cv ? next_vertex_cv->u32 : 0u)) * 4u; + if (const_offset >= 4096u) { + if (vaddr_offset.isUndefined()) + vaddr_offset = bld.copy(bld.def(v1), Operand(const_offset / 4096u * 4096u)); + else + vaddr_offset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), vaddr_offset); + const_offset %= 4096u; + } + + aco_ptr mtbuf{create_instruction(aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)}; + mtbuf->operands[0] = vaddr_offset; + mtbuf->operands[1] = Operand(gsvs_ring); + mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->gs2vs_offset)); + mtbuf->operands[3] = Operand(ctx->vsgs_output.outputs[i][j]); + mtbuf->offen = !vaddr_offset.isUndefined(); + mtbuf->dfmt = V_008F0C_BUF_DATA_FORMAT_32; + mtbuf->nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; + mtbuf->offset = const_offset; + mtbuf->glc = true; + mtbuf->slc = true; + mtbuf->barrier = barrier_gs_data; + mtbuf->can_reorder = true; + bld.insert(std::move(mtbuf)); + } + + offset += ctx->shader->info.gs.vertices_out; + } + + /* outputs for the next vertex are undefined and keeping them around can + * create invalid IR with control flow */ + ctx->vsgs_output.mask[i] = 0; + } + + Temp gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(ctx, ctx->args->merged_wave_info), Operand((8u << 16) | 16u)); + bld.sopp(aco_opcode::s_sendmsg, bld.m0(gs_wave_id), -1, sendmsg_gs(false, true, stream)); +} + Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src) { Builder bld(ctx->program, ctx->block); @@ -5970,6 +6191,9 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_load_input: visit_load_input(ctx, instr); break; + case nir_intrinsic_load_per_vertex_input: + visit_load_per_vertex_input(ctx, instr); + break; case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break; @@ -6605,6 +6829,35 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id)); break; } + case nir_intrinsic_load_invocation_id: { + assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + if (ctx->options->chip_class >= GFX10) + bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand(127u), get_arg(ctx, ctx->args->ac.gs_invocation_id)); + else + bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id)); + break; + } + case nir_intrinsic_load_primitive_id: { + assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id)); + break; + } + case nir_intrinsic_emit_vertex_with_counter: { + visit_emit_vertex_with_counter(ctx, instr); + break; + } + case nir_intrinsic_end_primitive_with_counter: { + Temp gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(ctx, ctx->args->merged_wave_info), Operand((8u << 16) | 16u)); + unsigned stream = nir_intrinsic_stream_id(instr); + bld.sopp(aco_opcode::s_sendmsg, bld.m0(gs_wave_id), -1, sendmsg_gs(true, false, stream)); + break; + } + case nir_intrinsic_set_vertex_count: { + /* unused, the HW keeps track of this for us */ + break; + } default: fprintf(stderr, "Unimplemented intrinsic instr: "); nir_print_instr(&instr->instr, stderr); @@ -8095,7 +8348,7 @@ static void visit_cf_list(isel_context *ctx, static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos) { int offset = ctx->program->info->vs.outinfo.vs_output_param_offset[slot]; - uint64_t mask = ctx->vs_output.mask[slot]; + uint64_t mask = ctx->vsgs_output.mask[slot]; if (!is_pos && !mask) return; if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED) @@ -8104,7 +8357,7 @@ static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *nex exp->enabled_mask = mask; for (unsigned i = 0; i < 4; ++i) { if (mask & (1 << i)) - exp->operands[i] = Operand(ctx->vs_output.outputs[slot][i]); + exp->operands[i] = Operand(ctx->vsgs_output.outputs[slot][i]); else exp->operands[i] = Operand(v1); } @@ -8127,23 +8380,23 @@ static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos) exp->enabled_mask = 0; for (unsigned i = 0; i < 4; ++i) exp->operands[i] = Operand(v1); - if (ctx->vs_output.mask[VARYING_SLOT_PSIZ]) { - exp->operands[0] = Operand(ctx->vs_output.outputs[VARYING_SLOT_PSIZ][0]); + if (ctx->vsgs_output.mask[VARYING_SLOT_PSIZ]) { + exp->operands[0] = Operand(ctx->vsgs_output.outputs[VARYING_SLOT_PSIZ][0]); exp->enabled_mask |= 0x1; } - if (ctx->vs_output.mask[VARYING_SLOT_LAYER]) { - exp->operands[2] = Operand(ctx->vs_output.outputs[VARYING_SLOT_LAYER][0]); + if (ctx->vsgs_output.mask[VARYING_SLOT_LAYER]) { + exp->operands[2] = Operand(ctx->vsgs_output.outputs[VARYING_SLOT_LAYER][0]); exp->enabled_mask |= 0x4; } - if (ctx->vs_output.mask[VARYING_SLOT_VIEWPORT]) { + if (ctx->vsgs_output.mask[VARYING_SLOT_VIEWPORT]) { if (ctx->options->chip_class < GFX9) { - exp->operands[3] = Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]); + exp->operands[3] = Operand(ctx->vsgs_output.outputs[VARYING_SLOT_VIEWPORT][0]); exp->enabled_mask |= 0x8; } else { Builder bld(ctx->program, ctx->block); Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u), - Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0])); + Operand(ctx->vsgs_output.outputs[VARYING_SLOT_VIEWPORT][0])); if (exp->operands[2].isTemp()) out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]); @@ -8163,13 +8416,13 @@ static void create_vs_exports(isel_context *ctx) radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo; if (outinfo->export_prim_id) { - ctx->vs_output.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1; - ctx->vs_output.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = get_arg(ctx, ctx->args->vs_prim_id); + ctx->vsgs_output.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1; + ctx->vsgs_output.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = get_arg(ctx, ctx->args->vs_prim_id); } if (ctx->options->key.has_multiview_view_index) { - ctx->vs_output.mask[VARYING_SLOT_LAYER] |= 0x1; - ctx->vs_output.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index)); + ctx->vsgs_output.mask[VARYING_SLOT_LAYER] |= 0x1; + ctx->vsgs_output.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index)); } /* the order these position exports are created is important */ @@ -8219,7 +8472,7 @@ static void emit_stream_output(isel_context *ctx, bool all_undef = true; assert(ctx->stage == vertex_vs); for (unsigned i = 0; i < num_comps; i++) { - out[i] = ctx->vs_output.outputs[loc][start + i]; + out[i] = ctx->vsgs_output.outputs[loc][start + i]; all_undef = all_undef && !out[i].id(); } if (all_undef) @@ -8239,7 +8492,7 @@ static void emit_stream_output(isel_context *ctx, Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, count)}; aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; for (int i = 0; i < count; ++i) - vec->operands[i] = (ctx->vs_output.mask[loc] & 1 << (start + i)) ? Operand(out[start + i]) : Operand(0u); + vec->operands[i] = (ctx->vsgs_output.mask[loc] & 1 << (start + i)) ? Operand(out[start + i]) : Operand(0u); vec->definitions[0] = Definition(write_data); ctx->block->instructions.emplace_back(std::move(vec)); @@ -8477,7 +8730,7 @@ void select_program(Program *program, if_context ic; if (shader_count >= 2) { Builder bld(ctx.program, ctx.block); - Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), ctx.merged_wave_info, Operand((8u << 16) | (i * 8u))); + Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | (i * 8u))); Temp thread_id = emit_mbcnt(&ctx, bld.def(v1)); Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(bld.lm)), count, thread_id); @@ -8486,7 +8739,8 @@ void select_program(Program *program, if (i) { Builder bld(ctx.program, ctx.block); - bld.barrier(aco_opcode::p_memory_barrier_shared); //TODO: different barriers are needed for different stages + assert(ctx.stage == vertex_geometry_gs); + bld.barrier(aco_opcode::p_memory_barrier_shared); bld.sopp(aco_opcode::s_barrier); } @@ -8496,11 +8750,17 @@ void select_program(Program *program, nir_function_impl *func = nir_shader_get_entrypoint(nir); visit_cf_list(&ctx, &func->body); - if (ctx.program->info->so.num_outputs/*&& !ctx->is_gs_copy_shader */) + if (ctx.program->info->so.num_outputs && ctx.stage == vertex_vs) emit_streamout(&ctx, 0); - if (ctx.stage == vertex_vs) + if (ctx.stage == vertex_vs) { create_vs_exports(&ctx); + } else if (nir->info.stage == MESA_SHADER_GEOMETRY && ctx.stage == vertex_geometry_gs) { + Builder bld(ctx.program, ctx.block); + Temp gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | 16u)); + bld.barrier(aco_opcode::p_memory_barrier_gs_data); + bld.sopp(aco_opcode::s_sendmsg, bld.m0(gs_wave_id), -1, sendmsg_gs_done(false, false, 0)); + } if (shader_count >= 2) { begin_divergent_if_else(&ctx, &ic); diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 8a461e7e929..64eb408263a 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -39,7 +39,7 @@ namespace aco { -struct vs_output_state { +struct ge_output_state { uint8_t mask[VARYING_SLOT_VAR31 + 1]; Temp outputs[VARYING_SLOT_VAR31 + 1][4]; }; @@ -74,19 +74,22 @@ struct isel_context { Temp arg_temps[AC_MAX_ARGS]; - /* inputs common for merged stages */ - Temp merged_wave_info = Temp(0, s1); - /* FS inputs */ Temp persp_centroid, linear_centroid; /* VS inputs */ bool needs_instance_id; + /* gathered information */ + uint64_t input_masks[MESA_SHADER_COMPUTE]; + uint64_t output_masks[MESA_SHADER_COMPUTE]; + /* VS output information */ unsigned num_clip_distances; unsigned num_cull_distances; - vs_output_state vs_output; + + /* VS or GS output information */ + ge_output_state vsgs_output; }; Temp get_arg(isel_context *ctx, struct ac_arg arg) @@ -298,6 +301,7 @@ void init_context(isel_context *ctx, nir_shader *shader) case nir_intrinsic_load_sample_id: case nir_intrinsic_load_sample_mask_in: case nir_intrinsic_load_input: + case nir_intrinsic_load_per_vertex_input: case nir_intrinsic_load_vertex_id: case nir_intrinsic_load_vertex_id_zero_base: case nir_intrinsic_load_barycentric_sample: @@ -357,6 +361,8 @@ void init_context(isel_context *ctx, nir_shader *shader) case nir_intrinsic_shared_atomic_exchange: case nir_intrinsic_shared_atomic_comp_swap: case nir_intrinsic_load_scratch: + case nir_intrinsic_load_invocation_id: + case nir_intrinsic_load_primitive_id: type = RegType::vgpr; break; case nir_intrinsic_shuffle: @@ -664,63 +670,68 @@ setup_vs_variables(isel_context *ctx, nir_shader *nir) } nir_foreach_variable(variable, &nir->outputs) { - variable->data.driver_location = variable->data.location * 4; + if (ctx->stage == vertex_geometry_gs) + variable->data.driver_location = util_bitcount64(ctx->output_masks[nir->info.stage] & ((1ull << variable->data.location) - 1ull)) * 4; + else + variable->data.driver_location = variable->data.location * 4; } - radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo; + if (ctx->stage == vertex_vs) { + radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo; - memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, - sizeof(outinfo->vs_output_param_offset)); + memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, + sizeof(outinfo->vs_output_param_offset)); - ctx->needs_instance_id = ctx->program->info->vs.needs_instance_id; + ctx->needs_instance_id = ctx->program->info->vs.needs_instance_id; - bool export_clip_dists = ctx->options->key.vs_common_out.export_clip_dists; + bool export_clip_dists = ctx->options->key.vs_common_out.export_clip_dists; - outinfo->param_exports = 0; - int pos_written = 0x1; - if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer) - pos_written |= 1 << 1; + outinfo->param_exports = 0; + int pos_written = 0x1; + if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer) + pos_written |= 1 << 1; - nir_foreach_variable(variable, &nir->outputs) - { - int idx = variable->data.location; - unsigned slots = variable->type->count_attribute_slots(false); - if (variable->data.compact) { - unsigned component_count = variable->data.location_frac + variable->type->length; - slots = (component_count + 3) / 4; - } - - if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER || idx == VARYING_SLOT_PRIMITIVE_ID || - ((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) && export_clip_dists)) { - for (unsigned i = 0; i < slots; i++) { - if (outinfo->vs_output_param_offset[idx + i] == AC_EXP_PARAM_UNDEFINED) - outinfo->vs_output_param_offset[idx + i] = outinfo->param_exports++; + uint64_t mask = ctx->output_masks[nir->info.stage]; + while (mask) { + int idx = u_bit_scan64(&mask); + if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER || idx == VARYING_SLOT_PRIMITIVE_ID || + ((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) && export_clip_dists)) { + if (outinfo->vs_output_param_offset[idx] == AC_EXP_PARAM_UNDEFINED) + outinfo->vs_output_param_offset[idx] = outinfo->param_exports++; } } - } - if (outinfo->writes_layer && - outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] == AC_EXP_PARAM_UNDEFINED) { - /* when ctx->options->key.has_multiview_view_index = true, the layer - * variable isn't declared in NIR and it's isel's job to get the layer */ - outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = outinfo->param_exports++; - } + if (outinfo->writes_layer && + outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] == AC_EXP_PARAM_UNDEFINED) { + /* when ctx->options->key.has_multiview_view_index = true, the layer + * variable isn't declared in NIR and it's isel's job to get the layer */ + outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = outinfo->param_exports++; + } - if (outinfo->export_prim_id) { - assert(outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] == AC_EXP_PARAM_UNDEFINED); - outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = outinfo->param_exports++; - } + if (outinfo->export_prim_id) { + assert(outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] == AC_EXP_PARAM_UNDEFINED); + outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = outinfo->param_exports++; + } - ctx->num_clip_distances = util_bitcount(outinfo->clip_dist_mask); - ctx->num_cull_distances = util_bitcount(outinfo->cull_dist_mask); + ctx->num_clip_distances = util_bitcount(outinfo->clip_dist_mask); + ctx->num_cull_distances = util_bitcount(outinfo->cull_dist_mask); - assert(ctx->num_clip_distances + ctx->num_cull_distances <= 8); + assert(ctx->num_clip_distances + ctx->num_cull_distances <= 8); - if (ctx->num_clip_distances + ctx->num_cull_distances > 0) - pos_written |= 1 << 2; - if (ctx->num_clip_distances + ctx->num_cull_distances > 4) - pos_written |= 1 << 3; + if (ctx->num_clip_distances + ctx->num_cull_distances > 0) + pos_written |= 1 << 2; + if (ctx->num_clip_distances + ctx->num_cull_distances > 4) + pos_written |= 1 << 3; - outinfo->pos_exports = util_bitcount(pos_written); + outinfo->pos_exports = util_bitcount(pos_written); + } else if (ctx->stage == vertex_geometry_gs) { + /* TODO: radv_nir_shader_info_pass() already sets this but it's larger + * than it needs to be in order to set it better, we have to improve + * radv_nir_shader_info_pass() because gfx9_get_gs_info() uses + * esgs_itemsize and has to be done before compilation + */ + /* radv_es_output_info *outinfo = &ctx->program->info->vs.es_info; + outinfo->esgs_itemsize = util_bitcount64(ctx->output_masks[nir->info.stage]) * 16u; */ + } } void @@ -744,11 +755,66 @@ setup_variables(isel_context *ctx, nir_shader *nir) setup_vs_variables(ctx, nir); break; } + case MESA_SHADER_GEOMETRY: { + assert(ctx->stage == vertex_geometry_gs); + nir_foreach_variable(variable, &nir->inputs) { + variable->data.driver_location = util_bitcount64(ctx->input_masks[nir->info.stage] & ((1ull << variable->data.location) - 1ull)) * 4; + } + nir_foreach_variable(variable, &nir->outputs) { + variable->data.driver_location = variable->data.location * 4; + } + ctx->program->info->gs.es_type = MESA_SHADER_VERTEX; /* tesselation shaders are not yet supported */ + break; + } default: unreachable("Unhandled shader stage."); } } +void +get_io_masks(isel_context *ctx, unsigned shader_count, struct nir_shader *const *shaders) +{ + for (unsigned i = 0; i < shader_count; i++) { + nir_shader *nir = shaders[i]; + if (nir->info.stage == MESA_SHADER_COMPUTE) + continue; + + uint64_t output_mask = 0; + nir_foreach_variable(variable, &nir->outputs) { + const glsl_type *type = variable->type; + if (nir_is_per_vertex_io(variable, nir->info.stage)) + type = type->fields.array; + unsigned slots = type->count_attribute_slots(false); + if (variable->data.compact) { + unsigned component_count = variable->data.location_frac + type->length; + slots = (component_count + 3) / 4; + } + output_mask |= ((1ull << slots) - 1) << variable->data.location; + } + + uint64_t input_mask = 0; + nir_foreach_variable(variable, &nir->inputs) { + const glsl_type *type = variable->type; + if (nir_is_per_vertex_io(variable, nir->info.stage)) + type = type->fields.array; + unsigned slots = type->count_attribute_slots(false); + if (variable->data.compact) { + unsigned component_count = variable->data.location_frac + type->length; + slots = (component_count + 3) / 4; + } + input_mask |= ((1ull << slots) - 1) << variable->data.location; + } + + ctx->output_masks[nir->info.stage] |= output_mask; + if (i + 1 < shader_count) + ctx->input_masks[shaders[i + 1]->info.stage] |= output_mask; + + ctx->input_masks[nir->info.stage] |= input_mask; + if (i) + ctx->output_masks[shaders[i - 1]->info.stage] |= input_mask; + } +} + isel_context setup_isel_context(Program* program, unsigned shader_count, @@ -781,12 +847,16 @@ setup_isel_context(Program* program, unreachable("Shader stage not implemented"); } } + bool gfx9_plus = args->options->chip_class >= GFX9; + bool ngg = args->shader_info->is_ngg && args->options->chip_class >= GFX10; if (program->stage == sw_vs) program->stage |= hw_vs; else if (program->stage == sw_fs) program->stage |= hw_fs; else if (program->stage == sw_cs) program->stage |= hw_cs; + else if (program->stage == (sw_vs | sw_gs) && gfx9_plus && !ngg) + program->stage |= hw_gs; else unreachable("Shader stage not implemented"); @@ -833,6 +903,8 @@ setup_isel_context(Program* program, ctx.options = args->options; ctx.stage = program->stage; + get_io_masks(&ctx, shader_count, shaders); + for (unsigned i = 0; i < shader_count; i++) { nir_shader *nir = shaders[i]; diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 5fa9e1cb869..24d1acf2b79 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -108,7 +108,12 @@ enum barrier_interaction : uint8_t { barrier_image = 0x2, barrier_atomic = 0x4, barrier_shared = 0x8, - barrier_count = 4, + /* used for geometry shaders to ensure vertex data writes are before the + * GS_DONE s_sendmsg. */ + barrier_gs_data = 0x10, + /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */ + barrier_gs_sendmsg = 0x20, + barrier_count = 6, }; enum fp_round { @@ -975,25 +980,7 @@ static inline bool is_phi(aco_ptr& instr) return is_phi(instr.get()); } -constexpr barrier_interaction get_barrier_interaction(Instruction* instr) -{ - switch (instr->format) { - case Format::SMEM: - return static_cast(instr)->barrier; - case Format::MUBUF: - return static_cast(instr)->barrier; - case Format::MIMG: - return static_cast(instr)->barrier; - case Format::FLAT: - case Format::GLOBAL: - case Format::SCRATCH: - return static_cast(instr)->barrier; - case Format::DS: - return barrier_shared; - default: - return barrier_none; - } -} +barrier_interaction get_barrier_interaction(Instruction* instr); bool is_dead(const std::vector& uses, Instruction *instr); diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index db4a349bcb9..d537133a6dc 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -222,11 +222,13 @@ opcode("p_cbranch", format=Format.PSEUDO_BRANCH) opcode("p_cbranch_z", format=Format.PSEUDO_BRANCH) opcode("p_cbranch_nz", format=Format.PSEUDO_BRANCH) -opcode("p_memory_barrier_all", format=Format.PSEUDO_BARRIER) +opcode("p_memory_barrier_common", format=Format.PSEUDO_BARRIER) # atomic, buffer, image and shared opcode("p_memory_barrier_atomic", format=Format.PSEUDO_BARRIER) opcode("p_memory_barrier_buffer", format=Format.PSEUDO_BARRIER) opcode("p_memory_barrier_image", format=Format.PSEUDO_BARRIER) opcode("p_memory_barrier_shared", format=Format.PSEUDO_BARRIER) +opcode("p_memory_barrier_gs_data", format=Format.PSEUDO_BARRIER) +opcode("p_memory_barrier_gs_sendmsg", format=Format.PSEUDO_BARRIER) opcode("p_spill") opcode("p_reload") diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp index c17845c082d..d3304a996f5 100644 --- a/src/amd/compiler/aco_print_ir.cpp +++ b/src/amd/compiler/aco_print_ir.cpp @@ -159,6 +159,10 @@ static void print_barrier_reorder(bool can_reorder, barrier_interaction barrier, fprintf(output, " atomic"); if (barrier & barrier_shared) fprintf(output, " shared"); + if (barrier & barrier_gs_data) + fprintf(output, " gs_data"); + if (barrier & barrier_gs_sendmsg) + fprintf(output, " gs_sendmsg"); } static void print_instr_format_specific(struct Instruction *instr, FILE *output) diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index 0a8d5af8c78..d5f2d913a65 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -23,6 +23,7 @@ */ #include "aco_ir.h" +#include "aco_builder.h" #include #include @@ -111,6 +112,74 @@ static bool is_spill_reload(aco_ptr& instr) return instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload; } +bool can_reorder(Instruction* candidate) +{ + switch (candidate->format) { + case Format::SMEM: + return static_cast(candidate)->can_reorder; + case Format::MUBUF: + return static_cast(candidate)->can_reorder; + case Format::MIMG: + return static_cast(candidate)->can_reorder; + case Format::MTBUF: + return static_cast(candidate)->can_reorder; + case Format::FLAT: + case Format::GLOBAL: + case Format::SCRATCH: + return static_cast(candidate)->can_reorder; + default: + return true; + } +} + +bool is_gs_or_done_sendmsg(Instruction *instr) +{ + if (instr->opcode == aco_opcode::s_sendmsg) { + uint16_t imm = static_cast(instr)->imm; + return (imm & sendmsg_id_mask) == _sendmsg_gs || + (imm & sendmsg_id_mask) == _sendmsg_gs_done; + } + return false; +} + +bool is_done_sendmsg(Instruction *instr) +{ + if (instr->opcode == aco_opcode::s_sendmsg) { + uint16_t imm = static_cast(instr)->imm; + return (imm & sendmsg_id_mask) == _sendmsg_gs_done; + } + return false; +} + +barrier_interaction get_barrier_interaction(Instruction* instr) +{ + switch (instr->format) { + case Format::SMEM: + return static_cast(instr)->barrier; + case Format::MUBUF: + return static_cast(instr)->barrier; + case Format::MIMG: + return static_cast(instr)->barrier; + case Format::MTBUF: + return static_cast(instr)->barrier; + case Format::FLAT: + case Format::GLOBAL: + case Format::SCRATCH: + return static_cast(instr)->barrier; + case Format::DS: + return barrier_shared; + case Format::SOPP: + if (is_done_sendmsg(instr)) + return (barrier_interaction)(barrier_gs_data | barrier_gs_sendmsg); + else if (is_gs_or_done_sendmsg(instr)) + return barrier_gs_sendmsg; + else + return barrier_none; + default: + return barrier_none; + } +} + bool can_move_instr(aco_ptr& instr, Instruction* current, int moving_interaction) { /* don't move exports so that they stay closer together */ @@ -127,26 +196,11 @@ bool can_move_instr(aco_ptr& instr, Instruction* current, int movin * instructions interacting with them instead? */ if (instr->format != Format::PSEUDO_BARRIER) { if (instr->opcode == aco_opcode::s_barrier) { - bool can_reorder = false; - switch (current->format) { - case Format::SMEM: - can_reorder = static_cast(current)->can_reorder; - break; - case Format::MUBUF: - can_reorder = static_cast(current)->can_reorder; - break; - case Format::MIMG: - can_reorder = static_cast(current)->can_reorder; - break; - case Format::FLAT: - case Format::GLOBAL: - case Format::SCRATCH: - can_reorder = static_cast(current)->can_reorder; - break; - default: - break; - } - return can_reorder && moving_interaction == barrier_none; + return can_reorder(current) && moving_interaction == barrier_none; + } else if (is_gs_or_done_sendmsg(instr.get())) { + int interaction = get_barrier_interaction(current); + interaction |= moving_interaction; + return !(interaction & get_barrier_interaction(instr.get())); } else { return true; } @@ -170,33 +224,17 @@ bool can_move_instr(aco_ptr& instr, Instruction* current, int movin return !(interaction & (barrier_image | barrier_buffer)); case aco_opcode::p_memory_barrier_shared: return !(interaction & barrier_shared); - case aco_opcode::p_memory_barrier_all: - return interaction == barrier_none; + case aco_opcode::p_memory_barrier_common: + return !(interaction & (barrier_image | barrier_buffer | barrier_shared | barrier_atomic)); + case aco_opcode::p_memory_barrier_gs_data: + return !(interaction & barrier_gs_data); + case aco_opcode::p_memory_barrier_gs_sendmsg: + return !(interaction & barrier_gs_sendmsg); default: return false; } } -bool can_reorder(Instruction* candidate) -{ - switch (candidate->format) { - case Format::SMEM: - return static_cast(candidate)->can_reorder; - case Format::MUBUF: - return static_cast(candidate)->can_reorder; - case Format::MIMG: - return static_cast(candidate)->can_reorder; - case Format::MTBUF: - return static_cast(candidate)->can_reorder; - case Format::FLAT: - case Format::GLOBAL: - case Format::SCRATCH: - return static_cast(candidate)->can_reorder; - default: - return true; - } -} - void schedule_SMEM(sched_ctx& ctx, Block* block, std::vector& register_demand, Instruction* current, int idx) diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index e407c9194c0..4360b675c69 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -474,6 +474,9 @@ radv_shader_compile_to_nir(struct radv_device *device, nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); + if (nir->info.stage == MESA_SHADER_GEOMETRY && use_aco) + nir_lower_gs_intrinsics(nir, true); + static const nir_lower_tex_options tex_options = { .lower_txp = ~0, .lower_tg4_offsets = true,