From 36e7a5f5b906eea6d1c222336492d16bcecc58f7 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 17 Jan 2020 16:11:55 +0100 Subject: [PATCH] aco: implement nir_intrinsic_global_atomic_* on GFX6 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit GFX6 doesn't have FLAT instructions, use MUBUF instructions instead. Signed-off-by: Samuel Pitoiset Reviewed-by: Daniel Schürmann Part-of: --- .../compiler/aco_instruction_selection.cpp | 192 ++++++++++++------ 1 file changed, 133 insertions(+), 59 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 235cac4e30e..d87f718eb3e 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -5012,75 +5012,149 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr) } Builder bld(ctx->program, ctx->block); - Temp addr = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + Temp addr = get_ssa_temp(ctx, instr->src[0].ssa); Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); + if (ctx->options->chip_class >= GFX7) + addr = as_vgpr(ctx, addr); + if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap) data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2), get_ssa_temp(ctx, instr->src[2].ssa), data); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - bool global = ctx->options->chip_class >= GFX9; aco_opcode op32, op64; - switch (instr->intrinsic) { - case nir_intrinsic_global_atomic_add: - op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add; - op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2; - break; - case nir_intrinsic_global_atomic_imin: - op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin; - op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2; - break; - case nir_intrinsic_global_atomic_umin: - op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin; - op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2; - break; - case nir_intrinsic_global_atomic_imax: - op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax; - op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2; - break; - case nir_intrinsic_global_atomic_umax: - op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax; - op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2; - break; - case nir_intrinsic_global_atomic_and: - op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and; - op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2; - break; - case nir_intrinsic_global_atomic_or: - op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or; - op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2; - break; - case nir_intrinsic_global_atomic_xor: - op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor; - op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2; - break; - case nir_intrinsic_global_atomic_exchange: - op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap; - op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2; - break; - case nir_intrinsic_global_atomic_comp_swap: - op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap; - op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2; - break; - default: - unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions."); + + if (ctx->options->chip_class >= GFX7) { + bool global = ctx->options->chip_class >= GFX9; + switch (instr->intrinsic) { + case nir_intrinsic_global_atomic_add: + op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add; + op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2; + break; + case nir_intrinsic_global_atomic_imin: + op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin; + op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2; + break; + case nir_intrinsic_global_atomic_umin: + op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin; + op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2; + break; + case nir_intrinsic_global_atomic_imax: + op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax; + op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2; + break; + case nir_intrinsic_global_atomic_umax: + op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax; + op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2; + break; + case nir_intrinsic_global_atomic_and: + op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and; + op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2; + break; + case nir_intrinsic_global_atomic_or: + op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or; + op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2; + break; + case nir_intrinsic_global_atomic_xor: + op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor; + op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2; + break; + case nir_intrinsic_global_atomic_exchange: + op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap; + op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2; + break; + case nir_intrinsic_global_atomic_comp_swap: + op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap; + op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2; + break; + default: + unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions."); + } + + aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; + aco_ptr flat{create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)}; + flat->operands[0] = Operand(addr); + flat->operands[1] = Operand(s1); + flat->operands[2] = Operand(data); + if (return_previous) + flat->definitions[0] = Definition(dst); + flat->glc = return_previous; + flat->dlc = false; /* Not needed for atomics */ + flat->offset = 0; + flat->disable_wqm = true; + flat->barrier = barrier_buffer; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(flat)); + } else { + assert(ctx->options->chip_class == GFX6); + + switch (instr->intrinsic) { + case nir_intrinsic_global_atomic_add: + op32 = aco_opcode::buffer_atomic_add; + op64 = aco_opcode::buffer_atomic_add_x2; + break; + case nir_intrinsic_global_atomic_imin: + op32 = aco_opcode::buffer_atomic_smin; + op64 = aco_opcode::buffer_atomic_smin_x2; + break; + case nir_intrinsic_global_atomic_umin: + op32 = aco_opcode::buffer_atomic_umin; + op64 = aco_opcode::buffer_atomic_umin_x2; + break; + case nir_intrinsic_global_atomic_imax: + op32 = aco_opcode::buffer_atomic_smax; + op64 = aco_opcode::buffer_atomic_smax_x2; + break; + case nir_intrinsic_global_atomic_umax: + op32 = aco_opcode::buffer_atomic_umax; + op64 = aco_opcode::buffer_atomic_umax_x2; + break; + case nir_intrinsic_global_atomic_and: + op32 = aco_opcode::buffer_atomic_and; + op64 = aco_opcode::buffer_atomic_and_x2; + break; + case nir_intrinsic_global_atomic_or: + op32 = aco_opcode::buffer_atomic_or; + op64 = aco_opcode::buffer_atomic_or_x2; + break; + case nir_intrinsic_global_atomic_xor: + op32 = aco_opcode::buffer_atomic_xor; + op64 = aco_opcode::buffer_atomic_xor_x2; + break; + case nir_intrinsic_global_atomic_exchange: + op32 = aco_opcode::buffer_atomic_swap; + op64 = aco_opcode::buffer_atomic_swap_x2; + break; + case nir_intrinsic_global_atomic_comp_swap: + op32 = aco_opcode::buffer_atomic_cmpswap; + op64 = aco_opcode::buffer_atomic_cmpswap_x2; + break; + default: + unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions."); + } + + Temp rsrc = get_gfx6_global_rsrc(bld, addr); + + aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; + + aco_ptr mubuf{create_instruction(op, Format::MUBUF, 4, return_previous ? 1 : 0)}; + mubuf->operands[0] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1); + mubuf->operands[1] = Operand(rsrc); + mubuf->operands[2] = Operand(0u); + mubuf->operands[3] = Operand(data); + if (return_previous) + mubuf->definitions[0] = Definition(dst); + mubuf->glc = return_previous; + mubuf->dlc = false; + mubuf->offset = 0; + mubuf->addr64 = addr.type() == RegType::vgpr; + mubuf->disable_wqm = true; + mubuf->barrier = barrier_buffer; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(mubuf)); } - aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; - aco_ptr flat{create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)}; - flat->operands[0] = Operand(addr); - flat->operands[1] = Operand(s1); - flat->operands[2] = Operand(data); - if (return_previous) - flat->definitions[0] = Definition(dst); - flat->glc = return_previous; - flat->dlc = false; /* Not needed for atomics */ - flat->offset = 0; - flat->disable_wqm = true; - flat->barrier = barrier_buffer; - ctx->program->needs_exact = true; - ctx->block->instructions.emplace_back(std::move(flat)); } void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) { -- 2.30.2