From 22d8822683139a815603a969a54c382ec3378d9e Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 17 Jan 2020 15:14:41 +0100 Subject: [PATCH] aco: implement nir_intrinsic_load_global on GFX6 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit GFX6 doesn't have FLAT instructions, use MUBUF instructions instead. Signed-off-by: Samuel Pitoiset Reviewed-by: Daniel Schürmann Part-of: --- .../compiler/aco_instruction_selection.cpp | 125 +++++++++++++----- 1 file changed, 94 insertions(+), 31 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 9d29fae316c..235cac4e30e 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -4740,40 +4740,103 @@ void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr) aco_opcode op; if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) { bool global = ctx->options->chip_class >= GFX9; - aco_opcode op; - switch (num_bytes) { - case 4: - op = global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword; - break; - case 8: - op = global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2; - break; - case 12: - op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3; - break; - case 16: - op = global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4; - break; - default: - unreachable("load_global not implemented for this size."); - } - aco_ptr flat{create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)}; - flat->operands[0] = Operand(addr); - flat->operands[1] = Operand(s1); - flat->glc = glc; - flat->dlc = dlc; - flat->barrier = barrier_buffer; - if (dst.type() == RegType::sgpr) { - Temp vec = bld.tmp(RegType::vgpr, dst.size()); - flat->definitions[0] = Definition(vec); - ctx->block->instructions.emplace_back(std::move(flat)); - bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec); + if (ctx->options->chip_class >= GFX7) { + aco_opcode op; + switch (num_bytes) { + case 4: + op = global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword; + break; + case 8: + op = global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2; + break; + case 12: + op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3; + break; + case 16: + op = global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4; + break; + default: + unreachable("load_global not implemented for this size."); + } + + aco_ptr flat{create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)}; + flat->operands[0] = Operand(addr); + flat->operands[1] = Operand(s1); + flat->glc = glc; + flat->dlc = dlc; + flat->barrier = barrier_buffer; + + if (dst.type() == RegType::sgpr) { + Temp vec = bld.tmp(RegType::vgpr, dst.size()); + flat->definitions[0] = Definition(vec); + ctx->block->instructions.emplace_back(std::move(flat)); + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec); + } else { + flat->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(flat)); + } + emit_split_vector(ctx, dst, num_components); } else { - flat->definitions[0] = Definition(dst); - ctx->block->instructions.emplace_back(std::move(flat)); + assert(ctx->options->chip_class == GFX6); + + /* GFX6 doesn't support loading vec3, expand to vec4. */ + num_bytes = num_bytes == 12 ? 16 : num_bytes; + + aco_opcode op; + switch (num_bytes) { + case 4: + op = aco_opcode::buffer_load_dword; + break; + case 8: + op = aco_opcode::buffer_load_dwordx2; + break; + case 16: + op = aco_opcode::buffer_load_dwordx4; + break; + default: + unreachable("load_global not implemented for this size."); + } + + Temp rsrc = get_gfx6_global_rsrc(bld, addr); + + aco_ptr mubuf{create_instruction(op, Format::MUBUF, 3, 1)}; + mubuf->operands[0] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1); + mubuf->operands[1] = Operand(rsrc); + mubuf->operands[2] = Operand(0u); + mubuf->glc = glc; + mubuf->dlc = false; + mubuf->offset = 0; + mubuf->addr64 = addr.type() == RegType::vgpr; + mubuf->disable_wqm = false; + mubuf->barrier = barrier_buffer; + aco_ptr instr = std::move(mubuf); + + /* expand vector */ + if (dst.size() == 3) { + Temp vec = bld.tmp(v4); + instr->definitions[0] = Definition(vec); + bld.insert(std::move(instr)); + emit_split_vector(ctx, vec, 4); + + instr.reset(create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, 3, 1)); + instr->operands[0] = Operand(emit_extract_vector(ctx, vec, 0, v1)); + instr->operands[1] = Operand(emit_extract_vector(ctx, vec, 1, v1)); + instr->operands[2] = Operand(emit_extract_vector(ctx, vec, 2, v1)); + } + + if (dst.type() == RegType::sgpr) { + Temp vec = bld.tmp(RegType::vgpr, dst.size()); + instr->definitions[0] = Definition(vec); + bld.insert(std::move(instr)); + expand_vector(ctx, vec, dst, num_components, (1 << num_components) - 1); + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec); + } else { + instr->definitions[0] = Definition(dst); + bld.insert(std::move(instr)); + emit_split_vector(ctx, dst, num_components); + } } - emit_split_vector(ctx, dst, num_components); } else { switch (num_bytes) { case 4: -- 2.30.2