From e22567089c829765d0b78a87d96f7dc5af9e10cd Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Thu, 28 May 2020 09:09:49 +0200 Subject: [PATCH] aco: sign-extend input/indentity for 32-bit reduce ops on GFX10 Because some 16-bit instructions are already VOP3 on GFX10, we use the 32-bit variants to remove the temporary VGPR and to use DDP with the arithmetic instructions. Signed-off-by: Samuel Pitoiset Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_lower_to_hw_instr.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 11f7401213f..1788f90b4c6 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -529,6 +529,20 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig sdwa->sel[0] = sdwa_ubyte; sdwa->dst_sel = sdwa_udword; bld.insert(std::move(sdwa)); + } else if (src.regClass() == v2b) { + if (ctx->program->chip_class >= GFX10 && + (reduce_op == iadd16 || reduce_op == imax16 || + reduce_op == imin16 || reduce_op == umin16 || reduce_op == umax16)) { + aco_ptr sdwa{create_instruction(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)}; + sdwa->operands[0] = Operand(PhysReg{tmp}, v1); + sdwa->definitions[0] = Definition(PhysReg{tmp}, v1); + if (reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16) + sdwa->sel[0] = sdwa_sword; + else + sdwa->sel[0] = sdwa_uword; + sdwa->dst_sel = sdwa_udword; + bld.insert(std::move(sdwa)); + } } bool reduction_needs_last_op = false; -- 2.30.2