From 9254fb4fc72ed289ffded28ef067b4582973e90c Mon Sep 17 00:00:00 2001 From: =?utf8?q?Daniel=20Sch=C3=BCrmann?= Date: Wed, 20 Nov 2019 18:57:23 +0100 Subject: [PATCH] aco: don't use a scalar temporary for reductions on GFX10 This patch also adds the scalar temporary for scans on SI/CI Reviewed-by: Rhys Perry --- src/amd/compiler/aco_lower_to_hw_instr.cpp | 4 ++-- src/amd/compiler/aco_reduce_assign.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 38e6030687b..405961b1993 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -481,8 +481,8 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig if (cluster_size == 64) { for (unsigned i = 0; i < src.size(); i++) - bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); - emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size()); + bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); + emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size()); } } else if (cluster_size == 32) { for (unsigned i = 0; i < src.size(); i++) diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp index 68a0dc15761..f1015b13316 100644 --- a/src/amd/compiler/aco_reduce_assign.cpp +++ b/src/amd/compiler/aco_reduce_assign.cpp @@ -153,7 +153,7 @@ void setup_reduce_temp(Program* program) instr->definitions[1] = bld.def(s2); /* scalar identity temporary */ - bool need_sitmp = program->chip_class >= GFX10 && cluster_size == 64; + bool need_sitmp = (program->chip_class <= GFX7 || program->chip_class >= GFX10) && instr->opcode != aco_opcode::p_reduce; if (instr->opcode == aco_opcode::p_exclusive_scan) { need_sitmp |= (op == imin32 || op == imin64 || op == imax32 || op == imax64 || -- 2.30.2