if (cluster_size == 64) {
for (unsigned i = 0; i < src.size(); i++)
- bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
- emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
+ bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
+ emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
}
} else if (cluster_size == 32) {
for (unsigned i = 0; i < src.size(); i++)
instr->definitions[1] = bld.def(s2);
/* scalar identity temporary */
- bool need_sitmp = program->chip_class >= GFX10 && cluster_size == 64;
+ bool need_sitmp = (program->chip_class <= GFX7 || program->chip_class >= GFX10) && instr->opcode != aco_opcode::p_reduce;
if (instr->opcode == aco_opcode::p_exclusive_scan) {
need_sitmp |=
(op == imin32 || op == imin64 || op == imax32 || op == imax64 ||