X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fcompiler%2Faco_reduce_assign.cpp;h=7bf7a6c3b685b9d8ddcf756782100f2200a08c0e;hb=e6366f9094326a2841058678174289827f504905;hp=66a3ec64c044e7583ac32e68dbb98e593bb37400;hpb=3865448012b16d0e98e706e1b462242a754436c7;p=mesa.git diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp index 66a3ec64c04..7bf7a6c3b68 100644 --- a/src/amd/compiler/aco_reduce_assign.cpp +++ b/src/amd/compiler/aco_reduce_assign.cpp @@ -117,11 +117,22 @@ void setup_reduce_temp(Program* program) /* same as before, except for the vector temporary instead of the reduce temporary */ unsigned cluster_size = static_cast(instr)->cluster_size; bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || - op == fmin64 || op == fmax64; + op == fmin64 || op == fmax64 || op == umin64 || + op == umax64 || op == imin64 || op == imax64 || + op == imul64; + bool gfx10_need_vtmp = op == imul8 || op == imax8 || op == imin8 || op == umin8 || + op == imul16 || op == imax16 || op == imin16 || op == umin16 || + op == iadd64; + if (program->chip_class >= GFX10 && cluster_size == 64) need_vtmp = true; + if (program->chip_class >= GFX10 && gfx10_need_vtmp) + need_vtmp = true; + if (program->chip_class <= GFX7) + need_vtmp = true; need_vtmp |= cluster_size == 32; + vtmp_in_loop |= need_vtmp && block.loop_nest_depth > 0; if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) { vtmp = {program->allocateId(), vtmp.regClass()}; @@ -147,20 +158,28 @@ void setup_reduce_temp(Program* program) instr->definitions[1] = bld.def(s2); /* scalar identity temporary */ - bool need_sitmp = program->chip_class >= GFX10 && cluster_size == 64; + bool need_sitmp = (program->chip_class <= GFX7 || program->chip_class >= GFX10) && instr->opcode != aco_opcode::p_reduce; if (instr->opcode == aco_opcode::p_exclusive_scan) { need_sitmp |= - (op == imin32 || op == imin64 || op == imax32 || op == imax64 || - op == fmin32 || op == fmin64 || op == fmax32 || op == fmax64 || - op == fmul64); + (op == imin8 || op == imin16 || op == imin32 || op == imin64 || + op == imax8 || op == imax16 || op == imax32 || op == imax64 || + op == fmin16 || op == fmin32 || op == fmin64 || + op == fmax16 || op == fmax32 || op == fmax64 || + op == fmul16 || op == fmul64); } if (need_sitmp) { instr->definitions[2] = bld.def(RegClass(RegType::sgpr, instr->operands[0].size())); } /* vcc clobber */ - if (op == iadd32 && program->chip_class < GFX9) - instr->definitions[4] = Definition(vcc, s2); + bool clobber_vcc = false; + if ((op == iadd32 || op == imul64) && program->chip_class < GFX9) + clobber_vcc = true; + if (op == iadd64 || op == umin64 || op == umax64 || op == imin64 || op == imax64) + clobber_vcc = true; + + if (clobber_vcc) + instr->definitions[4] = Definition(vcc, bld.lm); } } }