From 637c5a1dd9bd56da04d48b8c92c1c40b12ae76ab Mon Sep 17 00:00:00 2001 From: =?utf8?q?Timur=20Krist=C3=B3f?= Date: Wed, 27 Nov 2019 16:59:11 +0100 Subject: [PATCH] aco/wave32: Fix reductions. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Signed-off-by: Timur Kristóf Reviewed-by: Daniel Schürmann --- .../compiler/aco_instruction_selection.cpp | 8 +-- .../aco_instruction_selection_setup.cpp | 3 +- src/amd/compiler/aco_lower_to_hw_instr.cpp | 64 ++++++++++++------- 3 files changed, 45 insertions(+), 30 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 84c88e4eaa5..0f89cb1aee5 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -5232,15 +5232,15 @@ Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Te //subgroupClusteredOr(val, 4) -> wqm(val & exec) return bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))); - } else if (op == nir_op_iand && cluster_size == 64) { + } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) { //subgroupAnd(val) -> (exec & ~val) == 0 Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp(); return bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(0u), Operand(-1u), bld.scc(tmp)); - } else if (op == nir_op_ior && cluster_size == 64) { + } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) { //subgroupOr(val) -> (val & exec) != 0 Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)).def(1).getTemp(); return bool_to_vector_condition(ctx, tmp); - } else if (op == nir_op_ixor && cluster_size == 64) { + } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) { //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp); @@ -5839,7 +5839,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) nir_op op = (nir_op) nir_intrinsic_reduction_op(instr); unsigned cluster_size = instr->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(instr) : 0; - cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : 64, 64)); + cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size)); if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) { emit_uniform_subgroup(ctx, instr, src); diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 47f5778822f..469aebbb8d9 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -390,8 +390,7 @@ void init_context(isel_context *ctx, nir_shader *shader) if (intrinsic->dest.ssa.bit_size == 1) { size = lane_mask_size; type = RegType::sgpr; - } else if (nir_intrinsic_cluster_size(intrinsic) == 0 || - !ctx->divergent_vals[intrinsic->dest.ssa.index]) { + } else if (!ctx->divergent_vals[intrinsic->dest.ssa.index]) { type = RegType::sgpr; } else { type = RegType::vgpr; diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index e9c2d66d823..19e0f598074 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -412,7 +412,8 @@ uint32_t get_reduction_identity(ReduceOp op, unsigned idx) void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size, PhysReg tmp, PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst) { - assert(cluster_size == 64 || op == aco_opcode::p_reduce); + assert(cluster_size == ctx->program->wave_size || op == aco_opcode::p_reduce); + assert(cluster_size <= ctx->program->wave_size); Builder bld(ctx->program, &ctx->instructions); @@ -462,23 +463,34 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_mirror, 0xf, 0xf, false); if (cluster_size == 16) break; - if (cluster_size == 32) { + + if (ctx->program->chip_class >= GFX10) { + /* GFX10+ doesn't support row_bcast15 and row_bcast31 */ + + for (unsigned i = 0; i < src.size(); i++) + bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1), Operand(0u), Operand(0u)); + + if (cluster_size == 32 && dst.regClass().type() == RegType::vgpr) { + bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm)); + exec_restored = true; + emit_op(ctx, dst.physReg(), tmp, vtmp, PhysReg{0}, reduce_op, src.size()); + dst_written = true; + } else { + emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size()); + } + + if (cluster_size == 64) { + for (unsigned i = 0; i < src.size(); i++) + bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); + emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size()); + } + } else if (cluster_size == 32) { for (unsigned i = 0; i < src.size(); i++) bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, s1), ds_pattern_bitmode(0x1f, 0, 0x10)); bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm)); exec_restored = true; emit_op(ctx, dst.physReg(), vtmp, tmp, PhysReg{0}, reduce_op, src.size()); dst_written = true; - } else if (ctx->program->chip_class >= GFX10) { - assert(cluster_size == 64); - /* GFX10+ doesn't support row_bcast15 and row_bcast31 */ - for (unsigned i = 0; i < src.size(); i++) - bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1), Operand(0u), Operand(0u)); - emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size()); - - for (unsigned i = 0; i < src.size(); i++) - bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); - emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size()); } else { assert(cluster_size == 64); emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), @@ -504,10 +516,12 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig } bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(UINT64_MAX)); - /* fill in the gap in row 2 */ - for (unsigned i = 0; i < src.size(); i++) { - bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); - bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u)); + if (ctx->program->wave_size == 64) { + /* fill in the gap in row 2 */ + for (unsigned i = 0; i < src.size(); i++) { + bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); + bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u)); + } } std::swap(tmp, vtmp); } else { @@ -523,7 +537,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig } /* fall through */ case aco_opcode::p_inclusive_scan: - assert(cluster_size == 64); + assert(cluster_size == ctx->program->wave_size); emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(1), 0xf, 0xf, false, identity); emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), @@ -544,11 +558,13 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig } emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size()); - bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0u)); - bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0xffffffffu)); - for (unsigned i = 0; i < src.size(); i++) - bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); - emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size()); + if (ctx->program->wave_size == 64) { + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0u)); + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0xffffffffu)); + for (unsigned i = 0; i < src.size(); i++) + bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); + emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size()); + } } else { emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf, false, identity); @@ -563,10 +579,10 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig if (!exec_restored) bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm)); - if (op == aco_opcode::p_reduce && cluster_size == 64) { + if (op == aco_opcode::p_reduce && dst.regClass().type() == RegType::sgpr) { for (unsigned k = 0; k < src.size(); k++) { bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{dst.physReg() + k}, s1), - Operand(PhysReg{tmp + k}, v1), Operand(63u)); + Operand(PhysReg{tmp + k}, v1), Operand(ctx->program->wave_size - 1)); } } else if (!(dst.physReg() == tmp) && !dst_written) { for (unsigned k = 0; k < src.size(); k++) { -- 2.30.2