X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fcompiler%2Faco_optimizer.cpp;h=7b66aa1eeb3de933522ddede65e20b702a2059fa;hb=7cd548d3523b16a0e8f35c078316388016514d31;hp=4a287336d0d5a8872d3cecc1c1e21fdc151ceff2;hpb=a400928f4a64c657c214672a39801130cceaf79b;p=mesa.git diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 4a287336d0d..7b66aa1eeb3 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -82,10 +82,11 @@ enum Label { label_bitwise = 1 << 18, label_minmax = 1 << 19, label_fcmp = 1 << 20, + label_uniform_bool = 1 << 21, }; static constexpr uint32_t instr_labels = label_vec | label_mul | label_mad | label_omod_success | label_clamp_success | label_add_sub | label_bitwise | label_minmax | label_fcmp; -static constexpr uint32_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f; +static constexpr uint32_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f | label_uniform_bool; static constexpr uint32_t val_labels = label_constant | label_literal | label_mad; struct ssa_info { @@ -353,6 +354,17 @@ struct ssa_info { return label & label_fcmp; } + void set_uniform_bool(Temp uniform_bool) + { + add_label(label_uniform_bool); + temp = uniform_bool; + } + + bool is_uniform_bool() + { + return label & label_uniform_bool; + } + }; struct opt_ctx { @@ -469,11 +481,24 @@ bool can_accept_constant(aco_ptr& instr, unsigned operand) bool valu_can_accept_literal(opt_ctx& ctx, aco_ptr& instr, unsigned operand) { + /* instructions like v_cndmask_b32 can't take a literal because they always + * read SGPRs */ + if (instr->operands.size() >= 3 && + instr->operands[2].isTemp() && instr->operands[2].regClass().type() == RegType::sgpr) + return false; + // TODO: VOP3 can take a literal on GFX10 return !instr->isSDWA() && !instr->isDPP() && !instr->isVOP3() && operand == 0 && can_accept_constant(instr, operand); } +bool valu_can_accept_vgpr(aco_ptr& instr, unsigned operand) +{ + if (instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_writelane_b32) + return operand != 1; + return true; +} + bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset) { Operand op = instr->operands[op_index]; @@ -496,6 +521,9 @@ bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp return false; } + if (add_instr->usesModifiers()) + return false; + for (unsigned i = 0; i < 2; i++) { if (add_instr->operands[i].isConstant()) { *offset = add_instr->operands[i].constantValue(); @@ -520,7 +548,7 @@ bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp return false; } -void label_instruction(opt_ctx &ctx, aco_ptr& instr) +void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) { if (instr->isSALU() || instr->isVALU() || instr->format == Format::PSEUDO) { ASSERTED bool all_const = false; @@ -576,7 +604,7 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) /* VALU: propagate neg, abs & inline constants */ else if (instr->isVALU()) { - if (info.is_temp() && info.temp.type() == RegType::vgpr) { + if (info.is_temp() && info.temp.type() == RegType::vgpr && valu_can_accept_vgpr(instr, i)) { instr->operands[i].setTemp(info.temp); info = ctx.info[info.temp.id()]; } @@ -657,7 +685,8 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) Temp base; uint32_t offset; if (i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == instr->operands[i].regClass()) { - if (instr->opcode == aco_opcode::ds_write2_b32 || instr->opcode == aco_opcode::ds_read2_b32) { + if (instr->opcode == aco_opcode::ds_write2_b32 || instr->opcode == aco_opcode::ds_read2_b32 || + instr->opcode == aco_opcode::ds_write2_b64 || instr->opcode == aco_opcode::ds_read2_b64) { if (offset % 4 == 0 && ds->offset0 + (offset >> 2) <= 255 && ds->offset1 + (offset >> 2) <= 255) { @@ -702,6 +731,8 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) new_instr->operands.back() = Operand(base); if (!smem->definitions.empty()) new_instr->definitions[0] = smem->definitions[0]; + new_instr->can_reorder = smem->can_reorder; + new_instr->barrier = smem->barrier; instr.reset(new_instr); smem = static_cast(instr.get()); } @@ -752,7 +783,7 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) if (vec_op.isConstant()) { if (vec_op.isLiteral()) ctx.info[instr->definitions[i].tempId()].set_literal(vec_op.constantValue()); - else + else if (vec_op.size() == 1) ctx.info[instr->definitions[i].tempId()].set_constant(vec_op.constantValue()); } else { assert(vec_op.isTemp()); @@ -781,7 +812,7 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) if (vec_op.isConstant()) { if (vec_op.isLiteral()) ctx.info[instr->definitions[0].tempId()].set_literal(vec_op.constantValue()); - else + else if (vec_op.size() == 1) ctx.info[instr->definitions[0].tempId()].set_constant(vec_op.constantValue()); } else { assert(vec_op.isTemp()); @@ -796,13 +827,13 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) case aco_opcode::p_as_uniform: if (instr->definitions[0].isFixed()) { /* don't copy-propagate copies into fixed registers */ + } else if (instr->usesModifiers()) { + // TODO } else if (instr->operands[0].isConstant()) { if (instr->operands[0].isLiteral()) ctx.info[instr->definitions[0].tempId()].set_literal(instr->operands[0].constantValue()); - else + else if (instr->operands[0].size() == 1) ctx.info[instr->definitions[0].tempId()].set_constant(instr->operands[0].constantValue()); - } else if (instr->isDPP()) { - // TODO } else if (instr->operands[0].isTemp()) { ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp()); } else { @@ -846,11 +877,8 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) } case aco_opcode::v_mul_f32: { /* omod */ /* TODO: try to move the negate/abs modifier to the consumer instead */ - if (instr->isVOP3()) { - VOP3A_instruction *vop3 = static_cast(instr.get()); - if (vop3->abs[0] || vop3->abs[1] || vop3->neg[0] || vop3->neg[1] || vop3->omod || vop3->clamp) - break; - } + if (instr->usesModifiers()) + break; for (unsigned i = 0; i < 2; i++) { if (instr->operands[!i].isConstant() && instr->operands[i].isTemp()) { @@ -860,7 +888,8 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) ctx.info[instr->operands[i].tempId()].set_omod4(); } else if (instr->operands[!i].constantValue() == 0x3f000000) { /* 0.5 */ ctx.info[instr->operands[i].tempId()].set_omod5(); - } else if (instr->operands[!i].constantValue() == 0x3f800000) { /* 1.0 */ + } else if (instr->operands[!i].constantValue() == 0x3f800000 && + !block.fp_mode.must_flush_denorms32) { /* 1.0 */ ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[i].getTemp()); } else { continue; @@ -964,10 +993,15 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) case aco_opcode::s_add_u32: ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get()); break; + case aco_opcode::s_and_b64: + if (instr->operands[1].isFixed() && instr->operands[1].physReg() == exec && + instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_uniform_bool()) { + ctx.info[instr->definitions[1].tempId()].set_temp(ctx.info[instr->operands[0].tempId()].temp); + } + /* fallthrough */ + case aco_opcode::s_and_b32: case aco_opcode::s_not_b32: case aco_opcode::s_not_b64: - case aco_opcode::s_and_b32: - case aco_opcode::s_and_b64: case aco_opcode::s_or_b32: case aco_opcode::s_or_b64: case aco_opcode::s_xor_b32: @@ -1007,6 +1041,13 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) case aco_opcode::v_cmp_nlt_f32: ctx.info[instr->definitions[0].tempId()].set_fcmp(instr.get()); break; + case aco_opcode::s_cselect_b64: + if (instr->operands[0].constantEquals((unsigned) -1) && + instr->operands[1].constantEquals(0)) { + /* Found a cselect that operates on a uniform bool that comes from eg. s_cmp */ + ctx.info[instr->definitions[0].tempId()].set_uniform_bool(instr->operands[2].getTemp()); + } + break; default: break; } @@ -1852,7 +1893,7 @@ void apply_sgprs(opt_ctx &ctx, aco_ptr& instr) } } -bool apply_omod_clamp(opt_ctx &ctx, aco_ptr& instr) +bool apply_omod_clamp(opt_ctx &ctx, Block& block, aco_ptr& instr) { /* check if we could apply omod on predecessor */ if (instr->opcode == aco_opcode::v_mul_f32) { @@ -1919,18 +1960,21 @@ bool apply_omod_clamp(opt_ctx &ctx, aco_ptr& instr) } } + /* omod has no effect if denormals are enabled */ + bool can_use_omod = block.fp_mode.denorm32 == 0; + /* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */ if (!instr->definitions.empty() && ctx.uses[instr->definitions[0].tempId()] == 1 && can_use_VOP3(instr) && instr_info.can_use_output_modifiers[(int)instr->opcode]) { - if(ctx.info[instr->definitions[0].tempId()].is_omod2()) { + if (can_use_omod && ctx.info[instr->definitions[0].tempId()].is_omod2()) { to_VOP3(ctx, instr); static_cast(instr.get())->omod = 1; ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get()); - } else if (ctx.info[instr->definitions[0].tempId()].is_omod4()) { + } else if (can_use_omod && ctx.info[instr->definitions[0].tempId()].is_omod4()) { to_VOP3(ctx, instr); static_cast(instr.get())->omod = 2; ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get()); - } else if (ctx.info[instr->definitions[0].tempId()].is_omod5()) { + } else if (can_use_omod && ctx.info[instr->definitions[0].tempId()].is_omod5()) { to_VOP3(ctx, instr); static_cast(instr.get())->omod = 3; ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get()); @@ -1947,7 +1991,7 @@ bool apply_omod_clamp(opt_ctx &ctx, aco_ptr& instr) // TODO: we could possibly move the whole label_instruction pass to combine_instruction: // this would mean that we'd have to fix the instruction uses while value propagation -void combine_instruction(opt_ctx &ctx, aco_ptr& instr) +void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) { if (instr->definitions.empty() || !ctx.uses[instr->definitions[0].tempId()]) return; @@ -1955,7 +1999,7 @@ void combine_instruction(opt_ctx &ctx, aco_ptr& instr) if (instr->isVALU()) { if (can_apply_sgprs(instr)) apply_sgprs(ctx, instr); - if (apply_omod_clamp(ctx, instr)) + if (apply_omod_clamp(ctx, block, instr)) return; } @@ -2008,9 +2052,11 @@ void combine_instruction(opt_ctx &ctx, aco_ptr& instr) return; } /* combine mul+add -> mad */ - else if (instr->opcode == aco_opcode::v_add_f32 || - instr->opcode == aco_opcode::v_sub_f32 || - instr->opcode == aco_opcode::v_subrev_f32) { + else if ((instr->opcode == aco_opcode::v_add_f32 || + instr->opcode == aco_opcode::v_sub_f32 || + instr->opcode == aco_opcode::v_subrev_f32) && + block.fp_mode.denorm32 == 0 && !block.fp_mode.preserve_signed_zero_inf_nan32) { + //TODO: we could use fma instead when denormals are enabled if the NIR isn't marked as precise uint32_t uses_src0 = UINT32_MAX; uint32_t uses_src1 = UINT32_MAX; @@ -2354,7 +2400,7 @@ void optimize(Program* program) /* 1. Bottom-Up DAG pass (forward) to label all ssa-defs */ for (Block& block : program->blocks) { for (aco_ptr& instr : block.instructions) - label_instruction(ctx, instr); + label_instruction(ctx, block, instr); } ctx.uses = std::move(dead_code_analysis(program)); @@ -2362,7 +2408,7 @@ void optimize(Program* program) /* 2. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */ for (Block& block : program->blocks) { for (aco_ptr& instr : block.instructions) - combine_instruction(ctx, instr); + combine_instruction(ctx, block, instr); } /* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */