label_bitwise = 1 << 18,
label_minmax = 1 << 19,
label_fcmp = 1 << 20,
+ label_uniform_bool = 1 << 21,
};
static constexpr uint32_t instr_labels = label_vec | label_mul | label_mad | label_omod_success | label_clamp_success | label_add_sub | label_bitwise | label_minmax | label_fcmp;
-static constexpr uint32_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f;
+static constexpr uint32_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f | label_uniform_bool;
static constexpr uint32_t val_labels = label_constant | label_literal | label_mad;
struct ssa_info {
return label & label_fcmp;
}
+ void set_uniform_bool(Temp uniform_bool)
+ {
+ add_label(label_uniform_bool);
+ temp = uniform_bool;
+ }
+
+ bool is_uniform_bool()
+ {
+ return label & label_uniform_bool;
+ }
+
};
struct opt_ctx {
bool valu_can_accept_literal(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned operand)
{
+ /* instructions like v_cndmask_b32 can't take a literal because they always
+ * read SGPRs */
+ if (instr->operands.size() >= 3 &&
+ instr->operands[2].isTemp() && instr->operands[2].regClass().type() == RegType::sgpr)
+ return false;
+
// TODO: VOP3 can take a literal on GFX10
return !instr->isSDWA() && !instr->isDPP() && !instr->isVOP3() &&
operand == 0 && can_accept_constant(instr, operand);
}
+bool valu_can_accept_vgpr(aco_ptr<Instruction>& instr, unsigned operand)
+{
+ if (instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_writelane_b32)
+ return operand != 1;
+ return true;
+}
+
bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset)
{
Operand op = instr->operands[op_index];
return false;
}
+ if (add_instr->usesModifiers())
+ return false;
+
for (unsigned i = 0; i < 2; i++) {
if (add_instr->operands[i].isConstant()) {
*offset = add_instr->operands[i].constantValue();
return false;
}
-void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
+void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
{
if (instr->isSALU() || instr->isVALU() || instr->format == Format::PSEUDO) {
ASSERTED bool all_const = false;
/* VALU: propagate neg, abs & inline constants */
else if (instr->isVALU()) {
- if (info.is_temp() && info.temp.type() == RegType::vgpr) {
+ if (info.is_temp() && info.temp.type() == RegType::vgpr && valu_can_accept_vgpr(instr, i)) {
instr->operands[i].setTemp(info.temp);
info = ctx.info[info.temp.id()];
}
Temp base;
uint32_t offset;
if (i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == instr->operands[i].regClass()) {
- if (instr->opcode == aco_opcode::ds_write2_b32 || instr->opcode == aco_opcode::ds_read2_b32) {
+ if (instr->opcode == aco_opcode::ds_write2_b32 || instr->opcode == aco_opcode::ds_read2_b32 ||
+ instr->opcode == aco_opcode::ds_write2_b64 || instr->opcode == aco_opcode::ds_read2_b64) {
if (offset % 4 == 0 &&
ds->offset0 + (offset >> 2) <= 255 &&
ds->offset1 + (offset >> 2) <= 255) {
new_instr->operands.back() = Operand(base);
if (!smem->definitions.empty())
new_instr->definitions[0] = smem->definitions[0];
+ new_instr->can_reorder = smem->can_reorder;
+ new_instr->barrier = smem->barrier;
instr.reset(new_instr);
smem = static_cast<SMEM_instruction *>(instr.get());
}
if (vec_op.isConstant()) {
if (vec_op.isLiteral())
ctx.info[instr->definitions[i].tempId()].set_literal(vec_op.constantValue());
- else
+ else if (vec_op.size() == 1)
ctx.info[instr->definitions[i].tempId()].set_constant(vec_op.constantValue());
} else {
assert(vec_op.isTemp());
if (vec_op.isConstant()) {
if (vec_op.isLiteral())
ctx.info[instr->definitions[0].tempId()].set_literal(vec_op.constantValue());
- else
+ else if (vec_op.size() == 1)
ctx.info[instr->definitions[0].tempId()].set_constant(vec_op.constantValue());
} else {
assert(vec_op.isTemp());
case aco_opcode::p_as_uniform:
if (instr->definitions[0].isFixed()) {
/* don't copy-propagate copies into fixed registers */
+ } else if (instr->usesModifiers()) {
+ // TODO
} else if (instr->operands[0].isConstant()) {
if (instr->operands[0].isLiteral())
ctx.info[instr->definitions[0].tempId()].set_literal(instr->operands[0].constantValue());
- else
+ else if (instr->operands[0].size() == 1)
ctx.info[instr->definitions[0].tempId()].set_constant(instr->operands[0].constantValue());
- } else if (instr->isDPP()) {
- // TODO
} else if (instr->operands[0].isTemp()) {
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
} else {
}
case aco_opcode::v_mul_f32: { /* omod */
/* TODO: try to move the negate/abs modifier to the consumer instead */
- if (instr->isVOP3()) {
- VOP3A_instruction *vop3 = static_cast<VOP3A_instruction*>(instr.get());
- if (vop3->abs[0] || vop3->abs[1] || vop3->neg[0] || vop3->neg[1] || vop3->omod || vop3->clamp)
- break;
- }
+ if (instr->usesModifiers())
+ break;
for (unsigned i = 0; i < 2; i++) {
if (instr->operands[!i].isConstant() && instr->operands[i].isTemp()) {
ctx.info[instr->operands[i].tempId()].set_omod4();
} else if (instr->operands[!i].constantValue() == 0x3f000000) { /* 0.5 */
ctx.info[instr->operands[i].tempId()].set_omod5();
- } else if (instr->operands[!i].constantValue() == 0x3f800000) { /* 1.0 */
+ } else if (instr->operands[!i].constantValue() == 0x3f800000 &&
+ !block.fp_mode.must_flush_denorms32) { /* 1.0 */
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[i].getTemp());
} else {
continue;
case aco_opcode::s_add_u32:
ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get());
break;
+ case aco_opcode::s_and_b64:
+ if (instr->operands[1].isFixed() && instr->operands[1].physReg() == exec &&
+ instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
+ ctx.info[instr->definitions[1].tempId()].set_temp(ctx.info[instr->operands[0].tempId()].temp);
+ }
+ /* fallthrough */
+ case aco_opcode::s_and_b32:
case aco_opcode::s_not_b32:
case aco_opcode::s_not_b64:
- case aco_opcode::s_and_b32:
- case aco_opcode::s_and_b64:
case aco_opcode::s_or_b32:
case aco_opcode::s_or_b64:
case aco_opcode::s_xor_b32:
case aco_opcode::v_cmp_nlt_f32:
ctx.info[instr->definitions[0].tempId()].set_fcmp(instr.get());
break;
+ case aco_opcode::s_cselect_b64:
+ if (instr->operands[0].constantEquals((unsigned) -1) &&
+ instr->operands[1].constantEquals(0)) {
+ /* Found a cselect that operates on a uniform bool that comes from eg. s_cmp */
+ ctx.info[instr->definitions[0].tempId()].set_uniform_bool(instr->operands[2].getTemp());
+ }
+ break;
default:
break;
}
}
}
-bool apply_omod_clamp(opt_ctx &ctx, aco_ptr<Instruction>& instr)
+bool apply_omod_clamp(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
{
/* check if we could apply omod on predecessor */
if (instr->opcode == aco_opcode::v_mul_f32) {
}
}
+ /* omod has no effect if denormals are enabled */
+ bool can_use_omod = block.fp_mode.denorm32 == 0;
+
/* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */
if (!instr->definitions.empty() && ctx.uses[instr->definitions[0].tempId()] == 1 &&
can_use_VOP3(instr) && instr_info.can_use_output_modifiers[(int)instr->opcode]) {
- if(ctx.info[instr->definitions[0].tempId()].is_omod2()) {
+ if (can_use_omod && ctx.info[instr->definitions[0].tempId()].is_omod2()) {
to_VOP3(ctx, instr);
static_cast<VOP3A_instruction*>(instr.get())->omod = 1;
ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get());
- } else if (ctx.info[instr->definitions[0].tempId()].is_omod4()) {
+ } else if (can_use_omod && ctx.info[instr->definitions[0].tempId()].is_omod4()) {
to_VOP3(ctx, instr);
static_cast<VOP3A_instruction*>(instr.get())->omod = 2;
ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get());
- } else if (ctx.info[instr->definitions[0].tempId()].is_omod5()) {
+ } else if (can_use_omod && ctx.info[instr->definitions[0].tempId()].is_omod5()) {
to_VOP3(ctx, instr);
static_cast<VOP3A_instruction*>(instr.get())->omod = 3;
ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get());
// TODO: we could possibly move the whole label_instruction pass to combine_instruction:
// this would mean that we'd have to fix the instruction uses while value propagation
-void combine_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
+void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
{
if (instr->definitions.empty() || !ctx.uses[instr->definitions[0].tempId()])
return;
if (instr->isVALU()) {
if (can_apply_sgprs(instr))
apply_sgprs(ctx, instr);
- if (apply_omod_clamp(ctx, instr))
+ if (apply_omod_clamp(ctx, block, instr))
return;
}
return;
}
/* combine mul+add -> mad */
- else if (instr->opcode == aco_opcode::v_add_f32 ||
- instr->opcode == aco_opcode::v_sub_f32 ||
- instr->opcode == aco_opcode::v_subrev_f32) {
+ else if ((instr->opcode == aco_opcode::v_add_f32 ||
+ instr->opcode == aco_opcode::v_sub_f32 ||
+ instr->opcode == aco_opcode::v_subrev_f32) &&
+ block.fp_mode.denorm32 == 0 && !block.fp_mode.preserve_signed_zero_inf_nan32) {
+ //TODO: we could use fma instead when denormals are enabled if the NIR isn't marked as precise
uint32_t uses_src0 = UINT32_MAX;
uint32_t uses_src1 = UINT32_MAX;
/* 1. Bottom-Up DAG pass (forward) to label all ssa-defs */
for (Block& block : program->blocks) {
for (aco_ptr<Instruction>& instr : block.instructions)
- label_instruction(ctx, instr);
+ label_instruction(ctx, block, instr);
}
ctx.uses = std::move(dead_code_analysis(program));
/* 2. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */
for (Block& block : program->blocks) {
for (aco_ptr<Instruction>& instr : block.instructions)
- combine_instruction(ctx, instr);
+ combine_instruction(ctx, block, instr);
}
/* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */