if (instr->dest.dest.ssa.bit_size == 1) {
assert(src.regClass() == bld.lm);
assert(dst.regClass() == bld.lm);
- bld.sop2(Builder::s_andn2, Definition(dst), bld.def(s1, scc), Operand(exec, bld.lm), src);
+ /* Don't use s_andn2 here, this allows the optimizer to make a better decision */
+ Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
+ bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
} else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
} else if (dst.type() == RegType::sgpr) {
} else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
//subgroupAnd(val) -> (exec & ~val) == 0
Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
- Temp all = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), bld.scc(tmp), Operand(0u));
- return bool_to_vector_condition(ctx, all);
+ Temp cond = bool_to_vector_condition(ctx, emit_wqm(ctx, tmp));
+ return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond);
} else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
//subgroupOr(val) -> (val & exec) != 0
Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)).def(1).getTemp();
assert(dst.regClass() == bld.lm);
Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
- Temp all = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), bld.scc(tmp), Operand(0u));
- bool_to_vector_condition(ctx, emit_wqm(ctx, all), dst);
+ Temp cond = bool_to_vector_condition(ctx, emit_wqm(ctx, tmp));
+ bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
break;
}
case nir_intrinsic_vote_any: {
label_uniform_bool = 1 << 21,
label_constant_64bit = 1 << 22,
label_uniform_bitwise = 1 << 23,
+ label_scc_invert = 1 << 24,
};
static constexpr uint32_t instr_labels = label_vec | label_mul | label_mad | label_omod_success | label_clamp_success |
label_add_sub | label_bitwise | label_uniform_bitwise | label_minmax | label_fcmp;
-static constexpr uint32_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f | label_uniform_bool | label_omod2 | label_omod4 | label_omod5 | label_clamp;
+static constexpr uint32_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f | label_uniform_bool |
+ label_omod2 | label_omod4 | label_omod5 | label_clamp | label_scc_invert;
static constexpr uint32_t val_labels = label_constant | label_constant_64bit | label_literal | label_mad;
struct ssa_info {
return label & label_fcmp;
}
+ void set_scc_invert(Temp scc_inv)
+ {
+ add_label(label_scc_invert);
+ temp = scc_inv;
+ }
+
+ bool is_scc_invert()
+ {
+ return label & label_scc_invert;
+ }
+
void set_uniform_bool(Temp uniform_bool)
{
add_label(label_uniform_bool);
continue;
}
}
+
+ else if (instr->format == Format::PSEUDO_BRANCH) {
+ if (ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
+ /* Flip the branch instruction to get rid of the scc_invert instruction */
+ instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz : aco_opcode::p_cbranch_z;
+ instr->operands[0].setTemp(ctx.info[instr->operands[0].tempId()].temp);
+ }
+ }
}
/* if this instruction doesn't define anything, return */
case aco_opcode::s_add_u32:
ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get());
break;
+ case aco_opcode::s_not_b32:
+ case aco_opcode::s_not_b64:
+ if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
+ ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
+ ctx.info[instr->definitions[1].tempId()].set_scc_invert(ctx.info[instr->operands[0].tempId()].temp);
+ } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
+ ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
+ ctx.info[instr->definitions[1].tempId()].set_scc_invert(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
+ }
+ ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
+ break;
case aco_opcode::s_and_b32:
case aco_opcode::s_and_b64:
if (instr->operands[1].isFixed() && instr->operands[1].physReg() == exec && instr->operands[0].isTemp()) {
}
}
/* fallthrough */
- case aco_opcode::s_not_b32:
- case aco_opcode::s_not_b64:
case aco_opcode::s_or_b32:
case aco_opcode::s_or_b64:
case aco_opcode::s_xor_b32:
/* Found a cselect that operates on a uniform bool that comes from eg. s_cmp */
ctx.info[instr->definitions[0].tempId()].set_uniform_bool(instr->operands[2].getTemp());
}
+ if (instr->operands[2].isTemp() && ctx.info[instr->operands[2].tempId()].is_scc_invert()) {
+ /* Flip the operands to get rid of the scc_invert instruction */
+ std::swap(instr->operands[0], instr->operands[1]);
+ instr->operands[2].setTemp(ctx.info[instr->operands[2].tempId()].temp);
+ }
+ break;
+ case aco_opcode::p_wqm:
+ if (instr->operands[0].isTemp() &&
+ ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
+ ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
+ }
break;
default:
break;