if (!dst.id())
dst = bld.tmp(src.regClass());
+ assert(src.size() == dst.size());
+
if (ctx->stage != fragment_fs) {
if (!dst.id())
return src;
ctx->allocated_vec.emplace(dst.id(), elems);
}
-Temp as_divergent_bool(isel_context *ctx, Temp val, bool vcc_hint)
+Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2))
{
- if (val.regClass() == s2) {
- return val;
- } else {
- assert(val.regClass() == s1);
- Builder bld(ctx->program, ctx->block);
- Definition& def = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2),
- Operand((uint32_t) -1), Operand(0u), bld.scc(val)).def(0);
- if (vcc_hint)
- def.setHint(vcc);
- return def.getTemp();
- }
+ Builder bld(ctx->program, ctx->block);
+ if (!dst.id())
+ dst = bld.tmp(s2);
+
+ assert(val.regClass() == s1);
+ assert(dst.regClass() == s2);
+
+ return bld.sop2(aco_opcode::s_cselect_b64, bld.hint_vcc(Definition(dst)), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
}
-Temp as_uniform_bool(isel_context *ctx, Temp val)
+Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1))
{
- if (val.regClass() == s1) {
- return val;
- } else {
- assert(val.regClass() == s2);
- Builder bld(ctx->program, ctx->block);
- /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
- Temp tmp = bld.tmp(s1);
- bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.scc(Definition(tmp)), val, Operand(exec, s2)).def(1).getTemp();
- return emit_wqm(ctx, tmp);
- }
+ Builder bld(ctx->program, ctx->block);
+ if (!dst.id())
+ dst = bld.tmp(s1);
+
+ assert(val.regClass() == s2);
+ assert(dst.regClass() == s1);
+
+ /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
+ Temp tmp = bld.tmp(s1);
+ bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.scc(Definition(tmp)), val, Operand(exec, s2));
+ return emit_wqm(ctx, tmp, dst);
}
Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
src1 = as_vgpr(ctx, src1);
}
}
+
Builder bld(ctx->program, ctx->block);
- bld.vopc(op, Definition(dst), src0, src1).def(0).setHint(vcc);
+ bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
}
-void emit_comparison(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
+void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
{
- if (dst.regClass() == s2) {
- emit_vopc_instruction(ctx, instr, op, dst);
- if (!ctx->divergent_vals[instr->dest.dest.ssa.index])
- emit_split_vector(ctx, dst, 2);
- } else if (dst.regClass() == s1) {
- Temp src0 = get_alu_src(ctx, instr->src[0]);
- Temp src1 = get_alu_src(ctx, instr->src[1]);
- assert(src0.type() == RegType::sgpr && src1.type() == RegType::sgpr);
+ Temp src0 = get_alu_src(ctx, instr->src[0]);
+ Temp src1 = get_alu_src(ctx, instr->src[1]);
- Builder bld(ctx->program, ctx->block);
- bld.sopc(op, bld.scc(Definition(dst)), src0, src1);
+ assert(dst.regClass() == s2);
+ assert(src0.type() == RegType::sgpr);
+ assert(src1.type() == RegType::sgpr);
- } else {
- assert(false);
- }
+ Builder bld(ctx->program, ctx->block);
+ /* Emit the SALU comparison instruction */
+ Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
+ /* Turn the result into a per-lane bool */
+ bool_to_vector_condition(ctx, cmp, dst);
+}
+
+void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
+ aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::last_opcode, aco_opcode s64_op = aco_opcode::last_opcode)
+{
+ aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : s32_op;
+ aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : v32_op;
+ bool divergent_vals = ctx->divergent_vals[instr->dest.dest.ssa.index];
+ bool use_valu = s_op == aco_opcode::last_opcode ||
+ divergent_vals ||
+ ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr ||
+ ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
+ aco_opcode op = use_valu ? v_op : s_op;
+ assert(op != aco_opcode::last_opcode);
+
+ if (use_valu)
+ emit_vopc_instruction(ctx, instr, op, dst);
+ else
+ emit_sopc_instruction(ctx, instr, op, dst);
}
void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, aco_opcode op32, aco_opcode op64, Temp dst)
Builder bld(ctx->program, ctx->block);
Temp src0 = get_alu_src(ctx, instr->src[0]);
Temp src1 = get_alu_src(ctx, instr->src[1]);
- if (dst.regClass() == s2) {
- bld.sop2(op64, Definition(dst), bld.def(s1, scc),
- as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
- } else {
- assert(dst.regClass() == s1);
- bld.sop2(op32, bld.def(s1), bld.scc(Definition(dst)),
- as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
- }
-}
+ assert(dst.regClass() == s2);
+ assert(src0.regClass() == s2);
+ assert(src1.regClass() == s2);
+
+ bld.sop2(op64, Definition(dst), bld.def(s1, scc), src0, src1);
+}
void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
{
Temp then = get_alu_src(ctx, instr->src[1]);
Temp els = get_alu_src(ctx, instr->src[2]);
- if (dst.type() == RegType::vgpr) {
- cond = as_divergent_bool(ctx, cond, true);
+ assert(cond.regClass() == s2);
+ if (dst.type() == RegType::vgpr) {
aco_ptr<Instruction> bcsel;
if (dst.size() == 1) {
then = as_vgpr(ctx, then);
return;
}
- if (instr->dest.dest.ssa.bit_size != 1) { /* uniform condition and values in sgpr */
+ if (instr->dest.dest.ssa.bit_size == 1) {
+ assert(dst.regClass() == s2);
+ assert(then.regClass() == s2);
+ assert(els.regClass() == s2);
+ }
+
+ if (!ctx->divergent_vals[instr->src[0].src.ssa->index]) { /* uniform condition and values in sgpr */
if (dst.regClass() == s1 || dst.regClass() == s2) {
assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
- bld.sop2(op, Definition(dst), then, els, bld.scc(as_uniform_bool(ctx, cond)));
+ bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
} else {
fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
nir_print_instr(&instr->instr, stderr);
return;
}
- /* boolean bcsel */
- assert(instr->dest.dest.ssa.bit_size == 1);
-
- if (dst.regClass() == s1)
- cond = as_uniform_bool(ctx, cond);
-
- if (cond.regClass() == s1) { /* uniform selection */
- aco_opcode op;
- if (dst.regClass() == s2) {
- op = aco_opcode::s_cselect_b64;
- then = as_divergent_bool(ctx, then, false);
- els = as_divergent_bool(ctx, els, false);
- } else {
- assert(dst.regClass() == s1);
- op = aco_opcode::s_cselect_b32;
- then = as_uniform_bool(ctx, then);
- els = as_uniform_bool(ctx, els);
- }
- bld.sop2(op, Definition(dst), then, els, bld.scc(cond));
- return;
- }
-
/* divergent boolean bcsel
* this implements bcsel on bools: dst = s0 ? s1 : s2
* are going to be: dst = (s0 & s1) | (~s0 & s2) */
- assert (dst.regClass() == s2);
- then = as_divergent_bool(ctx, then, false);
- els = as_divergent_bool(ctx, els, false);
+ assert(instr->dest.dest.ssa.bit_size == 1);
if (cond.id() != then.id())
then = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), cond, then);
}
case nir_op_inot: {
Temp src = get_alu_src(ctx, instr->src[0]);
- /* uniform booleans */
- if (instr->dest.dest.ssa.bit_size == 1 && dst.regClass() == s1) {
- if (src.regClass() == s1) {
- /* in this case, src is either 1 or 0 */
- bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.scc(Definition(dst)), Operand(1u), src);
- } else {
- /* src is either exec_mask or 0 */
- assert(src.regClass() == s2);
- bld.sopc(aco_opcode::s_cmp_eq_u64, bld.scc(Definition(dst)), Operand(0u), src);
- }
+ if (instr->dest.dest.ssa.bit_size == 1) {
+ assert(src.regClass() == s2);
+ assert(dst.regClass() == s2);
+ bld.sop2(aco_opcode::s_andn2_b64, Definition(dst), bld.def(s1, scc), Operand(exec, s2), src);
} else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
} else if (dst.type() == RegType::sgpr) {
}
case nir_op_b2f32: {
Temp src = get_alu_src(ctx, instr->src[0]);
+ assert(src.regClass() == s2);
+
if (dst.regClass() == s1) {
- src = as_uniform_bool(ctx, src);
+ src = bool_to_scalar_condition(ctx, src);
bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src);
} else if (dst.regClass() == v1) {
- bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u),
- as_divergent_bool(ctx, src, true));
+ bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
} else {
unreachable("Wrong destination register class for nir_op_b2f32.");
}
}
case nir_op_b2f64: {
Temp src = get_alu_src(ctx, instr->src[0]);
+ assert(src.regClass() == s2);
+
if (dst.regClass() == s2) {
- src = as_uniform_bool(ctx, src);
+ src = bool_to_scalar_condition(ctx, src);
bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src));
} else if (dst.regClass() == v2) {
Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u));
- Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one,
- as_divergent_bool(ctx, src, true));
+ Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src);
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
} else {
unreachable("Wrong destination register class for nir_op_b2f64.");
}
case nir_op_b2i32: {
Temp src = get_alu_src(ctx, instr->src[0]);
+ assert(src.regClass() == s2);
+
if (dst.regClass() == s1) {
- if (src.regClass() == s1) {
- bld.copy(Definition(dst), src);
- } else {
- // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
- assert(src.regClass() == s2);
- bld.sopc(aco_opcode::s_cmp_lg_u64, bld.scc(Definition(dst)), Operand(0u), src);
- }
- } else {
- assert(dst.regClass() == v1 && src.regClass() == s2);
+ // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
+ bool_to_scalar_condition(ctx, src, dst);
+ } else if (dst.regClass() == v1) {
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src);
+ } else {
+ unreachable("Invalid register class for b2i32");
}
break;
}
case nir_op_i2b1: {
Temp src = get_alu_src(ctx, instr->src[0]);
- if (dst.regClass() == s2) {
+ assert(dst.regClass() == s2);
+
+ if (src.type() == RegType::vgpr) {
assert(src.regClass() == v1 || src.regClass() == v2);
bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
Definition(dst), Operand(0u), src).def(0).setHint(vcc);
} else {
- assert(src.regClass() == s1 && dst.regClass() == s1);
- bld.sopc(aco_opcode::s_cmp_lg_u32, bld.scc(Definition(dst)), Operand(0u), src);
+ assert(src.regClass() == s1 || src.regClass() == s2);
+ Temp tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
+ bld.scc(bld.def(s1)), Operand(0u), src);
+ bool_to_vector_condition(ctx, tmp, dst);
}
break;
}
break;
}
case nir_op_flt: {
- if (instr->src[0].src.ssa->bit_size == 32)
- emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_f32, dst);
- else if (instr->src[0].src.ssa->bit_size == 64)
- emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_f64, dst);
+ emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f32, aco_opcode::v_cmp_lt_f64);
break;
}
case nir_op_fge: {
- if (instr->src[0].src.ssa->bit_size == 32)
- emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_f32, dst);
- else if (instr->src[0].src.ssa->bit_size == 64)
- emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_f64, dst);
+ emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f32, aco_opcode::v_cmp_ge_f64);
break;
}
case nir_op_feq: {
- if (instr->src[0].src.ssa->bit_size == 32)
- emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_f32, dst);
- else if (instr->src[0].src.ssa->bit_size == 64)
- emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_f64, dst);
+ emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f32, aco_opcode::v_cmp_eq_f64);
break;
}
case nir_op_fne: {
- if (instr->src[0].src.ssa->bit_size == 32)
- emit_comparison(ctx, instr, aco_opcode::v_cmp_neq_f32, dst);
- else if (instr->src[0].src.ssa->bit_size == 64)
- emit_comparison(ctx, instr, aco_opcode::v_cmp_neq_f64, dst);
+ emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f32, aco_opcode::v_cmp_neq_f64);
break;
}
case nir_op_ilt: {
- if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
- emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_i32, dst);
- else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
- emit_comparison(ctx, instr, aco_opcode::s_cmp_lt_i32, dst);
- else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
- emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_i64, dst);
+ emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i32, aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
break;
}
case nir_op_ige: {
- if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
- emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_i32, dst);
- else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
- emit_comparison(ctx, instr, aco_opcode::s_cmp_ge_i32, dst);
- else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
- emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_i64, dst);
+ emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i32, aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
break;
}
case nir_op_ieq: {
- if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32) {
- emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_i32, dst);
- } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32) {
- emit_comparison(ctx, instr, aco_opcode::s_cmp_eq_i32, dst);
- } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64) {
- emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_i64, dst);
- } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 64) {
- emit_comparison(ctx, instr, aco_opcode::s_cmp_eq_u64, dst);
- } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 1) {
- Temp src0 = get_alu_src(ctx, instr->src[0]);
- Temp src1 = get_alu_src(ctx, instr->src[1]);
- bld.sopc(aco_opcode::s_cmp_eq_i32, bld.scc(Definition(dst)),
- as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
- } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 1) {
- Temp src0 = get_alu_src(ctx, instr->src[0]);
- Temp src1 = get_alu_src(ctx, instr->src[1]);
- bld.sop2(aco_opcode::s_xnor_b64, Definition(dst), bld.def(s1, scc),
- as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
- } else {
- fprintf(stderr, "Unimplemented NIR instr bit size: ");
- nir_print_instr(&instr->instr, stderr);
- fprintf(stderr, "\n");
- }
+ if (instr->src[0].src.ssa->bit_size == 1)
+ emit_boolean_logic(ctx, instr, aco_opcode::s_xnor_b32, aco_opcode::s_xnor_b64, dst);
+ else
+ emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32, aco_opcode::s_cmp_eq_u64);
break;
}
case nir_op_ine: {
- if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32) {
- emit_comparison(ctx, instr, aco_opcode::v_cmp_lg_i32, dst);
- } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64) {
- emit_comparison(ctx, instr, aco_opcode::v_cmp_lg_i64, dst);
- } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32) {
- emit_comparison(ctx, instr, aco_opcode::s_cmp_lg_i32, dst);
- } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 64) {
- emit_comparison(ctx, instr, aco_opcode::s_cmp_lg_u64, dst);
- } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 1) {
- Temp src0 = get_alu_src(ctx, instr->src[0]);
- Temp src1 = get_alu_src(ctx, instr->src[1]);
- bld.sopc(aco_opcode::s_cmp_lg_i32, bld.scc(Definition(dst)),
- as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
- } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 1) {
- Temp src0 = get_alu_src(ctx, instr->src[0]);
- Temp src1 = get_alu_src(ctx, instr->src[1]);
- bld.sop2(aco_opcode::s_xor_b64, Definition(dst), bld.def(s1, scc),
- as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
- } else {
- fprintf(stderr, "Unimplemented NIR instr bit size: ");
- nir_print_instr(&instr->instr, stderr);
- fprintf(stderr, "\n");
- }
+ if (instr->src[0].src.ssa->bit_size == 1)
+ emit_boolean_logic(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::s_xor_b64, dst);
+ else
+ emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32, aco_opcode::s_cmp_lg_u64);
break;
}
case nir_op_ult: {
- if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
- emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_u32, dst);
- else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
- emit_comparison(ctx, instr, aco_opcode::s_cmp_lt_u32, dst);
- else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
- emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_u64, dst);
+ emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u32, aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
break;
}
case nir_op_uge: {
- if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
- emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_u32, dst);
- else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
- emit_comparison(ctx, instr, aco_opcode::s_cmp_ge_u32, dst);
- else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
- emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_u64, dst);
+ emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u32, aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
break;
}
case nir_op_fddx:
assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
assert(dst.type() == RegType::sgpr);
- if (dst.size() == 1)
- {
- Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(instr->value[0].u32));
+ Builder bld(ctx->program, ctx->block);
+
+ if (instr->def.bit_size == 1) {
+ assert(dst.regClass() == s2);
+ bld.sop1(aco_opcode::s_mov_b64, Definition(dst), Operand((uint64_t)(instr->value[0].b ? -1 : 0)));
+ } else if (dst.size() == 1) {
+ bld.copy(Definition(dst), Operand(instr->value[0].u32));
} else {
assert(dst.size() != 1);
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
// TODO: optimize uniform conditions
Builder bld(ctx->program, ctx->block);
- Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
+ Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
+ assert(src.regClass() == s2);
src = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
bld.pseudo(aco_opcode::p_discard_if, src);
ctx->block->kind |= block_kind_uses_discard_if;
} else if (op == nir_op_iand && cluster_size == 64) {
//subgroupAnd(val) -> (exec & ~val) == 0
Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp();
- return bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), tmp, Operand(0u));
+ return bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), Operand(0u), Operand(-1u), bld.scc(tmp));
} else if (op == nir_op_ior && cluster_size == 64) {
//subgroupOr(val) -> (val & exec) != 0
- return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)).def(1).getTemp();
+ Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)).def(1).getTemp();
+ return bool_to_vector_condition(ctx, tmp);
} else if (op == nir_op_ixor && cluster_size == 64) {
//subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1
Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
tmp = bld.sop1(aco_opcode::s_bcnt1_i32_b64, bld.def(s2), bld.def(s1, scc), tmp);
- return bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp();
+ tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp();
+ return bool_to_vector_condition(ctx, tmp);
} else {
//subgroupClustered{And,Or,Xor}(val, n) ->
//lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0))
Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
if (src.regClass().type() == RegType::vgpr) {
bld.pseudo(aco_opcode::p_as_uniform, dst, src);
- } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
- bld.sopc(aco_opcode::s_cmp_lg_u64, bld.scc(dst), Operand(0u), Operand(src));
} else if (src.regClass() == s1) {
bld.sop1(aco_opcode::s_mov_b32, dst, src);
} else if (src.regClass() == s2) {
case nir_intrinsic_ballot: {
Definition tmp = bld.def(s2);
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
- if (instr->src[0].ssa->bit_size == 1 && src.regClass() == s2) {
+ if (instr->src[0].ssa->bit_size == 1) {
+ assert(src.regClass() == s2);
bld.sop2(aco_opcode::s_and_b64, tmp, bld.def(s1, scc), Operand(exec, s2), src);
- } else if (instr->src[0].ssa->bit_size == 1 && src.regClass() == s1) {
- bld.sop2(aco_opcode::s_cselect_b64, tmp, Operand(exec, s2), Operand(0u), bld.scc(src));
} else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
bld.vopc(aco_opcode::v_cmp_lg_u32, tmp, Operand(0u), src);
} else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
hi = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, hi));
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
emit_split_vector(ctx, dst, 2);
- } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2 && tid.regClass() == s1) {
- emit_wqm(ctx, bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src, tid), dst);
- } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
+ } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) {
+ assert(src.regClass() == s2);
+ Temp tmp = bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src, tid);
+ bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
+ } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) {
+ assert(src.regClass() == s2);
Temp tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
tmp = emit_extract_vector(ctx, tmp, 0, v1);
tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp);
hi = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
emit_split_vector(ctx, dst, 2);
- } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
- emit_wqm(ctx,
- bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src,
- bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2))),
- dst);
+ } else if (instr->dest.ssa.bit_size == 1) {
+ assert(src.regClass() == s2);
+ Temp tmp = bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src,
+ bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2)));
+ bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
} else if (src.regClass() == s1) {
bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
} else if (src.regClass() == s2) {
break;
}
case nir_intrinsic_vote_all: {
- Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
+ Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
assert(src.regClass() == s2);
- assert(dst.regClass() == s1);
+ assert(dst.regClass() == s2);
- Definition tmp = bld.def(s1);
- bld.sopc(aco_opcode::s_cmp_eq_u64, bld.scc(tmp),
- bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)),
- Operand(exec, s2));
- emit_wqm(ctx, tmp.getTemp(), dst);
+ Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp();
+ Temp val = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), Operand(0u), Operand(-1u), bld.scc(tmp));
+ emit_wqm(ctx, val, dst);
break;
}
case nir_intrinsic_vote_any: {
- Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
+ Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
assert(src.regClass() == s2);
- assert(dst.regClass() == s1);
+ assert(dst.regClass() == s2);
- Definition tmp = bld.def(s1);
- bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.scc(tmp), src, Operand(exec, s2));
- emit_wqm(ctx, tmp.getTemp(), dst);
+ Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp();
+ Temp val = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), Operand(-1u), Operand(0u), bld.scc(tmp));
+ emit_wqm(ctx, val, dst);
break;
}
case nir_intrinsic_reduce:
} else {
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
unsigned lane = nir_src_as_const_value(instr->src[1])->u32;
- if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
+ if (instr->dest.ssa.bit_size == 1) {
+ assert(src.regClass() == s2);
uint32_t half_mask = 0x11111111u << lane;
Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask));
Temp tmp = bld.tmp(s2);
}
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
- if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
+ if (instr->dest.ssa.bit_size == 1) {
+ assert(src.regClass() == s2);
src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), src);
ctx->program->needs_exact = true;
break;
case nir_intrinsic_demote_if: {
- Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc),
- as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false),
- Operand(exec, s2));
+ Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
+ assert(src.regClass() == s2);
+ Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
bld.pseudo(aco_opcode::p_demote_to_helper, cond);
ctx->block->kind |= block_kind_uses_demote;
ctx->program->needs_exact = true;
Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT),
bld.scc(compare_cube_wa));
}
- tg4_compare_cube_wa64 = as_divergent_bool(ctx, compare_cube_wa, true);
+ tg4_compare_cube_wa64 = bld.tmp(s2);
+ bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
+
nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, Operand(26u));
desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
aco_ptr<Pseudo_instruction> phi;
unsigned num_src = exec_list_length(&instr->srcs);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+ assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == s2);
aco_opcode opcode = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index] ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
}
/* try to scalarize vector phis */
- if (dst.size() > 1) {
+ if (instr->dest.ssa.bit_size != 1 && dst.size() > 1) {
// TODO: scalarize linear phis on divergent ifs
bool can_scalarize = (opcode == aco_opcode::p_phi || !(ctx->block->kind & block_kind_merge));
std::array<Temp, 4> new_vec;
ctx->block->kind |= block_kind_uniform;
/* emit branch */
- if (cond.regClass() == s2) {
- // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
- cond = as_uniform_bool(ctx, cond);
- }
+ assert(cond.regClass() == s2);
+ // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
+ cond = bool_to_scalar_condition(ctx, cond);
+
branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
branch->operands[0] = Operand(cond);
branch->operands[0].setFixed(scc);