{
aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : instr->src[0].src.ssa->bit_size == 32 ? s32_op : aco_opcode::num_opcodes;
aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : instr->src[0].src.ssa->bit_size == 32 ? v32_op : v16_op;
- bool divergent_vals = ctx->divergent_vals[instr->dest.dest.ssa.index];
bool use_valu = s_op == aco_opcode::num_opcodes ||
- divergent_vals ||
+ nir_dest_is_divergent(instr->dest.dest) ||
ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr ||
ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
aco_opcode op = use_valu ? v_op : s_op;
if (dst.type() == RegType::vgpr) {
aco_ptr<Instruction> bcsel;
- if (dst.regClass() == v2b) {
- then = as_vgpr(ctx, then);
- els = as_vgpr(ctx, els);
-
- Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), els, then, cond);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
- } else if (dst.regClass() == v1) {
+ if (dst.size() == 1) {
then = as_vgpr(ctx, then);
els = as_vgpr(ctx, els);
bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
- } else if (dst.regClass() == v2) {
+ } else if (dst.size() == 2) {
Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
assert(els.regClass() == bld.lm);
}
- if (!ctx->divergent_vals[instr->src[0].src.ssa->index]) { /* uniform condition and values in sgpr */
+ if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
if (dst.regClass() == s1 || dst.regClass() == s2) {
assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
assert(dst.size() == then.size());
bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
/* Extract the exponent and compute the unbiased value. */
- Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f64, bld.def(v1), val);
+ Temp exponent = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand(20u), Operand(11u));
+ exponent = bld.vsub32(bld.def(v1), exponent, Operand(1023u));
/* Extract the fractional part. */
Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu));
fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
/* Get the sign bit. */
- Temp sign = bld.vop2(aco_opcode::v_ashr_i32, bld.def(v1), Operand(31u), val_hi);
+ Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x80000000u), val_hi);
/* Decide the operation to apply depending on the unbiased exponent. */
Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u));
Temp src0 = get_alu_src(ctx, instr->src[0]);
Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
if (dst.regClass() == v2b) {
- Temp tmp = bld.tmp(v1);
- emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, tmp, true);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
} else if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
} else if (dst.regClass() == v2) {
Temp src0 = get_alu_src(ctx, instr->src[0]);
Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
if (dst.regClass() == v2b) {
- Temp tmp = bld.tmp(v1);
- emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, tmp, true);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
} else if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
} else if (dst.regClass() == v2) {
Temp src0 = get_alu_src(ctx, instr->src[0]);
Temp src1 = get_alu_src(ctx, instr->src[1]);
if (dst.regClass() == v2b) {
- Temp tmp = bld.tmp(v1);
if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
- emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, tmp, false);
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
else
- emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, tmp, true);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
} else if (dst.regClass() == v1) {
if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
if (dst.regClass() == v2b) {
// TODO: check fp_mode.must_flush_denorms16_64
- Temp tmp = bld.tmp(v1);
- emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, tmp, true);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);
} else if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
} else if (dst.regClass() == v2) {
Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
if (dst.regClass() == v2b) {
// TODO: check fp_mode.must_flush_denorms16_64
- Temp tmp = bld.tmp(v1);
- emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, tmp, true);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);
} else if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
} else if (dst.regClass() == v2) {
}
case nir_op_fmax3: {
if (dst.regClass() == v2b) {
- Temp tmp = bld.tmp(v1);
- emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f16, tmp, false);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f16, dst, false);
} else if (dst.regClass() == v1) {
emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
} else {
}
case nir_op_fmin3: {
if (dst.regClass() == v2b) {
- Temp tmp = bld.tmp(v1);
- emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f16, tmp, false);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f16, dst, false);
} else if (dst.regClass() == v1) {
emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
} else {
}
case nir_op_fmed3: {
if (dst.regClass() == v2b) {
- Temp tmp = bld.tmp(v1);
- emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f16, tmp, false);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f16, dst, false);
} else if (dst.regClass() == v1) {
emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
} else {
case nir_op_frsq: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (dst.regClass() == v2b) {
- Temp tmp = bld.vop1(aco_opcode::v_rsq_f16, bld.def(v1), src);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
} else if (dst.regClass() == v1) {
emit_rsq(ctx, bld, Definition(dst), src);
} else if (dst.regClass() == v2) {
case nir_op_fneg: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (dst.regClass() == v2b) {
- Temp tmp = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x8000u), as_vgpr(ctx, src));
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x8000u), as_vgpr(ctx, src));
} else if (dst.regClass() == v1) {
if (ctx->block->fp_mode.must_flush_denorms32)
src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
case nir_op_fabs: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (dst.regClass() == v2b) {
- Temp tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFu), as_vgpr(ctx, src));
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFu), as_vgpr(ctx, src));
} else if (dst.regClass() == v1) {
if (ctx->block->fp_mode.must_flush_denorms32)
src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
case nir_op_fsat: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (dst.regClass() == v2b) {
- Temp tmp = bld.vop3(aco_opcode::v_med3_f16, bld.def(v1), Operand(0u), Operand(0x3f800000u), src);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
} else if (dst.regClass() == v1) {
bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
/* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
case nir_op_flog2: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (dst.regClass() == v2b) {
- Temp tmp = bld.vop1(aco_opcode::v_log_f16, bld.def(v1), src);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
} else if (dst.regClass() == v1) {
emit_log2(ctx, bld, Definition(dst), src);
} else {
case nir_op_frcp: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (dst.regClass() == v2b) {
- Temp tmp = bld.vop1(aco_opcode::v_rcp_f16, bld.def(v1), src);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
} else if (dst.regClass() == v1) {
emit_rcp(ctx, bld, Definition(dst), src);
} else if (dst.regClass() == v2) {
}
case nir_op_fexp2: {
if (dst.regClass() == v2b) {
- Temp src = get_alu_src(ctx, instr->src[0]);
- Temp tmp = bld.vop1(aco_opcode::v_exp_f16, bld.def(v1), src);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
} else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
} else {
case nir_op_fsqrt: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (dst.regClass() == v2b) {
- Temp tmp = bld.vop1(aco_opcode::v_sqrt_f16, bld.def(v1), src);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
} else if (dst.regClass() == v1) {
emit_sqrt(ctx, bld, Definition(dst), src);
} else if (dst.regClass() == v2) {
}
case nir_op_ffract: {
if (dst.regClass() == v2b) {
- Temp src = get_alu_src(ctx, instr->src[0]);
- Temp tmp = bld.vop1(aco_opcode::v_fract_f16, bld.def(v1), src);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
} else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
} else if (dst.regClass() == v2) {
case nir_op_ffloor: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (dst.regClass() == v2b) {
- Temp tmp = bld.vop1(aco_opcode::v_floor_f16, bld.def(v1), src);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
} else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
} else if (dst.regClass() == v2) {
case nir_op_fceil: {
Temp src0 = get_alu_src(ctx, instr->src[0]);
if (dst.regClass() == v2b) {
- Temp tmp = bld.vop1(aco_opcode::v_ceil_f16, bld.def(v1), src0);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
} else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
} else if (dst.regClass() == v2) {
case nir_op_ftrunc: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (dst.regClass() == v2b) {
- Temp tmp = bld.vop1(aco_opcode::v_trunc_f16, bld.def(v1), src);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
} else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
} else if (dst.regClass() == v2) {
case nir_op_fround_even: {
Temp src0 = get_alu_src(ctx, instr->src[0]);
if (dst.regClass() == v2b) {
- Temp tmp = bld.vop1(aco_opcode::v_rndne_f16, bld.def(v1), src0);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
} else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
} else if (dst.regClass() == v2) {
if (dst.regClass() == v2b) {
Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
- tmp = bld.vop1(opcode, bld.def(v1), tmp);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ bld.vop1(opcode, Definition(dst), tmp);
} else if (dst.regClass() == v1) {
Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
Temp src0 = get_alu_src(ctx, instr->src[0]);
Temp src1 = get_alu_src(ctx, instr->src[1]);
if (dst.regClass() == v2b) {
- Temp tmp = bld.tmp(v1);
- emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, tmp, false);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
} else if (dst.regClass() == v1) {
bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst), as_vgpr(ctx, src0), src1);
} else if (dst.regClass() == v2) {
case nir_op_frexp_sig: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (dst.regClass() == v2b) {
- Temp tmp = bld.vop1(aco_opcode::v_frexp_mant_f16, bld.def(v1), src);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ bld.vop1(aco_opcode::v_frexp_mant_f16, Definition(dst), src);
} else if (dst.regClass() == v1) {
bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst), src);
} else if (dst.regClass() == v2) {
Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), one, src, cond);
cond = bld.vopc(aco_opcode::v_cmp_le_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
- Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), minus_one, src, cond);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), minus_one, src, cond);
} else if (dst.regClass() == v1) {
Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size == 64)
src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
- src = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), src);
+ bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
break;
}
case nir_op_f2f16_rtz: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size == 64)
src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
- src = bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1), src, Operand(0u));
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), src);
+ bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, Operand(0u));
break;
}
case nir_op_f2f32: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size == 8)
src = convert_int(bld, src, 8, 16, true);
- Temp tmp = bld.vop1(aco_opcode::v_cvt_f16_i16, bld.def(v1), src);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
break;
}
case nir_op_i2f32: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size == 8)
src = convert_int(bld, src, 8, 16, false);
- Temp tmp = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v1), src);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
break;
}
case nir_op_u2f32: {
bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3c00u), src);
} else if (dst.regClass() == v2b) {
Temp one = bld.copy(bld.def(v1), Operand(0x3c00u));
- Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src);
- bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), one, src);
} else {
unreachable("Wrong destination register class for nir_op_b2f16.");
}
bool allow_combining = true, bool reorder = true, bool slc = false)
{
Builder bld(ctx->program, ctx->block);
- assert(elem_size_bytes == 4 || elem_size_bytes == 8);
+ assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
assert(write_mask);
write_mask = widen_mask(write_mask, elem_size_bytes);
unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true)
{
- assert(elem_size_bytes == 4 || elem_size_bytes == 8);
- assert((num_components * elem_size_bytes / 4) == dst.size());
+ assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
+ assert((num_components * elem_size_bytes) == dst.bytes());
assert(!!stride != allow_combining);
Builder bld(ctx->program, ctx->block);
Builder bld(ctx->program, ctx->block);
uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 16;
- uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written);
- uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written);
- uint32_t output_vertex_size = num_tcs_outputs * 16;
+ uint32_t output_vertex_size = ctx->tcs_num_outputs * 16;
uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
- uint32_t output_patch_stride = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
+ uint32_t output_patch_stride = pervertex_output_patch_size + ctx->tcs_num_patch_outputs * 16;
std::pair<Temp, unsigned> offs = instr
? get_intrinsic_io_basic_offset(ctx, instr, 4u)
{
Builder bld(ctx->program, ctx->block);
- unsigned num_tcs_outputs = ctx->shader->info.stage == MESA_SHADER_TESS_CTRL
- ? util_last_bit64(ctx->args->shader_info->tcs.outputs_written)
- : ctx->args->options->key.tes.tcs_num_outputs;
-
- unsigned output_vertex_size = num_tcs_outputs * 16;
+ unsigned output_vertex_size = ctx->tcs_num_outputs * 16;
unsigned per_vertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
unsigned per_patch_data_offset = per_vertex_output_patch_size * ctx->tcs_num_patches;
unsigned attr_stride = ctx->tcs_num_patches;
bool tcs_driver_location_matches_api_mask(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex, uint64_t mask, bool *indirect)
{
+ assert(per_vertex || ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
+
if (mask == 0)
return false;
- unsigned off = nir_intrinsic_base(instr) * 4u;
+ unsigned drv_loc = nir_intrinsic_base(instr);
nir_src *off_src = nir_get_io_offset_src(instr);
if (!nir_src_is_const(*off_src)) {
}
*indirect = false;
- off += nir_src_as_uint(*off_src) * 16u;
-
- while (mask) {
- unsigned slot = u_bit_scan64(&mask) + (per_vertex ? 0 : VARYING_SLOT_PATCH0);
- if (off == shader_io_get_unique_index((gl_varying_slot) slot) * 16u)
- return true;
- }
-
- return false;
+ uint64_t slot = per_vertex
+ ? ctx->output_drv_loc_to_var_slot[ctx->shader->info.stage][drv_loc / 4]
+ : (ctx->output_tcs_patch_drv_loc_to_var_slot[drv_loc / 4] - VARYING_SLOT_PATCH0);
+ return (((uint64_t) 1) << slot) & mask;
}
bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr)
if (instr->src[0].ssa->bit_size == 64)
write_mask = widen_mask(write_mask, 2);
+ RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
+
for (unsigned i = 0; i < 8; ++i) {
if (write_mask & (1 << i)) {
ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
- ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, v1);
+ ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
}
idx++;
}
/* GFX6-8: VS runs on LS stage when tessellation is used, but LS shares LDS space with HS.
* GFX9+: LS is merged into HS, but still uses the same LDS layout.
*/
- unsigned num_tcs_inputs = util_last_bit64(ctx->args->shader_info->vs.ls_outputs_written);
Temp vertex_idx = get_arg(ctx, ctx->args->rel_auto_id);
- lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, num_tcs_inputs * 16u);
+ lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, ctx->tcs_num_inputs * 16u);
} else {
unreachable("Invalid LS or ES stage");
}
{
Builder bld(ctx->program, ctx->block);
Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
- if (!ctx->divergent_vals[instr->dest.ssa.index])
+ if (!nir_dest_is_divergent(instr->dest))
index = bld.as_uniform(index);
unsigned desc_set = nir_intrinsic_desc_set(instr);
unsigned binding = nir_intrinsic_binding(instr);
ctx->block->instructions.emplace_back(std::move(load));
Operand sample_index4;
- if (sample_index.isConstant() && sample_index.constantValue() < 16) {
- sample_index4 = Operand(sample_index.constantValue() << 2);
+ if (sample_index.isConstant()) {
+ if (sample_index.constantValue() < 16) {
+ sample_index4 = Operand(sample_index.constantValue() << 2);
+ } else {
+ sample_index4 = Operand(0u);
+ }
} else if (sample_index.regClass() == s1) {
sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
} else {
Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
- bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] &&
+ bool smem = !nir_src_is_divergent(instr->src[2]) &&
ctx->options->chip_class >= GFX8 &&
elem_size_bytes >= 4;
if (smem)
case nir_intrinsic_shuffle:
case nir_intrinsic_read_invocation: {
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
- if (!ctx->divergent_vals[instr->src[0].ssa->index]) {
+ if (!nir_src_is_divergent(instr->src[0])) {
emit_uniform_subgroup(ctx, instr, src);
} else {
Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
- if (instr->intrinsic == nir_intrinsic_read_invocation || !ctx->divergent_vals[instr->src[1].ssa->index])
+ if (instr->intrinsic == nir_intrinsic_read_invocation || !nir_src_is_divergent(instr->src[1]))
tid = bld.as_uniform(tid);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
if (src.regClass() == v1) {
nir_intrinsic_cluster_size(instr) : 0;
cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
- if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) {
+ if (!nir_src_is_divergent(instr->src[0]) && (op == nir_op_ior || op == nir_op_iand)) {
emit_uniform_subgroup(ctx, instr, src);
} else if (instr->dest.ssa.bit_size == 1) {
if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
}
case nir_intrinsic_quad_broadcast: {
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
- if (!ctx->divergent_vals[instr->dest.ssa.index]) {
+ if (!nir_dest_is_divergent(instr->dest)) {
emit_uniform_subgroup(ctx, instr, src);
} else {
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
case nir_intrinsic_quad_swap_diagonal:
case nir_intrinsic_quad_swizzle_amd: {
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
- if (!ctx->divergent_vals[instr->dest.ssa.index]) {
+ if (!nir_dest_is_divergent(instr->dest)) {
emit_uniform_subgroup(ctx, instr, src);
break;
}
}
case nir_intrinsic_masked_swizzle_amd: {
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
- if (!ctx->divergent_vals[instr->dest.ssa.index]) {
+ if (!nir_dest_is_divergent(instr->dest)) {
emit_uniform_subgroup(ctx, instr, src);
break;
}
{
Builder bld(ctx->program, ctx->block);
bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
- has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false;
+ has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false,
+ has_clamped_lod = false;
Temp resource, sampler, fmask_ptr, bias = Temp(), compare = Temp(), sample_index = Temp(),
- lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp();
+ lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(),
+ clamped_lod = Temp();
std::vector<Temp> coords;
std::vector<Temp> derivs;
nir_const_value *sample_index_cv = NULL;
break;
}
case nir_tex_src_bias:
- if (instr->op == nir_texop_txb) {
- bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
- has_bias = true;
- }
+ bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
+ has_bias = true;
break;
case nir_tex_src_lod: {
nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
}
break;
}
+ case nir_tex_src_min_lod:
+ clamped_lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
+ has_clamped_lod = true;
+ break;
case nir_tex_src_comparator:
if (instr->is_shadow) {
compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
- Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
+ Operand default_sample = Operand(1u);
+ if (ctx->options->robust_buffer_access) {
+ /* Extract the second dword of the descriptor, if it's
+ * all zero, then it's a null descriptor.
+ */
+ Temp dword1 = emit_extract_vector(ctx, resource, 1, s1);
+ Temp is_non_null_descriptor = bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand(0u));
+ default_sample = Operand(is_non_null_descriptor);
+ }
+
+ Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
- samples, Operand(1u), bld.scc(is_msaa));
+ samples, default_sample, bld.scc(is_msaa));
return;
}
args.emplace_back(sample_index);
if (has_lod)
args.emplace_back(lod);
+ if (has_clamped_lod)
+ args.emplace_back(clamped_lod);
Temp arg = bld.tmp(RegClass(RegType::vgpr, args.size()));
aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
// TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
aco_opcode opcode = aco_opcode::image_sample;
if (has_offset) { /* image_sample_*_o */
- if (has_compare) {
+ if (has_clamped_lod) {
+ if (has_compare) {
+ opcode = aco_opcode::image_sample_c_cl_o;
+ if (has_derivs)
+ opcode = aco_opcode::image_sample_c_d_cl_o;
+ if (has_bias)
+ opcode = aco_opcode::image_sample_c_b_cl_o;
+ } else {
+ opcode = aco_opcode::image_sample_cl_o;
+ if (has_derivs)
+ opcode = aco_opcode::image_sample_d_cl_o;
+ if (has_bias)
+ opcode = aco_opcode::image_sample_b_cl_o;
+ }
+ } else if (has_compare) {
opcode = aco_opcode::image_sample_c_o;
if (has_derivs)
opcode = aco_opcode::image_sample_c_d_o;
if (has_lod)
opcode = aco_opcode::image_sample_l_o;
}
+ } else if (has_clamped_lod) { /* image_sample_*_cl */
+ if (has_compare) {
+ opcode = aco_opcode::image_sample_c_cl;
+ if (has_derivs)
+ opcode = aco_opcode::image_sample_c_d_cl;
+ if (has_bias)
+ opcode = aco_opcode::image_sample_c_b_cl;
+ } else {
+ opcode = aco_opcode::image_sample_cl;
+ if (has_derivs)
+ opcode = aco_opcode::image_sample_d_cl;
+ if (has_bias)
+ opcode = aco_opcode::image_sample_b_cl;
+ }
} else { /* no offset */
if (has_compare) {
opcode = aco_opcode::image_sample_c;
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
- bool logical = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index];
+ bool logical = !dst.is_linear() || nir_dest_is_divergent(instr->dest);
logical |= ctx->block->kind & block_kind_merge;
aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
aco_ptr<Pseudo_branch_instruction> branch;
if_context ic;
- if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */
+ if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
/**
* Uniform conditionals are represented in the following way*) :
*
visit_cf_list(ctx, &if_stmt->else_list);
end_uniform_if(ctx, &ic);
-
- return !ctx->cf_info.has_branch;
} else { /* non-uniform condition */
/**
* To maintain a logical and linear CFG without critical edges,
visit_cf_list(ctx, &if_stmt->else_list);
end_divergent_if(ctx, &ic);
-
- return true;
}
+
+ return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
}
static bool visit_cf_list(isel_context *ctx,
bool is_int8 = (ctx->options->key.fs.is_int8 >> slot) & 1;
bool is_int10 = (ctx->options->key.fs.is_int10 >> slot) & 1;
+ bool is_16bit = values[0].regClass() == v2b;
switch (col_format)
{
case V_028714_SPI_SHADER_FP16_ABGR:
enabled_channels = 0x5;
compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
+ if (is_16bit) {
+ if (ctx->options->chip_class >= GFX9) {
+ /* Pack the FP16 values together instead of converting them to
+ * FP32 and back to FP16.
+ * TODO: use p_create_vector and let the compiler optimizes.
+ */
+ compr_op = aco_opcode::v_pack_b32_f16;
+ } else {
+ for (unsigned i = 0; i < 4; i++) {
+ if ((write_mask >> i) & 1)
+ values[i] = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), values[i]);
+ }
+ }
+ }
break;
case V_028714_SPI_SHADER_UNORM16_ABGR:
enabled_channels = 0x5;
- compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
+ if (is_16bit && ctx->options->chip_class >= GFX9) {
+ compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
+ } else {
+ compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
+ }
break;
case V_028714_SPI_SHADER_SNORM16_ABGR:
enabled_channels = 0x5;
- compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
+ if (is_16bit && ctx->options->chip_class >= GFX9) {
+ compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
+ } else {
+ compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
+ }
break;
case V_028714_SPI_SHADER_UINT16_ABGR: {
values[i]);
}
}
+ } else if (is_16bit) {
+ for (unsigned i = 0; i < 4; i++) {
+ if ((write_mask >> i) & 1) {
+ Temp tmp = convert_int(bld, values[i].getTemp(), 16, 32, false);
+ values[i] = Operand(tmp);
+ }
+ }
}
break;
}
values[i]);
}
}
+ } else if (is_16bit) {
+ for (unsigned i = 0; i < 4; i++) {
+ if ((write_mask >> i) & 1) {
+ Temp tmp = convert_int(bld, values[i].getTemp(), 16, 32, true);
+ values[i] = Operand(tmp);
+ }
+ }
}
break;
if (ngg_no_gs && !ngg_early_prim_export(&ctx))
ngg_emit_nogs_output(&ctx);
- ralloc_free(ctx.divergent_vals);
-
if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
/* Outputs of the previous stage are inputs to the next stage */
ctx.inputs = ctx.outputs;