bool commutative, bool swap_srcs=false, bool flush_denorms = false)
{
Builder bld(ctx->program, ctx->block);
+ bld.is_precise = instr->exact;
+
Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
if (src1.type() == RegType::sgpr) {
src2 = as_vgpr(ctx, src2);
Builder bld(ctx->program, ctx->block);
+ bld.is_precise = instr->exact;
if (flush_denorms && ctx->program->chip_class < GFX9) {
assert(dst.size() == 1);
Temp tmp = bld.vop3(op, Definition(dst), src0, src1, src2);
void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
{
Builder bld(ctx->program, ctx->block);
- bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
+ bld.is_precise = instr->exact;
+ if (dst.type() == RegType::sgpr)
+ bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
+ bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
+ else
+ bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
}
void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
abort();
}
Builder bld(ctx->program, ctx->block);
+ bld.is_precise = instr->exact;
Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
switch(instr->op) {
case nir_op_vec2:
case nir_op_fsat: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (dst.regClass() == v2b) {
- bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
+ bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand((uint16_t)0u), Operand((uint16_t)0x3c00), src);
} else if (dst.regClass() == v1) {
bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
/* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
case nir_op_fcos: {
Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
aco_ptr<Instruction> norm;
- Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
if (dst.regClass() == v2b) {
+ Temp half_pi = bld.copy(bld.def(s1), Operand(0x3118u));
Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
bld.vop1(opcode, Definition(dst), tmp);
} else if (dst.regClass() == v1) {
+ Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
/* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size == 8)
src = convert_int(ctx, bld, src, 8, 16, true);
+ else if (instr->src[0].src.ssa->bit_size == 64)
+ src = convert_int(ctx, bld, src, 64, 32, false);
bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
break;
}
Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size == 8)
src = convert_int(ctx, bld, src, 8, 16, false);
+ else if (instr->src[0].src.ssa->bit_size == 64)
+ src = convert_int(ctx, bld, src, 64, 32, false);
bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
break;
}
assert(dst.size() == 1);
Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size == 8) {
- //TODO: we should use v_cvt_f32_ubyte1/v_cvt_f32_ubyte2/etc depending on the register assignment
bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
} else {
if (instr->src[0].src.ssa->bit_size == 16)
}
case nir_op_f2i8:
case nir_op_f2i16: {
- Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size == 16)
- src = bld.vop1(aco_opcode::v_cvt_i16_f16, bld.def(v1), src);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
else if (instr->src[0].src.ssa->bit_size == 32)
- src = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src);
- else
- src = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src);
-
- if (dst.type() == RegType::vgpr)
- bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
else
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
break;
}
case nir_op_f2u8:
case nir_op_f2u16: {
- Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size == 16)
- src = bld.vop1(aco_opcode::v_cvt_u16_f16, bld.def(v1), src);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
else if (instr->src[0].src.ssa->bit_size == 32)
- src = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
else
- src = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src);
-
- if (dst.type() == RegType::vgpr)
- bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
- else
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
break;
}
case nir_op_f2i32: {
bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
}
} else if (instr->src[0].src.ssa->bit_size == 32) {
- if (dst.type() == RegType::vgpr)
- bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src);
- else
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
- bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src));
-
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
} else if (instr->src[0].src.ssa->bit_size == 64) {
- if (dst.type() == RegType::vgpr)
- bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src);
- else
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
- bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src));
-
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
}
} else if (instr->src[0].src.ssa->bit_size == 32) {
- if (dst.type() == RegType::vgpr)
- bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src);
- else
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
- bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src));
-
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
} else if (instr->src[0].src.ssa->bit_size == 64) {
- if (dst.type() == RegType::vgpr)
- bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src);
- else
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
- bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src));
-
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
}
case nir_op_unpack_half_2x16_split_x: {
if (dst.regClass() == v1) {
- Builder bld(ctx->program, ctx->block);
bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
}
case nir_op_unpack_half_2x16_split_y: {
if (dst.regClass() == v1) {
- Builder bld(ctx->program, ctx->block);
/* TODO: use SDWA here */
bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
}
/* shift result right if needed */
- if (info->component_size < 4) {
+ if (info->component_size < 4 && byte_align_loads) {
Operand align((uint32_t)byte_align);
if (byte_align == -1) {
if (offset.isConstant())
if (target == V_008DFC_SQ_EXP_NULL)
return false;
+ /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
+ if (ctx->options->enable_mrt_output_nan_fixup &&
+ !is_16bit &&
+ (col_format == V_028714_SPI_SHADER_32_R ||
+ col_format == V_028714_SPI_SHADER_32_GR ||
+ col_format == V_028714_SPI_SHADER_32_AR ||
+ col_format == V_028714_SPI_SHADER_32_ABGR ||
+ col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
+ for (int i = 0; i < 4; i++) {
+ if (!(write_mask & (1 << i)))
+ continue;
+
+ Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32,
+ bld.hint_vcc(bld.def(bld.lm)), values[i],
+ bld.copy(bld.def(v1), Operand(3u)));
+ values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i],
+ bld.copy(bld.def(v1), Operand(0u)), isnan);
+ }
+ }
+
if ((bool) compr_op) {
for (int i = 0; i < 2; i++) {
/* check if at least one of the values to be compressed is enabled */
float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
- /* default to preserving fp16 and fp64 denorms, since it's free */
+ /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
+ * the precision seems needed for Wolfenstein: Youngblood to render correctly */
if (program->next_fp_mode.must_flush_denorms16_64)
program->next_fp_mode.denorm16_64 = 0;
else