bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand(0u));
if (select != Temp())
- hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), select);
+ hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), bld.scc(select));
lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
Temp mid = bld.tmp(s1);
lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
bool commutative, bool swap_srcs=false, bool flush_denorms = false)
{
Builder bld(ctx->program, ctx->block);
+ bld.is_precise = instr->exact;
+
Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
if (src1.type() == RegType::sgpr) {
}
}
+void emit_vop2_instruction_logic64(isel_context *ctx, nir_alu_instr *instr,
+ aco_opcode op, Temp dst)
+{
+ Builder bld(ctx->program, ctx->block);
+ bld.is_precise = instr->exact;
+
+ Temp src0 = get_alu_src(ctx, instr->src[0]);
+ Temp src1 = get_alu_src(ctx, instr->src[1]);
+
+ if (src1.type() == RegType::sgpr) {
+ assert(src0.type() == RegType::vgpr);
+ std::swap(src0, src1);
+ }
+
+ Temp src00 = bld.tmp(src0.type(), 1);
+ Temp src01 = bld.tmp(src0.type(), 1);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
+ Temp src10 = bld.tmp(v1);
+ Temp src11 = bld.tmp(v1);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
+ Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
+ Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
+ bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
+}
+
void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
bool flush_denorms = false)
{
src2 = as_vgpr(ctx, src2);
Builder bld(ctx->program, ctx->block);
+ bld.is_precise = instr->exact;
if (flush_denorms && ctx->program->chip_class < GFX9) {
assert(dst.size() == 1);
Temp tmp = bld.vop3(op, Definition(dst), src0, src1, src2);
void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
{
Builder bld(ctx->program, ctx->block);
- bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
+ bld.is_precise = instr->exact;
+ if (dst.type() == RegType::sgpr)
+ bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
+ bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
+ else
+ bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
}
void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
abort();
}
Builder bld(ctx->program, ctx->block);
+ bld.is_precise = instr->exact;
Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
switch(instr->op) {
case nir_op_vec2:
bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
} else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
+ } else if (dst.regClass() == v2) {
+ Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
+ lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
+ hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
+ bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
} else if (dst.type() == RegType::sgpr) {
aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
emit_boolean_logic(ctx, instr, Builder::s_or, dst);
} else if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
+ } else if (dst.regClass() == v2) {
+ emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
} else if (dst.regClass() == s1) {
emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
} else if (dst.regClass() == s2) {
emit_boolean_logic(ctx, instr, Builder::s_and, dst);
} else if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
+ } else if (dst.regClass() == v2) {
+ emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
} else if (dst.regClass() == s1) {
emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
} else if (dst.regClass() == s2) {
emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
} else if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
+ } else if (dst.regClass() == v2) {
+ emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
} else if (dst.regClass() == s1) {
emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
} else if (dst.regClass() == s2) {
case nir_op_fsat: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (dst.regClass() == v2b) {
- bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
+ bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand((uint16_t)0u), Operand((uint16_t)0x3c00), src);
} else if (dst.regClass() == v1) {
bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
/* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
case nir_op_fcos: {
Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
aco_ptr<Instruction> norm;
- Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
if (dst.regClass() == v2b) {
+ Temp half_pi = bld.copy(bld.def(s1), Operand(0x3118u));
Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
bld.vop1(opcode, Definition(dst), tmp);
} else if (dst.regClass() == v1) {
+ Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
/* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size == 8)
src = convert_int(ctx, bld, src, 8, 16, true);
+ else if (instr->src[0].src.ssa->bit_size == 64)
+ src = convert_int(ctx, bld, src, 64, 32, false);
bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
break;
}
Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size == 8)
src = convert_int(ctx, bld, src, 8, 16, false);
+ else if (instr->src[0].src.ssa->bit_size == 64)
+ src = convert_int(ctx, bld, src, 64, 32, false);
bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
break;
}
assert(dst.size() == 1);
Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size == 8) {
- //TODO: we should use v_cvt_f32_ubyte1/v_cvt_f32_ubyte2/etc depending on the register assignment
bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
} else {
if (instr->src[0].src.ssa->bit_size == 16)
}
case nir_op_f2i8:
case nir_op_f2i16: {
- Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size == 16)
- src = bld.vop1(aco_opcode::v_cvt_i16_f16, bld.def(v1), src);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
else if (instr->src[0].src.ssa->bit_size == 32)
- src = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
else
- src = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src);
-
- if (dst.type() == RegType::vgpr)
- bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
- else
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
break;
}
case nir_op_f2u8:
case nir_op_f2u16: {
- Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size == 16)
- src = bld.vop1(aco_opcode::v_cvt_u16_f16, bld.def(v1), src);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
else if (instr->src[0].src.ssa->bit_size == 32)
- src = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
else
- src = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src);
-
- if (dst.type() == RegType::vgpr)
- bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
- else
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
break;
}
case nir_op_f2i32: {
bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
}
} else if (instr->src[0].src.ssa->bit_size == 32) {
- if (dst.type() == RegType::vgpr)
- bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src);
- else
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
- bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src));
-
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
} else if (instr->src[0].src.ssa->bit_size == 64) {
- if (dst.type() == RegType::vgpr)
- bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src);
- else
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
- bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src));
-
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
}
} else if (instr->src[0].src.ssa->bit_size == 32) {
- if (dst.type() == RegType::vgpr)
- bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src);
- else
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
- bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src));
-
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
} else if (instr->src[0].src.ssa->bit_size == 64) {
- if (dst.type() == RegType::vgpr)
- bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src);
- else
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
- bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src));
-
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
}
case nir_op_unpack_half_2x16_split_x: {
if (dst.regClass() == v1) {
- Builder bld(ctx->program, ctx->block);
bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
}
case nir_op_unpack_half_2x16_split_y: {
if (dst.regClass() == v1) {
- Builder bld(ctx->program, ctx->block);
/* TODO: use SDWA here */
bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
int byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
if (byte_align) {
- if ((bytes_needed > 2 || !supports_8bit_16bit_loads) && byte_align_loads) {
+ if ((bytes_needed > 2 ||
+ (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
+ !supports_8bit_16bit_loads) && byte_align_loads) {
if (info->component_stride) {
assert(supports_8bit_16bit_loads && "unimplemented");
bytes_needed = 2;
}
/* shift result right if needed */
- if (info->component_size < 4) {
+ if (info->component_size < 4 && byte_align_loads) {
Operand align((uint32_t)byte_align);
if (byte_align == -1) {
if (offset.isConstant())
/* dword or larger stores have to be dword-aligned */
unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
- unsigned align_offset = instr ? nir_intrinsic_align_mul(instr) : 0;
- bool dword_aligned = (align_offset + offset) % 4 == 0 && align_mul % 4 == 0;
- if (bytes >= 4 && !dword_aligned)
- bytes = MIN2(bytes, 2);
+ unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
+ bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
+ if (!dword_aligned)
+ bytes = MIN2(bytes, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
advance_write_mask(&todo, offset, bytes);
write_count_with_skips++;
void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size,
Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
- bool glc=false, bool readonly=true)
+ bool glc=false, bool readonly=true, bool allow_smem=true)
{
Builder bld(ctx->program, ctx->block);
- bool use_smem = dst.type() != RegType::vgpr && ((ctx->options->chip_class >= GFX8 && component_size >= 4) || readonly);
+ bool use_smem = dst.type() != RegType::vgpr && (!glc || ctx->options->chip_class >= GFX8) && allow_smem;
if (use_smem)
offset = bld.as_uniform(offset);
Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
- bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
+ unsigned access = nir_intrinsic_access(instr);
+ bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
unsigned size = instr->dest.ssa.bit_size / 8;
+
+ uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[0].ssa, access);
+ /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
+ * TODO: this optimization is disabled for now because we still need to ensure correct ordering
+ */
+ bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_store : has_vmem_store));
+ allow_smem |= ((access & ACCESS_RESTRICT) && (access & ACCESS_NON_WRITEABLE)) || (access & ACCESS_CAN_REORDER);
+
load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
- nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false);
+ nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false, allow_smem);
}
void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
+ bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+ uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr));
+ /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
+ * TODO: this optimization is disabled for now because we still need to ensure correct ordering
+ */
+ bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_loadstore : has_vmem_loadstore));
+
bool smem = !nir_src_is_divergent(instr->src[2]) &&
ctx->options->chip_class >= GFX8 &&
- elem_size_bytes >= 4;
+ (elem_size_bytes >= 4 || can_subdword_ssbo_store_use_smem(instr)) &&
+ allow_smem;
if (smem)
offset = bld.as_uniform(offset);
bool smem_nonfs = smem && ctx->stage != fragment_fs;
if (op != aco_opcode::p_fs_buffer_store_smem)
store->operands[1].setFixed(m0);
store->operands[2] = Operand(write_datas[i]);
- store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+ store->glc = glc;
store->dlc = false;
store->disable_wqm = true;
store->barrier = barrier_buffer;
store->operands[3] = Operand(write_datas[i]);
store->offset = offsets[i];
store->offen = (offset.type() == RegType::vgpr);
- store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+ store->glc = glc;
store->dlc = false;
store->disable_wqm = true;
store->barrier = barrier_buffer;
}
-Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa)
+Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa, RegClass rc)
{
Temp tmp = get_ssa_temp(ctx, ssa);
if (ssa->parent_instr->type == nir_instr_type_ssa_undef)
- return Operand(tmp.regClass());
+ return Operand(rc);
else
return Operand(tmp);
}
if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
continue;
cur_pred_idx++;
- Operand op = get_phi_operand(ctx, src.second);
+ Operand op = get_phi_operand(ctx, src.second, dst.regClass());
operands[num_operands++] = op;
num_defined += !op.isUndefined();
}
if (target == V_008DFC_SQ_EXP_NULL)
return false;
+ /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
+ if (ctx->options->enable_mrt_output_nan_fixup &&
+ !is_16bit &&
+ (col_format == V_028714_SPI_SHADER_32_R ||
+ col_format == V_028714_SPI_SHADER_32_GR ||
+ col_format == V_028714_SPI_SHADER_32_AR ||
+ col_format == V_028714_SPI_SHADER_32_ABGR ||
+ col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
+ for (int i = 0; i < 4; i++) {
+ if (!(write_mask & (1 << i)))
+ continue;
+
+ Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32,
+ bld.hint_vcc(bld.def(bld.lm)), values[i],
+ bld.copy(bld.def(v1), Operand(3u)));
+ values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i],
+ bld.copy(bld.def(v1), Operand(0u)), isnan);
+ }
+ }
+
if ((bool) compr_op) {
for (int i = 0; i < 2; i++) {
/* check if at least one of the values to be compressed is enabled */
float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
- /* default to preserving fp16 and fp64 denorms, since it's free */
+ /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
+ * the precision seems needed for Wolfenstein: Youngblood to render correctly */
if (program->next_fp_mode.must_flush_denorms16_64)
program->next_fp_mode.denorm16_64 = 0;
else