if (ctx->program->wave_size == 32) {
return thread_id_lo;
+ } else if (ctx->program->chip_class <= GFX7) {
+ Temp thread_id_hi = bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo);
+ return thread_id_hi;
} else {
- Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo);
+ Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, dst, mask_hi, thread_id_lo);
return thread_id_hi;
}
}
if (index.regClass() == s1)
return bld.readlane(bld.def(s1), data, index);
- Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
-
- /* Currently not implemented on GFX6-7 */
- assert(ctx->options->chip_class >= GFX8);
-
- if (ctx->options->chip_class <= GFX9 || ctx->program->wave_size == 32) {
+ if (ctx->options->chip_class <= GFX7) {
+ /* GFX6-7: there is no bpermute instruction */
+ Operand index_op(index);
+ Operand input_data(data);
+ index_op.setLateKill(true);
+ input_data.setLateKill(true);
+
+ return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), index_op, input_data);
+ } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {
+ /* GFX10 wave64 mode: emulate full-wave bpermute */
+ if (!ctx->has_gfx10_wave64_bpermute) {
+ ctx->has_gfx10_wave64_bpermute = true;
+ ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
+ ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
+ }
+
+ Temp index_is_lo = bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand(31u), index);
+ Builder::Result index_is_lo_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
+ Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), index_is_lo_split.def(1).getTemp());
+ Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
+ Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
+ Operand input_data(data);
+
+ index_x4.setLateKill(true);
+ input_data.setLateKill(true);
+ same_half.setLateKill(true);
+
+ return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half);
+ } else {
+ /* GFX8-9 or GFX10 wave32: bpermute works normally */
+ Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
}
+}
- /* GFX10, wave64 mode:
- * The bpermute instruction is limited to half-wave operation, which means that it can't
- * properly support subgroup shuffle like older generations (or wave32 mode), so we
- * emulate it here.
- */
- if (!ctx->has_gfx10_wave64_bpermute) {
- ctx->has_gfx10_wave64_bpermute = true;
- ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
- ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
- }
+static Temp emit_masked_swizzle(isel_context *ctx, Builder &bld, Temp src, unsigned mask)
+{
+ if (ctx->options->chip_class >= GFX8) {
+ unsigned and_mask = mask & 0x1f;
+ unsigned or_mask = (mask >> 5) & 0x1f;
+ unsigned xor_mask = (mask >> 10) & 0x1f;
+
+ uint16_t dpp_ctrl = 0xffff;
- Temp lane_id = emit_mbcnt(ctx, bld.def(v1));
- Temp lane_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), lane_id);
- Temp index_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), index);
- Temp cmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), lane_is_hi, index_is_hi);
+ // TODO: we could use DPP8 for some swizzles
+ if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) {
+ unsigned res[4] = {0, 1, 2, 3};
+ for (unsigned i = 0; i < 4; i++)
+ res[i] = ((res[i] | or_mask) ^ xor_mask) & 0x3;
+ dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
+ } else if (and_mask == 0x1f && !or_mask && xor_mask == 8) {
+ dpp_ctrl = dpp_row_rr(8);
+ } else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) {
+ dpp_ctrl = dpp_row_mirror;
+ } else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) {
+ dpp_ctrl = dpp_row_half_mirror;
+ }
- return bld.reduction(aco_opcode::p_wave64_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
- bld.vcc(cmp), Operand(v2.as_linear()), index_x4, data, gfx10_wave64_bpermute);
+ if (dpp_ctrl != 0xffff)
+ return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
+ }
+
+ return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
}
Temp as_vgpr(isel_context *ctx, Temp val)
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand(0u));
if (select != Temp())
- hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), select);
+ hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), bld.scc(select));
lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
Temp mid = bld.tmp(s1);
lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
}
}
-/* this function trims subdword vectors:
- * if dst is vgpr - split the src and create a shrunk version according to the mask.
- * if dst is sgpr - split the src, but move the original to sgpr. */
-void trim_subdword_vector(isel_context *ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
+void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
{
- assert(vec_src.type() == RegType::vgpr);
- emit_split_vector(ctx, vec_src, num_components);
-
Builder bld(ctx->program, ctx->block);
- std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
- unsigned component_size = vec_src.bytes() / num_components;
- RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
+ if (offset.isTemp()) {
+ Temp tmp[4] = {vec, vec, vec, vec};
- unsigned k = 0;
- for (unsigned i = 0; i < num_components; i++) {
- if (mask & (1 << i))
- elems[k++] = emit_extract_vector(ctx, vec_src, i, rc);
+ if (vec.size() == 4) {
+ tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), Definition(tmp[3]), vec);
+ } else if (vec.size() == 3) {
+ tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec);
+ } else if (vec.size() == 2) {
+ tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
+ bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
+ }
+ for (unsigned i = 0; i < dst.size(); i++)
+ tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
+
+ vec = tmp[0];
+ if (dst.size() == 2)
+ vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
+
+ offset = Operand(0u);
+ }
+
+ unsigned num_components = dst.bytes() / component_size;
+ if (vec.regClass() == dst.regClass()) {
+ assert(offset.constantValue() == 0);
+ bld.copy(Definition(dst), vec);
+ emit_split_vector(ctx, dst, num_components);
+ return;
}
+ emit_split_vector(ctx, vec, vec.bytes() / component_size);
+ std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
+ RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
+
+ assert(offset.constantValue() % component_size == 0);
+ unsigned skip = offset.constantValue() / component_size;
+ for (unsigned i = 0; i < num_components; i++)
+ elems[i] = emit_extract_vector(ctx, vec, i + skip, rc);
+
+ /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
if (dst.type() == RegType::vgpr) {
- assert(dst.bytes() == k * component_size);
- aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, k, 1)};
- for (unsigned i = 0; i < k; i++)
- vec->operands[i] = Operand(elems[i]);
- vec->definitions[0] = Definition(dst);
- bld.insert(std::move(vec));
+ aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
+ for (unsigned i = 0; i < num_components; i++)
+ create_vec->operands[i] = Operand(elems[i]);
+ create_vec->definitions[0] = Definition(dst);
+ bld.insert(std::move(create_vec));
+
+ /* if dst is sgpr - split the src, but move the original to sgpr. */
+ } else if (skip) {
+ vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
+ byte_align_scalar(ctx, vec, offset, dst);
} else {
- // TODO: alignbyte if mask doesn't start with 1?
- assert(mask & 1);
- assert(dst.size() == vec_src.size());
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
+ assert(dst.size() == vec.size());
+ bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
}
+
ctx->allocated_vec.emplace(dst.id(), elems);
}
bool commutative, bool swap_srcs=false, bool flush_denorms = false)
{
Builder bld(ctx->program, ctx->block);
+ bld.is_precise = instr->exact;
+
Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
if (src1.type() == RegType::sgpr) {
}
}
+void emit_vop2_instruction_logic64(isel_context *ctx, nir_alu_instr *instr,
+ aco_opcode op, Temp dst)
+{
+ Builder bld(ctx->program, ctx->block);
+ bld.is_precise = instr->exact;
+
+ Temp src0 = get_alu_src(ctx, instr->src[0]);
+ Temp src1 = get_alu_src(ctx, instr->src[1]);
+
+ if (src1.type() == RegType::sgpr) {
+ assert(src0.type() == RegType::vgpr);
+ std::swap(src0, src1);
+ }
+
+ Temp src00 = bld.tmp(src0.type(), 1);
+ Temp src01 = bld.tmp(src0.type(), 1);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
+ Temp src10 = bld.tmp(v1);
+ Temp src11 = bld.tmp(v1);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
+ Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
+ Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
+ bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
+}
+
void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
bool flush_denorms = false)
{
src2 = as_vgpr(ctx, src2);
Builder bld(ctx->program, ctx->block);
+ bld.is_precise = instr->exact;
if (flush_denorms && ctx->program->chip_class < GFX9) {
assert(dst.size() == 1);
Temp tmp = bld.vop3(op, Definition(dst), src0, src1, src2);
void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
{
Builder bld(ctx->program, ctx->block);
- bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
+ bld.is_precise = instr->exact;
+ if (dst.type() == RegType::sgpr)
+ bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
+ bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
+ else
+ bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
}
void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
{
aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : instr->src[0].src.ssa->bit_size == 32 ? s32_op : aco_opcode::num_opcodes;
aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : instr->src[0].src.ssa->bit_size == 32 ? v32_op : v16_op;
- bool divergent_vals = ctx->divergent_vals[instr->dest.dest.ssa.index];
bool use_valu = s_op == aco_opcode::num_opcodes ||
- divergent_vals ||
+ nir_dest_is_divergent(instr->dest.dest) ||
ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr ||
ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
aco_opcode op = use_valu ? v_op : s_op;
assert(els.regClass() == bld.lm);
}
- if (!ctx->divergent_vals[instr->src[0].src.ssa->index]) { /* uniform condition and values in sgpr */
+ if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
if (dst.regClass() == s1 || dst.regClass() == s2) {
assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
assert(dst.size() == then.size());
bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
/* Extract the exponent and compute the unbiased value. */
- Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f64, bld.def(v1), val);
+ Temp exponent = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand(20u), Operand(11u));
+ exponent = bld.vsub32(bld.def(v1), exponent, Operand(1023u));
/* Extract the fractional part. */
Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu));
fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
/* Get the sign bit. */
- Temp sign = bld.vop2(aco_opcode::v_ashr_i32, bld.def(v1), Operand(31u), val_hi);
+ Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x80000000u), val_hi);
/* Decide the operation to apply depending on the unbiased exponent. */
Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u));
if (ctx->options->chip_class >= GFX7)
return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
- /* GFX6 doesn't support V_FLOOR_F64, lower it. */
+ /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
+ * lowered at NIR level for precision reasons). */
Temp src0 = as_vgpr(ctx, val);
Temp mask = bld.copy(bld.def(s1), Operand(3u)); /* isnan */
return add->definitions[0].getTemp();
}
-Temp convert_int(Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool is_signed, Temp dst=Temp()) {
+Temp convert_int(isel_context *ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool is_signed, Temp dst=Temp()) {
if (!dst.id()) {
if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
bld.sop1(src_bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, Definition(tmp), src);
else
bld.sop2(aco_opcode::s_and_b32, Definition(tmp), bld.def(s1, scc), Operand(src_bits == 8 ? 0xFFu : 0xFFFFu), src);
- } else {
+ } else if (ctx->options->chip_class >= GFX8) {
assert(src_bits != 8 || src.regClass() == v1b);
assert(src_bits != 16 || src.regClass() == v2b);
aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword;
sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword;
bld.insert(std::move(sdwa));
+ } else {
+ assert(ctx->options->chip_class == GFX6 || ctx->options->chip_class == GFX7);
+ aco_opcode opcode = is_signed ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32;
+ bld.vop3(opcode, Definition(tmp), src, Operand(0u), Operand(src_bits == 8 ? 8u : 16u));
}
if (dst_bits == 64) {
abort();
}
Builder bld(ctx->program, ctx->block);
+ bld.is_precise = instr->exact;
Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
switch(instr->op) {
case nir_op_vec2:
bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
else
unreachable("wrong src register class for nir_op_imov");
- } else if (dst.regClass() == v1) {
- bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
- } else if (dst.regClass() == v2) {
- bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
} else {
- nir_print_instr(&instr->instr, stderr);
- unreachable("Should have been lowered to scalar.");
+ if (dst.regClass() == v1)
+ bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
+ else if (dst.regClass() == v1b ||
+ dst.regClass() == v2b ||
+ dst.regClass() == v2)
+ bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
+ else
+ unreachable("wrong src register class for nir_op_imov");
}
break;
}
bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
} else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
+ } else if (dst.regClass() == v2) {
+ Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
+ lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
+ hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
+ bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
} else if (dst.type() == RegType::sgpr) {
aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
emit_boolean_logic(ctx, instr, Builder::s_or, dst);
} else if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
+ } else if (dst.regClass() == v2) {
+ emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
} else if (dst.regClass() == s1) {
emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
} else if (dst.regClass() == s2) {
emit_boolean_logic(ctx, instr, Builder::s_and, dst);
} else if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
+ } else if (dst.regClass() == v2) {
+ emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
} else if (dst.regClass() == s1) {
emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
} else if (dst.regClass() == s2) {
emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
} else if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
+ } else if (dst.regClass() == v2) {
+ emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
} else if (dst.regClass() == s1) {
emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
} else if (dst.regClass() == s2) {
} else if (dst.regClass() == v1) {
emit_rsq(ctx, bld, Definition(dst), src);
} else if (dst.regClass() == v2) {
+ /* Lowered at NIR level for precision reasons. */
emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
case nir_op_fsat: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (dst.regClass() == v2b) {
- bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
+ bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand((uint16_t)0u), Operand((uint16_t)0x3c00), src);
} else if (dst.regClass() == v1) {
bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
/* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
} else if (dst.regClass() == v1) {
emit_rcp(ctx, bld, Definition(dst), src);
} else if (dst.regClass() == v2) {
+ /* Lowered at NIR level for precision reasons. */
emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
} else if (dst.regClass() == v1) {
emit_sqrt(ctx, bld, Definition(dst), src);
} else if (dst.regClass() == v2) {
+ /* Lowered at NIR level for precision reasons. */
emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
case nir_op_fcos: {
Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
aco_ptr<Instruction> norm;
- Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
if (dst.regClass() == v2b) {
+ Temp half_pi = bld.copy(bld.def(s1), Operand(0x3118u));
Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
bld.vop1(opcode, Definition(dst), tmp);
} else if (dst.regClass() == v1) {
+ Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
/* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
if (instr->src[0].src.ssa->bit_size == 16) {
Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand(0u));
- convert_int(bld, tmp, 8, 32, true, dst);
+ convert_int(ctx, bld, tmp, 8, 32, true, dst);
} else if (instr->src[0].src.ssa->bit_size == 32) {
bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst), src);
} else if (instr->src[0].src.ssa->bit_size == 64) {
assert(dst.regClass() == v2b);
Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size == 8)
- src = convert_int(bld, src, 8, 16, true);
+ src = convert_int(ctx, bld, src, 8, 16, true);
+ else if (instr->src[0].src.ssa->bit_size == 64)
+ src = convert_int(ctx, bld, src, 64, 32, false);
bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
break;
}
assert(dst.size() == 1);
Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size <= 16)
- src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, true);
+ src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
break;
}
if (instr->src[0].src.ssa->bit_size <= 32) {
Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size <= 16)
- src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, true);
+ src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
} else if (instr->src[0].src.ssa->bit_size == 64) {
Temp src = get_alu_src(ctx, instr->src[0]);
assert(dst.regClass() == v2b);
Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size == 8)
- src = convert_int(bld, src, 8, 16, false);
+ src = convert_int(ctx, bld, src, 8, 16, false);
+ else if (instr->src[0].src.ssa->bit_size == 64)
+ src = convert_int(ctx, bld, src, 64, 32, false);
bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
break;
}
assert(dst.size() == 1);
Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size == 8) {
- //TODO: we should use v_cvt_f32_ubyte1/v_cvt_f32_ubyte2/etc depending on the register assignment
bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
} else {
if (instr->src[0].src.ssa->bit_size == 16)
- src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, true);
+ src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
}
break;
if (instr->src[0].src.ssa->bit_size <= 32) {
Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size <= 16)
- src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, false);
+ src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
} else if (instr->src[0].src.ssa->bit_size == 64) {
Temp src = get_alu_src(ctx, instr->src[0]);
}
case nir_op_f2i8:
case nir_op_f2i16: {
- Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size == 16)
- src = bld.vop1(aco_opcode::v_cvt_i16_f16, bld.def(v1), src);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
else if (instr->src[0].src.ssa->bit_size == 32)
- src = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src);
- else
- src = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src);
-
- if (dst.type() == RegType::vgpr)
- bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
else
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
break;
}
case nir_op_f2u8:
case nir_op_f2u16: {
- Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size == 16)
- src = bld.vop1(aco_opcode::v_cvt_u16_f16, bld.def(v1), src);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
else if (instr->src[0].src.ssa->bit_size == 32)
- src = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src);
- else
- src = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src);
-
- if (dst.type() == RegType::vgpr)
- bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
else
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
break;
}
case nir_op_f2i32: {
bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
}
} else if (instr->src[0].src.ssa->bit_size == 32) {
- if (dst.type() == RegType::vgpr)
- bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src);
- else
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
- bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src));
-
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
} else if (instr->src[0].src.ssa->bit_size == 64) {
- if (dst.type() == RegType::vgpr)
- bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src);
- else
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
- bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src));
-
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
}
} else if (instr->src[0].src.ssa->bit_size == 32) {
- if (dst.type() == RegType::vgpr)
- bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src);
- else
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
- bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src));
-
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
} else if (instr->src[0].src.ssa->bit_size == 64) {
- if (dst.type() == RegType::vgpr)
- bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src);
- else
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
- bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src));
-
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
case nir_op_i2i16:
case nir_op_i2i32:
case nir_op_i2i64: {
- convert_int(bld, get_alu_src(ctx, instr->src[0]),
+ convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]),
instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, true, dst);
break;
}
case nir_op_u2u16:
case nir_op_u2u32:
case nir_op_u2u64: {
- convert_int(bld, get_alu_src(ctx, instr->src[0]),
+ convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]),
instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, false, dst);
break;
}
}
case nir_op_unpack_half_2x16_split_x: {
if (dst.regClass() == v1) {
- Builder bld(ctx->program, ctx->block);
bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
}
case nir_op_unpack_half_2x16_split_y: {
if (dst.regClass() == v1) {
- Builder bld(ctx->program, ctx->block);
/* TODO: use SDWA here */
bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
return new_mask;
}
-void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst)
-{
- Builder bld(ctx->program, ctx->block);
- if (offset.isTemp()) {
- Temp tmp[3] = {vec, vec, vec};
-
- if (vec.size() == 3) {
- tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
- bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec);
- } else if (vec.size() == 2) {
- tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
- bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
- }
- for (unsigned i = 0; i < dst.size(); i++)
- tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
-
- vec = tmp[0];
- if (dst.size() == 2)
- vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
-
- offset = Operand(0u);
- }
-
- if (vec.bytes() == dst.bytes() && offset.constantValue() == 0)
- bld.copy(Definition(dst), vec);
- else
- trim_subdword_vector(ctx, vec, dst, vec.bytes(), ((1 << dst.bytes()) - 1) << offset.constantValue());
-}
-
struct LoadEmitInfo {
Operand offset;
Temp dst;
int byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
if (byte_align) {
- if ((bytes_needed > 2 || !supports_8bit_16bit_loads) && byte_align_loads) {
+ if ((bytes_needed > 2 ||
+ (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
+ !supports_8bit_16bit_loads) && byte_align_loads) {
if (info->component_stride) {
assert(supports_8bit_16bit_loads && "unimplemented");
bytes_needed = 2;
Temp val = callback(bld, info, aligned_offset_tmp, bytes_needed, align,
reduced_const_offset, byte_align ? Temp() : info->dst);
+ /* the callback wrote directly to dst */
+ if (val == info->dst) {
+ assert(num_vals == 0);
+ emit_split_vector(ctx, info->dst, info->num_components);
+ return;
+ }
+
/* shift result right if needed */
- if (byte_align) {
+ if (info->component_size < 4 && byte_align_loads) {
Operand align((uint32_t)byte_align);
if (byte_align == -1) {
if (offset.isConstant())
align = offset;
}
- if (align.isTemp() || align.constantValue()) {
- assert(val.bytes() >= load_size && "unimplemented");
- Temp new_val = bld.tmp(RegClass::get(val.type(), load_size));
- if (val.type() == RegType::sgpr)
- byte_align_scalar(ctx, val, align, new_val);
- else
- byte_align_vector(ctx, val, align, new_val);
- val = new_val;
- }
+ assert(val.bytes() >= load_size && "unimplemented");
+ if (val.type() == RegType::sgpr)
+ byte_align_scalar(ctx, val, align, info->dst);
+ else
+ byte_align_vector(ctx, val, align, info->dst, component_size);
+ return;
}
/* add result to list and advance */
vals[num_vals++] = val;
}
- /* the callback wrote directly to dst */
- if (vals[0] == info->dst) {
- assert(num_vals == 1);
- emit_split_vector(ctx, info->dst, info->num_components);
- return;
- }
-
/* create array of components */
unsigned components_split = 0;
std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
mubuf->definitions[0] = Definition(val);
bld.insert(std::move(mubuf));
- if (bytes_size < 4)
- val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, bytes_size)), val, Operand(0u));
-
return val;
}
bld.insert(std::move(flat));
}
- if (bytes_size < 4)
- val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, bytes_size)), val, Operand(0u));
-
return val;
}
/* dword or larger stores have to be dword-aligned */
unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
- unsigned align_offset = instr ? nir_intrinsic_align_mul(instr) : 0;
- bool dword_aligned = (align_offset + offset) % 4 == 0 && align_mul % 4 == 0;
- if (bytes >= 4 && !dword_aligned)
- bytes = MIN2(bytes, 2);
+ unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
+ bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
+ if (!dword_aligned)
+ bytes = MIN2(bytes, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
advance_write_mask(&todo, offset, bytes);
write_count_with_skips++;
bool allow_combining = true, bool reorder = true, bool slc = false)
{
Builder bld(ctx->program, ctx->block);
- assert(elem_size_bytes == 4 || elem_size_bytes == 8);
+ assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
assert(write_mask);
write_mask = widen_mask(write_mask, elem_size_bytes);
unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true)
{
- assert(elem_size_bytes == 4 || elem_size_bytes == 8);
- assert((num_components * elem_size_bytes / 4) == dst.size());
+ assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
+ assert((num_components * elem_size_bytes) == dst.bytes());
assert(!!stride != allow_combining);
Builder bld(ctx->program, ctx->block);
if (instr->src[0].ssa->bit_size == 64)
write_mask = widen_mask(write_mask, 2);
+ RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
+
for (unsigned i = 0; i < 8; ++i) {
if (write_mask & (1 << i)) {
ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
- ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, v1);
+ ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
}
idx++;
}
Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
Builder bld(ctx->program, ctx->block);
- Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component);
- if (ctx->program->has_16bank_lds)
- interp_p1.instr->operands[0].setLateKill(true);
- bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx, component);
+
+ if (dst.regClass() == v2b) {
+ if (ctx->program->has_16bank_lds) {
+ assert(ctx->options->chip_class <= GFX8);
+ Builder::Result interp_p1 =
+ bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1),
+ Operand(2u) /* P0 */, bld.m0(prim_mask), idx, component);
+ interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b),
+ coord1, bld.m0(prim_mask), interp_p1, idx, component);
+ bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2,
+ bld.m0(prim_mask), interp_p1, idx, component);
+ } else {
+ aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
+
+ if (ctx->options->chip_class == GFX8)
+ interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
+
+ Builder::Result interp_p1 =
+ bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1),
+ coord1, bld.m0(prim_mask), idx, component);
+ bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask),
+ interp_p1, idx, component);
+ }
+ } else {
+ Builder::Result interp_p1 =
+ bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
+ bld.m0(prim_mask), idx, component);
+
+ if (ctx->program->has_16bank_lds)
+ interp_p1.instr->operands[0].setLateKill(true);
+
+ bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2,
+ bld.m0(prim_mask), interp_p1, idx, component);
+ }
}
void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset;
unsigned component = nir_intrinsic_component(instr);
+ unsigned bitsize = instr->dest.ssa.bit_size;
unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
/* load channels */
while (channel_start < num_channels) {
- unsigned fetch_size = num_channels - channel_start;
+ unsigned fetch_component = num_channels - channel_start;
unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size;
bool expanded = false;
vtx_info->chan_byte_size == 4;
unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID;
if (!use_mubuf) {
- fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_size);
+ fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_component);
} else {
- if (fetch_size == 3 && ctx->options->chip_class == GFX6) {
+ if (fetch_component == 3 && ctx->options->chip_class == GFX6) {
/* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */
- fetch_size = 4;
+ fetch_component = 4;
expanded = true;
}
}
+ unsigned fetch_bytes = fetch_component * bitsize / 8;
+
Temp fetch_index = index;
if (attrib_stride != 0 && fetch_offset > attrib_stride) {
fetch_index = bld.vadd32(bld.def(v1), Operand(fetch_offset / attrib_stride), fetch_index);
}
aco_opcode opcode;
- switch (fetch_size) {
- case 1:
- opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
- break;
+ switch (fetch_bytes) {
case 2:
- opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
+ assert(!use_mubuf && bitsize == 16);
+ opcode = aco_opcode::tbuffer_load_format_d16_x;
+ break;
+ case 4:
+ if (bitsize == 16) {
+ assert(!use_mubuf);
+ opcode = aco_opcode::tbuffer_load_format_d16_xy;
+ } else {
+ opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
+ }
+ break;
+ case 6:
+ assert(!use_mubuf && bitsize == 16);
+ opcode = aco_opcode::tbuffer_load_format_d16_xyz;
break;
- case 3:
+ case 8:
+ if (bitsize == 16) {
+ assert(!use_mubuf);
+ opcode = aco_opcode::tbuffer_load_format_d16_xyzw;
+ } else {
+ opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
+ }
+ break;
+ case 12:
assert(ctx->options->chip_class >= GFX7 ||
(!use_mubuf && ctx->options->chip_class == GFX6));
opcode = use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz;
break;
- case 4:
+ case 16:
opcode = use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw;
break;
default:
}
Temp fetch_dst;
- if (channel_start == 0 && fetch_size == dst.size() && !post_shuffle &&
+ if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle &&
!expanded && (alpha_adjust == RADV_ALPHA_ADJUST_NONE ||
num_channels <= 3)) {
direct_fetch = true;
fetch_dst = dst;
} else {
- fetch_dst = bld.tmp(RegType::vgpr, fetch_size);
+ fetch_dst = bld.tmp(RegClass::get(RegType::vgpr, fetch_bytes));
}
if (use_mubuf) {
emit_split_vector(ctx, fetch_dst, fetch_dst.size());
- if (fetch_size == 1) {
+ if (fetch_component == 1) {
channels[channel_start] = fetch_dst;
} else {
- for (unsigned i = 0; i < MIN2(fetch_size, num_channels - channel_start); i++)
- channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i, v1);
+ for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++)
+ channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i,
+ bitsize == 16 ? v2b : v1);
}
- channel_start += fetch_size;
+ channel_start += fetch_component;
}
if (!direct_fetch) {
{
Builder bld(ctx->program, ctx->block);
Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
- if (!ctx->divergent_vals[instr->dest.ssa.index])
+ if (!nir_dest_is_divergent(instr->dest))
index = bld.as_uniform(index);
unsigned desc_set = nir_intrinsic_desc_set(instr);
unsigned binding = nir_intrinsic_binding(instr);
void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size,
Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
- bool glc=false, bool readonly=true)
+ bool glc=false, bool readonly=true, bool allow_smem=true)
{
Builder bld(ctx->program, ctx->block);
- bool use_smem = dst.type() != RegType::vgpr && ((ctx->options->chip_class >= GFX8 && component_size >= 4) || readonly);
+ bool use_smem = dst.type() != RegType::vgpr && (!glc || ctx->options->chip_class >= GFX8) && allow_smem;
if (use_smem)
offset = bld.as_uniform(offset);
Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
- bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
+ unsigned access = nir_intrinsic_access(instr);
+ bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
unsigned size = instr->dest.ssa.bit_size / 8;
+
+ uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[0].ssa, access);
+ /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
+ * TODO: this optimization is disabled for now because we still need to ensure correct ordering
+ */
+ bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_store : has_vmem_store));
+ allow_smem |= ((access & ACCESS_RESTRICT) && (access & ACCESS_NON_WRITEABLE)) || (access & ACCESS_CAN_REORDER);
+
load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
- nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false);
+ nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false, allow_smem);
}
void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
- bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] &&
+ bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+ uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr));
+ /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
+ * TODO: this optimization is disabled for now because we still need to ensure correct ordering
+ */
+ bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_loadstore : has_vmem_loadstore));
+
+ bool smem = !nir_src_is_divergent(instr->src[2]) &&
ctx->options->chip_class >= GFX8 &&
- elem_size_bytes >= 4;
+ (elem_size_bytes >= 4 || can_subdword_ssbo_store_use_smem(instr)) &&
+ allow_smem;
if (smem)
offset = bld.as_uniform(offset);
bool smem_nonfs = smem && ctx->stage != fragment_fs;
if (op != aco_opcode::p_fs_buffer_store_smem)
store->operands[1].setFixed(m0);
store->operands[2] = Operand(write_datas[i]);
- store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+ store->glc = glc;
store->dlc = false;
store->disable_wqm = true;
store->barrier = barrier_buffer;
store->operands[3] = Operand(write_datas[i]);
store->offset = offsets[i];
store->offen = (offset.type() == RegType::vgpr);
- store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+ store->glc = glc;
store->dlc = false;
store->disable_wqm = true;
store->barrier = barrier_buffer;
op32 = aco_opcode::ds_write_b32;
op64 = aco_opcode::ds_write_b64;
op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
- op64_rtn = aco_opcode::ds_wrxchg2_rtn_b64;
+ op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
break;
case nir_intrinsic_shared_atomic_comp_swap:
op32 = aco_opcode::ds_cmpst_b32;
case nir_intrinsic_shuffle:
case nir_intrinsic_read_invocation: {
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
- if (!ctx->divergent_vals[instr->src[0].ssa->index]) {
+ if (!nir_src_is_divergent(instr->src[0])) {
emit_uniform_subgroup(ctx, instr, src);
} else {
Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
- if (instr->intrinsic == nir_intrinsic_read_invocation || !ctx->divergent_vals[instr->src[1].ssa->index])
+ if (instr->intrinsic == nir_intrinsic_read_invocation || !nir_src_is_divergent(instr->src[1]))
tid = bld.as_uniform(tid);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
- if (src.regClass() == v1) {
+ if (src.regClass() == v1b || src.regClass() == v2b) {
+ Temp tmp = bld.tmp(v1);
+ tmp = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), tmp);
+ if (dst.type() == RegType::vgpr)
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
+ else
+ bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
+ } else if (src.regClass() == v1) {
emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), dst);
} else if (src.regClass() == v2) {
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
case nir_intrinsic_read_first_invocation: {
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
- if (src.regClass() == v1) {
+ if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) {
emit_wqm(ctx,
bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src),
dst);
nir_intrinsic_cluster_size(instr) : 0;
cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
- if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) {
+ if (!nir_src_is_divergent(instr->src[0]) && (op == nir_op_ior || op == nir_op_iand)) {
emit_uniform_subgroup(ctx, instr, src);
} else if (instr->dest.ssa.bit_size == 1) {
if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
} else if (cluster_size == 1) {
bld.copy(Definition(dst), src);
} else {
- src = as_vgpr(ctx, src);
+ unsigned bit_size = instr->src[0].ssa->bit_size;
+
+ src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
ReduceOp reduce_op;
switch (op) {
- #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break;
- CASE(iadd)
- CASE(imul)
- CASE(fadd)
- CASE(fmul)
- CASE(imin)
- CASE(umin)
- CASE(fmin)
- CASE(imax)
- CASE(umax)
- CASE(fmax)
- CASE(iand)
- CASE(ior)
- CASE(ixor)
+ #define CASEI(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : (bit_size == 8) ? name##8 : name##64; break;
+ #define CASEF(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64; break;
+ CASEI(iadd)
+ CASEI(imul)
+ CASEI(imin)
+ CASEI(umin)
+ CASEI(imax)
+ CASEI(umax)
+ CASEI(iand)
+ CASEI(ior)
+ CASEI(ixor)
+ CASEF(fadd)
+ CASEF(fmul)
+ CASEF(fmin)
+ CASEF(fmax)
default:
unreachable("unknown reduction op");
- #undef CASE
+ #undef CASEI
+ #undef CASEF
}
aco_opcode aco_op;
}
case nir_intrinsic_quad_broadcast: {
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
- if (!ctx->divergent_vals[instr->dest.ssa.index]) {
+ if (!nir_dest_is_divergent(instr->dest)) {
emit_uniform_subgroup(ctx, instr, src);
} else {
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp,
bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))));
emit_wqm(ctx, tmp, dst);
+ } else if (instr->dest.ssa.bit_size == 8) {
+ Temp tmp = bld.tmp(v1);
+ if (ctx->program->chip_class >= GFX8)
+ emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
+ else
+ emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), tmp);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp);
+ } else if (instr->dest.ssa.bit_size == 16) {
+ Temp tmp = bld.tmp(v1);
+ if (ctx->program->chip_class >= GFX8)
+ emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
+ else
+ emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), tmp);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
} else if (instr->dest.ssa.bit_size == 32) {
if (ctx->program->chip_class >= GFX8)
emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), dst);
case nir_intrinsic_quad_swap_diagonal:
case nir_intrinsic_quad_swizzle_amd: {
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
- if (!ctx->divergent_vals[instr->dest.ssa.index]) {
+ if (!nir_dest_is_divergent(instr->dest)) {
emit_uniform_subgroup(ctx, instr, src);
break;
}
src = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);
Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
emit_wqm(ctx, tmp, dst);
+ } else if (instr->dest.ssa.bit_size == 8) {
+ Temp tmp = bld.tmp(v1);
+ if (ctx->program->chip_class >= GFX8)
+ emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
+ else
+ emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl), tmp);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp);
+ } else if (instr->dest.ssa.bit_size == 16) {
+ Temp tmp = bld.tmp(v1);
+ if (ctx->program->chip_class >= GFX8)
+ emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
+ else
+ emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl), tmp);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
} else if (instr->dest.ssa.bit_size == 32) {
Temp tmp;
if (ctx->program->chip_class >= GFX8)
}
case nir_intrinsic_masked_swizzle_amd: {
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
- if (!ctx->divergent_vals[instr->dest.ssa.index]) {
+ if (!nir_dest_is_divergent(instr->dest)) {
emit_uniform_subgroup(ctx, instr, src);
break;
}
uint32_t mask = nir_intrinsic_swizzle_mask(instr);
if (dst.regClass() == v1) {
emit_wqm(ctx,
- bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false),
+ emit_masked_swizzle(ctx, bld, src, mask),
dst);
} else if (dst.regClass() == v2) {
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
- lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, mask, 0, false));
- hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, mask, 0, false));
+ lo = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, lo, mask));
+ hi = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, hi, mask));
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
emit_split_vector(ctx, dst, 2);
} else {
get_ssa_temp(ctx, &instr->dest.ssa));
break;
}
- case nir_intrinsic_shader_clock:
- bld.smem(aco_opcode::s_memtime, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
+ case nir_intrinsic_shader_clock: {
+ aco_opcode opcode =
+ nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE ?
+ aco_opcode::s_memrealtime : aco_opcode::s_memtime;
+ bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2);
break;
+ }
case nir_intrinsic_load_vertex_id_zero_base: {
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id));
{
Builder bld(ctx->program, ctx->block);
bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
- has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false;
+ has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false,
+ has_clamped_lod = false;
Temp resource, sampler, fmask_ptr, bias = Temp(), compare = Temp(), sample_index = Temp(),
- lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp();
+ lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(),
+ clamped_lod = Temp();
std::vector<Temp> coords;
std::vector<Temp> derivs;
nir_const_value *sample_index_cv = NULL;
break;
}
case nir_tex_src_bias:
- if (instr->op == nir_texop_txb) {
- bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
- has_bias = true;
- }
+ bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
+ has_bias = true;
break;
case nir_tex_src_lod: {
nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
}
break;
}
+ case nir_tex_src_min_lod:
+ clamped_lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
+ has_clamped_lod = true;
+ break;
case nir_tex_src_comparator:
if (instr->is_shadow) {
compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
args.emplace_back(sample_index);
if (has_lod)
args.emplace_back(lod);
+ if (has_clamped_lod)
+ args.emplace_back(clamped_lod);
Temp arg = bld.tmp(RegClass(RegType::vgpr, args.size()));
aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
// TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
aco_opcode opcode = aco_opcode::image_sample;
if (has_offset) { /* image_sample_*_o */
- if (has_compare) {
+ if (has_clamped_lod) {
+ if (has_compare) {
+ opcode = aco_opcode::image_sample_c_cl_o;
+ if (has_derivs)
+ opcode = aco_opcode::image_sample_c_d_cl_o;
+ if (has_bias)
+ opcode = aco_opcode::image_sample_c_b_cl_o;
+ } else {
+ opcode = aco_opcode::image_sample_cl_o;
+ if (has_derivs)
+ opcode = aco_opcode::image_sample_d_cl_o;
+ if (has_bias)
+ opcode = aco_opcode::image_sample_b_cl_o;
+ }
+ } else if (has_compare) {
opcode = aco_opcode::image_sample_c_o;
if (has_derivs)
opcode = aco_opcode::image_sample_c_d_o;
if (has_lod)
opcode = aco_opcode::image_sample_l_o;
}
+ } else if (has_clamped_lod) { /* image_sample_*_cl */
+ if (has_compare) {
+ opcode = aco_opcode::image_sample_c_cl;
+ if (has_derivs)
+ opcode = aco_opcode::image_sample_c_d_cl;
+ if (has_bias)
+ opcode = aco_opcode::image_sample_c_b_cl;
+ } else {
+ opcode = aco_opcode::image_sample_cl;
+ if (has_derivs)
+ opcode = aco_opcode::image_sample_d_cl;
+ if (has_bias)
+ opcode = aco_opcode::image_sample_b_cl;
+ }
} else { /* no offset */
if (has_compare) {
opcode = aco_opcode::image_sample_c;
}
if (instr->op == nir_texop_tg4) {
- if (has_offset) {
- opcode = aco_opcode::image_gather4_lz_o;
- if (has_compare)
+ if (has_offset) { /* image_gather4_*_o */
+ if (has_compare) {
opcode = aco_opcode::image_gather4_c_lz_o;
+ if (has_lod)
+ opcode = aco_opcode::image_gather4_c_l_o;
+ if (has_bias)
+ opcode = aco_opcode::image_gather4_c_b_o;
+ } else {
+ opcode = aco_opcode::image_gather4_lz_o;
+ if (has_lod)
+ opcode = aco_opcode::image_gather4_l_o;
+ if (has_bias)
+ opcode = aco_opcode::image_gather4_b_o;
+ }
} else {
- opcode = aco_opcode::image_gather4_lz;
- if (has_compare)
+ if (has_compare) {
opcode = aco_opcode::image_gather4_c_lz;
+ if (has_lod)
+ opcode = aco_opcode::image_gather4_c_l;
+ if (has_bias)
+ opcode = aco_opcode::image_gather4_c_b;
+ } else {
+ opcode = aco_opcode::image_gather4_lz;
+ if (has_lod)
+ opcode = aco_opcode::image_gather4_l;
+ if (has_bias)
+ opcode = aco_opcode::image_gather4_b;
+ }
}
} else if (instr->op == nir_texop_lod) {
opcode = aco_opcode::image_get_lod;
}
-Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa)
+Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa, RegClass rc, bool logical)
{
Temp tmp = get_ssa_temp(ctx, ssa);
- if (ssa->parent_instr->type == nir_instr_type_ssa_undef)
- return Operand(tmp.regClass());
- else
+ if (ssa->parent_instr->type == nir_instr_type_ssa_undef) {
+ return Operand(rc);
+ } else if (logical && ssa->bit_size == 1 && ssa->parent_instr->type == nir_instr_type_load_const) {
+ if (ctx->program->wave_size == 64)
+ return Operand(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT64_MAX : 0u);
+ else
+ return Operand(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT32_MAX : 0u);
+ } else {
return Operand(tmp);
+ }
}
void visit_phi(isel_context *ctx, nir_phi_instr *instr)
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
- bool logical = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index];
+ bool logical = !dst.is_linear() || nir_dest_is_divergent(instr->dest);
logical |= ctx->block->kind & block_kind_merge;
aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
continue;
cur_pred_idx++;
- Operand op = get_phi_operand(ctx, src.second);
+ Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical);
operands[num_operands++] = op;
num_defined += !op.isUndefined();
}
aco_ptr<Pseudo_branch_instruction> branch;
if_context ic;
- if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */
+ if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
/**
* Uniform conditionals are represented in the following way*) :
*
bool is_int8 = (ctx->options->key.fs.is_int8 >> slot) & 1;
bool is_int10 = (ctx->options->key.fs.is_int10 >> slot) & 1;
+ bool is_16bit = values[0].regClass() == v2b;
switch (col_format)
{
case V_028714_SPI_SHADER_FP16_ABGR:
enabled_channels = 0x5;
compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
+ if (is_16bit) {
+ if (ctx->options->chip_class >= GFX9) {
+ /* Pack the FP16 values together instead of converting them to
+ * FP32 and back to FP16.
+ * TODO: use p_create_vector and let the compiler optimizes.
+ */
+ compr_op = aco_opcode::v_pack_b32_f16;
+ } else {
+ for (unsigned i = 0; i < 4; i++) {
+ if ((write_mask >> i) & 1)
+ values[i] = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), values[i]);
+ }
+ }
+ }
break;
case V_028714_SPI_SHADER_UNORM16_ABGR:
enabled_channels = 0x5;
- compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
+ if (is_16bit && ctx->options->chip_class >= GFX9) {
+ compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
+ } else {
+ compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
+ }
break;
case V_028714_SPI_SHADER_SNORM16_ABGR:
enabled_channels = 0x5;
- compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
+ if (is_16bit && ctx->options->chip_class >= GFX9) {
+ compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
+ } else {
+ compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
+ }
break;
case V_028714_SPI_SHADER_UINT16_ABGR: {
values[i]);
}
}
+ } else if (is_16bit) {
+ for (unsigned i = 0; i < 4; i++) {
+ if ((write_mask >> i) & 1) {
+ Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
+ values[i] = Operand(tmp);
+ }
+ }
}
break;
}
values[i]);
}
}
+ } else if (is_16bit) {
+ for (unsigned i = 0; i < 4; i++) {
+ if ((write_mask >> i) & 1) {
+ Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
+ values[i] = Operand(tmp);
+ }
+ }
}
break;
if (target == V_008DFC_SQ_EXP_NULL)
return false;
+ /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
+ if (ctx->options->enable_mrt_output_nan_fixup &&
+ !is_16bit &&
+ (col_format == V_028714_SPI_SHADER_32_R ||
+ col_format == V_028714_SPI_SHADER_32_GR ||
+ col_format == V_028714_SPI_SHADER_32_AR ||
+ col_format == V_028714_SPI_SHADER_32_ABGR ||
+ col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
+ for (int i = 0; i < 4; i++) {
+ if (!(write_mask & (1 << i)))
+ continue;
+
+ Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32,
+ bld.hint_vcc(bld.def(bld.lm)), values[i],
+ bld.copy(bld.def(v1), Operand(3u)));
+ values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i],
+ bld.copy(bld.def(v1), Operand(0u)), isnan);
+ }
+ }
+
if ((bool) compr_op) {
for (int i = 0; i < 2; i++) {
/* check if at least one of the values to be compressed is enabled */
float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
- /* default to preserving fp16 and fp64 denorms, since it's free */
+ /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
+ * the precision seems needed for Wolfenstein: Youngblood to render correctly */
if (program->next_fp_mode.must_flush_denorms16_64)
program->next_fp_mode.denorm16_64 = 0;
else
if (ngg_no_gs && !ngg_early_prim_export(&ctx))
ngg_emit_nogs_output(&ctx);
- ralloc_free(ctx.divergent_vals);
-
if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
/* Outputs of the previous stage are inputs to the next stage */
ctx.inputs = ctx.outputs;