#include <algorithm>
#include <array>
+#include <stack>
#include <map>
#include "ac_shader_util.h"
ctx->cf_info.parent_if.is_divergent = divergent_if_old;
ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1;
if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
- ctx->cf_info.exec_potentially_empty = false;
+ ctx->cf_info.exec_potentially_empty_discard = false;
}
};
Temp cond;
bool divergent_old;
- bool exec_potentially_empty_old;
+ bool exec_potentially_empty_discard_old;
+ bool exec_potentially_empty_break_old;
+ uint16_t exec_potentially_empty_break_depth_old;
unsigned BB_if_idx;
unsigned invert_idx;
+ bool uniform_has_then_branch;
bool then_branch_divergent;
Block BB_invert;
Block BB_endif;
};
-static void visit_cf_list(struct isel_context *ctx,
+static bool visit_cf_list(struct isel_context *ctx,
struct exec_list *list);
static void add_logical_edge(unsigned pred_idx, Block *succ)
assert(idx == 0);
return src;
}
- assert(src.size() > idx);
+
+ assert(src.bytes() > (idx * dst_rc.bytes()));
Builder bld(ctx->program, ctx->block);
auto it = ctx->allocated_vec.find(src.id());
- /* the size check needs to be early because elements other than 0 may be garbage */
- if (it != ctx->allocated_vec.end() && it->second[0].size() == dst_rc.size()) {
+ if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
if (it->second[idx].regClass() == dst_rc) {
return it->second[idx];
} else {
- assert(dst_rc.size() == it->second[idx].regClass().size());
+ assert(!dst_rc.is_subdword());
assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
return bld.copy(bld.def(dst_rc), it->second[idx]);
}
}
- if (src.size() == dst_rc.size()) {
+ if (dst_rc.is_subdword())
+ src = as_vgpr(ctx, src);
+
+ if (src.bytes() == dst_rc.bytes()) {
assert(idx == 0);
return bld.copy(bld.def(dst_rc), src);
} else {
return;
if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
return;
+ RegClass rc;
+ if (num_components > vec_src.size()) {
+ if (vec_src.type() == RegType::sgpr) {
+ /* should still help get_alu_src() */
+ emit_split_vector(ctx, vec_src, vec_src.size());
+ return;
+ }
+ /* sub-dword split */
+ rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
+ } else {
+ rc = RegClass(vec_src.type(), vec_src.size() / num_components);
+ }
aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
split->operands[0] = Operand(vec_src);
std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
for (unsigned i = 0; i < num_components; i++) {
- elems[i] = {ctx->program->allocateId(), RegClass(vec_src.type(), vec_src.size() / num_components)};
+ elems[i] = {ctx->program->allocateId(), rc};
split->definitions[i] = Definition(elems[i]);
}
ctx->block->instructions.emplace_back(std::move(split));
ctx->allocated_vec.emplace(dst.id(), elems);
}
+/* adjust misaligned small bit size loads */
+void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst)
+{
+ Builder bld(ctx->program, ctx->block);
+ Operand shift;
+ Temp select = Temp();
+ if (offset.isConstant()) {
+ assert(offset.constantValue() && offset.constantValue() < 4);
+ shift = Operand(offset.constantValue() * 8);
+ } else {
+ /* bit_offset = 8 * (offset & 0x3) */
+ Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand(3u));
+ select = bld.tmp(s1);
+ shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp, Operand(3u));
+ }
+
+ if (vec.size() == 1) {
+ bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
+ } else if (vec.size() == 2) {
+ Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
+ bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
+ if (tmp == dst)
+ emit_split_vector(ctx, dst, 2);
+ else
+ emit_extract_vector(ctx, tmp, 0, dst);
+ } else if (vec.size() == 4) {
+ Temp lo = bld.tmp(s2), hi = bld.tmp(s2);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
+ hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand(0u));
+ if (select != Temp())
+ hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), select);
+ lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
+ Temp mid = bld.tmp(s1);
+ lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
+ hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
+ mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
+ bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
+ emit_split_vector(ctx, dst, 2);
+ }
+}
+
+/* this function trims subdword vectors:
+ * if dst is vgpr - split the src and create a shrunk version according to the mask.
+ * if dst is sgpr - split the src, but move the original to sgpr. */
+void trim_subdword_vector(isel_context *ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
+{
+ assert(vec_src.type() == RegType::vgpr);
+ emit_split_vector(ctx, vec_src, num_components);
+
+ Builder bld(ctx->program, ctx->block);
+ std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
+ unsigned component_size = vec_src.bytes() / num_components;
+ RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
+
+ unsigned k = 0;
+ for (unsigned i = 0; i < num_components; i++) {
+ if (mask & (1 << i))
+ elems[k++] = emit_extract_vector(ctx, vec_src, i, rc);
+ }
+
+ if (dst.type() == RegType::vgpr) {
+ assert(dst.bytes() == k * component_size);
+ aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, k, 1)};
+ for (unsigned i = 0; i < k; i++)
+ vec->operands[i] = Operand(elems[i]);
+ vec->definitions[0] = Definition(dst);
+ bld.insert(std::move(vec));
+ } else {
+ // TODO: alignbyte if mask doesn't start with 1?
+ assert(mask & 1);
+ assert(dst.size() == vec_src.size());
+ bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
+ }
+ ctx->allocated_vec.emplace(dst.id(), elems);
+}
+
Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2))
{
Builder bld(ctx->program, ctx->block);
assert(val.regClass() == s1);
assert(dst.regClass() == bld.lm);
- return bld.sop2(Builder::s_cselect, bld.hint_vcc(Definition(dst)), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
+ return bld.sop2(Builder::s_cselect, Definition(dst), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
}
Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1))
}
Temp vec = get_ssa_temp(ctx, src.src.ssa);
- unsigned elem_size = vec.size() / src.src.ssa->num_components;
- assert(elem_size > 0); /* TODO: 8 and 16-bit vectors not supported */
- assert(vec.size() % elem_size == 0);
+ unsigned elem_size = vec.bytes() / src.src.ssa->num_components;
+ assert(elem_size > 0);
+ assert(vec.bytes() % elem_size == 0);
+
+ if (elem_size < 4 && vec.type() == RegType::sgpr) {
+ assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
+ assert(size == 1);
+ unsigned swizzle = src.swizzle[0];
+ if (vec.size() > 1) {
+ assert(src.src.ssa->bit_size == 16);
+ vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
+ swizzle = swizzle & 1;
+ }
+ if (swizzle == 0)
+ return vec;
+
+ Temp dst{ctx->program->allocateId(), s1};
+ aco_ptr<SOP2_instruction> bfe{create_instruction<SOP2_instruction>(aco_opcode::s_bfe_u32, Format::SOP2, 2, 2)};
+ bfe->operands[0] = Operand(vec);
+ bfe->operands[1] = Operand(uint32_t((src.src.ssa->bit_size << 16) | (src.src.ssa->bit_size * swizzle)));
+ bfe->definitions[0] = Definition(dst);
+ bfe->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
+ ctx->block->instructions.emplace_back(std::move(bfe));
+ return dst;
+ }
- RegClass elem_rc = RegClass(vec.type(), elem_size);
+ RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword() : RegClass(vec.type(), elem_size / 4);
if (size == 1) {
return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
} else {
elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
vec_instr->operands[i] = Operand{elems[i]};
}
- Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size)};
+ Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size / 4)};
vec_instr->definitions[0] = Definition(dst);
ctx->block->instructions.emplace_back(std::move(vec_instr));
ctx->allocated_vec.emplace(dst.id(), elems);
Temp t = src0;
src0 = src1;
src1 = t;
- } else if (src0.type() == RegType::vgpr &&
- op != aco_opcode::v_madmk_f32 &&
- op != aco_opcode::v_madak_f32 &&
- op != aco_opcode::v_madmk_f16 &&
- op != aco_opcode::v_madak_f16) {
- /* If the instruction is not commutative, we emit a VOP3A instruction */
- bld.vop2_e64(op, Definition(dst), src0, src1);
- return;
} else {
- src1 = bld.copy(bld.def(RegType::vgpr, src1.size()), src1); //TODO: as_vgpr
+ src1 = as_vgpr(ctx, src1);
}
}
if (src0.type() == RegType::vgpr) {
/* to swap the operands, we might also have to change the opcode */
switch (op) {
+ case aco_opcode::v_cmp_lt_f16:
+ op = aco_opcode::v_cmp_gt_f16;
+ break;
+ case aco_opcode::v_cmp_ge_f16:
+ op = aco_opcode::v_cmp_le_f16;
+ break;
+ case aco_opcode::v_cmp_lt_i16:
+ op = aco_opcode::v_cmp_gt_i16;
+ break;
+ case aco_opcode::v_cmp_ge_i16:
+ op = aco_opcode::v_cmp_le_i16;
+ break;
+ case aco_opcode::v_cmp_lt_u16:
+ op = aco_opcode::v_cmp_gt_u16;
+ break;
+ case aco_opcode::v_cmp_ge_u16:
+ op = aco_opcode::v_cmp_le_u16;
+ break;
case aco_opcode::v_cmp_lt_f32:
op = aco_opcode::v_cmp_gt_f32;
break;
}
void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
- aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, aco_opcode s64_op = aco_opcode::num_opcodes)
+ aco_opcode v16_op, aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, aco_opcode s64_op = aco_opcode::num_opcodes)
{
- aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : s32_op;
- aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : v32_op;
+ aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : instr->src[0].src.ssa->bit_size == 32 ? s32_op : aco_opcode::num_opcodes;
+ aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : instr->src[0].src.ssa->bit_size == 32 ? v32_op : v16_op;
bool divergent_vals = ctx->divergent_vals[instr->dest.dest.ssa.index];
bool use_valu = s_op == aco_opcode::num_opcodes ||
divergent_vals ||
emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
}
+Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
+{
+ if (ctx->options->chip_class >= GFX7)
+ return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
+
+ /* GFX6 doesn't support V_TRUNC_F64, lower it. */
+ /* TODO: create more efficient code! */
+ if (val.type() == RegType::sgpr)
+ val = as_vgpr(ctx, val);
+
+ /* Split the input value. */
+ Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
+
+ /* Extract the exponent and compute the unbiased value. */
+ Temp exponent = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand(20u), Operand(11u));
+ exponent = bld.vsub32(bld.def(v1), exponent, Operand(1023u));
+
+ /* Extract the fractional part. */
+ Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu));
+ fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
+
+ Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi), fract_mask);
+
+ Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
+ Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
+ fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
+ tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
+ fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
+
+ /* Get the sign bit. */
+ Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x80000000u), val_hi);
+
+ /* Decide the operation to apply depending on the unbiased exponent. */
+ Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u));
+ Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo, bld.copy(bld.def(v1), Operand(0u)), exp_lt0);
+ Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
+ Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand(51u));
+ dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
+ dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
+
+ return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
+}
+
+Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
+{
+ if (ctx->options->chip_class >= GFX7)
+ return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
+
+ /* GFX6 doesn't support V_FLOOR_F64, lower it. */
+ Temp src0 = as_vgpr(ctx, val);
+
+ Temp mask = bld.copy(bld.def(s1), Operand(3u)); /* isnan */
+ Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(-1u), Operand(0x3fefffffu));
+
+ Temp isnan = bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask);
+ Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
+ Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);
+
+ Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
+ Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
+
+ Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
+ Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
+
+ Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
+
+ Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);
+ static_cast<VOP3A_instruction*>(add)->neg[1] = true;
+
+ return add->definitions[0].getTemp();
+}
+
+Temp convert_int(Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool is_signed, Temp dst=Temp()) {
+ if (!dst.id()) {
+ if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
+ dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
+ else
+ dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
+ }
+
+ if (dst.bytes() == src.bytes() && dst_bits < src_bits)
+ return bld.copy(Definition(dst), src);
+ else if (dst.bytes() < src.bytes())
+ return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
+
+ Temp tmp = dst;
+ if (dst_bits == 64)
+ tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
+
+ if (tmp == src) {
+ } else if (src.regClass() == s1) {
+ if (is_signed)
+ bld.sop1(src_bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, Definition(tmp), src);
+ else
+ bld.sop2(aco_opcode::s_and_b32, Definition(tmp), bld.def(s1, scc), Operand(src_bits == 8 ? 0xFFu : 0xFFFFu), src);
+ } else {
+ assert(src_bits != 8 || src.regClass() == v1b);
+ assert(src_bits != 16 || src.regClass() == v2b);
+ aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
+ sdwa->operands[0] = Operand(src);
+ sdwa->definitions[0] = Definition(tmp);
+ if (is_signed)
+ sdwa->sel[0] = src_bits == 8 ? sdwa_sbyte : sdwa_sword;
+ else
+ sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword;
+ sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword;
+ bld.insert(std::move(sdwa));
+ }
+
+ if (dst_bits == 64) {
+ if (is_signed && dst.regClass() == s2) {
+ Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand(31u));
+ bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
+ } else if (is_signed && dst.regClass() == v2) {
+ Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), tmp);
+ bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
+ } else {
+ bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u));
+ }
+ }
+
+ return dst;
+}
+
void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
{
if (!instr->dest.dest.is_ssa) {
case nir_op_vec3:
case nir_op_vec4: {
std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
- aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
- for (unsigned i = 0; i < instr->dest.dest.ssa.num_components; ++i) {
+ unsigned num = instr->dest.dest.ssa.num_components;
+ for (unsigned i = 0; i < num; ++i)
elems[i] = get_alu_src(ctx, instr->src[i]);
- vec->operands[i] = Operand{elems[i]};
+
+ if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {
+ aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
+ RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u);
+ for (unsigned i = 0; i < num; ++i) {
+ if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
+ vec->operands[i] = Operand(emit_extract_vector(ctx, elems[i], 0, elem_rc));
+ else
+ vec->operands[i] = Operand{elems[i]};
+ }
+ vec->definitions[0] = Definition(dst);
+ ctx->block->instructions.emplace_back(std::move(vec));
+ ctx->allocated_vec.emplace(dst.id(), elems);
+ } else {
+ // TODO: that is a bit suboptimal..
+ Temp mask = bld.copy(bld.def(s1), Operand((1u << instr->dest.dest.ssa.bit_size) - 1));
+ for (unsigned i = 0; i < num - 1; ++i)
+ if (((i+1) * instr->dest.dest.ssa.bit_size) % 32)
+ elems[i] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
+ for (unsigned i = 0; i < num; ++i) {
+ unsigned bit = i * instr->dest.dest.ssa.bit_size;
+ if (bit % 32 == 0) {
+ elems[bit / 32] = elems[i];
+ } else {
+ elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
+ elems[i], Operand((i * instr->dest.dest.ssa.bit_size) % 32));
+ elems[bit / 32] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[bit / 32], elems[i]);
+ }
+ }
+ if (dst.size() == 1)
+ bld.copy(Definition(dst), elems[0]);
+ else
+ bld.pseudo(aco_opcode::p_create_vector, Definition(dst), elems[0], elems[1]);
}
- vec->definitions[0] = Definition(dst);
- ctx->block->instructions.emplace_back(std::move(vec));
- ctx->allocated_vec.emplace(dst.id(), elems);
break;
}
case nir_op_mov: {
if (instr->dest.dest.ssa.bit_size == 1) {
assert(src.regClass() == bld.lm);
assert(dst.regClass() == bld.lm);
- bld.sop2(Builder::s_andn2, Definition(dst), bld.def(s1, scc), Operand(exec, bld.lm), src);
+ /* Don't use s_andn2 here, this allows the optimizer to make a better decision */
+ Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
+ bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
} else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
} else if (dst.type() == RegType::sgpr) {
case nir_op_isign: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (dst.regClass() == s1) {
- Temp tmp = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
- Temp gtz = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src, Operand(0u));
- bld.sop2(aco_opcode::s_add_i32, Definition(dst), bld.def(s1, scc), gtz, tmp);
+ Temp tmp = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand((uint32_t)-1));
+ bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand(1u));
} else if (dst.regClass() == s2) {
Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
Temp neqz;
/* SCC gets zero-extended to 64 bit */
bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
} else if (dst.regClass() == v1) {
- Temp tmp = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
- Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
- bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(1u), tmp, gtz);
+ bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand((uint32_t)-1), src, Operand(1u));
} else if (dst.regClass() == v2) {
Temp upper = emit_extract_vector(ctx, src, 1, v1);
Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
Temp msb = sub.def(0).getTemp();
Temp carry = sub.def(1).getTemp();
- bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, carry);
+ bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, bld.scc(carry));
} else if (src.regClass() == v1) {
aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
Temp msb_rev = bld.tmp(v1);
break;
}
case nir_op_fmul: {
- if (dst.size() == 1) {
+ Temp src0 = get_alu_src(ctx, instr->src[0]);
+ Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
+ if (dst.regClass() == v2b) {
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
+ } else if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
- } else if (dst.size() == 2) {
- bld.vop3(aco_opcode::v_mul_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
- as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+ } else if (dst.regClass() == v2) {
+ bld.vop3(aco_opcode::v_mul_f64, Definition(dst), src0, src1);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
break;
}
case nir_op_fadd: {
- if (dst.size() == 1) {
+ Temp src0 = get_alu_src(ctx, instr->src[0]);
+ Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
+ if (dst.regClass() == v2b) {
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
+ } else if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
- } else if (dst.size() == 2) {
- bld.vop3(aco_opcode::v_add_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
- as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+ } else if (dst.regClass() == v2) {
+ bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, src1);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
case nir_op_fsub: {
Temp src0 = get_alu_src(ctx, instr->src[0]);
Temp src1 = get_alu_src(ctx, instr->src[1]);
- if (dst.size() == 1) {
+ if (dst.regClass() == v2b) {
+ if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
+ else
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
+ } else if (dst.regClass() == v1) {
if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
else
emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
- } else if (dst.size() == 2) {
+ } else if (dst.regClass() == v2) {
Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
- get_alu_src(ctx, instr->src[0]),
- as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+ as_vgpr(ctx, src0), as_vgpr(ctx, src1));
VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
sub->neg[1] = true;
} else {
break;
}
case nir_op_fmax: {
- if (dst.size() == 1) {
+ Temp src0 = get_alu_src(ctx, instr->src[0]);
+ Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
+ if (dst.regClass() == v2b) {
+ // TODO: check fp_mode.must_flush_denorms16_64
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);
+ } else if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
- } else if (dst.size() == 2) {
+ } else if (dst.regClass() == v2) {
if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
- Temp tmp = bld.vop3(aco_opcode::v_max_f64, bld.def(v2),
- get_alu_src(ctx, instr->src[0]),
- as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+ Temp tmp = bld.vop3(aco_opcode::v_max_f64, bld.def(v2), src0, src1);
bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
} else {
- bld.vop3(aco_opcode::v_max_f64, Definition(dst),
- get_alu_src(ctx, instr->src[0]),
- as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+ bld.vop3(aco_opcode::v_max_f64, Definition(dst), src0, src1);
}
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
break;
}
case nir_op_fmin: {
- if (dst.size() == 1) {
+ Temp src0 = get_alu_src(ctx, instr->src[0]);
+ Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
+ if (dst.regClass() == v2b) {
+ // TODO: check fp_mode.must_flush_denorms16_64
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);
+ } else if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
- } else if (dst.size() == 2) {
+ } else if (dst.regClass() == v2) {
if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
- Temp tmp = bld.vop3(aco_opcode::v_min_f64, bld.def(v2),
- get_alu_src(ctx, instr->src[0]),
- as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+ Temp tmp = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), src0, src1);
bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
} else {
- bld.vop3(aco_opcode::v_min_f64, Definition(dst),
- get_alu_src(ctx, instr->src[0]),
- as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+ bld.vop3(aco_opcode::v_min_f64, Definition(dst), src0, src1);
}
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
break;
}
case nir_op_fmax3: {
- if (dst.size() == 1) {
+ if (dst.regClass() == v2b) {
+ emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f16, dst, false);
+ } else if (dst.regClass() == v1) {
emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
break;
}
case nir_op_fmin3: {
- if (dst.size() == 1) {
+ if (dst.regClass() == v2b) {
+ emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f16, dst, false);
+ } else if (dst.regClass() == v1) {
emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
break;
}
case nir_op_fmed3: {
- if (dst.size() == 1) {
+ if (dst.regClass() == v2b) {
+ emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f16, dst, false);
+ } else if (dst.regClass() == v1) {
emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
break;
}
case nir_op_frsq: {
- if (dst.size() == 1) {
- emit_rsq(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
- } else if (dst.size() == 2) {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (dst.regClass() == v2b) {
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
+ } else if (dst.regClass() == v1) {
+ emit_rsq(ctx, bld, Definition(dst), src);
+ } else if (dst.regClass() == v2) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
}
case nir_op_fneg: {
Temp src = get_alu_src(ctx, instr->src[0]);
- if (dst.size() == 1) {
+ if (dst.regClass() == v2b) {
+ bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x8000u), as_vgpr(ctx, src));
+ } else if (dst.regClass() == v1) {
if (ctx->block->fp_mode.must_flush_denorms32)
src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
- } else if (dst.size() == 2) {
+ } else if (dst.regClass() == v2) {
if (ctx->block->fp_mode.must_flush_denorms16_64)
src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
}
case nir_op_fabs: {
Temp src = get_alu_src(ctx, instr->src[0]);
- if (dst.size() == 1) {
+ if (dst.regClass() == v2b) {
+ bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFu), as_vgpr(ctx, src));
+ } else if (dst.regClass() == v1) {
if (ctx->block->fp_mode.must_flush_denorms32)
src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
- } else if (dst.size() == 2) {
+ } else if (dst.regClass() == v2) {
if (ctx->block->fp_mode.must_flush_denorms16_64)
src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
}
case nir_op_fsat: {
Temp src = get_alu_src(ctx, instr->src[0]);
- if (dst.size() == 1) {
+ if (dst.regClass() == v2b) {
+ bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
+ } else if (dst.regClass() == v1) {
bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
/* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
// TODO: confirm that this holds under any circumstances
- } else if (dst.size() == 2) {
+ } else if (dst.regClass() == v2) {
Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
vop3->clamp = true;
break;
}
case nir_op_flog2: {
- if (dst.size() == 1) {
- emit_log2(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (dst.regClass() == v2b) {
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
+ } else if (dst.regClass() == v1) {
+ emit_log2(ctx, bld, Definition(dst), src);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
break;
}
case nir_op_frcp: {
- if (dst.size() == 1) {
- emit_rcp(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
- } else if (dst.size() == 2) {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (dst.regClass() == v2b) {
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
+ } else if (dst.regClass() == v1) {
+ emit_rcp(ctx, bld, Definition(dst), src);
+ } else if (dst.regClass() == v2) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
break;
}
case nir_op_fexp2: {
- if (dst.size() == 1) {
+ if (dst.regClass() == v2b) {
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
+ } else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
break;
}
case nir_op_fsqrt: {
- if (dst.size() == 1) {
- emit_sqrt(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
- } else if (dst.size() == 2) {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (dst.regClass() == v2b) {
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
+ } else if (dst.regClass() == v1) {
+ emit_sqrt(ctx, bld, Definition(dst), src);
+ } else if (dst.regClass() == v2) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
break;
}
case nir_op_ffract: {
- if (dst.size() == 1) {
+ if (dst.regClass() == v2b) {
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
+ } else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
- } else if (dst.size() == 2) {
+ } else if (dst.regClass() == v2) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
break;
}
case nir_op_ffloor: {
- if (dst.size() == 1) {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (dst.regClass() == v2b) {
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
+ } else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
- } else if (dst.size() == 2) {
- emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f64, dst);
+ } else if (dst.regClass() == v2) {
+ emit_floor_f64(ctx, bld, Definition(dst), src);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
break;
}
case nir_op_fceil: {
- if (dst.size() == 1) {
+ Temp src0 = get_alu_src(ctx, instr->src[0]);
+ if (dst.regClass() == v2b) {
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
+ } else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
- } else if (dst.size() == 2) {
- emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
+ } else if (dst.regClass() == v2) {
+ if (ctx->options->chip_class >= GFX7) {
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
+ } else {
+ /* GFX6 doesn't support V_CEIL_F64, lower it. */
+ /* trunc = trunc(src0)
+ * if (src0 > 0.0 && src0 != trunc)
+ * trunc += 1.0
+ */
+ Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
+ Temp tmp0 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand(0u));
+ Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc);
+ Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), tmp0, tmp1);
+ Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand(0u)), bld.copy(bld.def(v1), Operand(0x3ff00000u)), cond);
+ add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), bld.copy(bld.def(v1), Operand(0u)), add);
+ bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
+ }
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
break;
}
case nir_op_ftrunc: {
- if (dst.size() == 1) {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (dst.regClass() == v2b) {
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
+ } else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
- } else if (dst.size() == 2) {
- emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f64, dst);
+ } else if (dst.regClass() == v2) {
+ emit_trunc_f64(ctx, bld, Definition(dst), src);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
break;
}
case nir_op_fround_even: {
- if (dst.size() == 1) {
+ Temp src0 = get_alu_src(ctx, instr->src[0]);
+ if (dst.regClass() == v2b) {
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
+ } else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
- } else if (dst.size() == 2) {
- emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
+ } else if (dst.regClass() == v2) {
+ if (ctx->options->chip_class >= GFX7) {
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
+ } else {
+ /* GFX6 doesn't support V_RNDNE_F64, lower it. */
+ Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
+
+ Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1), bld.copy(bld.def(s1), Operand(-2u)));
+ Temp bfi = bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask, bld.copy(bld.def(v1), Operand(0x43300000u)), as_vgpr(ctx, src0_hi));
+ Temp tmp = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
+ Instruction *sub = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
+ static_cast<VOP3A_instruction*>(sub)->neg[1] = true;
+ tmp = sub->definitions[0].getTemp();
+
+ Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x432fffffu));
+ Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v);
+ static_cast<VOP3A_instruction*>(vop3)->abs[0] = true;
+ Temp cond = vop3->definitions[0].getTemp();
+
+ Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
+ Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo, as_vgpr(ctx, src0_lo), cond);
+ Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi, as_vgpr(ctx, src0_hi), cond);
+
+ bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
+ }
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
}
case nir_op_fsin:
case nir_op_fcos: {
- Temp src = get_alu_src(ctx, instr->src[0]);
+ Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
aco_ptr<Instruction> norm;
- if (dst.size() == 1) {
- Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
- Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, as_vgpr(ctx, src));
+ Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
+ if (dst.regClass() == v2b) {
+ Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
+ aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
+ bld.vop1(opcode, Definition(dst), tmp);
+ } else if (dst.regClass() == v1) {
+ Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
/* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
if (ctx->options->chip_class < GFX9)
break;
}
case nir_op_ldexp: {
- if (dst.size() == 1) {
- bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst),
- as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
- get_alu_src(ctx, instr->src[1]));
- } else if (dst.size() == 2) {
- bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst),
- as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
- get_alu_src(ctx, instr->src[1]));
+ Temp src0 = get_alu_src(ctx, instr->src[0]);
+ Temp src1 = get_alu_src(ctx, instr->src[1]);
+ if (dst.regClass() == v2b) {
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
+ } else if (dst.regClass() == v1) {
+ bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst), as_vgpr(ctx, src0), src1);
+ } else if (dst.regClass() == v2) {
+ bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst), as_vgpr(ctx, src0), src1);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
break;
}
case nir_op_frexp_sig: {
- if (dst.size() == 1) {
- bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst),
- get_alu_src(ctx, instr->src[0]));
- } else if (dst.size() == 2) {
- bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst),
- get_alu_src(ctx, instr->src[0]));
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (dst.regClass() == v2b) {
+ bld.vop1(aco_opcode::v_frexp_mant_f16, Definition(dst), src);
+ } else if (dst.regClass() == v1) {
+ bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst), src);
+ } else if (dst.regClass() == v2) {
+ bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst), src);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
break;
}
case nir_op_frexp_exp: {
- if (instr->src[0].src.ssa->bit_size == 32) {
- bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst),
- get_alu_src(ctx, instr->src[0]));
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (instr->src[0].src.ssa->bit_size == 16) {
+ Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
+ tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand(0u));
+ convert_int(bld, tmp, 8, 32, true, dst);
+ } else if (instr->src[0].src.ssa->bit_size == 32) {
+ bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst), src);
} else if (instr->src[0].src.ssa->bit_size == 64) {
- bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst),
- get_alu_src(ctx, instr->src[0]));
+ bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst), src);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
}
case nir_op_fsign: {
Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
- if (dst.size() == 1) {
+ if (dst.regClass() == v2b) {
+ Temp one = bld.copy(bld.def(v1), Operand(0x3c00u));
+ Temp minus_one = bld.copy(bld.def(v1), Operand(0xbc00u));
+ Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
+ src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), one, src, cond);
+ cond = bld.vopc(aco_opcode::v_cmp_le_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
+ bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), minus_one, src, cond);
+ } else if (dst.regClass() == v1) {
Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
- } else if (dst.size() == 2) {
+ } else if (dst.regClass() == v2) {
Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond);
}
break;
}
+ case nir_op_f2f16:
+ case nir_op_f2f16_rtne: {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (instr->src[0].src.ssa->bit_size == 64)
+ src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
+ bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
+ break;
+ }
+ case nir_op_f2f16_rtz: {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (instr->src[0].src.ssa->bit_size == 64)
+ src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
+ bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, Operand(0u));
+ break;
+ }
case nir_op_f2f32: {
- if (instr->src[0].src.ssa->bit_size == 64) {
+ if (instr->src[0].src.ssa->bit_size == 16) {
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
+ } else if (instr->src[0].src.ssa->bit_size == 64) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
break;
}
case nir_op_f2f64: {
- if (instr->src[0].src.ssa->bit_size == 32) {
- emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_f32, dst);
- } else {
- fprintf(stderr, "Unimplemented NIR instr bit size: ");
- nir_print_instr(&instr->instr, stderr);
- fprintf(stderr, "\n");
- }
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (instr->src[0].src.ssa->bit_size == 16)
+ src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
+ bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
+ break;
+ }
+ case nir_op_i2f16: {
+ assert(dst.regClass() == v2b);
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (instr->src[0].src.ssa->bit_size == 8)
+ src = convert_int(bld, src, 8, 16, true);
+ bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
break;
}
case nir_op_i2f32: {
assert(dst.size() == 1);
- emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_i32, dst);
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (instr->src[0].src.ssa->bit_size <= 16)
+ src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, true);
+ bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
break;
}
case nir_op_i2f64: {
- if (instr->src[0].src.ssa->bit_size == 32) {
- emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_i32, dst);
+ if (instr->src[0].src.ssa->bit_size <= 32) {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (instr->src[0].src.ssa->bit_size <= 16)
+ src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, true);
+ bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
} else if (instr->src[0].src.ssa->bit_size == 64) {
Temp src = get_alu_src(ctx, instr->src[0]);
RegClass rc = RegClass(src.type(), 1);
}
break;
}
+ case nir_op_u2f16: {
+ assert(dst.regClass() == v2b);
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (instr->src[0].src.ssa->bit_size == 8)
+ src = convert_int(bld, src, 8, 16, false);
+ bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
+ break;
+ }
case nir_op_u2f32: {
assert(dst.size() == 1);
- emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_u32, dst);
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (instr->src[0].src.ssa->bit_size == 8) {
+ //TODO: we should use v_cvt_f32_ubyte1/v_cvt_f32_ubyte2/etc depending on the register assignment
+ bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
+ } else {
+ if (instr->src[0].src.ssa->bit_size == 16)
+ src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, true);
+ bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
+ }
break;
}
case nir_op_u2f64: {
- if (instr->src[0].src.ssa->bit_size == 32) {
- emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_u32, dst);
+ if (instr->src[0].src.ssa->bit_size <= 32) {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (instr->src[0].src.ssa->bit_size <= 16)
+ src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, false);
+ bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
} else if (instr->src[0].src.ssa->bit_size == 64) {
Temp src = get_alu_src(ctx, instr->src[0]);
RegClass rc = RegClass(src.type(), 1);
}
break;
}
+ case nir_op_f2i8:
+ case nir_op_f2i16: {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (instr->src[0].src.ssa->bit_size == 16)
+ src = bld.vop1(aco_opcode::v_cvt_i16_f16, bld.def(v1), src);
+ else if (instr->src[0].src.ssa->bit_size == 32)
+ src = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src);
+ else
+ src = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src);
+
+ if (dst.type() == RegType::vgpr)
+ bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
+ else
+ bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
+ break;
+ }
+ case nir_op_f2u8:
+ case nir_op_f2u16: {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (instr->src[0].src.ssa->bit_size == 16)
+ src = bld.vop1(aco_opcode::v_cvt_u16_f16, bld.def(v1), src);
+ else if (instr->src[0].src.ssa->bit_size == 32)
+ src = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src);
+ else
+ src = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src);
+
+ if (dst.type() == RegType::vgpr)
+ bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
+ else
+ bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
+ break;
+ }
case nir_op_f2i32: {
Temp src = get_alu_src(ctx, instr->src[0]);
- if (instr->src[0].src.ssa->bit_size == 32) {
+ if (instr->src[0].src.ssa->bit_size == 16) {
+ Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
+ if (dst.type() == RegType::vgpr) {
+ bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
+ } else {
+ bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
+ bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
+ }
+ } else if (instr->src[0].src.ssa->bit_size == 32) {
if (dst.type() == RegType::vgpr)
bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src);
else
}
case nir_op_f2u32: {
Temp src = get_alu_src(ctx, instr->src[0]);
- if (instr->src[0].src.ssa->bit_size == 32) {
+ if (instr->src[0].src.ssa->bit_size == 16) {
+ Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
+ if (dst.type() == RegType::vgpr) {
+ bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
+ } else {
+ bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
+ bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
+ }
+ } else if (instr->src[0].src.ssa->bit_size == 32) {
if (dst.type() == RegType::vgpr)
bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src);
else
}
case nir_op_f2i64: {
Temp src = get_alu_src(ctx, instr->src[0]);
- if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
+ if (instr->src[0].src.ssa->bit_size == 16)
+ src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
+
+ if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u));
Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
- } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
+ } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
if (src.type() == RegType::vgpr)
src = bld.as_uniform(src);
Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
- exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
- exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
- exponent = bld.sop2(aco_opcode::s_min_u32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
+ exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
+ exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
+ exponent = bld.sop2(aco_opcode::s_min_i32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
} else if (instr->src[0].src.ssa->bit_size == 64) {
Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
- Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
+ Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
- Temp floor = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
+ Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
}
case nir_op_f2u64: {
Temp src = get_alu_src(ctx, instr->src[0]);
- if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
+ if (instr->src[0].src.ssa->bit_size == 16)
+ src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
+
+ if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(64u), exponent);
exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent);
upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range);
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
- } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
+ } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
if (src.type() == RegType::vgpr)
src = bld.as_uniform(src);
Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
- exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
- exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
+ exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
+ exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent);
} else if (instr->src[0].src.ssa->bit_size == 64) {
Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
- Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
+ Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
- Temp floor = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
+ Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
}
break;
}
- case nir_op_b2f32: {
+ case nir_op_b2f16: {
Temp src = get_alu_src(ctx, instr->src[0]);
assert(src.regClass() == bld.lm);
if (dst.regClass() == s1) {
src = bool_to_scalar_condition(ctx, src);
- bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src);
+ bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3c00u), src);
+ } else if (dst.regClass() == v2b) {
+ Temp one = bld.copy(bld.def(v1), Operand(0x3c00u));
+ bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), one, src);
+ } else {
+ unreachable("Wrong destination register class for nir_op_b2f16.");
+ }
+ break;
+ }
+ case nir_op_b2f32: {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ assert(src.regClass() == bld.lm);
+
+ if (dst.regClass() == s1) {
+ src = bool_to_scalar_condition(ctx, src);
+ bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src);
} else if (dst.regClass() == v1) {
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
} else {
}
break;
}
- case nir_op_i2i32: {
- Temp src = get_alu_src(ctx, instr->src[0]);
- if (instr->src[0].src.ssa->bit_size == 64) {
- /* we can actually just say dst = src, as it would map the lower register */
- emit_extract_vector(ctx, src, 0, dst);
- } else {
- fprintf(stderr, "Unimplemented NIR instr bit size: ");
- nir_print_instr(&instr->instr, stderr);
- fprintf(stderr, "\n");
- }
- break;
- }
- case nir_op_u2u32: {
- Temp src = get_alu_src(ctx, instr->src[0]);
- if (instr->src[0].src.ssa->bit_size == 16) {
- if (dst.regClass() == s1) {
- bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFFFu), src);
- } else {
- // TODO: do better with SDWA
- bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0xFFFFu), src);
- }
- } else if (instr->src[0].src.ssa->bit_size == 64) {
- /* we can actually just say dst = src, as it would map the lower register */
- emit_extract_vector(ctx, src, 0, dst);
- } else {
- fprintf(stderr, "Unimplemented NIR instr bit size: ");
- nir_print_instr(&instr->instr, stderr);
- fprintf(stderr, "\n");
- }
- break;
- }
+ case nir_op_i2i8:
+ case nir_op_i2i16:
+ case nir_op_i2i32:
case nir_op_i2i64: {
- Temp src = get_alu_src(ctx, instr->src[0]);
- if (src.regClass() == s1) {
- Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
- bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
- } else if (src.regClass() == v1) {
- Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
- bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
- } else {
- fprintf(stderr, "Unimplemented NIR instr bit size: ");
- nir_print_instr(&instr->instr, stderr);
- fprintf(stderr, "\n");
- }
+ convert_int(bld, get_alu_src(ctx, instr->src[0]),
+ instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, true, dst);
break;
}
+ case nir_op_u2u8:
+ case nir_op_u2u16:
+ case nir_op_u2u32:
case nir_op_u2u64: {
- Temp src = get_alu_src(ctx, instr->src[0]);
- if (instr->src[0].src.ssa->bit_size == 32) {
- bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
- } else {
- fprintf(stderr, "Unimplemented NIR instr bit size: ");
- nir_print_instr(&instr->instr, stderr);
- fprintf(stderr, "\n");
- }
+ convert_int(bld, get_alu_src(ctx, instr->src[0]),
+ instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, false, dst);
break;
}
+ case nir_op_b2b32:
case nir_op_b2i32: {
Temp src = get_alu_src(ctx, instr->src[0]);
assert(src.regClass() == bld.lm);
}
break;
}
+ case nir_op_b2b1:
case nir_op_i2b1: {
Temp src = get_alu_src(ctx, instr->src[0]);
assert(dst.regClass() == bld.lm);
case nir_op_unpack_64_2x32_split_y:
bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
break;
+ case nir_op_unpack_32_2x16_split_x:
+ if (dst.type() == RegType::vgpr) {
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
+ } else {
+ bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
+ }
+ break;
+ case nir_op_unpack_32_2x16_split_y:
+ if (dst.type() == RegType::vgpr) {
+ bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
+ } else {
+ bld.sop2(aco_opcode::s_bfe_u32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]), Operand(uint32_t(16 << 16 | 16)));
+ }
+ break;
+ case nir_op_pack_32_2x16_split: {
+ Temp src0 = get_alu_src(ctx, instr->src[0]);
+ Temp src1 = get_alu_src(ctx, instr->src[1]);
+ if (dst.regClass() == v1) {
+ src0 = emit_extract_vector(ctx, src0, 0, v2b);
+ src1 = emit_extract_vector(ctx, src1, 0, v2b);
+ bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
+ } else {
+ src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0, Operand(0xFFFFu));
+ src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1, Operand(16u));
+ bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
+ }
+ break;
+ }
case nir_op_pack_half_2x16: {
Temp src = get_alu_src(ctx, instr->src[0], 2);
break;
}
case nir_op_flt: {
- emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f32, aco_opcode::v_cmp_lt_f64);
+ emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32, aco_opcode::v_cmp_lt_f64);
break;
}
case nir_op_fge: {
- emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f32, aco_opcode::v_cmp_ge_f64);
+ emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32, aco_opcode::v_cmp_ge_f64);
break;
}
case nir_op_feq: {
- emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f32, aco_opcode::v_cmp_eq_f64);
+ emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32, aco_opcode::v_cmp_eq_f64);
break;
}
case nir_op_fne: {
- emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f32, aco_opcode::v_cmp_neq_f64);
+ emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32, aco_opcode::v_cmp_neq_f64);
break;
}
case nir_op_ilt: {
- emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i32, aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
+ emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32, aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
break;
}
case nir_op_ige: {
- emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i32, aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
+ emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32, aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
break;
}
case nir_op_ieq: {
if (instr->src[0].src.ssa->bit_size == 1)
emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
else
- emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
+ emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
break;
}
if (instr->src[0].src.ssa->bit_size == 1)
emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
else
- emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
+ emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
break;
}
case nir_op_ult: {
- emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u32, aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
+ emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32, aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
break;
}
case nir_op_uge: {
- emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u32, aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
+ emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32, aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
break;
}
case nir_op_fddx:
int val = instr->value[0].b ? -1 : 0;
Operand op = bld.lm.size() == 1 ? Operand((uint32_t) val) : Operand((uint64_t) val);
bld.sop1(Builder::s_mov, Definition(dst), op);
+ } else if (instr->def.bit_size == 8) {
+ /* ensure that the value is correctly represented in the low byte of the register */
+ bld.sopk(aco_opcode::s_movk_i32, Definition(dst), instr->value[0].u8);
+ } else if (instr->def.bit_size == 16) {
+ /* ensure that the value is correctly represented in the low half of the register */
+ bld.sopk(aco_opcode::s_movk_i32, Definition(dst), instr->value[0].u16);
} else if (dst.size() == 1) {
bld.copy(Definition(dst), Operand(instr->value[0].u32));
} else {
return new_mask;
}
-void visit_store_vs_output(isel_context *ctx, nir_intrinsic_instr *instr)
+void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst)
{
- /* This wouldn't work inside control flow or with indirect offsets but
- * that doesn't happen because of nir_lower_io_to_temporaries(). */
-
- unsigned write_mask = nir_intrinsic_write_mask(instr);
- unsigned component = nir_intrinsic_component(instr);
- Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
- unsigned idx = nir_intrinsic_base(instr) + component;
+ Builder bld(ctx->program, ctx->block);
+ if (offset.isTemp()) {
+ Temp tmp[3] = {vec, vec, vec};
- nir_instr *off_instr = instr->src[1].ssa->parent_instr;
- if (off_instr->type != nir_instr_type_load_const) {
- fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
- nir_print_instr(off_instr, stderr);
- fprintf(stderr, "\n");
- }
- idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 4u;
+ if (vec.size() == 3) {
+ tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec);
+ } else if (vec.size() == 2) {
+ tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
+ bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
+ }
+ for (unsigned i = 0; i < dst.size(); i++)
+ tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
- if (instr->src[0].ssa->bit_size == 64)
- write_mask = widen_mask(write_mask, 2);
+ vec = tmp[0];
+ if (dst.size() == 2)
+ vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
- for (unsigned i = 0; i < 8; ++i) {
- if (write_mask & (1 << i)) {
- ctx->vs_output.mask[idx / 4u] |= 1 << (idx % 4u);
- ctx->vs_output.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1);
- }
- idx++;
+ offset = Operand(0u);
}
-}
-void visit_store_fs_output(isel_context *ctx, nir_intrinsic_instr *instr)
-{
- Builder bld(ctx->program, ctx->block);
- unsigned write_mask = nir_intrinsic_write_mask(instr);
- Operand values[4];
- Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
- for (unsigned i = 0; i < 4; ++i) {
- if (write_mask & (1 << i)) {
- Temp tmp = emit_extract_vector(ctx, src, i, v1);
- values[i] = Operand(tmp);
- } else {
- values[i] = Operand(v1);
- }
- }
+ if (vec.bytes() == dst.bytes() && offset.constantValue() == 0)
+ bld.copy(Definition(dst), vec);
+ else
+ trim_subdword_vector(ctx, vec, dst, vec.bytes(), ((1 << dst.bytes()) - 1) << offset.constantValue());
+}
- unsigned index = nir_intrinsic_base(instr) / 4;
- unsigned target, col_format;
- unsigned enabled_channels = 0xF;
- aco_opcode compr_op = (aco_opcode)0;
+struct LoadEmitInfo {
+ Operand offset;
+ Temp dst;
+ unsigned num_components;
+ unsigned component_size;
+ Temp resource = Temp(0, s1);
+ unsigned component_stride = 0;
+ unsigned const_offset = 0;
+ unsigned align_mul = 0;
+ unsigned align_offset = 0;
+
+ bool glc = false;
+ unsigned swizzle_component_size = 0;
+ barrier_interaction barrier = barrier_none;
+ bool can_reorder = true;
+ Temp soffset = Temp(0, s1);
+};
- nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
- assert(offset && "Non-const offsets on exports not yet supported");
- index += offset->u32;
+using LoadCallback = Temp(*)(
+ Builder& bld, const LoadEmitInfo* info, Temp offset, unsigned bytes_needed,
+ unsigned align, unsigned const_offset, Temp dst_hint);
- assert(index != FRAG_RESULT_COLOR);
+template <LoadCallback callback, bool byte_align_loads, bool supports_8bit_16bit_loads, unsigned max_const_offset_plus_one>
+void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
+{
+ unsigned load_size = info->num_components * info->component_size;
+ unsigned component_size = info->component_size;
- /* Unlike vertex shader exports, it's fine to use multiple exports to
- * export separate channels of one target. So shaders which export both
- * FRAG_RESULT_SAMPLE_MASK and FRAG_RESULT_DEPTH should work fine.
- * TODO: combine the exports in those cases and create better code
- */
+ unsigned num_vals = 0;
+ Temp vals[info->dst.bytes()];
- if (index == FRAG_RESULT_SAMPLE_MASK) {
+ unsigned const_offset = info->const_offset;
- if (ctx->program->info->ps.writes_z) {
- target = V_008DFC_SQ_EXP_MRTZ;
- enabled_channels = 0x4;
- col_format = (unsigned) -1;
+ unsigned align_mul = info->align_mul ? info->align_mul : component_size;
+ unsigned align_offset = (info->align_offset + const_offset) % align_mul;
- values[2] = values[0];
- values[0] = Operand(v1);
- } else {
- bld.exp(aco_opcode::exp, Operand(v1), Operand(values[0]), Operand(v1), Operand(v1),
- 0xc, V_008DFC_SQ_EXP_MRTZ, true);
- return;
+ unsigned bytes_read = 0;
+ while (bytes_read < load_size) {
+ unsigned bytes_needed = load_size - bytes_read;
+
+ /* add buffer for unaligned loads */
+ int byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
+
+ if (byte_align) {
+ if ((bytes_needed > 2 || !supports_8bit_16bit_loads) && byte_align_loads) {
+ if (info->component_stride) {
+ assert(supports_8bit_16bit_loads && "unimplemented");
+ bytes_needed = 2;
+ byte_align = 0;
+ } else {
+ bytes_needed += byte_align == -1 ? 4 - info->align_mul : byte_align;
+ bytes_needed = align(bytes_needed, 4);
+ }
+ } else {
+ byte_align = 0;
+ }
}
- } else if (index == FRAG_RESULT_DEPTH) {
+ if (info->swizzle_component_size)
+ bytes_needed = MIN2(bytes_needed, info->swizzle_component_size);
+ if (info->component_stride)
+ bytes_needed = MIN2(bytes_needed, info->component_size);
- target = V_008DFC_SQ_EXP_MRTZ;
- enabled_channels = 0x1;
- col_format = (unsigned) -1;
+ bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4);
- } else if (index == FRAG_RESULT_STENCIL) {
+ /* reduce constant offset */
+ Operand offset = info->offset;
+ unsigned reduced_const_offset = const_offset;
+ bool remove_const_offset_completely = need_to_align_offset;
+ if (const_offset && (remove_const_offset_completely || const_offset >= max_const_offset_plus_one)) {
+ unsigned to_add = const_offset;
+ if (remove_const_offset_completely) {
+ reduced_const_offset = 0;
+ } else {
+ to_add = const_offset / max_const_offset_plus_one * max_const_offset_plus_one;
+ reduced_const_offset %= max_const_offset_plus_one;
+ }
+ Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
+ if (offset.isConstant()) {
+ offset = Operand(offset.constantValue() + to_add);
+ } else if (offset_tmp.regClass() == s1) {
+ offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
+ offset_tmp, Operand(to_add));
+ } else if (offset_tmp.regClass() == v1) {
+ offset = bld.vadd32(bld.def(v1), offset_tmp, Operand(to_add));
+ } else {
+ Temp lo = bld.tmp(offset_tmp.type(), 1);
+ Temp hi = bld.tmp(offset_tmp.type(), 1);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
+
+ if (offset_tmp.regClass() == s2) {
+ Temp carry = bld.tmp(s1);
+ lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo, Operand(to_add));
+ hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry);
+ offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
+ } else {
+ Temp new_lo = bld.tmp(v1);
+ Temp carry = bld.vadd32(Definition(new_lo), lo, Operand(to_add), true).def(1).getTemp();
+ hi = bld.vadd32(bld.def(v1), hi, Operand(0u), false, carry);
+ offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi);
+ }
+ }
+ }
- if (ctx->program->info->ps.writes_z) {
- target = V_008DFC_SQ_EXP_MRTZ;
- enabled_channels = 0x2;
- col_format = (unsigned) -1;
+ /* align offset down if needed */
+ Operand aligned_offset = offset;
+ if (need_to_align_offset) {
+ Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
+ if (offset.isConstant()) {
+ aligned_offset = Operand(offset.constantValue() & 0xfffffffcu);
+ } else if (offset_tmp.regClass() == s1) {
+ aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfffffffcu), offset_tmp);
+ } else if (offset_tmp.regClass() == s2) {
+ aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand((uint64_t)0xfffffffffffffffcllu), offset_tmp);
+ } else if (offset_tmp.regClass() == v1) {
+ aligned_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xfffffffcu), offset_tmp);
+ } else if (offset_tmp.regClass() == v2) {
+ Temp hi = bld.tmp(v1), lo = bld.tmp(v1);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
+ lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xfffffffcu), lo);
+ aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi);
+ }
+ }
+ Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp() :
+ bld.copy(bld.def(s1), aligned_offset);
+
+ unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
+ Temp val = callback(bld, info, aligned_offset_tmp, bytes_needed, align,
+ reduced_const_offset, byte_align ? Temp() : info->dst);
+
+ /* shift result right if needed */
+ if (byte_align) {
+ Operand align((uint32_t)byte_align);
+ if (byte_align == -1) {
+ if (offset.isConstant())
+ align = Operand(offset.constantValue() % 4u);
+ else if (offset.size() == 2)
+ align = Operand(emit_extract_vector(ctx, offset.getTemp(), 0, RegClass(offset.getTemp().type(), 1)));
+ else
+ align = offset;
+ }
- values[1] = values[0];
- values[0] = Operand(v1);
- } else {
- values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u), values[0]);
- bld.exp(aco_opcode::exp, values[0], Operand(v1), Operand(v1), Operand(v1),
- 0x3, V_008DFC_SQ_EXP_MRTZ, true);
- return;
+ if (align.isTemp() || align.constantValue()) {
+ assert(val.bytes() >= load_size && "unimplemented");
+ Temp new_val = bld.tmp(RegClass::get(val.type(), load_size));
+ if (val.type() == RegType::sgpr)
+ byte_align_scalar(ctx, val, align, new_val);
+ else
+ byte_align_vector(ctx, val, align, new_val);
+ val = new_val;
+ }
}
- } else {
- index -= FRAG_RESULT_DATA0;
- target = V_008DFC_SQ_EXP_MRT + index;
- col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf;
+ /* add result to list and advance */
+ if (info->component_stride) {
+ assert(val.bytes() == info->component_size && "unimplemented");
+ const_offset += info->component_stride;
+ align_offset = (align_offset + info->component_stride) % align_mul;
+ } else {
+ const_offset += val.bytes();
+ align_offset = (align_offset + val.bytes()) % align_mul;
+ }
+ bytes_read += val.bytes();
+ vals[num_vals++] = val;
}
- bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1;
- bool is_int10 = (ctx->options->key.fs.is_int10 >> index) & 1;
-
- switch (col_format)
- {
- case V_028714_SPI_SHADER_ZERO:
- enabled_channels = 0; /* writemask */
- target = V_008DFC_SQ_EXP_NULL;
- break;
-
- case V_028714_SPI_SHADER_32_R:
- enabled_channels = 1;
- break;
- case V_028714_SPI_SHADER_32_GR:
- enabled_channels = 0x3;
- break;
+ /* the callback wrote directly to dst */
+ if (vals[0] == info->dst) {
+ assert(num_vals == 1);
+ emit_split_vector(ctx, info->dst, info->num_components);
+ return;
+ }
- case V_028714_SPI_SHADER_32_AR:
- if (ctx->options->chip_class >= GFX10) {
- /* Special case: on GFX10, the outputs are different for 32_AR */
- enabled_channels = 0x3;
- values[1] = values[3];
- values[3] = Operand(v1);
- } else {
- enabled_channels = 0x9;
+ /* create array of components */
+ unsigned components_split = 0;
+ std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
+ bool has_vgprs = false;
+ for (unsigned i = 0; i < num_vals;) {
+ Temp tmp[num_vals];
+ unsigned num_tmps = 0;
+ unsigned tmp_size = 0;
+ RegType reg_type = RegType::sgpr;
+ while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) {
+ if (vals[i].type() == RegType::vgpr)
+ reg_type = RegType::vgpr;
+ tmp_size += vals[i].bytes();
+ tmp[num_tmps++] = vals[i++];
+ }
+ if (num_tmps > 1) {
+ aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+ aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
+ for (unsigned i = 0; i < num_vals; i++)
+ vec->operands[i] = Operand(tmp[i]);
+ tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
+ vec->definitions[0] = Definition(tmp[0]);
+ bld.insert(std::move(vec));
}
- break;
-
- case V_028714_SPI_SHADER_FP16_ABGR:
- enabled_channels = 0x5;
- compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
- break;
- case V_028714_SPI_SHADER_UNORM16_ABGR:
- enabled_channels = 0x5;
- compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
- break;
+ if (tmp[0].bytes() % component_size) {
+ /* trim tmp[0] */
+ assert(i == num_vals);
+ RegClass new_rc = RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);
+ tmp[0] = bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand(0u));
+ }
- case V_028714_SPI_SHADER_SNORM16_ABGR:
- enabled_channels = 0x5;
- compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
- break;
+ RegClass elem_rc = RegClass::get(reg_type, component_size);
- case V_028714_SPI_SHADER_UINT16_ABGR: {
- enabled_channels = 0x5;
- compr_op = aco_opcode::v_cvt_pk_u16_u32;
- if (is_int8 || is_int10) {
- /* clamp */
- uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
- Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb));
+ unsigned start = components_split;
- for (unsigned i = 0; i < 4; i++) {
- if ((write_mask >> i) & 1) {
- values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1),
- i == 3 && is_int10 ? Operand(3u) : Operand(max_rgb_val),
- values[i]);
- }
+ if (tmp_size == elem_rc.bytes()) {
+ allocated_vec[components_split++] = tmp[0];
+ } else {
+ assert(tmp_size % elem_rc.bytes() == 0);
+ aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
+ aco_opcode::p_split_vector, Format::PSEUDO, 1, tmp_size / elem_rc.bytes())};
+ for (unsigned i = 0; i < split->definitions.size(); i++) {
+ Temp component = bld.tmp(elem_rc);
+ allocated_vec[components_split++] = component;
+ split->definitions[i] = Definition(component);
}
+ split->operands[0] = Operand(tmp[0]);
+ bld.insert(std::move(split));
}
- break;
- }
-
- case V_028714_SPI_SHADER_SINT16_ABGR:
- enabled_channels = 0x5;
- compr_op = aco_opcode::v_cvt_pk_i16_i32;
- if (is_int8 || is_int10) {
- /* clamp */
- uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
- uint32_t min_rgb = is_int8 ? -128 :is_int10 ? -512 : 0;
- Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb));
- Temp min_rgb_val = bld.copy(bld.def(s1), Operand(min_rgb));
- for (unsigned i = 0; i < 4; i++) {
- if ((write_mask >> i) & 1) {
- values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1),
- i == 3 && is_int10 ? Operand(1u) : Operand(max_rgb_val),
- values[i]);
- values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1),
- i == 3 && is_int10 ? Operand(-2u) : Operand(min_rgb_val),
- values[i]);
- }
- }
+ /* try to p_as_uniform early so we can create more optimizable code and
+ * also update allocated_vec */
+ for (unsigned j = start; j < components_split; j++) {
+ if (allocated_vec[j].bytes() % 4 == 0 && info->dst.type() == RegType::sgpr)
+ allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
+ has_vgprs |= allocated_vec[j].type() == RegType::vgpr;
}
- break;
-
- case V_028714_SPI_SHADER_32_ABGR:
- enabled_channels = 0xF;
- break;
-
- default:
- break;
}
- if (target == V_008DFC_SQ_EXP_NULL)
- return;
+ /* concatenate components and p_as_uniform() result if needed */
+ if (info->dst.type() == RegType::vgpr || !has_vgprs)
+ ctx->allocated_vec.emplace(info->dst.id(), allocated_vec);
- if ((bool) compr_op) {
- for (int i = 0; i < 2; i++) {
- /* check if at least one of the values to be compressed is enabled */
- unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1;
- if (enabled) {
- enabled_channels |= enabled << (i*2);
- values[i] = bld.vop3(compr_op, bld.def(v1),
- values[i*2].isUndefined() ? Operand(0u) : values[i*2],
- values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1]);
- } else {
- values[i] = Operand(v1);
- }
- }
- values[2] = Operand(v1);
- values[3] = Operand(v1);
+ int padding_bytes = MAX2((int)info->dst.bytes() - int(allocated_vec[0].bytes() * info->num_components), 0);
+
+ aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+ aco_opcode::p_create_vector, Format::PSEUDO, info->num_components + !!padding_bytes, 1)};
+ for (unsigned i = 0; i < info->num_components; i++)
+ vec->operands[i] = Operand(allocated_vec[i]);
+ if (padding_bytes)
+ vec->operands[info->num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes));
+ if (info->dst.type() == RegType::sgpr && has_vgprs) {
+ Temp tmp = bld.tmp(RegType::vgpr, info->dst.size());
+ vec->definitions[0] = Definition(tmp);
+ bld.insert(std::move(vec));
+ bld.pseudo(aco_opcode::p_as_uniform, Definition(info->dst), tmp);
} else {
- for (int i = 0; i < 4; i++)
- values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
+ vec->definitions[0] = Definition(info->dst);
+ bld.insert(std::move(vec));
}
-
- bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3],
- enabled_channels, target, (bool) compr_op);
}
-Operand load_lds_size_m0(isel_context *ctx)
+Operand load_lds_size_m0(Builder& bld)
{
/* TODO: m0 does not need to be initialized on GFX9+ */
- Builder bld(ctx->program, ctx->block);
return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff));
}
-void load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
- Temp address, unsigned base_offset, unsigned align)
+Temp lds_load_callback(Builder& bld, const LoadEmitInfo *info,
+ Temp offset, unsigned bytes_needed,
+ unsigned align, unsigned const_offset,
+ Temp dst_hint)
{
- assert(util_is_power_of_two_nonzero(align) && align >= 4);
+ offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
- Builder bld(ctx->program, ctx->block);
+ Operand m = load_lds_size_m0(bld);
- Operand m = load_lds_size_m0(ctx);
+ bool large_ds_read = bld.program->chip_class >= GFX7;
+ bool usable_read2 = bld.program->chip_class >= GFX7;
- unsigned num_components = dst.size() * 4u / elem_size_bytes;
- unsigned bytes_read = 0;
- unsigned result_size = 0;
- unsigned total_bytes = num_components * elem_size_bytes;
- std::array<Temp, NIR_MAX_VEC_COMPONENTS> result;
-
- while (bytes_read < total_bytes) {
- unsigned todo = total_bytes - bytes_read;
- bool aligned8 = bytes_read % 8 == 0 && align % 8 == 0;
- bool aligned16 = bytes_read % 16 == 0 && align % 16 == 0;
-
- aco_opcode op = aco_opcode::last_opcode;
- bool read2 = false;
- if (todo >= 16 && aligned16) {
- op = aco_opcode::ds_read_b128;
- todo = 16;
- } else if (todo >= 16 && aligned8) {
- op = aco_opcode::ds_read2_b64;
- read2 = true;
- todo = 16;
- } else if (todo >= 12 && aligned16) {
- op = aco_opcode::ds_read_b96;
- todo = 12;
- } else if (todo >= 8 && aligned8) {
- op = aco_opcode::ds_read_b64;
- todo = 8;
- } else if (todo >= 8) {
- op = aco_opcode::ds_read2_b32;
- read2 = true;
- todo = 8;
- } else if (todo >= 4) {
- op = aco_opcode::ds_read_b32;
- todo = 4;
- } else {
- assert(false);
- }
- assert(todo % elem_size_bytes == 0);
- unsigned num_elements = todo / elem_size_bytes;
- unsigned offset = base_offset + bytes_read;
- unsigned max_offset = read2 ? 1019 : 65535;
+ bool read2 = false;
+ unsigned size = 0;
+ aco_opcode op;
+ //TODO: use ds_read_u8_d16_hi/ds_read_u16_d16_hi if beneficial
+ if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
+ size = 16;
+ op = aco_opcode::ds_read_b128;
+ } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
+ size = 16;
+ read2 = true;
+ op = aco_opcode::ds_read2_b64;
+ } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
+ size = 12;
+ op = aco_opcode::ds_read_b96;
+ } else if (bytes_needed >= 8 && align % 8 == 0) {
+ size = 8;
+ op = aco_opcode::ds_read_b64;
+ } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0) {
+ size = 8;
+ read2 = true;
+ op = aco_opcode::ds_read2_b32;
+ } else if (bytes_needed >= 4 && align % 4 == 0) {
+ size = 4;
+ op = aco_opcode::ds_read_b32;
+ } else if (bytes_needed >= 2 && align % 2 == 0) {
+ size = 2;
+ op = aco_opcode::ds_read_u16;
+ } else {
+ size = 1;
+ op = aco_opcode::ds_read_u8;
+ }
- Temp address_offset = address;
- if (offset > max_offset) {
- address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
- offset = bytes_read;
- }
- assert(offset <= max_offset); /* bytes_read shouldn't be large enough for this to happen */
+ unsigned max_offset_plus_one = read2 ? 254 * (size / 2u) + 1 : 65536;
+ if (const_offset >= max_offset_plus_one) {
+ offset = bld.vadd32(bld.def(v1), offset, Operand(const_offset / max_offset_plus_one));
+ const_offset %= max_offset_plus_one;
+ }
- Temp res;
- if (num_components == 1 && dst.type() == RegType::vgpr)
- res = dst;
- else
- res = bld.tmp(RegClass(RegType::vgpr, todo / 4));
+ if (read2)
+ const_offset /= (size / 2u);
- if (read2)
- res = bld.ds(op, Definition(res), address_offset, m, offset >> 2, (offset >> 2) + 1);
- else
- res = bld.ds(op, Definition(res), address_offset, m, offset);
+ RegClass rc = RegClass(RegType::vgpr, DIV_ROUND_UP(size, 4));
+ Temp val = rc == info->dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
+ if (read2)
+ bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
+ else
+ bld.ds(op, Definition(val), offset, m, const_offset);
- if (num_components == 1) {
- assert(todo == total_bytes);
- if (dst.type() == RegType::sgpr)
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
- return;
- }
+ if (size < 4)
+ val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, size)), val, Operand(0u));
- if (dst.type() == RegType::sgpr)
- res = bld.as_uniform(res);
+ return val;
+}
- if (num_elements == 1) {
- result[result_size++] = res;
- } else {
- assert(res != dst && res.size() % num_elements == 0);
- aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elements)};
- split->operands[0] = Operand(res);
- for (unsigned i = 0; i < num_elements; i++)
- split->definitions[i] = Definition(result[result_size++] = bld.tmp(res.type(), elem_size_bytes / 4));
- ctx->block->instructions.emplace_back(std::move(split));
+static auto emit_lds_load = emit_load<lds_load_callback, false, true, UINT32_MAX>;
+
+Temp smem_load_callback(Builder& bld, const LoadEmitInfo *info,
+ Temp offset, unsigned bytes_needed,
+ unsigned align, unsigned const_offset,
+ Temp dst_hint)
+{
+ unsigned size = 0;
+ aco_opcode op;
+ if (bytes_needed <= 4) {
+ size = 1;
+ op = info->resource.id() ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;
+ } else if (bytes_needed <= 8) {
+ size = 2;
+ op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;
+ } else if (bytes_needed <= 16) {
+ size = 4;
+ op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;
+ } else if (bytes_needed <= 32) {
+ size = 8;
+ op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;
+ } else {
+ size = 16;
+ op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
+ }
+ aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
+ if (info->resource.id()) {
+ load->operands[0] = Operand(info->resource);
+ load->operands[1] = Operand(offset);
+ } else {
+ load->operands[0] = Operand(offset);
+ load->operands[1] = Operand(0u);
+ }
+ RegClass rc(RegType::sgpr, size);
+ Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
+ load->definitions[0] = Definition(val);
+ load->glc = info->glc;
+ load->dlc = info->glc && bld.program->chip_class >= GFX10;
+ load->barrier = info->barrier;
+ load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works
+ bld.insert(std::move(load));
+ return val;
+}
+
+static auto emit_smem_load = emit_load<smem_load_callback, true, false, 1024>;
+
+Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
+ Temp offset, unsigned bytes_needed,
+ unsigned align_, unsigned const_offset,
+ Temp dst_hint)
+{
+ Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
+ Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
+
+ if (info->soffset.id()) {
+ if (soffset.isTemp())
+ vaddr = bld.copy(bld.def(v1), soffset);
+ soffset = Operand(info->soffset);
+ }
+
+ unsigned bytes_size = 0;
+ aco_opcode op;
+ if (bytes_needed == 1) {
+ bytes_size = 1;
+ op = aco_opcode::buffer_load_ubyte;
+ } else if (bytes_needed == 2) {
+ bytes_size = 2;
+ op = aco_opcode::buffer_load_ushort;
+ } else if (bytes_needed <= 4) {
+ bytes_size = 4;
+ op = aco_opcode::buffer_load_dword;
+ } else if (bytes_needed <= 8) {
+ bytes_size = 8;
+ op = aco_opcode::buffer_load_dwordx2;
+ } else if (bytes_needed <= 12 && bld.program->chip_class > GFX6) {
+ bytes_size = 12;
+ op = aco_opcode::buffer_load_dwordx3;
+ } else {
+ bytes_size = 16;
+ op = aco_opcode::buffer_load_dwordx4;
+ }
+ aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
+ mubuf->operands[0] = Operand(info->resource);
+ mubuf->operands[1] = vaddr;
+ mubuf->operands[2] = soffset;
+ mubuf->offen = (offset.type() == RegType::vgpr);
+ mubuf->glc = info->glc;
+ mubuf->dlc = info->glc && bld.program->chip_class >= GFX10;
+ mubuf->barrier = info->barrier;
+ mubuf->can_reorder = info->can_reorder;
+ mubuf->offset = const_offset;
+ RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
+ Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
+ mubuf->definitions[0] = Definition(val);
+ bld.insert(std::move(mubuf));
+
+ if (bytes_size < 4)
+ val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, bytes_size)), val, Operand(0u));
+
+ return val;
+}
+
+static auto emit_mubuf_load = emit_load<mubuf_load_callback, true, true, 4096>;
+
+Temp get_gfx6_global_rsrc(Builder& bld, Temp addr)
+{
+ uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+ S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+
+ if (addr.type() == RegType::vgpr)
+ return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand(0u), Operand(0u), Operand(-1u), Operand(rsrc_conf));
+ return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand(-1u), Operand(rsrc_conf));
+}
+
+Temp global_load_callback(Builder& bld, const LoadEmitInfo *info,
+ Temp offset, unsigned bytes_needed,
+ unsigned align_, unsigned const_offset,
+ Temp dst_hint)
+{
+ unsigned bytes_size = 0;
+ bool mubuf = bld.program->chip_class == GFX6;
+ bool global = bld.program->chip_class >= GFX9;
+ aco_opcode op;
+ if (bytes_needed == 1) {
+ bytes_size = 1;
+ op = mubuf ? aco_opcode::buffer_load_ubyte : global ? aco_opcode::global_load_ubyte : aco_opcode::flat_load_ubyte;
+ } else if (bytes_needed == 2) {
+ bytes_size = 2;
+ op = mubuf ? aco_opcode::buffer_load_ushort : global ? aco_opcode::global_load_ushort : aco_opcode::flat_load_ushort;
+ } else if (bytes_needed <= 4) {
+ bytes_size = 4;
+ op = mubuf ? aco_opcode::buffer_load_dword : global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
+ } else if (bytes_needed <= 8) {
+ bytes_size = 8;
+ op = mubuf ? aco_opcode::buffer_load_dwordx2 : global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
+ } else if (bytes_needed <= 12 && !mubuf) {
+ bytes_size = 12;
+ op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
+ } else {
+ bytes_size = 16;
+ op = mubuf ? aco_opcode::buffer_load_dwordx4 : global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
+ }
+ RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
+ Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
+ if (mubuf) {
+ aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
+ mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, offset));
+ mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
+ mubuf->operands[2] = Operand(0u);
+ mubuf->glc = info->glc;
+ mubuf->dlc = false;
+ mubuf->offset = 0;
+ mubuf->addr64 = offset.type() == RegType::vgpr;
+ mubuf->disable_wqm = false;
+ mubuf->barrier = info->barrier;
+ mubuf->definitions[0] = Definition(val);
+ bld.insert(std::move(mubuf));
+ } else {
+ offset = offset.regClass() == s2 ? bld.copy(bld.def(v2), offset) : offset;
+
+ aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
+ flat->operands[0] = Operand(offset);
+ flat->operands[1] = Operand(s1);
+ flat->glc = info->glc;
+ flat->dlc = info->glc && bld.program->chip_class >= GFX10;
+ flat->barrier = info->barrier;
+ flat->offset = 0u;
+ flat->definitions[0] = Definition(val);
+ bld.insert(std::move(flat));
+ }
+
+ if (bytes_size < 4)
+ val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, bytes_size)), val, Operand(0u));
+
+ return val;
+}
+
+static auto emit_global_load = emit_load<global_load_callback, true, true, 1>;
+
+Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
+ Temp address, unsigned base_offset, unsigned align)
+{
+ assert(util_is_power_of_two_nonzero(align));
+
+ Builder bld(ctx->program, ctx->block);
+
+ unsigned num_components = dst.bytes() / elem_size_bytes;
+ LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
+ info.align_mul = align;
+ info.align_offset = 0;
+ info.barrier = barrier_shared;
+ info.can_reorder = false;
+ info.const_offset = base_offset;
+ emit_lds_load(ctx, bld, &info);
+
+ return dst;
+}
+
+void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp *dst, unsigned *offsets, Temp src)
+{
+ if (!count)
+ return;
+
+ Builder bld(ctx->program, ctx->block);
+
+ ASSERTED bool is_subdword = false;
+ for (unsigned i = 0; i < count; i++)
+ is_subdword |= offsets[i] % 4;
+ is_subdword |= (src.bytes() - offsets[count - 1]) % 4;
+ assert(!is_subdword || dst_type == RegType::vgpr);
+
+ /* count == 1 fast path */
+ if (count == 1) {
+ if (dst_type == RegType::sgpr)
+ dst[0] = bld.as_uniform(src);
+ else
+ dst[0] = as_vgpr(ctx, src);
+ return;
+ }
+
+ for (unsigned i = 0; i < count - 1; i++)
+ dst[i] = bld.tmp(RegClass::get(dst_type, offsets[i + 1] - offsets[i]));
+ dst[count - 1] = bld.tmp(RegClass::get(dst_type, src.bytes() - offsets[count - 1]));
+
+ if (is_subdword && src.type() == RegType::sgpr) {
+ src = as_vgpr(ctx, src);
+ } else {
+ /* use allocated_vec if possible */
+ auto it = ctx->allocated_vec.find(src.id());
+ if (it != ctx->allocated_vec.end()) {
+ unsigned total_size = 0;
+ for (unsigned i = 0; it->second[i].bytes() && (i < NIR_MAX_VEC_COMPONENTS); i++)
+ total_size += it->second[i].bytes();
+ if (total_size != src.bytes())
+ goto split;
+
+ unsigned elem_size = it->second[0].bytes();
+
+ for (unsigned i = 0; i < count; i++) {
+ if (offsets[i] % elem_size || dst[i].bytes() % elem_size)
+ goto split;
+ }
+
+ for (unsigned i = 0; i < count; i++) {
+ unsigned start_idx = offsets[i] / elem_size;
+ unsigned op_count = dst[i].bytes() / elem_size;
+ if (op_count == 1) {
+ if (dst_type == RegType::sgpr)
+ dst[i] = bld.as_uniform(it->second[start_idx]);
+ else
+ dst[i] = as_vgpr(ctx, it->second[start_idx]);
+ continue;
+ }
+
+ aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, op_count, 1)};
+ for (unsigned j = 0; j < op_count; j++) {
+ Temp tmp = it->second[start_idx + j];
+ if (dst_type == RegType::sgpr)
+ tmp = bld.as_uniform(tmp);
+ vec->operands[j] = Operand(tmp);
+ }
+ vec->definitions[0] = Definition(dst[i]);
+ bld.insert(std::move(vec));
+ }
+ return;
+ }
+ }
+
+ if (dst_type == RegType::sgpr)
+ src = bld.as_uniform(src);
+
+ split:
+ /* just split it */
+ aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, count)};
+ split->operands[0] = Operand(src);
+ for (unsigned i = 0; i < count; i++)
+ split->definitions[i] = Definition(dst[i]);
+ bld.insert(std::move(split));
+}
+
+bool scan_write_mask(uint32_t mask, uint32_t todo_mask,
+ int *start, int *count)
+{
+ unsigned start_elem = ffs(todo_mask) - 1;
+ bool skip = !(mask & (1 << start_elem));
+ if (skip)
+ mask = ~mask & todo_mask;
+
+ mask &= todo_mask;
+
+ u_bit_scan_consecutive_range(&mask, start, count);
+
+ return !skip;
+}
+
+void advance_write_mask(uint32_t *todo_mask, int start, int count)
+{
+ *todo_mask &= ~u_bit_consecutive(0, count) << start;
+}
+
+void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
+ Temp address, unsigned base_offset, unsigned align)
+{
+ assert(util_is_power_of_two_nonzero(align));
+ assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
+
+ Builder bld(ctx->program, ctx->block);
+ bool large_ds_write = ctx->options->chip_class >= GFX7;
+ bool usable_write2 = ctx->options->chip_class >= GFX7;
+
+ unsigned write_count = 0;
+ Temp write_datas[32];
+ unsigned offsets[32];
+ aco_opcode opcodes[32];
+
+ wrmask = widen_mask(wrmask, elem_size_bytes);
+
+ uint32_t todo = u_bit_consecutive(0, data.bytes());
+ while (todo) {
+ int offset, bytes;
+ if (!scan_write_mask(wrmask, todo, &offset, &bytes)) {
+ offsets[write_count] = offset;
+ opcodes[write_count] = aco_opcode::num_opcodes;
+ write_count++;
+ advance_write_mask(&todo, offset, bytes);
+ continue;
+ }
+
+ bool aligned2 = offset % 2 == 0 && align % 2 == 0;
+ bool aligned4 = offset % 4 == 0 && align % 4 == 0;
+ bool aligned8 = offset % 8 == 0 && align % 8 == 0;
+ bool aligned16 = offset % 16 == 0 && align % 16 == 0;
+
+ //TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
+ aco_opcode op = aco_opcode::num_opcodes;
+ if (bytes >= 16 && aligned16 && large_ds_write) {
+ op = aco_opcode::ds_write_b128;
+ bytes = 16;
+ } else if (bytes >= 12 && aligned16 && large_ds_write) {
+ op = aco_opcode::ds_write_b96;
+ bytes = 12;
+ } else if (bytes >= 8 && aligned8) {
+ op = aco_opcode::ds_write_b64;
+ bytes = 8;
+ } else if (bytes >= 4 && aligned4) {
+ op = aco_opcode::ds_write_b32;
+ bytes = 4;
+ } else if (bytes >= 2 && aligned2) {
+ op = aco_opcode::ds_write_b16;
+ bytes = 2;
+ } else if (bytes >= 1) {
+ op = aco_opcode::ds_write_b8;
+ bytes = 1;
+ } else {
+ assert(false);
+ }
+
+ offsets[write_count] = offset;
+ opcodes[write_count] = op;
+ write_count++;
+ advance_write_mask(&todo, offset, bytes);
+ }
+
+ Operand m = load_lds_size_m0(bld);
+
+ split_store_data(ctx, RegType::vgpr, write_count, write_datas, offsets, data);
+
+ for (unsigned i = 0; i < write_count; i++) {
+ aco_opcode op = opcodes[i];
+ if (op == aco_opcode::num_opcodes)
+ continue;
+
+ Temp data = write_datas[i];
+
+ unsigned second = write_count;
+ if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
+ for (second = i + 1; second < write_count; second++) {
+ if (opcodes[second] == op && (offsets[second] - offsets[i]) % data.bytes() == 0) {
+ op = data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
+ opcodes[second] = aco_opcode::num_opcodes;
+ break;
+ }
+ }
+ }
+
+ bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
+ unsigned write2_off = (offsets[second] - offsets[i]) / data.bytes();
+
+ unsigned inline_offset = base_offset + offsets[i];
+ unsigned max_offset = write2 ? (255 - write2_off) * data.bytes() : 65535;
+ Temp address_offset = address;
+ if (inline_offset > max_offset) {
+ address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
+ inline_offset = offsets[i];
+ }
+ assert(inline_offset <= max_offset); /* offsets[i] shouldn't be large enough for this to happen */
+
+ if (write2) {
+ Temp second_data = write_datas[second];
+ inline_offset /= data.bytes();
+ bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
+ } else {
+ bld.ds(op, address_offset, data, m, inline_offset);
+ }
+ }
+}
+
+unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset)
+{
+ unsigned align = 16;
+ if (const_offset)
+ align = std::min(align, 1u << (ffs(const_offset) - 1));
+
+ return align;
+}
+
+
+aco_opcode get_buffer_store_op(bool smem, unsigned bytes)
+{
+ switch (bytes) {
+ case 1:
+ assert(!smem);
+ return aco_opcode::buffer_store_byte;
+ case 2:
+ assert(!smem);
+ return aco_opcode::buffer_store_short;
+ case 4:
+ return smem ? aco_opcode::s_buffer_store_dword : aco_opcode::buffer_store_dword;
+ case 8:
+ return smem ? aco_opcode::s_buffer_store_dwordx2 : aco_opcode::buffer_store_dwordx2;
+ case 12:
+ assert(!smem);
+ return aco_opcode::buffer_store_dwordx3;
+ case 16:
+ return smem ? aco_opcode::s_buffer_store_dwordx4 : aco_opcode::buffer_store_dwordx4;
+ }
+ unreachable("Unexpected store size");
+ return aco_opcode::num_opcodes;
+}
+
+void split_buffer_store(isel_context *ctx, nir_intrinsic_instr *instr, bool smem, RegType dst_type,
+ Temp data, unsigned writemask, int swizzle_element_size,
+ unsigned *write_count, Temp *write_datas, unsigned *offsets)
+{
+ unsigned write_count_with_skips = 0;
+ bool skips[16];
+
+ /* determine how to split the data */
+ unsigned todo = u_bit_consecutive(0, data.bytes());
+ while (todo) {
+ int offset, bytes;
+ skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &bytes);
+ offsets[write_count_with_skips] = offset;
+ if (skips[write_count_with_skips]) {
+ advance_write_mask(&todo, offset, bytes);
+ write_count_with_skips++;
+ continue;
+ }
+
+ /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
+ * larger than swizzle_element_size */
+ bytes = MIN2(bytes, swizzle_element_size);
+ if (bytes % 4)
+ bytes = bytes > 4 ? bytes & ~0x3 : MIN2(bytes, 2);
+
+ /* SMEM and GFX6 VMEM can't emit 12-byte stores */
+ if ((ctx->program->chip_class == GFX6 || smem) && bytes == 12)
+ bytes = 8;
+
+ /* dword or larger stores have to be dword-aligned */
+ unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
+ unsigned align_offset = instr ? nir_intrinsic_align_mul(instr) : 0;
+ bool dword_aligned = (align_offset + offset) % 4 == 0 && align_mul % 4 == 0;
+ if (bytes >= 4 && !dword_aligned)
+ bytes = MIN2(bytes, 2);
+
+ advance_write_mask(&todo, offset, bytes);
+ write_count_with_skips++;
+ }
+
+ /* actually split data */
+ split_store_data(ctx, dst_type, write_count_with_skips, write_datas, offsets, data);
+
+ /* remove skips */
+ for (unsigned i = 0; i < write_count_with_skips; i++) {
+ if (skips[i])
+ continue;
+ write_datas[*write_count] = write_datas[i];
+ offsets[*write_count] = offsets[i];
+ (*write_count)++;
+ }
+}
+
+Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType reg_type, unsigned elem_size_bytes,
+ unsigned split_cnt = 0u, Temp dst = Temp())
+{
+ Builder bld(ctx->program, ctx->block);
+ unsigned dword_size = elem_size_bytes / 4;
+
+ if (!dst.id())
+ dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
+
+ std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
+ aco_ptr<Pseudo_instruction> instr {create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
+ instr->definitions[0] = Definition(dst);
+
+ for (unsigned i = 0; i < cnt; ++i) {
+ if (arr[i].id()) {
+ assert(arr[i].size() == dword_size);
+ allocated_vec[i] = arr[i];
+ instr->operands[i] = Operand(arr[i]);
+ } else {
+ Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)), Operand(0u, dword_size == 2));
+ allocated_vec[i] = zero;
+ instr->operands[i] = Operand(zero);
+ }
+ }
+
+ bld.insert(std::move(instr));
+
+ if (split_cnt)
+ emit_split_vector(ctx, dst, split_cnt);
+ else
+ ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
+
+ return dst;
+}
+
+inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, unsigned const_offset)
+{
+ if (const_offset >= 4096) {
+ unsigned excess_const_offset = const_offset / 4096u * 4096u;
+ const_offset %= 4096u;
+
+ if (!voffset.id())
+ voffset = bld.copy(bld.def(v1), Operand(excess_const_offset));
+ else if (unlikely(voffset.regClass() == s1))
+ voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), Operand(excess_const_offset), Operand(voffset));
+ else if (likely(voffset.regClass() == v1))
+ voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand(excess_const_offset));
+ else
+ unreachable("Unsupported register class of voffset");
+ }
+
+ return const_offset;
+}
+
+void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
+ unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false)
+{
+ assert(vdata.id());
+ assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
+ assert(vdata.size() >= 1 && vdata.size() <= 4);
+
+ Builder bld(ctx->program, ctx->block);
+ aco_opcode op = get_buffer_store_op(false, vdata.bytes());
+ const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
+
+ Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
+ Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
+ Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
+ /* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false,
+ /* disable_wqm */ false, /* glc */ true, /* dlc*/ false, /* slc */ slc);
+
+ static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
+}
+
+void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
+ unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
+ bool allow_combining = true, bool reorder = true, bool slc = false)
+{
+ Builder bld(ctx->program, ctx->block);
+ assert(elem_size_bytes == 4 || elem_size_bytes == 8);
+ assert(write_mask);
+ write_mask = widen_mask(write_mask, elem_size_bytes);
+
+ unsigned write_count = 0;
+ Temp write_datas[32];
+ unsigned offsets[32];
+ split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask,
+ allow_combining ? 16 : 4, &write_count, write_datas, offsets);
+
+ for (unsigned i = 0; i < write_count; i++) {
+ unsigned const_offset = offsets[i] + base_const_offset;
+ emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc);
+ }
+}
+
+void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,
+ unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
+ unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true)
+{
+ assert(elem_size_bytes == 4 || elem_size_bytes == 8);
+ assert((num_components * elem_size_bytes / 4) == dst.size());
+ assert(!!stride != allow_combining);
+
+ Builder bld(ctx->program, ctx->block);
+
+ LoadEmitInfo info = {Operand(voffset), dst, num_components, elem_size_bytes, descriptor};
+ info.component_stride = allow_combining ? 0 : stride;
+ info.glc = true;
+ info.swizzle_component_size = allow_combining ? 0 : 4;
+ info.align_mul = MIN2(elem_size_bytes, 4);
+ info.align_offset = 0;
+ info.soffset = soffset;
+ info.const_offset = base_const_offset;
+ emit_mubuf_load(ctx, bld, &info);
+}
+
+std::pair<Temp, unsigned> offset_add_from_nir(isel_context *ctx, const std::pair<Temp, unsigned> &base_offset, nir_src *off_src, unsigned stride = 1u)
+{
+ Builder bld(ctx->program, ctx->block);
+ Temp offset = base_offset.first;
+ unsigned const_offset = base_offset.second;
+
+ if (!nir_src_is_const(*off_src)) {
+ Temp indirect_offset_arg = get_ssa_temp(ctx, off_src->ssa);
+ Temp with_stride;
+
+ /* Calculate indirect offset with stride */
+ if (likely(indirect_offset_arg.regClass() == v1))
+ with_stride = bld.v_mul24_imm(bld.def(v1), indirect_offset_arg, stride);
+ else if (indirect_offset_arg.regClass() == s1)
+ with_stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), indirect_offset_arg);
+ else
+ unreachable("Unsupported register class of indirect offset");
+
+ /* Add to the supplied base offset */
+ if (offset.id() == 0)
+ offset = with_stride;
+ else if (unlikely(offset.regClass() == s1 && with_stride.regClass() == s1))
+ offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), with_stride, offset);
+ else if (offset.size() == 1 && with_stride.size() == 1)
+ offset = bld.vadd32(bld.def(v1), with_stride, offset);
+ else
+ unreachable("Unsupported register class of indirect offset");
+ } else {
+ unsigned const_offset_arg = nir_src_as_uint(*off_src);
+ const_offset += const_offset_arg * stride;
+ }
+
+ return std::make_pair(offset, const_offset);
+}
+
+std::pair<Temp, unsigned> offset_add(isel_context *ctx, const std::pair<Temp, unsigned> &off1, const std::pair<Temp, unsigned> &off2)
+{
+ Builder bld(ctx->program, ctx->block);
+ Temp offset;
+
+ if (off1.first.id() && off2.first.id()) {
+ if (unlikely(off1.first.regClass() == s1 && off2.first.regClass() == s1))
+ offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), off1.first, off2.first);
+ else if (off1.first.size() == 1 && off2.first.size() == 1)
+ offset = bld.vadd32(bld.def(v1), off1.first, off2.first);
+ else
+ unreachable("Unsupported register class of indirect offset");
+ } else {
+ offset = off1.first.id() ? off1.first : off2.first;
+ }
+
+ return std::make_pair(offset, off1.second + off2.second);
+}
+
+std::pair<Temp, unsigned> offset_mul(isel_context *ctx, const std::pair<Temp, unsigned> &offs, unsigned multiplier)
+{
+ Builder bld(ctx->program, ctx->block);
+ unsigned const_offset = offs.second * multiplier;
+
+ if (!offs.first.id())
+ return std::make_pair(offs.first, const_offset);
+
+ Temp offset = unlikely(offs.first.regClass() == s1)
+ ? bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(multiplier), offs.first)
+ : bld.v_mul24_imm(bld.def(v1), offs.first, multiplier);
+
+ return std::make_pair(offset, const_offset);
+}
+
+std::pair<Temp, unsigned> get_intrinsic_io_basic_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned base_stride, unsigned component_stride)
+{
+ Builder bld(ctx->program, ctx->block);
+
+ /* base is the driver_location, which is already multiplied by 4, so is in dwords */
+ unsigned const_offset = nir_intrinsic_base(instr) * base_stride;
+ /* component is in bytes */
+ const_offset += nir_intrinsic_component(instr) * component_stride;
+
+ /* offset should be interpreted in relation to the base, so the instruction effectively reads/writes another input/output when it has an offset */
+ nir_src *off_src = nir_get_io_offset_src(instr);
+ return offset_add_from_nir(ctx, std::make_pair(Temp(), const_offset), off_src, 4u * base_stride);
+}
+
+std::pair<Temp, unsigned> get_intrinsic_io_basic_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned stride = 1u)
+{
+ return get_intrinsic_io_basic_offset(ctx, instr, stride, stride);
+}
+
+Temp get_tess_rel_patch_id(isel_context *ctx)
+{
+ Builder bld(ctx->program, ctx->block);
+
+ switch (ctx->shader->info.stage) {
+ case MESA_SHADER_TESS_CTRL:
+ return bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffu),
+ get_arg(ctx, ctx->args->ac.tcs_rel_ids));
+ case MESA_SHADER_TESS_EVAL:
+ return get_arg(ctx, ctx->args->tes_rel_patch_id);
+ default:
+ unreachable("Unsupported stage in get_tess_rel_patch_id");
+ }
+}
+
+std::pair<Temp, unsigned> get_tcs_per_vertex_input_lds_offset(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
+ Builder bld(ctx->program, ctx->block);
+
+ uint32_t tcs_in_patch_stride = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 4;
+ uint32_t tcs_in_vertex_stride = ctx->tcs_num_inputs * 4;
+
+ std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr);
+
+ nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
+ offs = offset_add_from_nir(ctx, offs, vertex_index_src, tcs_in_vertex_stride);
+
+ Temp rel_patch_id = get_tess_rel_patch_id(ctx);
+ Temp tcs_in_current_patch_offset = bld.v_mul24_imm(bld.def(v1), rel_patch_id, tcs_in_patch_stride);
+ offs = offset_add(ctx, offs, std::make_pair(tcs_in_current_patch_offset, 0));
+
+ return offset_mul(ctx, offs, 4u);
+}
+
+std::pair<Temp, unsigned> get_tcs_output_lds_offset(isel_context *ctx, nir_intrinsic_instr *instr = nullptr, bool per_vertex = false)
+{
+ assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
+ Builder bld(ctx->program, ctx->block);
+
+ uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 16;
+ uint32_t output_vertex_size = ctx->tcs_num_outputs * 16;
+ uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
+ uint32_t output_patch_stride = pervertex_output_patch_size + ctx->tcs_num_patch_outputs * 16;
+
+ std::pair<Temp, unsigned> offs = instr
+ ? get_intrinsic_io_basic_offset(ctx, instr, 4u)
+ : std::make_pair(Temp(), 0u);
+
+ Temp rel_patch_id = get_tess_rel_patch_id(ctx);
+ Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, output_patch_stride);
+
+ if (per_vertex) {
+ assert(instr);
+
+ nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
+ offs = offset_add_from_nir(ctx, offs, vertex_index_src, output_vertex_size);
+
+ uint32_t output_patch0_offset = (input_patch_size * ctx->tcs_num_patches);
+ offs = offset_add(ctx, offs, std::make_pair(patch_off, output_patch0_offset));
+ } else {
+ uint32_t output_patch0_patch_data_offset = (input_patch_size * ctx->tcs_num_patches + pervertex_output_patch_size);
+ offs = offset_add(ctx, offs, std::make_pair(patch_off, output_patch0_patch_data_offset));
+ }
+
+ return offs;
+}
+
+std::pair<Temp, unsigned> get_tcs_per_vertex_output_vmem_offset(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ Builder bld(ctx->program, ctx->block);
+
+ unsigned vertices_per_patch = ctx->shader->info.tess.tcs_vertices_out;
+ unsigned attr_stride = vertices_per_patch * ctx->tcs_num_patches;
+
+ std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, attr_stride * 4u, 4u);
+
+ Temp rel_patch_id = get_tess_rel_patch_id(ctx);
+ Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, vertices_per_patch * 16u);
+ offs = offset_add(ctx, offs, std::make_pair(patch_off, 0u));
+
+ nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
+ offs = offset_add_from_nir(ctx, offs, vertex_index_src, 16u);
+
+ return offs;
+}
+
+std::pair<Temp, unsigned> get_tcs_per_patch_output_vmem_offset(isel_context *ctx, nir_intrinsic_instr *instr = nullptr, unsigned const_base_offset = 0u)
+{
+ Builder bld(ctx->program, ctx->block);
+
+ unsigned output_vertex_size = ctx->tcs_num_outputs * 16;
+ unsigned per_vertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
+ unsigned per_patch_data_offset = per_vertex_output_patch_size * ctx->tcs_num_patches;
+ unsigned attr_stride = ctx->tcs_num_patches;
+
+ std::pair<Temp, unsigned> offs = instr
+ ? get_intrinsic_io_basic_offset(ctx, instr, attr_stride * 4u, 4u)
+ : std::make_pair(Temp(), 0u);
+
+ if (const_base_offset)
+ offs.second += const_base_offset * attr_stride;
+
+ Temp rel_patch_id = get_tess_rel_patch_id(ctx);
+ Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, 16u);
+ offs = offset_add(ctx, offs, std::make_pair(patch_off, per_patch_data_offset));
+
+ return offs;
+}
+
+bool tcs_driver_location_matches_api_mask(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex, uint64_t mask, bool *indirect)
+{
+ assert(per_vertex || ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
+
+ if (mask == 0)
+ return false;
+
+ unsigned drv_loc = nir_intrinsic_base(instr);
+ nir_src *off_src = nir_get_io_offset_src(instr);
+
+ if (!nir_src_is_const(*off_src)) {
+ *indirect = true;
+ return false;
+ }
+
+ *indirect = false;
+ uint64_t slot = per_vertex
+ ? ctx->output_drv_loc_to_var_slot[ctx->shader->info.stage][drv_loc / 4]
+ : (ctx->output_tcs_patch_drv_loc_to_var_slot[drv_loc / 4] - VARYING_SLOT_PATCH0);
+ return (((uint64_t) 1) << slot) & mask;
+}
+
+bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ unsigned write_mask = nir_intrinsic_write_mask(instr);
+ unsigned component = nir_intrinsic_component(instr);
+ unsigned idx = nir_intrinsic_base(instr) + component;
+
+ nir_instr *off_instr = instr->src[1].ssa->parent_instr;
+ if (off_instr->type != nir_instr_type_load_const)
+ return false;
+
+ Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
+ idx += nir_src_as_uint(instr->src[1]) * 4u;
+
+ if (instr->src[0].ssa->bit_size == 64)
+ write_mask = widen_mask(write_mask, 2);
+
+ for (unsigned i = 0; i < 8; ++i) {
+ if (write_mask & (1 << i)) {
+ ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
+ ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, v1);
}
+ idx++;
+ }
+
+ return true;
+}
+
+bool load_input_from_temps(isel_context *ctx, nir_intrinsic_instr *instr, Temp dst)
+{
+ /* Only TCS per-vertex inputs are supported by this function.
+ * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations is the same.
+ */
+ if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
+ return false;
+
+ nir_src *off_src = nir_get_io_offset_src(instr);
+ nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
+ nir_instr *vertex_index_instr = vertex_index_src->ssa->parent_instr;
+ bool can_use_temps = nir_src_is_const(*off_src) &&
+ vertex_index_instr->type == nir_instr_type_intrinsic &&
+ nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
+
+ if (!can_use_temps)
+ return false;
- bytes_read += todo;
- }
+ unsigned idx = nir_intrinsic_base(instr) + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src);
+ Temp *src = &ctx->inputs.temps[idx];
+ create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
- assert(result_size == num_components && result_size > 1);
- aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, result_size, 1)};
- for (unsigned i = 0; i < result_size; i++)
- vec->operands[i] = Operand(result[i]);
- vec->definitions[0] = Definition(dst);
- ctx->block->instructions.emplace_back(std::move(vec));
- ctx->allocated_vec.emplace(dst.id(), result);
+ return true;
}
-Temp extract_subvector(isel_context *ctx, Temp data, unsigned start, unsigned size, RegType type)
+void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
{
- if (start == 0 && size == data.size())
- return type == RegType::vgpr ? as_vgpr(ctx, data) : data;
+ Builder bld(ctx->program, ctx->block);
- unsigned size_hint = 1;
- auto it = ctx->allocated_vec.find(data.id());
- if (it != ctx->allocated_vec.end())
- size_hint = it->second[0].size();
- if (size % size_hint || start % size_hint)
- size_hint = 1;
+ if (ctx->tcs_in_out_eq && store_output_to_temps(ctx, instr)) {
+ /* When the TCS only reads this output directly and for the same vertices as its invocation id, it is unnecessary to store the VS output to LDS. */
+ bool indirect_write;
+ bool temp_only_input = tcs_driver_location_matches_api_mask(ctx, instr, true, ctx->tcs_temp_only_inputs, &indirect_write);
+ if (temp_only_input && !indirect_write)
+ return;
+ }
- start /= size_hint;
- size /= size_hint;
+ std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, 4u);
+ Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
+ unsigned write_mask = nir_intrinsic_write_mask(instr);
+ unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8u;
- Temp elems[size];
- for (unsigned i = 0; i < size; i++)
- elems[i] = emit_extract_vector(ctx, data, start + i, RegClass(type, size_hint));
+ if (ctx->stage == vertex_es || ctx->stage == tess_eval_es) {
+ /* GFX6-8: ES stage is not merged into GS, data is passed from ES to GS in VMEM. */
+ Temp esgs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_VS * 16u));
+ Temp es2gs_offset = get_arg(ctx, ctx->args->es2gs_offset);
+ store_vmem_mubuf(ctx, src, esgs_ring, offs.first, es2gs_offset, offs.second, elem_size_bytes, write_mask, false, true, true);
+ } else {
+ Temp lds_base;
+
+ if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
+ /* GFX9+: ES stage is merged into GS, data is passed between them using LDS. */
+ unsigned itemsize = ctx->stage == vertex_geometry_gs
+ ? ctx->program->info->vs.es_info.esgs_itemsize
+ : ctx->program->info->tes.es_info.esgs_itemsize;
+ Temp thread_id = emit_mbcnt(ctx, bld.def(v1));
+ Temp wave_idx = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(ctx, ctx->args->merged_wave_info), Operand(4u << 16 | 24));
+ Temp vertex_idx = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), thread_id,
+ bld.v_mul24_imm(bld.def(v1), as_vgpr(ctx, wave_idx), ctx->program->wave_size));
+ lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, itemsize);
+ } else if (ctx->stage == vertex_ls || ctx->stage == vertex_tess_control_hs) {
+ /* GFX6-8: VS runs on LS stage when tessellation is used, but LS shares LDS space with HS.
+ * GFX9+: LS is merged into HS, but still uses the same LDS layout.
+ */
+ Temp vertex_idx = get_arg(ctx, ctx->args->rel_auto_id);
+ lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, ctx->tcs_num_inputs * 16u);
+ } else {
+ unreachable("Invalid LS or ES stage");
+ }
- if (size == 1)
- return type == RegType::vgpr ? as_vgpr(ctx, elems[0]) : elems[0];
+ offs = offset_add(ctx, offs, std::make_pair(lds_base, 0u));
+ unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
+ store_lds(ctx, elem_size_bytes, src, write_mask, offs.first, offs.second, lds_align);
+ }
+}
+
+bool tcs_output_is_tess_factor(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
+{
+ if (per_vertex)
+ return false;
+
+ unsigned off = nir_intrinsic_base(instr) * 4u;
+ return off == ctx->tcs_tess_lvl_out_loc ||
+ off == ctx->tcs_tess_lvl_in_loc;
- aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
- for (unsigned i = 0; i < size; i++)
- vec->operands[i] = Operand(elems[i]);
- Temp res = {ctx->program->allocateId(), RegClass(type, size * size_hint)};
- vec->definitions[0] = Definition(res);
- ctx->block->instructions.emplace_back(std::move(vec));
- return res;
}
-void ds_write_helper(isel_context *ctx, Operand m, Temp address, Temp data, unsigned data_start, unsigned total_size, unsigned offset0, unsigned offset1, unsigned align)
+bool tcs_output_is_read_by_tes(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
{
- Builder bld(ctx->program, ctx->block);
- unsigned bytes_written = 0;
- while (bytes_written < total_size * 4) {
- unsigned todo = total_size * 4 - bytes_written;
- bool aligned8 = bytes_written % 8 == 0 && align % 8 == 0;
- bool aligned16 = bytes_written % 16 == 0 && align % 16 == 0;
-
- aco_opcode op = aco_opcode::last_opcode;
- bool write2 = false;
- unsigned size = 0;
- if (todo >= 16 && aligned16) {
- op = aco_opcode::ds_write_b128;
- size = 4;
- } else if (todo >= 16 && aligned8) {
- op = aco_opcode::ds_write2_b64;
- write2 = true;
- size = 4;
- } else if (todo >= 12 && aligned16) {
- op = aco_opcode::ds_write_b96;
- size = 3;
- } else if (todo >= 8 && aligned8) {
- op = aco_opcode::ds_write_b64;
- size = 2;
- } else if (todo >= 8) {
- op = aco_opcode::ds_write2_b32;
- write2 = true;
- size = 2;
- } else if (todo >= 4) {
- op = aco_opcode::ds_write_b32;
- size = 1;
- } else {
- assert(false);
- }
+ uint64_t mask = per_vertex
+ ? ctx->program->info->tcs.tes_inputs_read
+ : ctx->program->info->tcs.tes_patch_inputs_read;
- unsigned offset = offset0 + offset1 + bytes_written;
- unsigned max_offset = write2 ? 1020 : 65535;
- Temp address_offset = address;
- if (offset > max_offset) {
- address_offset = bld.vadd32(bld.def(v1), Operand(offset0), address_offset);
- offset = offset1 + bytes_written;
- }
- assert(offset <= max_offset); /* offset1 shouldn't be large enough for this to happen */
+ bool indirect_write = false;
+ bool output_read_by_tes = tcs_driver_location_matches_api_mask(ctx, instr, per_vertex, mask, &indirect_write);
+ return indirect_write || output_read_by_tes;
+}
- if (write2) {
- Temp val0 = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size / 2, RegType::vgpr);
- Temp val1 = extract_subvector(ctx, data, data_start + (bytes_written >> 2) + 1, size / 2, RegType::vgpr);
- bld.ds(op, address_offset, val0, val1, m, offset >> 2, (offset >> 2) + 1);
- } else {
- Temp val = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size, RegType::vgpr);
- bld.ds(op, address_offset, val, m, offset);
- }
+bool tcs_output_is_read_by_tcs(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
+{
+ uint64_t mask = per_vertex
+ ? ctx->shader->info.outputs_read
+ : ctx->shader->info.patch_outputs_read;
- bytes_written += size * 4;
- }
+ bool indirect_write = false;
+ bool output_read = tcs_driver_location_matches_api_mask(ctx, instr, per_vertex, mask, &indirect_write);
+ return indirect_write || output_read;
}
-void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
- Temp address, unsigned base_offset, unsigned align)
+void visit_store_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
{
- assert(util_is_power_of_two_nonzero(align) && align >= 4);
+ assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
+ assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
- Operand m = load_lds_size_m0(ctx);
+ Builder bld(ctx->program, ctx->block);
- /* we need at most two stores for 32bit variables */
- int start[2], count[2];
- u_bit_scan_consecutive_range(&wrmask, &start[0], &count[0]);
- u_bit_scan_consecutive_range(&wrmask, &start[1], &count[1]);
- assert(wrmask == 0);
+ Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
+ unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
+ unsigned write_mask = nir_intrinsic_write_mask(instr);
- /* one combined store is sufficient */
- if (count[0] == count[1]) {
- Builder bld(ctx->program, ctx->block);
+ bool is_tess_factor = tcs_output_is_tess_factor(ctx, instr, per_vertex);
+ bool write_to_vmem = !is_tess_factor && tcs_output_is_read_by_tes(ctx, instr, per_vertex);
+ bool write_to_lds = is_tess_factor || tcs_output_is_read_by_tcs(ctx, instr, per_vertex);
- Temp address_offset = address;
- if ((base_offset >> 2) + start[1] > 255) {
- address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
- base_offset = 0;
- }
+ if (write_to_vmem) {
+ std::pair<Temp, unsigned> vmem_offs = per_vertex
+ ? get_tcs_per_vertex_output_vmem_offset(ctx, instr)
+ : get_tcs_per_patch_output_vmem_offset(ctx, instr);
- assert(count[0] == 1);
- Temp val0 = emit_extract_vector(ctx, data, start[0], v1);
- Temp val1 = emit_extract_vector(ctx, data, start[1], v1);
- aco_opcode op = elem_size_bytes == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
- base_offset = base_offset / elem_size_bytes;
- bld.ds(op, address_offset, val0, val1, m,
- base_offset + start[0], base_offset + start[1]);
- return;
+ Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
+ Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
+ store_vmem_mubuf(ctx, store_val, hs_ring_tess_offchip, vmem_offs.first, oc_lds, vmem_offs.second, elem_size_bytes, write_mask, true, false);
}
- for (unsigned i = 0; i < 2; i++) {
- if (count[i] == 0)
- continue;
-
- unsigned elem_size_words = elem_size_bytes / 4;
- ds_write_helper(ctx, m, address, data, start[i] * elem_size_words, count[i] * elem_size_words,
- base_offset, start[i] * elem_size_bytes, align);
+ if (write_to_lds) {
+ std::pair<Temp, unsigned> lds_offs = get_tcs_output_lds_offset(ctx, instr, per_vertex);
+ unsigned lds_align = calculate_lds_alignment(ctx, lds_offs.second);
+ store_lds(ctx, elem_size_bytes, store_val, write_mask, lds_offs.first, lds_offs.second, lds_align);
}
- return;
+}
+
+void visit_load_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
+{
+ assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
+ assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
+
+ Builder bld(ctx->program, ctx->block);
+
+ Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+ std::pair<Temp, unsigned> lds_offs = get_tcs_output_lds_offset(ctx, instr, per_vertex);
+ unsigned lds_align = calculate_lds_alignment(ctx, lds_offs.second);
+ unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
+
+ load_lds(ctx, elem_size_bytes, dst, lds_offs.first, lds_offs.second, lds_align);
}
void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
{
- if (ctx->stage == vertex_vs) {
- visit_store_vs_output(ctx, instr);
- } else if (ctx->stage == fragment_fs) {
- visit_store_fs_output(ctx, instr);
+ if (ctx->stage == vertex_vs ||
+ ctx->stage == tess_eval_vs ||
+ ctx->stage == fragment_fs ||
+ ctx->stage == ngg_vertex_gs ||
+ ctx->stage == ngg_tess_eval_gs ||
+ ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
+ bool stored_to_temps = store_output_to_temps(ctx, instr);
+ if (!stored_to_temps) {
+ fprintf(stderr, "Unimplemented output offset instruction:\n");
+ nir_print_instr(instr->src[1].ssa->parent_instr, stderr);
+ fprintf(stderr, "\n");
+ abort();
+ }
+ } else if (ctx->stage == vertex_es ||
+ ctx->stage == vertex_ls ||
+ ctx->stage == tess_eval_es ||
+ (ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
+ (ctx->stage == vertex_geometry_gs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
+ (ctx->stage == tess_eval_geometry_gs && ctx->shader->info.stage == MESA_SHADER_TESS_EVAL)) {
+ visit_store_ls_or_es_output(ctx, instr);
+ } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
+ visit_store_tcs_output(ctx, instr, false);
} else {
unreachable("Shader stage not implemented");
}
}
+void visit_load_output(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ visit_load_tcs_output(ctx, instr, false);
+}
+
void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask)
{
Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
Builder bld(ctx->program, ctx->block);
- Temp tmp = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component);
- bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), tmp, idx, component);
+ Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component);
+ if (ctx->program->has_16bank_lds)
+ interp_p1.instr->operands[0].setLateKill(true);
+ bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx, component);
}
void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
}
}
-unsigned get_num_channels_from_data_format(unsigned data_format)
+bool check_vertex_fetch_size(isel_context *ctx, const ac_data_format_info *vtx_info,
+ unsigned offset, unsigned stride, unsigned channels)
+{
+ unsigned vertex_byte_size = vtx_info->chan_byte_size * channels;
+ if (vtx_info->chan_byte_size != 4 && channels == 3)
+ return false;
+ return (ctx->options->chip_class != GFX6 && ctx->options->chip_class != GFX10) ||
+ (offset % vertex_byte_size == 0 && stride % vertex_byte_size == 0);
+}
+
+uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_info,
+ unsigned offset, unsigned stride, unsigned *channels)
{
- switch (data_format) {
+ if (!vtx_info->chan_byte_size) {
+ *channels = vtx_info->num_channels;
+ return vtx_info->chan_format;
+ }
+
+ unsigned num_channels = *channels;
+ if (!check_vertex_fetch_size(ctx, vtx_info, offset, stride, *channels)) {
+ unsigned new_channels = num_channels + 1;
+ /* first, assume more loads is worse and try using a larger data format */
+ while (new_channels <= 4 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels)) {
+ new_channels++;
+ /* don't make the attribute potentially out-of-bounds */
+ if (offset + new_channels * vtx_info->chan_byte_size > stride)
+ new_channels = 5;
+ }
+
+ if (new_channels == 5) {
+ /* then try decreasing load size (at the cost of more loads) */
+ new_channels = *channels;
+ while (new_channels > 1 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels))
+ new_channels--;
+ }
+
+ if (new_channels < *channels)
+ *channels = new_channels;
+ num_channels = new_channels;
+ }
+
+ switch (vtx_info->chan_format) {
case V_008F0C_BUF_DATA_FORMAT_8:
+ return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8,
+ V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1];
case V_008F0C_BUF_DATA_FORMAT_16:
+ return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16,
+ V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1];
case V_008F0C_BUF_DATA_FORMAT_32:
- return 1;
- case V_008F0C_BUF_DATA_FORMAT_8_8:
- case V_008F0C_BUF_DATA_FORMAT_16_16:
- case V_008F0C_BUF_DATA_FORMAT_32_32:
- return 2;
- case V_008F0C_BUF_DATA_FORMAT_10_11_11:
- case V_008F0C_BUF_DATA_FORMAT_11_11_10:
- case V_008F0C_BUF_DATA_FORMAT_32_32_32:
- return 3;
- case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
- case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
- case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
- case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
- case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
- return 4;
- default:
- break;
+ return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
+ V_008F0C_BUF_DATA_FORMAT_32_32_32, V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1];
}
-
- return 4;
+ unreachable("shouldn't reach here");
+ return V_008F0C_BUF_DATA_FORMAT_INVALID;
}
/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
{
Builder bld(ctx->program, ctx->block);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
- if (ctx->stage & sw_vs) {
+ if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
nir_instr *off_instr = instr->src[0].ssa->parent_instr;
if (off_instr->type != nir_instr_type_load_const) {
unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
unsigned dfmt = attrib_format & 0xf;
-
unsigned nfmt = (attrib_format >> 4) & 0x7;
- unsigned num_dfmt_channels = get_num_channels_from_data_format(dfmt);
+ const struct ac_data_format_info *vtx_info = ac_get_data_format_info(dfmt);
+
unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
- unsigned num_channels = MIN2(util_last_bit(mask), num_dfmt_channels);
+ unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels);
unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3;
bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
if (post_shuffle)
num_channels = MAX2(num_channels, 3);
- Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, Operand(attrib_binding * 16u));
+ Operand off = bld.copy(bld.def(s1), Operand(attrib_binding * 16u));
+ Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, off);
Temp index;
if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance);
if (divisor) {
- ctx->needs_instance_id = true;
Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id);
if (divisor != 1) {
Temp divided = bld.tmp(v1);
get_arg(ctx, ctx->args->ac.vertex_id));
}
- if (attrib_stride != 0 && attrib_offset > attrib_stride) {
- index = bld.vadd32(bld.def(v1), Operand(attrib_offset / attrib_stride), index);
- attrib_offset = attrib_offset % attrib_stride;
- }
+ Temp channels[num_channels];
+ unsigned channel_start = 0;
+ bool direct_fetch = false;
+
+ /* skip unused channels at the start */
+ if (vtx_info->chan_byte_size && !post_shuffle) {
+ channel_start = ffs(mask) - 1;
+ for (unsigned i = 0; i < channel_start; i++)
+ channels[i] = Temp(0, s1);
+ } else if (vtx_info->chan_byte_size && post_shuffle && !(mask & 0x8)) {
+ num_channels = 3 - (ffs(mask) - 1);
+ }
+
+ /* load channels */
+ while (channel_start < num_channels) {
+ unsigned fetch_size = num_channels - channel_start;
+ unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size;
+ bool expanded = false;
+
+ /* use MUBUF when possible to avoid possible alignment issues */
+ /* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */
+ bool use_mubuf = (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT ||
+ nfmt == V_008F0C_BUF_NUM_FORMAT_UINT ||
+ nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) &&
+ vtx_info->chan_byte_size == 4;
+ unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID;
+ if (!use_mubuf) {
+ fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_size);
+ } else {
+ if (fetch_size == 3 && ctx->options->chip_class == GFX6) {
+ /* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */
+ fetch_size = 4;
+ expanded = true;
+ }
+ }
- Operand soffset(0u);
- if (attrib_offset >= 4096) {
- soffset = bld.copy(bld.def(s1), Operand(attrib_offset));
- attrib_offset = 0;
- }
+ Temp fetch_index = index;
+ if (attrib_stride != 0 && fetch_offset > attrib_stride) {
+ fetch_index = bld.vadd32(bld.def(v1), Operand(fetch_offset / attrib_stride), fetch_index);
+ fetch_offset = fetch_offset % attrib_stride;
+ }
- aco_opcode opcode;
- switch (num_channels) {
- case 1:
- opcode = aco_opcode::tbuffer_load_format_x;
- break;
- case 2:
- opcode = aco_opcode::tbuffer_load_format_xy;
- break;
- case 3:
- opcode = aco_opcode::tbuffer_load_format_xyz;
- break;
- case 4:
- opcode = aco_opcode::tbuffer_load_format_xyzw;
- break;
- default:
- unreachable("Unimplemented load_input vector size");
- }
+ Operand soffset(0u);
+ if (fetch_offset >= 4096) {
+ soffset = bld.copy(bld.def(s1), Operand(fetch_offset / 4096 * 4096));
+ fetch_offset %= 4096;
+ }
- Temp tmp = post_shuffle || num_channels != dst.size() || alpha_adjust != RADV_ALPHA_ADJUST_NONE || component ? bld.tmp(RegType::vgpr, num_channels) : dst;
+ aco_opcode opcode;
+ switch (fetch_size) {
+ case 1:
+ opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
+ break;
+ case 2:
+ opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
+ break;
+ case 3:
+ assert(ctx->options->chip_class >= GFX7 ||
+ (!use_mubuf && ctx->options->chip_class == GFX6));
+ opcode = use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz;
+ break;
+ case 4:
+ opcode = use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw;
+ break;
+ default:
+ unreachable("Unimplemented load_input vector size");
+ }
- aco_ptr<MTBUF_instruction> mubuf{create_instruction<MTBUF_instruction>(opcode, Format::MTBUF, 3, 1)};
- mubuf->operands[0] = Operand(index);
- mubuf->operands[1] = Operand(list);
- mubuf->operands[2] = soffset;
- mubuf->definitions[0] = Definition(tmp);
- mubuf->idxen = true;
- mubuf->can_reorder = true;
- mubuf->dfmt = dfmt;
- mubuf->nfmt = nfmt;
- assert(attrib_offset < 4096);
- mubuf->offset = attrib_offset;
- ctx->block->instructions.emplace_back(std::move(mubuf));
+ Temp fetch_dst;
+ if (channel_start == 0 && fetch_size == dst.size() && !post_shuffle &&
+ !expanded && (alpha_adjust == RADV_ALPHA_ADJUST_NONE ||
+ num_channels <= 3)) {
+ direct_fetch = true;
+ fetch_dst = dst;
+ } else {
+ fetch_dst = bld.tmp(RegType::vgpr, fetch_size);
+ }
+
+ if (use_mubuf) {
+ Instruction *mubuf = bld.mubuf(opcode,
+ Definition(fetch_dst), list, fetch_index, soffset,
+ fetch_offset, false, true).instr;
+ static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
+ } else {
+ Instruction *mtbuf = bld.mtbuf(opcode,
+ Definition(fetch_dst), list, fetch_index, soffset,
+ fetch_dfmt, nfmt, fetch_offset, false, true).instr;
+ static_cast<MTBUF_instruction*>(mtbuf)->can_reorder = true;
+ }
+
+ emit_split_vector(ctx, fetch_dst, fetch_dst.size());
+
+ if (fetch_size == 1) {
+ channels[channel_start] = fetch_dst;
+ } else {
+ for (unsigned i = 0; i < MIN2(fetch_size, num_channels - channel_start); i++)
+ channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i, v1);
+ }
- emit_split_vector(ctx, tmp, tmp.size());
+ channel_start += fetch_size;
+ }
- if (tmp.id() != dst.id()) {
+ if (!direct_fetch) {
bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT &&
nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
+ std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
+ unsigned num_temp = 0;
for (unsigned i = 0; i < dst.size(); i++) {
unsigned idx = i + component;
- if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE && num_channels >= 4) {
- Temp alpha = emit_extract_vector(ctx, tmp, swizzle[3], v1);
- vec->operands[3] = Operand(adjust_vertex_fetch_alpha(ctx, alpha_adjust, alpha));
- } else if (idx < num_channels) {
- vec->operands[i] = Operand(emit_extract_vector(ctx, tmp, swizzle[idx], v1));
+ if (swizzle[idx] < num_channels && channels[swizzle[idx]].id()) {
+ Temp channel = channels[swizzle[idx]];
+ if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE)
+ channel = adjust_vertex_fetch_alpha(ctx, alpha_adjust, channel);
+ vec->operands[i] = Operand(channel);
+
+ num_temp++;
+ elems[i] = channel;
} else if (is_float && idx == 3) {
vec->operands[i] = Operand(0x3f800000u);
} else if (!is_float && idx == 3) {
vec->definitions[0] = Definition(dst);
ctx->block->instructions.emplace_back(std::move(vec));
emit_split_vector(ctx, dst, dst.size());
- }
- } else if (ctx->stage == fragment_fs) {
- nir_instr *off_instr = instr->src[0].ssa->parent_instr;
+ if (num_temp == dst.size())
+ ctx->allocated_vec.emplace(dst.id(), elems);
+ }
+ } else if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) {
+ unsigned offset_idx = instr->intrinsic == nir_intrinsic_load_input ? 0 : 1;
+ nir_instr *off_instr = instr->src[offset_idx].ssa->parent_instr;
if (off_instr->type != nir_instr_type_load_const ||
nir_instr_as_load_const(off_instr)->value[0].u32 != 0) {
fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
}
Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
- nir_const_value* offset = nir_src_as_const_value(instr->src[0]);
+ nir_const_value* offset = nir_src_as_const_value(instr->src[offset_idx]);
if (offset) {
assert(offset->u32 == 0);
} else {
/* the lower 15bit of the prim_mask contain the offset into LDS
* while the upper bits contain the number of prims */
- Temp offset_src = get_ssa_temp(ctx, instr->src[0].ssa);
+ Temp offset_src = get_ssa_temp(ctx, instr->src[offset_idx].ssa);
assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
Builder bld(ctx->program, ctx->block);
Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
unsigned idx = nir_intrinsic_base(instr);
unsigned component = nir_intrinsic_component(instr);
+ unsigned vertex_id = 2; /* P0 */
+
+ if (instr->intrinsic == nir_intrinsic_load_input_vertex) {
+ nir_const_value* src0 = nir_src_as_const_value(instr->src[0]);
+ switch (src0->u32) {
+ case 0:
+ vertex_id = 2; /* P0 */
+ break;
+ case 1:
+ vertex_id = 0; /* P10 */
+ break;
+ case 2:
+ vertex_id = 1; /* P20 */
+ break;
+ default:
+ unreachable("invalid vertex index");
+ }
+ }
if (dst.size() == 1) {
- bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(2u), bld.m0(prim_mask), idx, component);
+ bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(vertex_id), bld.m0(prim_mask), idx, component);
} else {
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
for (unsigned i = 0; i < dst.size(); i++)
- vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(2u), bld.m0(prim_mask), idx, component + i);
+ vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(vertex_id), bld.m0(prim_mask), idx, component + i);
vec->definitions[0] = Definition(dst);
bld.insert(std::move(vec));
}
+ } else if (ctx->shader->info.stage == MESA_SHADER_TESS_EVAL) {
+ Temp ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
+ Temp soffset = get_arg(ctx, ctx->args->oc_lds);
+ std::pair<Temp, unsigned> offs = get_tcs_per_patch_output_vmem_offset(ctx, instr);
+ unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8u;
+
+ load_vmem_mubuf(ctx, dst, ring, offs.first, soffset, offs.second, elem_size_bytes, instr->dest.ssa.num_components);
} else {
unreachable("Shader stage not implemented");
}
}
+std::pair<Temp, unsigned> get_gs_per_vertex_input_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned base_stride = 1u)
+{
+ assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY);
+
+ Builder bld(ctx->program, ctx->block);
+ nir_src *vertex_src = nir_get_io_vertex_index_src(instr);
+ Temp vertex_offset;
+
+ if (!nir_src_is_const(*vertex_src)) {
+ /* better code could be created, but this case probably doesn't happen
+ * much in practice */
+ Temp indirect_vertex = as_vgpr(ctx, get_ssa_temp(ctx, vertex_src->ssa));
+ for (unsigned i = 0; i < ctx->shader->info.gs.vertices_in; i++) {
+ Temp elem;
+
+ if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
+ elem = get_arg(ctx, ctx->args->gs_vtx_offset[i / 2u * 2u]);
+ if (i % 2u)
+ elem = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), elem);
+ } else {
+ elem = get_arg(ctx, ctx->args->gs_vtx_offset[i]);
+ }
+
+ if (vertex_offset.id()) {
+ Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)),
+ Operand(i), indirect_vertex);
+ vertex_offset = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), vertex_offset, elem, cond);
+ } else {
+ vertex_offset = elem;
+ }
+ }
+
+ if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs)
+ vertex_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu), vertex_offset);
+ } else {
+ unsigned vertex = nir_src_as_uint(*vertex_src);
+ if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs)
+ vertex_offset = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
+ get_arg(ctx, ctx->args->gs_vtx_offset[vertex / 2u * 2u]),
+ Operand((vertex % 2u) * 16u), Operand(16u));
+ else
+ vertex_offset = get_arg(ctx, ctx->args->gs_vtx_offset[vertex]);
+ }
+
+ std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, base_stride);
+ offs = offset_add(ctx, offs, std::make_pair(vertex_offset, 0u));
+ return offset_mul(ctx, offs, 4u);
+}
+
+void visit_load_gs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY);
+
+ Builder bld(ctx->program, ctx->block);
+ Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+ unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
+
+ if (ctx->stage == geometry_gs) {
+ std::pair<Temp, unsigned> offs = get_gs_per_vertex_input_offset(ctx, instr, ctx->program->wave_size);
+ Temp ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_GS * 16u));
+ load_vmem_mubuf(ctx, dst, ring, offs.first, Temp(), offs.second, elem_size_bytes, instr->dest.ssa.num_components, 4u * ctx->program->wave_size, false, true);
+ } else if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
+ std::pair<Temp, unsigned> offs = get_gs_per_vertex_input_offset(ctx, instr);
+ unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
+ load_lds(ctx, elem_size_bytes, dst, offs.first, offs.second, lds_align);
+ } else {
+ unreachable("Unsupported GS stage.");
+ }
+}
+
+void visit_load_tcs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
+
+ Builder bld(ctx->program, ctx->block);
+ Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+
+ if (load_input_from_temps(ctx, instr, dst))
+ return;
+
+ std::pair<Temp, unsigned> offs = get_tcs_per_vertex_input_lds_offset(ctx, instr);
+ unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
+ unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
+
+ load_lds(ctx, elem_size_bytes, dst, offs.first, offs.second, lds_align);
+}
+
+void visit_load_tes_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
+
+ Builder bld(ctx->program, ctx->block);
+
+ Temp ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
+ Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
+ Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+
+ unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
+ std::pair<Temp, unsigned> offs = get_tcs_per_vertex_output_vmem_offset(ctx, instr);
+
+ load_vmem_mubuf(ctx, dst, ring, offs.first, oc_lds, offs.second, elem_size_bytes, instr->dest.ssa.num_components, 0u, true, true);
+}
+
+void visit_load_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ switch (ctx->shader->info.stage) {
+ case MESA_SHADER_GEOMETRY:
+ visit_load_gs_per_vertex_input(ctx, instr);
+ break;
+ case MESA_SHADER_TESS_CTRL:
+ visit_load_tcs_per_vertex_input(ctx, instr);
+ break;
+ case MESA_SHADER_TESS_EVAL:
+ visit_load_tes_per_vertex_input(ctx, instr);
+ break;
+ default:
+ unreachable("Unimplemented shader stage");
+ }
+}
+
+void visit_load_per_vertex_output(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ visit_load_tcs_output(ctx, instr, true);
+}
+
+void visit_store_per_vertex_output(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
+ assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
+
+ visit_store_tcs_output(ctx, instr, true);
+}
+
+void visit_load_tess_coord(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
+
+ Builder bld(ctx->program, ctx->block);
+ Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+
+ Operand tes_u(get_arg(ctx, ctx->args->tes_u));
+ Operand tes_v(get_arg(ctx, ctx->args->tes_v));
+ Operand tes_w(0u);
+
+ if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES) {
+ Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v);
+ tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0x3f800000u /* 1.0f */), tmp);
+ tes_w = Operand(tmp);
+ }
+
+ Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w);
+ emit_split_vector(ctx, tess_coord, 3);
+}
+
Temp load_desc_ptr(isel_context *ctx, unsigned desc_set)
{
if (ctx->program->info->need_indirect_descriptor_sets) {
Builder bld(ctx->program, ctx->block);
Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0]));
- return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, Operand(desc_set << 2));//, false, false, false);
+ Operand off = bld.copy(bld.def(s1), Operand(desc_set << 2));
+ return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, off);//, false, false, false);
}
return get_arg(ctx, ctx->args->descriptor_sets[desc_set]);
if (nir_const_index && const_index == 0) {
index = desc_ptr;
- } else if (index.type() == RegType::vgpr) {
- index = bld.vadd32(bld.def(v1),
- nir_const_index ? Operand(const_index) : Operand(index),
- Operand(desc_ptr));
- } else {
- index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
- nir_const_index ? Operand(const_index) : Operand(index),
- Operand(desc_ptr));
- }
-
- bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), index);
-}
-
-void load_buffer(isel_context *ctx, unsigned num_components, Temp dst,
- Temp rsrc, Temp offset, bool glc=false, bool readonly=true)
-{
- Builder bld(ctx->program, ctx->block);
-
- unsigned num_bytes = dst.size() * 4;
- bool dlc = glc && ctx->options->chip_class >= GFX10;
-
- aco_opcode op;
- if (dst.type() == RegType::vgpr || (ctx->options->chip_class < GFX8 && !readonly)) {
- Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
- Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
- unsigned const_offset = 0;
-
- Temp lower = Temp();
- if (num_bytes > 16) {
- assert(num_components == 3 || num_components == 4);
- op = aco_opcode::buffer_load_dwordx4;
- lower = bld.tmp(v4);
- aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
- mubuf->definitions[0] = Definition(lower);
- mubuf->operands[0] = vaddr;
- mubuf->operands[1] = Operand(rsrc);
- mubuf->operands[2] = soffset;
- mubuf->offen = (offset.type() == RegType::vgpr);
- mubuf->glc = glc;
- mubuf->dlc = dlc;
- mubuf->barrier = readonly ? barrier_none : barrier_buffer;
- mubuf->can_reorder = readonly;
- bld.insert(std::move(mubuf));
- emit_split_vector(ctx, lower, 2);
- num_bytes -= 16;
- const_offset = 16;
- }
-
- switch (num_bytes) {
- case 4:
- op = aco_opcode::buffer_load_dword;
- break;
- case 8:
- op = aco_opcode::buffer_load_dwordx2;
- break;
- case 12:
- op = aco_opcode::buffer_load_dwordx3;
- break;
- case 16:
- op = aco_opcode::buffer_load_dwordx4;
- break;
- default:
- unreachable("Load SSBO not implemented for this size.");
- }
- aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
- mubuf->operands[0] = vaddr;
- mubuf->operands[1] = Operand(rsrc);
- mubuf->operands[2] = soffset;
- mubuf->offen = (offset.type() == RegType::vgpr);
- mubuf->glc = glc;
- mubuf->dlc = dlc;
- mubuf->barrier = readonly ? barrier_none : barrier_buffer;
- mubuf->can_reorder = readonly;
- mubuf->offset = const_offset;
- aco_ptr<Instruction> instr = std::move(mubuf);
-
- if (dst.size() > 4) {
- assert(lower != Temp());
- Temp upper = bld.tmp(RegType::vgpr, dst.size() - lower.size());
- instr->definitions[0] = Definition(upper);
- bld.insert(std::move(instr));
- if (dst.size() == 8)
- emit_split_vector(ctx, upper, 2);
- instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size() / 2, 1));
- instr->operands[0] = Operand(emit_extract_vector(ctx, lower, 0, v2));
- instr->operands[1] = Operand(emit_extract_vector(ctx, lower, 1, v2));
- instr->operands[2] = Operand(emit_extract_vector(ctx, upper, 0, v2));
- if (dst.size() == 8)
- instr->operands[3] = Operand(emit_extract_vector(ctx, upper, 1, v2));
- }
-
- if (dst.type() == RegType::sgpr) {
- Temp vec = bld.tmp(RegType::vgpr, dst.size());
- instr->definitions[0] = Definition(vec);
- bld.insert(std::move(instr));
- expand_vector(ctx, vec, dst, num_components, (1 << num_components) - 1);
- } else {
- instr->definitions[0] = Definition(dst);
- bld.insert(std::move(instr));
- emit_split_vector(ctx, dst, num_components);
- }
- } else {
- switch (num_bytes) {
- case 4:
- op = aco_opcode::s_buffer_load_dword;
- break;
- case 8:
- op = aco_opcode::s_buffer_load_dwordx2;
- break;
- case 12:
- case 16:
- op = aco_opcode::s_buffer_load_dwordx4;
- break;
- case 24:
- case 32:
- op = aco_opcode::s_buffer_load_dwordx8;
- break;
- default:
- unreachable("Load SSBO not implemented for this size.");
- }
- aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
- load->operands[0] = Operand(rsrc);
- load->operands[1] = Operand(bld.as_uniform(offset));
- assert(load->operands[1].getTemp().type() == RegType::sgpr);
- load->definitions[0] = Definition(dst);
- load->glc = glc;
- load->dlc = dlc;
- load->barrier = readonly ? barrier_none : barrier_buffer;
- load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works
- assert(ctx->options->chip_class >= GFX8 || !glc);
-
- /* trim vector */
- if (dst.size() == 3) {
- Temp vec = bld.tmp(s4);
- load->definitions[0] = Definition(vec);
- bld.insert(std::move(load));
- emit_split_vector(ctx, vec, 4);
-
- bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
- emit_extract_vector(ctx, vec, 0, s1),
- emit_extract_vector(ctx, vec, 1, s1),
- emit_extract_vector(ctx, vec, 2, s1));
- } else if (dst.size() == 6) {
- Temp vec = bld.tmp(s8);
- load->definitions[0] = Definition(vec);
- bld.insert(std::move(load));
- emit_split_vector(ctx, vec, 4);
-
- bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
- emit_extract_vector(ctx, vec, 0, s2),
- emit_extract_vector(ctx, vec, 1, s2),
- emit_extract_vector(ctx, vec, 2, s2));
- } else {
- bld.insert(std::move(load));
- }
- emit_split_vector(ctx, dst, num_components);
+ } else if (index.type() == RegType::vgpr) {
+ index = bld.vadd32(bld.def(v1),
+ nir_const_index ? Operand(const_index) : Operand(index),
+ Operand(desc_ptr));
+ } else {
+ index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
+ nir_const_index ? Operand(const_index) : Operand(index),
+ Operand(desc_ptr));
}
+
+ bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), index);
+}
+
+void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size,
+ Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
+ bool glc=false, bool readonly=true)
+{
+ Builder bld(ctx->program, ctx->block);
+
+ bool use_smem = dst.type() != RegType::vgpr && ((ctx->options->chip_class >= GFX8 && component_size >= 4) || readonly);
+ if (use_smem)
+ offset = bld.as_uniform(offset);
+
+ LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
+ info.glc = glc;
+ info.barrier = readonly ? barrier_none : barrier_buffer;
+ info.can_reorder = readonly;
+ info.align_mul = align_mul;
+ info.align_offset = align_offset;
+ if (use_smem)
+ emit_smem_load(ctx, bld, &info);
+ else
+ emit_mubuf_load(ctx, bld, &info);
}
void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
rsrc = convert_pointer_to_64_bit(ctx, rsrc);
rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
}
-
- load_buffer(ctx, instr->num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa));
+ unsigned size = instr->dest.ssa.bit_size / 8;
+ load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
+ nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr));
}
void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
{
Builder bld(ctx->program, ctx->block);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-
unsigned offset = nir_intrinsic_base(instr);
+ unsigned count = instr->dest.ssa.num_components;
nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]);
- if (index_cv && instr->dest.ssa.bit_size == 32) {
- unsigned count = instr->dest.ssa.num_components;
+ if (index_cv && instr->dest.ssa.bit_size == 32) {
unsigned start = (offset + index_cv->u32) / 4u;
start -= ctx->args->ac.base_inline_push_consts;
if (start + count <= ctx->args->ac.num_inline_push_consts) {
Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
Temp vec = dst;
bool trim = false;
+ bool aligned = true;
+
+ if (instr->dest.ssa.bit_size == 8) {
+ aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
+ bool fits_in_dword = count == 1 || (index_cv && ((offset + index_cv->u32) % 4 + count) <= 4);
+ if (!aligned)
+ vec = fits_in_dword ? bld.tmp(s1) : bld.tmp(s2);
+ } else if (instr->dest.ssa.bit_size == 16) {
+ aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
+ if (!aligned)
+ vec = count == 4 ? bld.tmp(s4) : count > 1 ? bld.tmp(s2) : bld.tmp(s1);
+ }
+
aco_opcode op;
- switch (dst.size()) {
+ switch (vec.size()) {
case 1:
op = aco_opcode::s_load_dword;
break;
bld.smem(op, Definition(vec), ptr, index);
+ if (!aligned) {
+ Operand byte_offset = index_cv ? Operand((offset + index_cv->u32) % 4) : Operand(index);
+ byte_align_scalar(ctx, vec, byte_offset, dst);
+ return;
+ }
+
if (trim) {
emit_split_vector(ctx, vec, 4);
RegClass rc = dst.size() == 3 ? s1 : s2;
bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)),
Operand(MIN2(base + range, ctx->shader->constant_data_size)),
Operand(desc_type));
-
- load_buffer(ctx, instr->num_components, dst, rsrc, offset);
+ unsigned size = instr->dest.ssa.bit_size / 8;
+ // TODO: get alignment information for subdword constants
+ load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0);
}
void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
{
if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
- ctx->cf_info.exec_potentially_empty = true;
+ ctx->cf_info.exec_potentially_empty_discard = true;
ctx->program->needs_exact = true;
Builder bld(ctx->program, ctx->block);
if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
- ctx->cf_info.exec_potentially_empty = true;
+ ctx->cf_info.exec_potentially_empty_discard = true;
bool divergent = ctx->cf_info.parent_if.is_divergent ||
ctx->cf_info.parent_loop.has_divergent_continue;
ctx->block->kind |= block_kind_break;
unsigned idx = ctx->block->index;
+ ctx->cf_info.parent_loop.has_divergent_branch = true;
+ ctx->cf_info.nir_to_aco[instr->instr.block->index] = idx;
+
/* remove critical edges from linear CFG */
bld.branch(aco_opcode::p_branch);
Block* break_block = ctx->program->create_and_insert_block();
Operand off;
if (!index_set) {
- off = Operand(offset);
+ off = bld.copy(bld.def(s1), Operand(offset));
} else {
off = Operand((Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset),
bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index)));
* The sample index should be adjusted as follows:
* sample_index = (fmask >> (sample_index * 4)) & 0xF;
*/
-static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, Temp coords, Operand sample_index, Temp fmask_desc_ptr)
+static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vector<Temp>& coords, Operand sample_index, Temp fmask_desc_ptr)
{
Builder bld(ctx->program, ctx->block);
Temp fmask = bld.tmp(v1);
? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da)
: 0;
- aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
- load->operands[0] = Operand(coords);
- load->operands[1] = Operand(fmask_desc_ptr);
+ Temp coord = da ? bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), coords[0], coords[1], coords[2]) :
+ bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), coords[0], coords[1]);
+ aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 3, 1)};
+ load->operands[0] = Operand(fmask_desc_ptr);
+ load->operands[1] = Operand(s4); /* no sampler */
+ load->operands[2] = Operand(coord);
load->definitions[0] = Definition(fmask);
load->glc = false;
load->dlc = false;
ctx->block->instructions.emplace_back(std::move(load));
Operand sample_index4;
- if (sample_index.isConstant() && sample_index.constantValue() < 16) {
- sample_index4 = Operand(sample_index.constantValue() << 2);
+ if (sample_index.isConstant()) {
+ if (sample_index.constantValue() < 16) {
+ sample_index4 = Operand(sample_index.constantValue() << 2);
+ } else {
+ sample_index4 = Operand(0u);
+ }
} else if (sample_index.regClass() == s1) {
sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
} else {
bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
int count = image_type_to_components_count(dim, is_array);
- std::vector<Operand> coords(count);
+ std::vector<Temp> coords(count);
+ Builder bld(ctx->program, ctx->block);
if (is_ms) {
- Operand sample_index;
- nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]);
- if (sample_cv)
- sample_index = Operand(sample_cv->u32);
- else
- sample_index = Operand(emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[2].ssa), 0, v1));
-
+ count--;
+ Temp src2 = get_ssa_temp(ctx, instr->src[2].ssa);
+ /* get sample index */
if (instr->intrinsic == nir_intrinsic_image_deref_load) {
- aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, is_array ? 3 : 2, 1)};
- for (unsigned i = 0; i < vec->operands.size(); i++)
- vec->operands[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
- Temp fmask_load_address = {ctx->program->allocateId(), is_array ? v3 : v2};
- vec->definitions[0] = Definition(fmask_load_address);
- ctx->block->instructions.emplace_back(std::move(vec));
+ nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]);
+ Operand sample_index = sample_cv ? Operand(sample_cv->u32) : Operand(emit_extract_vector(ctx, src2, 0, v1));
+ std::vector<Temp> fmask_load_address;
+ for (unsigned i = 0; i < (is_array ? 3 : 2); i++)
+ fmask_load_address.emplace_back(emit_extract_vector(ctx, src0, i, v1));
Temp fmask_desc_ptr = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_FMASK, nullptr, false, false);
- sample_index = Operand(adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr));
+ coords[count] = adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr);
+ } else {
+ coords[count] = emit_extract_vector(ctx, src2, 0, v1);
}
- count--;
- coords[count] = sample_index;
}
- if (count == 1 && !gfx9_1d)
- return emit_extract_vector(ctx, src0, 0, v1);
-
if (gfx9_1d) {
- coords[0] = Operand(emit_extract_vector(ctx, src0, 0, v1));
+ coords[0] = emit_extract_vector(ctx, src0, 0, v1);
coords.resize(coords.size() + 1);
- coords[1] = Operand((uint32_t) 0);
+ coords[1] = bld.copy(bld.def(v1), Operand(0u));
if (is_array)
- coords[2] = Operand(emit_extract_vector(ctx, src0, 1, v1));
+ coords[2] = emit_extract_vector(ctx, src0, 1, v1);
} else {
for (int i = 0; i < count; i++)
- coords[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
+ coords[i] = emit_extract_vector(ctx, src0, i, v1);
}
if (instr->intrinsic == nir_intrinsic_image_deref_load ||
bool level_zero = nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0;
if (!level_zero)
- coords.emplace_back(Operand(get_ssa_temp(ctx, instr->src[lod_index].ssa)));
+ coords.emplace_back(get_ssa_temp(ctx, instr->src[lod_index].ssa));
}
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
for (unsigned i = 0; i < coords.size(); i++)
- vec->operands[i] = coords[i];
+ vec->operands[i] = Operand(coords[i]);
Temp res = {ctx->program->allocateId(), RegClass(RegType::vgpr, coords.size())};
vec->definitions[0] = Definition(res);
ctx->block->instructions.emplace_back(std::move(vec));
unreachable(">4 channel buffer image load");
}
aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3, 1)};
- load->operands[0] = Operand(vindex);
- load->operands[1] = Operand(rsrc);
+ load->operands[0] = Operand(rsrc);
+ load->operands[1] = Operand(vindex);
load->operands[2] = Operand((uint32_t) 0);
Temp tmp;
if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
- aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 2, 1)};
- load->operands[0] = Operand(coords);
- load->operands[1] = Operand(resource);
+ aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1)};
+ load->operands[0] = Operand(resource);
+ load->operands[1] = Operand(s4); /* no sampler */
+ load->operands[2] = Operand(coords);
load->definitions[0] = Definition(tmp);
load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
load->dlc = load->glc && ctx->options->chip_class >= GFX10;
unreachable(">4 channel buffer image store");
}
aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
- store->operands[0] = Operand(vindex);
- store->operands[1] = Operand(rsrc);
+ store->operands[0] = Operand(rsrc);
+ store->operands[1] = Operand(vindex);
store->operands[2] = Operand((uint32_t) 0);
store->operands[3] = Operand(data);
store->idxen = true;
bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
- aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 4, 0)};
- store->operands[0] = Operand(coords);
- store->operands[1] = Operand(resource);
- store->operands[2] = Operand(s4);
- store->operands[3] = Operand(data);
+ aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 0)};
+ store->operands[0] = Operand(resource);
+ store->operands[1] = Operand(data);
+ store->operands[2] = Operand(coords);
store->glc = glc;
store->dlc = false;
store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
//assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented.");
aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
- mubuf->operands[0] = Operand(vindex);
- mubuf->operands[1] = Operand(resource);
+ mubuf->operands[0] = Operand(resource);
+ mubuf->operands[1] = Operand(vindex);
mubuf->operands[2] = Operand((uint32_t)0);
mubuf->operands[3] = Operand(data);
if (return_previous)
Temp coords = get_image_coords(ctx, instr, type);
Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
- aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(image_op, Format::MIMG, 4, return_previous ? 1 : 0)};
- mimg->operands[0] = Operand(coords);
- mimg->operands[1] = Operand(resource);
- mimg->operands[2] = Operand(s4); /* no sampler */
- mimg->operands[3] = Operand(data);
+ aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(image_op, Format::MIMG, 3, return_previous ? 1 : 0)};
+ mimg->operands[0] = Operand(resource);
+ mimg->operands[1] = Operand(data);
+ mimg->operands[2] = Operand(coords);
if (return_previous)
mimg->definitions[0] = Definition(dst);
mimg->glc = return_previous;
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
- aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)};
- mimg->operands[0] = Operand(lod);
- mimg->operands[1] = Operand(resource);
+ aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1)};
+ mimg->operands[0] = Operand(resource);
+ mimg->operands[1] = Operand(s4); /* no sampler */
+ mimg->operands[2] = Operand(lod);
uint8_t& dmask = mimg->dmask;
mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
- load_buffer(ctx, num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), glc, false);
+ unsigned size = instr->dest.ssa.bit_size / 8;
+ load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
+ nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false);
}
void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
Builder bld(ctx->program, ctx->block);
Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
- unsigned writemask = nir_intrinsic_write_mask(instr);
+ unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] &&
- ctx->options->chip_class >= GFX8;
+ ctx->options->chip_class >= GFX8 &&
+ elem_size_bytes >= 4;
if (smem)
offset = bld.as_uniform(offset);
bool smem_nonfs = smem && ctx->stage != fragment_fs;
- while (writemask) {
- int start, count;
- u_bit_scan_consecutive_range(&writemask, &start, &count);
- if (count == 3 && smem) {
- writemask |= 1u << (start + 2);
- count = 2;
- }
- int num_bytes = count * elem_size_bytes;
-
- if (num_bytes > 16) {
- assert(elem_size_bytes == 8);
- writemask |= (((count - 2) << 1) - 1) << (start + 2);
- count = 2;
- num_bytes = 16;
- }
-
- // TODO: check alignment of sub-dword stores
- // TODO: split 3 bytes. there is no store instruction for that
-
- Temp write_data;
- if (count != instr->num_components) {
- emit_split_vector(ctx, data, instr->num_components);
- aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
- for (int i = 0; i < count; i++) {
- Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(data.type(), elem_size_bytes / 4));
- vec->operands[i] = Operand(smem_nonfs ? bld.as_uniform(elem) : elem);
- }
- write_data = bld.tmp(!smem ? RegType::vgpr : smem_nonfs ? RegType::sgpr : data.type(), count * elem_size_bytes / 4);
- vec->definitions[0] = Definition(write_data);
- ctx->block->instructions.emplace_back(std::move(vec));
- } else if (!smem && data.type() != RegType::vgpr) {
- assert(num_bytes % 4 == 0);
- write_data = bld.copy(bld.def(RegType::vgpr, num_bytes / 4), data);
- } else if (smem_nonfs && data.type() == RegType::vgpr) {
- assert(num_bytes % 4 == 0);
- write_data = bld.as_uniform(data);
- } else {
- write_data = data;
- }
+ unsigned write_count = 0;
+ Temp write_datas[32];
+ unsigned offsets[32];
+ split_buffer_store(ctx, instr, smem, smem_nonfs ? RegType::sgpr : (smem ? data.type() : RegType::vgpr),
+ data, writemask, 16, &write_count, write_datas, offsets);
- aco_opcode vmem_op, smem_op;
- switch (num_bytes) {
- case 4:
- vmem_op = aco_opcode::buffer_store_dword;
- smem_op = aco_opcode::s_buffer_store_dword;
- break;
- case 8:
- vmem_op = aco_opcode::buffer_store_dwordx2;
- smem_op = aco_opcode::s_buffer_store_dwordx2;
- break;
- case 12:
- vmem_op = aco_opcode::buffer_store_dwordx3;
- smem_op = aco_opcode::last_opcode;
- assert(!smem);
- break;
- case 16:
- vmem_op = aco_opcode::buffer_store_dwordx4;
- smem_op = aco_opcode::s_buffer_store_dwordx4;
- break;
- default:
- unreachable("Store SSBO not implemented for this size.");
- }
- if (ctx->stage == fragment_fs)
- smem_op = aco_opcode::p_fs_buffer_store_smem;
+ for (unsigned i = 0; i < write_count; i++) {
+ aco_opcode op = get_buffer_store_op(smem, write_datas[i].bytes());
+ if (smem && ctx->stage == fragment_fs)
+ op = aco_opcode::p_fs_buffer_store_smem;
if (smem) {
- aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(smem_op, Format::SMEM, 3, 0)};
+ aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(op, Format::SMEM, 3, 0)};
store->operands[0] = Operand(rsrc);
- if (start) {
+ if (offsets[i]) {
Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
- offset, Operand(start * elem_size_bytes));
+ offset, Operand(offsets[i]));
store->operands[1] = Operand(off);
} else {
store->operands[1] = Operand(offset);
}
- if (smem_op != aco_opcode::p_fs_buffer_store_smem)
+ if (op != aco_opcode::p_fs_buffer_store_smem)
store->operands[1].setFixed(m0);
- store->operands[2] = Operand(write_data);
+ store->operands[2] = Operand(write_datas[i]);
store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
store->dlc = false;
store->disable_wqm = true;
store->barrier = barrier_buffer;
ctx->block->instructions.emplace_back(std::move(store));
ctx->program->wb_smem_l1_on_end = true;
- if (smem_op == aco_opcode::p_fs_buffer_store_smem) {
+ if (op == aco_opcode::p_fs_buffer_store_smem) {
ctx->block->kind |= block_kind_needs_lowering;
ctx->program->needs_exact = true;
}
} else {
- aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(vmem_op, Format::MUBUF, 4, 0)};
- store->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
- store->operands[1] = Operand(rsrc);
+ aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
+ store->operands[0] = Operand(rsrc);
+ store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
- store->operands[3] = Operand(write_data);
- store->offset = start * elem_size_bytes;
+ store->operands[3] = Operand(write_datas[i]);
+ store->offset = offsets[i];
store->offen = (offset.type() == RegType::vgpr);
store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
store->dlc = false;
}
aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
- mubuf->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
- mubuf->operands[1] = Operand(rsrc);
+ mubuf->operands[0] = Operand(rsrc);
+ mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
mubuf->operands[3] = Operand(data);
if (return_previous)
{
Builder bld(ctx->program, ctx->block);
unsigned num_components = instr->num_components;
- unsigned num_bytes = num_components * instr->dest.ssa.bit_size / 8;
+ unsigned component_size = instr->dest.ssa.bit_size / 8;
+
+ LoadEmitInfo info = {Operand(get_ssa_temp(ctx, instr->src[0].ssa)),
+ get_ssa_temp(ctx, &instr->dest.ssa),
+ num_components, component_size};
+ info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
+ info.align_mul = nir_intrinsic_align_mul(instr);
+ info.align_offset = nir_intrinsic_align_offset(instr);
+ info.barrier = barrier_buffer;
+ info.can_reorder = false;
+ /* VMEM stores don't update the SMEM cache and it's difficult to prove that
+ * it's safe to use SMEM */
+ bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE;
+ if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->chip_class < GFX8) || !can_use_smem) {
+ emit_global_load(ctx, bld, &info);
+ } else {
+ info.offset = Operand(bld.as_uniform(info.offset));
+ emit_smem_load(ctx, bld, &info);
+ }
+}
- Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
- Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
+void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ Builder bld(ctx->program, ctx->block);
+ unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
+ unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
- bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
- bool dlc = glc && ctx->options->chip_class >= GFX10;
- aco_opcode op;
- if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) {
- bool global = ctx->options->chip_class >= GFX9;
- aco_opcode op;
- switch (num_bytes) {
- case 4:
- op = global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
- break;
- case 8:
- op = global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
- break;
- case 12:
- op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
- break;
- case 16:
- op = global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
- break;
- default:
- unreachable("load_global not implemented for this size.");
- }
- aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
- flat->operands[0] = Operand(addr);
- flat->operands[1] = Operand(s1);
- flat->glc = glc;
- flat->dlc = dlc;
- flat->barrier = barrier_buffer;
+ Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
+ Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
+ bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+
+ if (ctx->options->chip_class >= GFX7)
+ addr = as_vgpr(ctx, addr);
+
+ unsigned write_count = 0;
+ Temp write_datas[32];
+ unsigned offsets[32];
+ split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
+ 16, &write_count, write_datas, offsets);
+
+ for (unsigned i = 0; i < write_count; i++) {
+ if (ctx->options->chip_class >= GFX7) {
+ unsigned offset = offsets[i];
+ Temp store_addr = addr;
+ if (offset > 0 && ctx->options->chip_class < GFX9) {
+ Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
+ Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
+ Temp carry = bld.tmp(bld.lm);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
+
+ bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)),
+ Operand(offset), addr0);
+ bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm),
+ Operand(0u), addr1,
+ carry).def(1).setHint(vcc);
+
+ store_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
+
+ offset = 0;
+ }
- if (dst.type() == RegType::sgpr) {
- Temp vec = bld.tmp(RegType::vgpr, dst.size());
- flat->definitions[0] = Definition(vec);
- ctx->block->instructions.emplace_back(std::move(flat));
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
- } else {
- flat->definitions[0] = Definition(dst);
- ctx->block->instructions.emplace_back(std::move(flat));
- }
- emit_split_vector(ctx, dst, num_components);
- } else {
- switch (num_bytes) {
+ bool global = ctx->options->chip_class >= GFX9;
+ aco_opcode op;
+ switch (write_datas[i].bytes()) {
+ case 1:
+ op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte;
+ break;
+ case 2:
+ op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short;
+ break;
case 4:
- op = aco_opcode::s_load_dword;
+ op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
break;
case 8:
- op = aco_opcode::s_load_dwordx2;
+ op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
break;
case 12:
+ op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
+ break;
case 16:
- op = aco_opcode::s_load_dwordx4;
+ op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
break;
default:
- unreachable("load_global not implemented for this size.");
- }
- aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
- load->operands[0] = Operand(addr);
- load->operands[1] = Operand(0u);
- load->definitions[0] = Definition(dst);
- load->glc = glc;
- load->dlc = dlc;
- load->barrier = barrier_buffer;
- assert(ctx->options->chip_class >= GFX8 || !glc);
-
- if (dst.size() == 3) {
- /* trim vector */
- Temp vec = bld.tmp(s4);
- load->definitions[0] = Definition(vec);
- ctx->block->instructions.emplace_back(std::move(load));
- emit_split_vector(ctx, vec, 4);
+ unreachable("store_global not implemented for this size.");
+ }
- bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
- emit_extract_vector(ctx, vec, 0, s1),
- emit_extract_vector(ctx, vec, 1, s1),
- emit_extract_vector(ctx, vec, 2, s1));
+ aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
+ flat->operands[0] = Operand(store_addr);
+ flat->operands[1] = Operand(s1);
+ flat->operands[2] = Operand(write_datas[i]);
+ flat->glc = glc;
+ flat->dlc = false;
+ flat->offset = offset;
+ flat->disable_wqm = true;
+ flat->barrier = barrier_buffer;
+ ctx->program->needs_exact = true;
+ ctx->block->instructions.emplace_back(std::move(flat));
} else {
- ctx->block->instructions.emplace_back(std::move(load));
- }
- }
-}
-
-void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
-{
- Builder bld(ctx->program, ctx->block);
- unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
-
- Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
- Temp addr = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
-
- unsigned writemask = nir_intrinsic_write_mask(instr);
- while (writemask) {
- int start, count;
- u_bit_scan_consecutive_range(&writemask, &start, &count);
- unsigned num_bytes = count * elem_size_bytes;
+ assert(ctx->options->chip_class == GFX6);
- Temp write_data = data;
- if (count != instr->num_components) {
- aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
- for (int i = 0; i < count; i++)
- vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, v1));
- write_data = bld.tmp(RegType::vgpr, count);
- vec->definitions[0] = Definition(write_data);
- ctx->block->instructions.emplace_back(std::move(vec));
- }
-
- unsigned offset = start * elem_size_bytes;
- if (offset > 0 && ctx->options->chip_class < GFX9) {
- Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
- Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
- Temp carry = bld.tmp(bld.lm);
- bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
+ aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
- bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)),
- Operand(offset), addr0);
- bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm),
- Operand(0u), addr1,
- carry).def(1).setHint(vcc);
+ Temp rsrc = get_gfx6_global_rsrc(bld, addr);
- addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
-
- offset = 0;
- }
-
- bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
- bool global = ctx->options->chip_class >= GFX9;
- aco_opcode op;
- switch (num_bytes) {
- case 4:
- op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
- break;
- case 8:
- op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
- break;
- case 12:
- op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
- break;
- case 16:
- op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
- break;
- default:
- unreachable("store_global not implemented for this size.");
+ aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
+ mubuf->operands[0] = Operand(rsrc);
+ mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
+ mubuf->operands[2] = Operand(0u);
+ mubuf->operands[3] = Operand(write_datas[i]);
+ mubuf->glc = glc;
+ mubuf->dlc = false;
+ mubuf->offset = offsets[i];
+ mubuf->addr64 = addr.type() == RegType::vgpr;
+ mubuf->disable_wqm = true;
+ mubuf->barrier = barrier_buffer;
+ ctx->program->needs_exact = true;
+ ctx->block->instructions.emplace_back(std::move(mubuf));
}
- aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
- flat->operands[0] = Operand(addr);
- flat->operands[1] = Operand(s1);
- flat->operands[2] = Operand(data);
- flat->glc = glc;
- flat->dlc = false;
- flat->offset = offset;
- flat->disable_wqm = true;
- flat->barrier = barrier_buffer;
- ctx->program->needs_exact = true;
- ctx->block->instructions.emplace_back(std::move(flat));
}
}
}
Builder bld(ctx->program, ctx->block);
- Temp addr = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
+ Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
+ if (ctx->options->chip_class >= GFX7)
+ addr = as_vgpr(ctx, addr);
+
if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap)
data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
get_ssa_temp(ctx, instr->src[2].ssa), data);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
- bool global = ctx->options->chip_class >= GFX9;
aco_opcode op32, op64;
- switch (instr->intrinsic) {
- case nir_intrinsic_global_atomic_add:
- op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
- op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
- break;
- case nir_intrinsic_global_atomic_imin:
- op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
- op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
- break;
- case nir_intrinsic_global_atomic_umin:
- op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
- op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
- break;
- case nir_intrinsic_global_atomic_imax:
- op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
- op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
- break;
- case nir_intrinsic_global_atomic_umax:
- op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
- op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
- break;
- case nir_intrinsic_global_atomic_and:
- op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
- op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
- break;
- case nir_intrinsic_global_atomic_or:
- op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
- op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
- break;
- case nir_intrinsic_global_atomic_xor:
- op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
- op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
- break;
- case nir_intrinsic_global_atomic_exchange:
- op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
- op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
- break;
- case nir_intrinsic_global_atomic_comp_swap:
- op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
- op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
- break;
- default:
- unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions.");
+
+ if (ctx->options->chip_class >= GFX7) {
+ bool global = ctx->options->chip_class >= GFX9;
+ switch (instr->intrinsic) {
+ case nir_intrinsic_global_atomic_add:
+ op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
+ op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
+ break;
+ case nir_intrinsic_global_atomic_imin:
+ op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
+ op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
+ break;
+ case nir_intrinsic_global_atomic_umin:
+ op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
+ op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
+ break;
+ case nir_intrinsic_global_atomic_imax:
+ op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
+ op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
+ break;
+ case nir_intrinsic_global_atomic_umax:
+ op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
+ op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
+ break;
+ case nir_intrinsic_global_atomic_and:
+ op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
+ op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
+ break;
+ case nir_intrinsic_global_atomic_or:
+ op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
+ op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
+ break;
+ case nir_intrinsic_global_atomic_xor:
+ op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
+ op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
+ break;
+ case nir_intrinsic_global_atomic_exchange:
+ op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
+ op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
+ break;
+ case nir_intrinsic_global_atomic_comp_swap:
+ op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
+ op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
+ break;
+ default:
+ unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions.");
+ }
+
+ aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
+ aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
+ flat->operands[0] = Operand(addr);
+ flat->operands[1] = Operand(s1);
+ flat->operands[2] = Operand(data);
+ if (return_previous)
+ flat->definitions[0] = Definition(dst);
+ flat->glc = return_previous;
+ flat->dlc = false; /* Not needed for atomics */
+ flat->offset = 0;
+ flat->disable_wqm = true;
+ flat->barrier = barrier_buffer;
+ ctx->program->needs_exact = true;
+ ctx->block->instructions.emplace_back(std::move(flat));
+ } else {
+ assert(ctx->options->chip_class == GFX6);
+
+ switch (instr->intrinsic) {
+ case nir_intrinsic_global_atomic_add:
+ op32 = aco_opcode::buffer_atomic_add;
+ op64 = aco_opcode::buffer_atomic_add_x2;
+ break;
+ case nir_intrinsic_global_atomic_imin:
+ op32 = aco_opcode::buffer_atomic_smin;
+ op64 = aco_opcode::buffer_atomic_smin_x2;
+ break;
+ case nir_intrinsic_global_atomic_umin:
+ op32 = aco_opcode::buffer_atomic_umin;
+ op64 = aco_opcode::buffer_atomic_umin_x2;
+ break;
+ case nir_intrinsic_global_atomic_imax:
+ op32 = aco_opcode::buffer_atomic_smax;
+ op64 = aco_opcode::buffer_atomic_smax_x2;
+ break;
+ case nir_intrinsic_global_atomic_umax:
+ op32 = aco_opcode::buffer_atomic_umax;
+ op64 = aco_opcode::buffer_atomic_umax_x2;
+ break;
+ case nir_intrinsic_global_atomic_and:
+ op32 = aco_opcode::buffer_atomic_and;
+ op64 = aco_opcode::buffer_atomic_and_x2;
+ break;
+ case nir_intrinsic_global_atomic_or:
+ op32 = aco_opcode::buffer_atomic_or;
+ op64 = aco_opcode::buffer_atomic_or_x2;
+ break;
+ case nir_intrinsic_global_atomic_xor:
+ op32 = aco_opcode::buffer_atomic_xor;
+ op64 = aco_opcode::buffer_atomic_xor_x2;
+ break;
+ case nir_intrinsic_global_atomic_exchange:
+ op32 = aco_opcode::buffer_atomic_swap;
+ op64 = aco_opcode::buffer_atomic_swap_x2;
+ break;
+ case nir_intrinsic_global_atomic_comp_swap:
+ op32 = aco_opcode::buffer_atomic_cmpswap;
+ op64 = aco_opcode::buffer_atomic_cmpswap_x2;
+ break;
+ default:
+ unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions.");
+ }
+
+ Temp rsrc = get_gfx6_global_rsrc(bld, addr);
+
+ aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
+
+ aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
+ mubuf->operands[0] = Operand(rsrc);
+ mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
+ mubuf->operands[2] = Operand(0u);
+ mubuf->operands[3] = Operand(data);
+ if (return_previous)
+ mubuf->definitions[0] = Definition(dst);
+ mubuf->glc = return_previous;
+ mubuf->dlc = false;
+ mubuf->offset = 0;
+ mubuf->addr64 = addr.type() == RegType::vgpr;
+ mubuf->disable_wqm = true;
+ mubuf->barrier = barrier_buffer;
+ ctx->program->needs_exact = true;
+ ctx->block->instructions.emplace_back(std::move(mubuf));
}
- aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
- aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
- flat->operands[0] = Operand(addr);
- flat->operands[1] = Operand(s1);
- flat->operands[2] = Operand(data);
- if (return_previous)
- flat->definitions[0] = Definition(dst);
- flat->glc = return_previous;
- flat->dlc = false; /* Not needed for atomics */
- flat->offset = 0;
- flat->disable_wqm = true;
- flat->barrier = barrier_buffer;
- ctx->program->needs_exact = true;
- ctx->block->instructions.emplace_back(std::move(flat));
}
void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
switch(instr->intrinsic) {
case nir_intrinsic_group_memory_barrier:
case nir_intrinsic_memory_barrier:
- bld.barrier(aco_opcode::p_memory_barrier_all);
- break;
- case nir_intrinsic_memory_barrier_atomic_counter:
- bld.barrier(aco_opcode::p_memory_barrier_atomic);
+ bld.barrier(aco_opcode::p_memory_barrier_common);
break;
case nir_intrinsic_memory_barrier_buffer:
bld.barrier(aco_opcode::p_memory_barrier_buffer);
case nir_intrinsic_memory_barrier_image:
bld.barrier(aco_opcode::p_memory_barrier_image);
break;
+ case nir_intrinsic_memory_barrier_tcs_patch:
case nir_intrinsic_memory_barrier_shared:
bld.barrier(aco_opcode::p_memory_barrier_shared);
break;
{
// TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
- assert(instr->dest.ssa.bit_size >= 32 && "Bitsize not supported in load_shared.");
Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
Builder bld(ctx->program, ctx->block);
Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
- assert(elem_size_bytes >= 4 && "Only 32bit & 64bit store_shared currently supported.");
unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
{
unsigned offset = nir_intrinsic_base(instr);
- Operand m = load_lds_size_m0(ctx);
+ Builder bld(ctx->program, ctx->block);
+ Operand m = load_lds_size_m0(bld);
Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
}
if (offset > 65535) {
- Builder bld(ctx->program, ctx->block);
address = bld.vadd32(bld.def(v1), Operand(offset), address);
offset = 0;
}
}
/* older generations need element size = 16 bytes. element size removed in GFX9 */
- if (ctx->program->chip_class <= GFX8)
- rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
-
- return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
-}
-
-void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
- assert(instr->dest.ssa.bit_size == 32 || instr->dest.ssa.bit_size == 64);
- Builder bld(ctx->program, ctx->block);
- Temp rsrc = get_scratch_resource(ctx);
- Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
- Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-
- aco_opcode op;
- switch (dst.size()) {
- case 1:
- op = aco_opcode::buffer_load_dword;
- break;
- case 2:
- op = aco_opcode::buffer_load_dwordx2;
- break;
- case 3:
- op = aco_opcode::buffer_load_dwordx3;
- break;
- case 4:
- op = aco_opcode::buffer_load_dwordx4;
- break;
- case 6:
- case 8: {
- std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
- Temp lower = bld.mubuf(aco_opcode::buffer_load_dwordx4,
- bld.def(v4), offset, rsrc,
- ctx->program->scratch_offset, 0, true);
- Temp upper = bld.mubuf(dst.size() == 6 ? aco_opcode::buffer_load_dwordx2 :
- aco_opcode::buffer_load_dwordx4,
- dst.size() == 6 ? bld.def(v2) : bld.def(v4),
- offset, rsrc, ctx->program->scratch_offset, 16, true);
- emit_split_vector(ctx, lower, 2);
- elems[0] = emit_extract_vector(ctx, lower, 0, v2);
- elems[1] = emit_extract_vector(ctx, lower, 1, v2);
- if (dst.size() == 8) {
- emit_split_vector(ctx, upper, 2);
- elems[2] = emit_extract_vector(ctx, upper, 0, v2);
- elems[3] = emit_extract_vector(ctx, upper, 1, v2);
- } else {
- elems[2] = upper;
- }
-
- aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
- Format::PSEUDO, dst.size() / 2, 1)};
- for (unsigned i = 0; i < dst.size() / 2; i++)
- vec->operands[i] = Operand(elems[i]);
- vec->definitions[0] = Definition(dst);
- bld.insert(std::move(vec));
- ctx->allocated_vec.emplace(dst.id(), elems);
- return;
- }
- default:
- unreachable("Wrong dst size for nir_intrinsic_load_scratch");
- }
+ if (ctx->program->chip_class <= GFX8)
+ rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
+
+ return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
+}
+
+void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
+ Builder bld(ctx->program, ctx->block);
+ Temp rsrc = get_scratch_resource(ctx);
+ Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
+ Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
- bld.mubuf(op, Definition(dst), offset, rsrc, ctx->program->scratch_offset, 0, true);
- emit_split_vector(ctx, dst, instr->num_components);
+ LoadEmitInfo info = {Operand(offset), dst, instr->dest.ssa.num_components,
+ instr->dest.ssa.bit_size / 8u, rsrc};
+ info.align_mul = nir_intrinsic_align_mul(instr);
+ info.align_offset = nir_intrinsic_align_offset(instr);
+ info.swizzle_component_size = 16;
+ info.can_reorder = false;
+ info.soffset = ctx->program->scratch_offset;
+ emit_mubuf_load(ctx, bld, &info);
}
void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
- assert(instr->src[0].ssa->bit_size == 32 || instr->src[0].ssa->bit_size == 64);
Builder bld(ctx->program, ctx->block);
Temp rsrc = get_scratch_resource(ctx);
Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
- unsigned writemask = nir_intrinsic_write_mask(instr);
-
- while (writemask) {
- int start, count;
- u_bit_scan_consecutive_range(&writemask, &start, &count);
- int num_bytes = count * elem_size_bytes;
+ unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
- if (num_bytes > 16) {
- assert(elem_size_bytes == 8);
- writemask |= (((count - 2) << 1) - 1) << (start + 2);
- count = 2;
- num_bytes = 16;
- }
-
- // TODO: check alignment of sub-dword stores
- // TODO: split 3 bytes. there is no store instruction for that
-
- Temp write_data;
- if (count != instr->num_components) {
- aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
- for (int i = 0; i < count; i++) {
- Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(RegType::vgpr, elem_size_bytes / 4));
- vec->operands[i] = Operand(elem);
- }
- write_data = bld.tmp(RegClass(RegType::vgpr, count * elem_size_bytes / 4));
- vec->definitions[0] = Definition(write_data);
- ctx->block->instructions.emplace_back(std::move(vec));
- } else {
- write_data = data;
- }
-
- aco_opcode op;
- switch (num_bytes) {
- case 4:
- op = aco_opcode::buffer_store_dword;
- break;
- case 8:
- op = aco_opcode::buffer_store_dwordx2;
- break;
- case 12:
- op = aco_opcode::buffer_store_dwordx3;
- break;
- case 16:
- op = aco_opcode::buffer_store_dwordx4;
- break;
- default:
- unreachable("Invalid data size for nir_intrinsic_store_scratch.");
- }
+ unsigned write_count = 0;
+ Temp write_datas[32];
+ unsigned offsets[32];
+ split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
+ 16, &write_count, write_datas, offsets);
- bld.mubuf(op, offset, rsrc, ctx->program->scratch_offset, write_data, start * elem_size_bytes, true);
+ for (unsigned i = 0; i < write_count; i++) {
+ aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
+ bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true);
}
}
bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, get_arg(ctx, ctx->args->ac.sample_coverage));
}
+void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr) {
+ Builder bld(ctx->program, ctx->block);
+
+ unsigned stream = nir_intrinsic_stream_id(instr);
+ Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
+ next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u);
+ nir_const_value *next_vertex_cv = nir_src_as_const_value(instr->src[0]);
+
+ /* get GSVS ring */
+ Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_GSVS_GS * 16u));
+
+ unsigned num_components =
+ ctx->program->info->gs.num_stream_output_components[stream];
+ assert(num_components);
+
+ unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out;
+ unsigned stream_offset = 0;
+ for (unsigned i = 0; i < stream; i++) {
+ unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] * ctx->shader->info.gs.vertices_out;
+ stream_offset += prev_stride * ctx->program->wave_size;
+ }
+
+ /* Limit on the stride field for <= GFX7. */
+ assert(stride < (1 << 14));
+
+ Temp gsvs_dwords[4];
+ for (unsigned i = 0; i < 4; i++)
+ gsvs_dwords[i] = bld.tmp(s1);
+ bld.pseudo(aco_opcode::p_split_vector,
+ Definition(gsvs_dwords[0]),
+ Definition(gsvs_dwords[1]),
+ Definition(gsvs_dwords[2]),
+ Definition(gsvs_dwords[3]),
+ gsvs_ring);
+
+ if (stream_offset) {
+ Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand(stream_offset));
+
+ Temp carry = bld.tmp(s1);
+ gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), gsvs_dwords[0], stream_offset_tmp);
+ gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(0u), bld.scc(carry));
+ }
+
+ gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(S_008F04_STRIDE(stride)));
+ gsvs_dwords[2] = bld.copy(bld.def(s1), Operand((uint32_t)ctx->program->wave_size));
+
+ gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
+ gsvs_dwords[0], gsvs_dwords[1], gsvs_dwords[2], gsvs_dwords[3]);
+
+ unsigned offset = 0;
+ for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
+ if (ctx->program->info->gs.output_streams[i] != stream)
+ continue;
+
+ for (unsigned j = 0; j < 4; j++) {
+ if (!(ctx->program->info->gs.output_usage_mask[i] & (1 << j)))
+ continue;
+
+ if (ctx->outputs.mask[i] & (1 << j)) {
+ Operand vaddr_offset = next_vertex_cv ? Operand(v1) : Operand(next_vertex);
+ unsigned const_offset = (offset + (next_vertex_cv ? next_vertex_cv->u32 : 0u)) * 4u;
+ if (const_offset >= 4096u) {
+ if (vaddr_offset.isUndefined())
+ vaddr_offset = bld.copy(bld.def(v1), Operand(const_offset / 4096u * 4096u));
+ else
+ vaddr_offset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), vaddr_offset);
+ const_offset %= 4096u;
+ }
+
+ aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)};
+ mtbuf->operands[0] = Operand(gsvs_ring);
+ mtbuf->operands[1] = vaddr_offset;
+ mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->gs2vs_offset));
+ mtbuf->operands[3] = Operand(ctx->outputs.temps[i * 4u + j]);
+ mtbuf->offen = !vaddr_offset.isUndefined();
+ mtbuf->dfmt = V_008F0C_BUF_DATA_FORMAT_32;
+ mtbuf->nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
+ mtbuf->offset = const_offset;
+ mtbuf->glc = true;
+ mtbuf->slc = true;
+ mtbuf->barrier = barrier_gs_data;
+ mtbuf->can_reorder = true;
+ bld.insert(std::move(mtbuf));
+ }
+
+ offset += ctx->shader->info.gs.vertices_out;
+ }
+
+ /* outputs for the next vertex are undefined and keeping them around can
+ * create invalid IR with control flow */
+ ctx->outputs.mask[i] = 0;
+ }
+
+ bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream));
+}
+
Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src)
{
Builder bld(ctx->program, ctx->block);
} else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
//subgroupAnd(val) -> (exec & ~val) == 0
Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
- return bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(0u), Operand(-1u), bld.scc(tmp));
+ Temp cond = bool_to_vector_condition(ctx, emit_wqm(ctx, tmp));
+ return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond);
} else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
//subgroupOr(val) -> (val & exec) != 0
Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)).def(1).getTemp();
emit_split_vector(ctx, dst, 2);
break;
}
+ case nir_intrinsic_load_barycentric_model: {
+ Temp model = get_arg(ctx, ctx->args->ac.pull_model);
+
+ Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+ Temp p1 = emit_extract_vector(ctx, model, 0, v1);
+ Temp p2 = emit_extract_vector(ctx, model, 1, v1);
+ Temp p3 = emit_extract_vector(ctx, model, 2, v1);
+ bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
+ Operand(p1), Operand(p2), Operand(p3));
+ emit_split_vector(ctx, dst, 3);
+ break;
+ }
case nir_intrinsic_load_barycentric_at_sample: {
uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
switch (ctx->options->key.fs.num_samples) {
offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u));
offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
}
- sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand(offset));
+
+ Operand off = bld.copy(bld.def(s1), Operand(offset));
+ sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, off);
} else if (ctx->options->chip_class >= GFX9) {
addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, private_segment_buffer, sample_pos_offset);
- } else {
+ } else if (ctx->options->chip_class >= GFX7) {
/* addr += private_segment_buffer + sample_pos_offset */
Temp tmp0 = bld.tmp(s1);
Temp tmp1 = bld.tmp(s1);
/* sample_pos = flat_load_dwordx2 addr */
sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
+ } else {
+ assert(ctx->options->chip_class == GFX6);
+
+ uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+ S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+ Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer, Operand(0u), Operand(rsrc_conf));
+
+ addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
+ addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), addr, Operand(0u));
+
+ sample_pos = bld.tmp(v2);
+
+ aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)};
+ load->definitions[0] = Definition(sample_pos);
+ load->operands[0] = Operand(rsrc);
+ load->operands[1] = Operand(addr);
+ load->operands[2] = Operand(0u);
+ load->offset = sample_pos_offset;
+ load->offen = 0;
+ load->addr64 = true;
+ load->glc = false;
+ load->dlc = false;
+ load->disable_wqm = false;
+ load->barrier = barrier_none;
+ load->can_reorder = true;
+ ctx->block->instructions.emplace_back(std::move(load));
}
/* sample_pos -= 0.5 */
Operand(0u), get_arg(ctx, ctx->args->ac.front_face)).def(0).setHint(vcc);
break;
}
- case nir_intrinsic_load_view_index:
- case nir_intrinsic_load_layer_id: {
- if (instr->intrinsic == nir_intrinsic_load_view_index && (ctx->stage & sw_vs)) {
+ case nir_intrinsic_load_view_index: {
+ if (ctx->stage & (sw_vs | sw_gs | sw_tcs | sw_tes)) {
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
break;
}
+ /* fallthrough */
+ }
+ case nir_intrinsic_load_layer_id: {
unsigned idx = nir_intrinsic_base(instr);
bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
Operand(2u), bld.m0(get_arg(ctx, ctx->args->ac.prim_mask)), idx, 0);
posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u));
break;
}
+ case nir_intrinsic_load_tess_coord:
+ visit_load_tess_coord(ctx, instr);
+ break;
case nir_intrinsic_load_interpolated_input:
visit_load_interpolated_input(ctx, instr);
break;
visit_store_output(ctx, instr);
break;
case nir_intrinsic_load_input:
+ case nir_intrinsic_load_input_vertex:
visit_load_input(ctx, instr);
break;
+ case nir_intrinsic_load_output:
+ visit_load_output(ctx, instr);
+ break;
+ case nir_intrinsic_load_per_vertex_input:
+ visit_load_per_vertex_input(ctx, instr);
+ break;
+ case nir_intrinsic_load_per_vertex_output:
+ visit_load_per_vertex_output(ctx, instr);
+ break;
+ case nir_intrinsic_store_per_vertex_output:
+ visit_store_per_vertex_output(ctx, instr);
+ break;
case nir_intrinsic_load_ubo:
visit_load_ubo(ctx, instr);
break;
case nir_intrinsic_get_buffer_size:
visit_get_buffer_size(ctx, instr);
break;
- case nir_intrinsic_barrier: {
- unsigned* bsize = ctx->program->info->cs.block_size;
- unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2];
- if (workgroup_size > ctx->program->wave_size)
+ case nir_intrinsic_control_barrier: {
+ if (ctx->program->chip_class == GFX6 && ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
+ /* GFX6 only (thanks to a hw bug workaround):
+ * The real barrier instruction isn’t needed, because an entire patch
+ * always fits into a single wave.
+ */
+ break;
+ }
+
+ if (ctx->program->workgroup_size > ctx->program->wave_size)
bld.sopp(aco_opcode::s_barrier);
+
break;
}
+ case nir_intrinsic_memory_barrier_tcs_patch:
case nir_intrinsic_group_memory_barrier:
case nir_intrinsic_memory_barrier:
- case nir_intrinsic_memory_barrier_atomic_counter:
case nir_intrinsic_memory_barrier_buffer:
case nir_intrinsic_memory_barrier_image:
case nir_intrinsic_memory_barrier_shared:
assert(dst.regClass() == bld.lm);
Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
- Temp val = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(0u), Operand(-1u), bld.scc(tmp));
- emit_wqm(ctx, val, dst);
+ Temp cond = bool_to_vector_condition(ctx, emit_wqm(ctx, tmp));
+ bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
break;
}
case nir_intrinsic_vote_any: {
assert(src.regClass() == bld.lm);
assert(dst.regClass() == bld.lm);
- Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
- Temp val = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(-1u), Operand(0u), bld.scc(tmp));
- emit_wqm(ctx, val, dst);
+ Temp tmp = bool_to_scalar_condition(ctx, src);
+ bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
break;
}
case nir_intrinsic_reduce:
break;
}
case nir_intrinsic_demote:
- bld.pseudo(aco_opcode::p_demote_to_helper);
+ bld.pseudo(aco_opcode::p_demote_to_helper, Operand(-1u));
+
+ if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
+ ctx->cf_info.exec_potentially_empty_discard = true;
ctx->block->kind |= block_kind_uses_demote;
ctx->program->needs_exact = true;
break;
assert(src.regClass() == bld.lm);
Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
bld.pseudo(aco_opcode::p_demote_to_helper, cond);
+
+ if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
+ ctx->cf_info.exec_potentially_empty_discard = true;
ctx->block->kind |= block_kind_uses_demote;
ctx->program->needs_exact = true;
break;
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id));
break;
}
+ case nir_intrinsic_load_invocation_id: {
+ Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+
+ if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
+ if (ctx->options->chip_class >= GFX10)
+ bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand(127u), get_arg(ctx, ctx->args->ac.gs_invocation_id));
+ else
+ bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id));
+ } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
+ bld.vop3(aco_opcode::v_bfe_u32, Definition(dst),
+ get_arg(ctx, ctx->args->ac.tcs_rel_ids), Operand(8u), Operand(5u));
+ } else {
+ unreachable("Unsupported stage for load_invocation_id");
+ }
+
+ break;
+ }
+ case nir_intrinsic_load_primitive_id: {
+ Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+
+ switch (ctx->shader->info.stage) {
+ case MESA_SHADER_GEOMETRY:
+ bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
+ break;
+ case MESA_SHADER_TESS_CTRL:
+ bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tcs_patch_id));
+ break;
+ case MESA_SHADER_TESS_EVAL:
+ bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tes_patch_id));
+ break;
+ default:
+ unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
+ }
+
+ break;
+ }
+ case nir_intrinsic_load_patch_vertices_in: {
+ assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL ||
+ ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
+
+ Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+ bld.copy(Definition(dst), Operand(ctx->args->options->key.tcs.input_vertices));
+ break;
+ }
+ case nir_intrinsic_emit_vertex_with_counter: {
+ visit_emit_vertex_with_counter(ctx, instr);
+ break;
+ }
+ case nir_intrinsic_end_primitive_with_counter: {
+ unsigned stream = nir_intrinsic_stream_id(instr);
+ bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(true, false, stream));
+ break;
+ }
+ case nir_intrinsic_set_vertex_count: {
+ /* unused, the HW keeps track of this for us */
+ break;
+ }
default:
fprintf(stderr, "Unimplemented intrinsic instr: ");
nir_print_instr(&instr->instr, stderr);
*res_ptr = get_sampler_desc(ctx, texture_deref_instr, (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false, false);
} else if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
*res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false, false);
+ } else if (instr->op == nir_texop_fragment_mask_fetch) {
+ *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false);
} else {
*res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false, false);
}
*out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
}
-void prepare_cube_coords(isel_context *ctx, Temp* coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array)
+void prepare_cube_coords(isel_context *ctx, std::vector<Temp>& coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array)
{
Builder bld(ctx->program, ctx->block);
- Temp coord_args[4], ma, tc, sc, id;
- for (unsigned i = 0; i < (is_array ? 4 : 3); i++)
- coord_args[i] = emit_extract_vector(ctx, *coords, i, v1);
+ Temp ma, tc, sc, id;
if (is_array) {
- coord_args[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_args[3]);
+ coords[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[3]);
// see comment in ac_prepare_cube_coords()
if (ctx->options->chip_class <= GFX8)
- coord_args[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coord_args[3]);
+ coords[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coords[3]);
}
- ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
+ ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coords[0], coords[1], coords[2]);
aco_ptr<VOP3A_instruction> vop3a{create_instruction<VOP3A_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
vop3a->operands[0] = Operand(ma);
vop3a->definitions[0] = Definition(invma);
ctx->block->instructions.emplace_back(std::move(vop3a));
- sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
+ sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
if (!is_deriv)
sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/));
- tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
+ tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
if (!is_deriv)
tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/));
- id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
+ id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]);
if (is_deriv) {
sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);
}
if (is_array)
- id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coord_args[3], id, Operand(0x41000000u/*8.0*/));
- *coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), sc, tc, id);
-
-}
-
-Temp apply_round_slice(isel_context *ctx, Temp coords, unsigned idx)
-{
- Temp coord_vec[3];
- for (unsigned i = 0; i < coords.size(); i++)
- coord_vec[i] = emit_extract_vector(ctx, coords, i, v1);
-
- Builder bld(ctx->program, ctx->block);
- coord_vec[idx] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_vec[idx]);
-
- aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
- for (unsigned i = 0; i < coords.size(); i++)
- vec->operands[i] = Operand(coord_vec[i]);
- Temp res = bld.tmp(RegType::vgpr, coords.size());
- vec->definitions[0] = Definition(res);
- ctx->block->instructions.emplace_back(std::move(vec));
- return res;
+ id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coords[3], id, Operand(0x41000000u/*8.0*/));
+ coords.resize(3);
+ coords[0] = sc;
+ coords[1] = tc;
+ coords[2] = id;
}
void get_const_vec(nir_ssa_def *vec, nir_const_value *cv[4])
Builder bld(ctx->program, ctx->block);
bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false;
- Temp resource, sampler, fmask_ptr, bias = Temp(), coords, compare = Temp(), sample_index = Temp(),
- lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(), derivs = Temp();
+ Temp resource, sampler, fmask_ptr, bias = Temp(), compare = Temp(), sample_index = Temp(),
+ lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp();
+ std::vector<Temp> coords;
+ std::vector<Temp> derivs;
nir_const_value *sample_index_cv = NULL;
nir_const_value *const_offset[4] = {NULL, NULL, NULL, NULL};
enum glsl_base_type stype;
for (unsigned i = 0; i < instr->num_srcs; i++) {
switch (instr->src[i].src_type) {
- case nir_tex_src_coord:
- coords = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[i].src.ssa));
+ case nir_tex_src_coord: {
+ Temp coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
+ for (unsigned i = 0; i < coord.size(); i++)
+ coords.emplace_back(emit_extract_vector(ctx, coord, i, v1));
break;
+ }
case nir_tex_src_bias:
if (instr->op == nir_texop_txb) {
bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
break;
}
}
-// TODO: all other cases: structure taken from ac_nir_to_llvm.c
+
if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa), true);
Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
- Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
+ Operand default_sample = Operand(1u);
+ if (ctx->options->robust_buffer_access) {
+ /* Extract the second dword of the descriptor, if it's
+ * all zero, then it's a null descriptor.
+ */
+ Temp dword1 = emit_extract_vector(ctx, resource, 1, s1);
+ Temp is_non_null_descriptor = bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand(0u));
+ default_sample = Operand(is_non_null_descriptor);
+ }
+
+ Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
- samples, Operand(1u), bld.scc(is_msaa));
+ samples, default_sample, bld.scc(is_msaa));
return;
}
}
if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
- prepare_cube_coords(ctx, &coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod);
+ prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod);
/* pack derivatives */
if (has_ddx || has_ddy) {
if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) {
- derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(v4),
- ddx, Operand(0u), ddy, Operand(0u));
+ assert(has_ddx && has_ddy && ddx.size() == 1 && ddy.size() == 1);
+ Temp zero = bld.copy(bld.def(v1), Operand(0u));
+ derivs = {ddx, zero, ddy, zero};
} else {
- derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, ddx.size() + ddy.size()), ddx, ddy);
+ for (unsigned i = 0; has_ddx && i < ddx.size(); i++)
+ derivs.emplace_back(emit_extract_vector(ctx, ddx, i, v1));
+ for (unsigned i = 0; has_ddy && i < ddy.size(); i++)
+ derivs.emplace_back(emit_extract_vector(ctx, ddy, i, v1));
}
has_derivs = true;
}
instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
instr->is_array &&
instr->op != nir_texop_txf)
- coords = apply_round_slice(ctx, coords, 1);
+ coords[1] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[1]);
if (instr->coord_components > 2 &&
(instr->sampler_dim == GLSL_SAMPLER_DIM_2D ||
instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
instr->is_array &&
- instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms)
- coords = apply_round_slice(ctx, coords, 2);
+ instr->op != nir_texop_txf &&
+ instr->op != nir_texop_txf_ms &&
+ instr->op != nir_texop_fragment_fetch &&
+ instr->op != nir_texop_fragment_mask_fetch)
+ coords[2] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[2]);
if (ctx->options->chip_class == GFX9 &&
instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
instr->op != nir_texop_lod && instr->coord_components) {
assert(coords.size() > 0 && coords.size() < 3);
- aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size() + 1, 1)};
- vec->operands[0] = Operand(emit_extract_vector(ctx, coords, 0, v1));
- vec->operands[1] = instr->op == nir_texop_txf ? Operand((uint32_t) 0) : Operand((uint32_t) 0x3f000000);
- if (coords.size() > 1)
- vec->operands[2] = Operand(emit_extract_vector(ctx, coords, 1, v1));
- coords = bld.tmp(RegType::vgpr, coords.size() + 1);
- vec->definitions[0] = Definition(coords);
- ctx->block->instructions.emplace_back(std::move(vec));
+ coords.insert(std::next(coords.begin()), bld.copy(bld.def(v1), instr->op == nir_texop_txf ?
+ Operand((uint32_t) 0) :
+ Operand((uint32_t) 0x3f000000)));
}
bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
- instr->op != nir_texop_txs) {
+ instr->op != nir_texop_txs &&
+ instr->op != nir_texop_fragment_fetch &&
+ instr->op != nir_texop_fragment_mask_fetch) {
assert(has_sample_index);
Operand op(sample_index);
if (sample_index_cv)
}
if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {
- Temp split_coords[coords.size()];
- emit_split_vector(ctx, coords, coords.size());
- for (unsigned i = 0; i < coords.size(); i++)
- split_coords[i] = emit_extract_vector(ctx, coords, i, v1);
-
- unsigned i = 0;
- for (; i < std::min(offset.size(), instr->coord_components); i++) {
+ for (unsigned i = 0; i < std::min(offset.size(), instr->coord_components); i++) {
Temp off = emit_extract_vector(ctx, offset, i, v1);
- split_coords[i] = bld.vadd32(bld.def(v1), split_coords[i], off);
+ coords[i] = bld.vadd32(bld.def(v1), coords[i], off);
}
-
- aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
- for (unsigned i = 0; i < coords.size(); i++)
- vec->operands[i] = Operand(split_coords[i]);
- coords = bld.tmp(coords.regClass());
- vec->definitions[0] = Definition(coords);
- ctx->block->instructions.emplace_back(std::move(vec));
-
has_offset = false;
}
if (tmp_dst.id() == dst.id() && div_by_6)
tmp_dst = bld.tmp(tmp_dst.regClass());
- tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1));
- tex->operands[0] = Operand(as_vgpr(ctx,lod));
- tex->operands[1] = Operand(resource);
+ tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1));
+ tex->operands[0] = Operand(resource);
+ tex->operands[1] = Operand(s4); /* no sampler */
+ tex->operands[2] = Operand(as_vgpr(ctx,lod));
if (ctx->options->chip_class == GFX9 &&
instr->op == nir_texop_txs &&
instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
Temp tg4_compare_cube_wa64 = Temp();
if (tg4_integer_workarounds) {
- tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1));
- tex->operands[0] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
- tex->operands[1] = Operand(resource);
+ tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1));
+ tex->operands[0] = Operand(resource);
+ tex->operands[1] = Operand(s4); /* no sampler */
+ tex->operands[2] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
tex->dim = dim;
tex->dmask = 0x3;
tex->da = da;
half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000/*-0.5*/), half_texel[i]);
}
- Temp orig_coords[2] = {
- emit_extract_vector(ctx, coords, 0, v1),
- emit_extract_vector(ctx, coords, 1, v1)};
Temp new_coords[2] = {
- bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[0], half_texel[0]),
- bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[1], half_texel[1])
+ bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
+ bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])
};
if (tg4_integer_cube_workaround) {
ctx->block->instructions.emplace_back(std::move(vec));
new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
- new_coords[0], orig_coords[0], tg4_compare_cube_wa64);
+ new_coords[0], coords[0], tg4_compare_cube_wa64);
new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
- new_coords[1], orig_coords[1], tg4_compare_cube_wa64);
- }
-
- if (coords.size() == 3) {
- coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3),
- new_coords[0], new_coords[1],
- emit_extract_vector(ctx, coords, 2, v1));
- } else {
- assert(coords.size() == 2);
- coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
- new_coords[0], new_coords[1]);
- }
- }
-
- std::vector<Operand> args;
- if (has_offset)
- args.emplace_back(Operand(offset));
- if (has_bias)
- args.emplace_back(Operand(bias));
- if (has_compare)
- args.emplace_back(Operand(compare));
- if (has_derivs)
- args.emplace_back(Operand(derivs));
- args.emplace_back(Operand(coords));
- if (has_sample_index)
- args.emplace_back(Operand(sample_index));
- if (has_lod)
- args.emplace_back(lod);
-
- Temp arg;
- if (args.size() > 1) {
- aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
- unsigned size = 0;
- for (unsigned i = 0; i < args.size(); i++) {
- size += args[i].size();
- vec->operands[i] = args[i];
+ new_coords[1], coords[1], tg4_compare_cube_wa64);
}
- RegClass rc = RegClass(RegType::vgpr, size);
- Temp tmp = bld.tmp(rc);
- vec->definitions[0] = Definition(tmp);
- ctx->block->instructions.emplace_back(std::move(vec));
- arg = tmp;
- } else {
- assert(args[0].isTemp());
- arg = as_vgpr(ctx, args[0].getTemp());
+ coords[0] = new_coords[0];
+ coords[1] = new_coords[1];
}
- /* we don't need the bias, sample index, compare value or offset to be
- * computed in WQM but if the p_create_vector copies the coordinates, then it
- * needs to be in WQM */
- if (!(has_ddx && has_ddy) && !has_lod && !level_zero &&
- instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
- instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
- arg = emit_wqm(ctx, arg, bld.tmp(arg.regClass()), true);
-
if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
//FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe()
tmp_dst = bld.tmp(RegType::vgpr, last_bit);
aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
- mubuf->operands[0] = Operand(coords);
- mubuf->operands[1] = Operand(resource);
+ mubuf->operands[0] = Operand(resource);
+ mubuf->operands[1] = Operand(coords[0]);
mubuf->operands[2] = Operand((uint32_t) 0);
mubuf->definitions[0] = Definition(tmp_dst);
mubuf->idxen = true;
return;
}
+ /* gather MIMG address components */
+ std::vector<Temp> args;
+ if (has_offset)
+ args.emplace_back(offset);
+ if (has_bias)
+ args.emplace_back(bias);
+ if (has_compare)
+ args.emplace_back(compare);
+ if (has_derivs)
+ args.insert(args.end(), derivs.begin(), derivs.end());
+
+ args.insert(args.end(), coords.begin(), coords.end());
+ if (has_sample_index)
+ args.emplace_back(sample_index);
+ if (has_lod)
+ args.emplace_back(lod);
+
+ Temp arg = bld.tmp(RegClass(RegType::vgpr, args.size()));
+ aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
+ vec->definitions[0] = Definition(arg);
+ for (unsigned i = 0; i < args.size(); i++)
+ vec->operands[i] = Operand(args[i]);
+ ctx->block->instructions.emplace_back(std::move(vec));
+
if (instr->op == nir_texop_txf ||
instr->op == nir_texop_txf_ms ||
- instr->op == nir_texop_samples_identical) {
- aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ? aco_opcode::image_load : aco_opcode::image_load_mip;
- tex.reset(create_instruction<MIMG_instruction>(op, Format::MIMG, 2, 1));
- tex->operands[0] = Operand(arg);
- tex->operands[1] = Operand(resource);
+ instr->op == nir_texop_samples_identical ||
+ instr->op == nir_texop_fragment_fetch ||
+ instr->op == nir_texop_fragment_mask_fetch) {
+ aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS || instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS ? aco_opcode::image_load : aco_opcode::image_load_mip;
+ tex.reset(create_instruction<MIMG_instruction>(op, Format::MIMG, 3, 1));
+ tex->operands[0] = Operand(resource);
+ tex->operands[1] = Operand(s4); /* no sampler */
+ tex->operands[2] = Operand(arg);
tex->dim = dim;
tex->dmask = dmask;
tex->unrm = true;
opcode = aco_opcode::image_get_lod;
}
+ /* we don't need the bias, sample index, compare value or offset to be
+ * computed in WQM but if the p_create_vector copies the coordinates, then it
+ * needs to be in WQM */
+ if (ctx->stage == fragment_fs &&
+ !has_derivs && !has_lod && !level_zero &&
+ instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
+ instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
+ arg = emit_wqm(ctx, arg, bld.tmp(arg.regClass()), true);
+
tex.reset(create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1));
- tex->operands[0] = Operand(arg);
- tex->operands[1] = Operand(resource);
- tex->operands[2] = Operand(sampler);
+ tex->operands[0] = Operand(resource);
+ tex->operands[1] = Operand(sampler);
+ tex->operands[2] = Operand(arg);
tex->dim = dim;
tex->dmask = dmask;
tex->da = da;
std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;
unsigned num_operands = 0;
- Operand operands[std::max(exec_list_length(&instr->srcs), (unsigned)preds.size())];
+ Operand operands[std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1];
unsigned num_defined = 0;
unsigned cur_pred_idx = 0;
for (std::pair<unsigned, nir_ssa_def *> src : phi_src) {
continue;
}
}
+ /* Handle missing predecessors at the end. This shouldn't happen with loop
+ * headers and we can't ignore these sources for loop header phis. */
+ if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
+ continue;
cur_pred_idx++;
Operand op = get_phi_operand(ctx, src.second);
operands[num_operands++] = op;
while (cur_pred_idx++ < preds.size())
operands[num_operands++] = Operand(dst.regClass());
+ /* If the loop ends with a break, still add a linear continue edge in case
+ * that break is divergent or continue_or_break is used. We'll either remove
+ * this operand later in visit_loop() if it's not necessary or replace the
+ * undef with something correct. */
+ if (!logical && ctx->block->kind & block_kind_loop_header) {
+ nir_loop *loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent);
+ nir_block *last = nir_loop_last_block(loop);
+ if (last->successors[0] != instr->instr.block)
+ operands[num_operands++] = Operand(RegClass());
+ }
+
if (num_defined == 0) {
Builder bld(ctx->program, ctx->block);
if (dst.regClass() == s1) {
abort();
}
+ if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) {
+ ctx->cf_info.exec_potentially_empty_break = true;
+ ctx->cf_info.exec_potentially_empty_break_depth = ctx->cf_info.loop_nest_depth;
+ }
+
/* remove critical edges from linear CFG */
bld.branch(aco_opcode::p_branch);
Block* break_block = ctx->program->create_and_insert_block();
+static Operand create_continue_phis(isel_context *ctx, unsigned first, unsigned last,
+ aco_ptr<Instruction>& header_phi, Operand *vals)
+{
+ vals[0] = Operand(header_phi->definitions[0].getTemp());
+ RegClass rc = vals[0].regClass();
+
+ unsigned loop_nest_depth = ctx->program->blocks[first].loop_nest_depth;
+
+ unsigned next_pred = 1;
+
+ for (unsigned idx = first + 1; idx <= last; idx++) {
+ Block& block = ctx->program->blocks[idx];
+ if (block.loop_nest_depth != loop_nest_depth) {
+ vals[idx - first] = vals[idx - 1 - first];
+ continue;
+ }
+
+ if (block.kind & block_kind_continue) {
+ vals[idx - first] = header_phi->operands[next_pred];
+ next_pred++;
+ continue;
+ }
+
+ bool all_same = true;
+ for (unsigned i = 1; all_same && (i < block.linear_preds.size()); i++)
+ all_same = vals[block.linear_preds[i] - first] == vals[block.linear_preds[0] - first];
+
+ Operand val;
+ if (all_same) {
+ val = vals[block.linear_preds[0] - first];
+ } else {
+ aco_ptr<Instruction> phi(create_instruction<Pseudo_instruction>(
+ aco_opcode::p_linear_phi, Format::PSEUDO, block.linear_preds.size(), 1));
+ for (unsigned i = 0; i < block.linear_preds.size(); i++)
+ phi->operands[i] = vals[block.linear_preds[i] - first];
+ val = Operand(Temp(ctx->program->allocateId(), rc));
+ phi->definitions[0] = Definition(val.getTemp());
+ block.instructions.emplace(block.instructions.begin(), std::move(phi));
+ }
+ vals[idx - first] = val;
+ }
+
+ return vals[last - first];
+}
+
static void visit_loop(isel_context *ctx, nir_loop *loop)
{
+ //TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true
append_logical_end(ctx->block);
ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
Builder bld(ctx->program, ctx->block);
unsigned loop_header_idx = loop_header->index;
loop_info_RAII loop_raii(ctx, loop_header_idx, &loop_exit);
append_logical_start(ctx->block);
- visit_cf_list(ctx, &loop->body);
+ bool unreachable = visit_cf_list(ctx, &loop->body);
//TODO: what if a loop ends with a unconditional or uniformly branched continue and this branch is never taken?
if (!ctx->cf_info.has_branch) {
append_logical_end(ctx->block);
- if (ctx->cf_info.exec_potentially_empty) {
+ if (ctx->cf_info.exec_potentially_empty_discard || ctx->cf_info.exec_potentially_empty_break) {
/* Discards can result in code running with an empty exec mask.
* This would result in divergent breaks not ever being taken. As a
* workaround, break the loop when the loop mask is empty instead of
add_linear_edge(block_idx, continue_block);
add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
- add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
+ if (!ctx->cf_info.parent_loop.has_divergent_branch)
+ add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
ctx->block = &ctx->program->blocks[block_idx];
} else {
ctx->block->kind |= (block_kind_continue | block_kind_uniform);
bld.branch(aco_opcode::p_branch);
}
- /* fixup phis in loop header from unreachable blocks */
- if (ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch) {
+ /* Fixup phis in loop header from unreachable blocks.
+ * has_branch/has_divergent_branch also indicates if the loop ends with a
+ * break/continue instruction, but we don't emit those if unreachable=true */
+ if (unreachable) {
+ assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch);
bool linear = ctx->cf_info.has_branch;
bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
}
}
+ /* Fixup linear phis in loop header from expecting a continue. Both this fixup
+ * and the previous one shouldn't both happen at once because a break in the
+ * merge block would get CSE'd */
+ if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) {
+ unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1);
+ Operand vals[num_vals];
+ for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
+ if (instr->opcode == aco_opcode::p_linear_phi) {
+ if (ctx->cf_info.has_branch)
+ instr->operands.pop_back();
+ else
+ instr->operands.back() = create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals);
+ } else if (!is_phi(instr)) {
+ break;
+ }
+ }
+ }
+
ctx->cf_info.has_branch = false;
// TODO: if the loop has not a single exit, we must add one °°
ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
- ic->exec_potentially_empty_old = ctx->cf_info.exec_potentially_empty;
+ ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard;
+ ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;
+ ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;
ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
ctx->cf_info.parent_if.is_divergent = true;
- ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */
+
+ /* divergent branches use cbranch_execz */
+ ctx->cf_info.exec_potentially_empty_discard = false;
+ ctx->cf_info.exec_potentially_empty_break = false;
+ ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
/** emit logical then block */
Block* BB_then_logical = ctx->program->create_and_insert_block();
branch->operands[0] = Operand(ic->cond);
ctx->block->instructions.push_back(std::move(branch));
- ic->exec_potentially_empty_old |= ctx->cf_info.exec_potentially_empty;
- ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */
+ ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
+ ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break;
+ ic->exec_potentially_empty_break_depth_old =
+ std::min(ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
+ /* divergent branches use cbranch_execz */
+ ctx->cf_info.exec_potentially_empty_discard = false;
+ ctx->cf_info.exec_potentially_empty_break = false;
+ ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
/** emit logical else block */
Block* BB_else_logical = ctx->program->create_and_insert_block();
ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
- ctx->cf_info.exec_potentially_empty |= ic->exec_potentially_empty_old;
+ ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old;
+ ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old;
+ ctx->cf_info.exec_potentially_empty_break_depth =
+ std::min(ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
+ if (ctx->cf_info.loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth &&
+ !ctx->cf_info.parent_if.is_divergent) {
+ ctx->cf_info.exec_potentially_empty_break = false;
+ ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
+ }
/* uniform control flow never has an empty exec-mask */
- if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
- ctx->cf_info.exec_potentially_empty = false;
+ if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) {
+ ctx->cf_info.exec_potentially_empty_discard = false;
+ ctx->cf_info.exec_potentially_empty_break = false;
+ ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
+ }
+}
+
+static void begin_uniform_if_then(isel_context *ctx, if_context *ic, Temp cond)
+{
+ assert(cond.regClass() == s1);
+
+ append_logical_end(ctx->block);
+ ctx->block->kind |= block_kind_uniform;
+
+ aco_ptr<Pseudo_branch_instruction> branch;
+ aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
+ branch.reset(create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 0));
+ branch->operands[0] = Operand(cond);
+ branch->operands[0].setFixed(scc);
+ ctx->block->instructions.emplace_back(std::move(branch));
+
+ ic->BB_if_idx = ctx->block->index;
+ ic->BB_endif = Block();
+ ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
+ ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
+
+ ctx->cf_info.has_branch = false;
+ ctx->cf_info.parent_loop.has_divergent_branch = false;
+
+ /** emit then block */
+ Block* BB_then = ctx->program->create_and_insert_block();
+ BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth;
+ add_edge(ic->BB_if_idx, BB_then);
+ append_logical_start(BB_then);
+ ctx->block = BB_then;
+}
+
+static void begin_uniform_if_else(isel_context *ctx, if_context *ic)
+{
+ Block *BB_then = ctx->block;
+
+ ic->uniform_has_then_branch = ctx->cf_info.has_branch;
+ ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
+
+ if (!ic->uniform_has_then_branch) {
+ append_logical_end(BB_then);
+ /* branch from then block to endif block */
+ aco_ptr<Pseudo_branch_instruction> branch;
+ branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
+ BB_then->instructions.emplace_back(std::move(branch));
+ add_linear_edge(BB_then->index, &ic->BB_endif);
+ if (!ic->then_branch_divergent)
+ add_logical_edge(BB_then->index, &ic->BB_endif);
+ BB_then->kind |= block_kind_uniform;
+ }
+
+ ctx->cf_info.has_branch = false;
+ ctx->cf_info.parent_loop.has_divergent_branch = false;
+
+ /** emit else block */
+ Block* BB_else = ctx->program->create_and_insert_block();
+ BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth;
+ add_edge(ic->BB_if_idx, BB_else);
+ append_logical_start(BB_else);
+ ctx->block = BB_else;
+}
+
+static void end_uniform_if(isel_context *ctx, if_context *ic)
+{
+ Block *BB_else = ctx->block;
+
+ if (!ctx->cf_info.has_branch) {
+ append_logical_end(BB_else);
+ /* branch from then block to endif block */
+ aco_ptr<Pseudo_branch_instruction> branch;
+ branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
+ BB_else->instructions.emplace_back(std::move(branch));
+ add_linear_edge(BB_else->index, &ic->BB_endif);
+ if (!ctx->cf_info.parent_loop.has_divergent_branch)
+ add_logical_edge(BB_else->index, &ic->BB_endif);
+ BB_else->kind |= block_kind_uniform;
+ }
+
+ ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
+ ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
+
+ /** emit endif merge block */
+ if (!ctx->cf_info.has_branch) {
+ ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
+ append_logical_start(ctx->block);
+ }
}
-static void visit_if(isel_context *ctx, nir_if *if_stmt)
+static bool visit_if(isel_context *ctx, nir_if *if_stmt)
{
Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
Builder bld(ctx->program, ctx->block);
aco_ptr<Pseudo_branch_instruction> branch;
+ if_context ic;
if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */
/**
* to the loop exit/entry block. Otherwise, it branches to the next
* merge block.
**/
- append_logical_end(ctx->block);
- ctx->block->kind |= block_kind_uniform;
- /* emit branch */
- assert(cond.regClass() == bld.lm);
// TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
+ assert(cond.regClass() == ctx->program->lane_mask);
cond = bool_to_scalar_condition(ctx, cond);
- branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
- branch->operands[0] = Operand(cond);
- branch->operands[0].setFixed(scc);
- ctx->block->instructions.emplace_back(std::move(branch));
-
- unsigned BB_if_idx = ctx->block->index;
- Block BB_endif = Block();
- BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
- BB_endif.kind |= ctx->block->kind & block_kind_top_level;
-
- /** emit then block */
- Block* BB_then = ctx->program->create_and_insert_block();
- BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth;
- add_edge(BB_if_idx, BB_then);
- append_logical_start(BB_then);
- ctx->block = BB_then;
+ begin_uniform_if_then(ctx, &ic, cond);
visit_cf_list(ctx, &if_stmt->then_list);
- BB_then = ctx->block;
- bool then_branch = ctx->cf_info.has_branch;
- bool then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
-
- if (!then_branch) {
- append_logical_end(BB_then);
- /* branch from then block to endif block */
- branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
- BB_then->instructions.emplace_back(std::move(branch));
- add_linear_edge(BB_then->index, &BB_endif);
- if (!then_branch_divergent)
- add_logical_edge(BB_then->index, &BB_endif);
- BB_then->kind |= block_kind_uniform;
- }
-
- ctx->cf_info.has_branch = false;
- ctx->cf_info.parent_loop.has_divergent_branch = false;
- /** emit else block */
- Block* BB_else = ctx->program->create_and_insert_block();
- BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth;
- add_edge(BB_if_idx, BB_else);
- append_logical_start(BB_else);
- ctx->block = BB_else;
+ begin_uniform_if_else(ctx, &ic);
visit_cf_list(ctx, &if_stmt->else_list);
- BB_else = ctx->block;
-
- if (!ctx->cf_info.has_branch) {
- append_logical_end(BB_else);
- /* branch from then block to endif block */
- branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
- BB_else->instructions.emplace_back(std::move(branch));
- add_linear_edge(BB_else->index, &BB_endif);
- if (!ctx->cf_info.parent_loop.has_divergent_branch)
- add_logical_edge(BB_else->index, &BB_endif);
- BB_else->kind |= block_kind_uniform;
- }
- ctx->cf_info.has_branch &= then_branch;
- ctx->cf_info.parent_loop.has_divergent_branch &= then_branch_divergent;
-
- /** emit endif merge block */
- if (!ctx->cf_info.has_branch) {
- ctx->block = ctx->program->insert_block(std::move(BB_endif));
- append_logical_start(ctx->block);
- }
+ end_uniform_if(ctx, &ic);
} else { /* non-uniform condition */
/**
* To maintain a logical and linear CFG without critical edges,
* *) Exceptions may be due to break and continue statements within loops
**/
- if_context ic;
-
begin_divergent_if_then(ctx, &ic, cond);
visit_cf_list(ctx, &if_stmt->then_list);
end_divergent_if(ctx, &ic);
}
+
+ return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
}
-static void visit_cf_list(isel_context *ctx,
+static bool visit_cf_list(isel_context *ctx,
struct exec_list *list)
{
foreach_list_typed(nir_cf_node, node, node, list) {
visit_block(ctx, nir_cf_node_as_block(node));
break;
case nir_cf_node_if:
- visit_if(ctx, nir_cf_node_as_if(node));
+ if (!visit_if(ctx, nir_cf_node_as_if(node)))
+ return true;
break;
case nir_cf_node_loop:
visit_loop(ctx, nir_cf_node_as_loop(node));
unreachable("unimplemented cf list type");
}
}
+ return false;
+}
+
+static void create_null_export(isel_context *ctx)
+{
+ /* Some shader stages always need to have exports.
+ * So when there is none, we need to add a null export.
+ */
+
+ unsigned dest = (ctx->program->stage & hw_fs) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS;
+ bool vm = (ctx->program->stage & hw_fs) || ctx->program->chip_class >= GFX10;
+ Builder bld(ctx->program, ctx->block);
+ bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
+ /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, vm);
}
-static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos)
+static bool export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos)
{
- int offset = ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
- uint64_t mask = ctx->vs_output.mask[slot];
+ assert(ctx->stage == vertex_vs ||
+ ctx->stage == tess_eval_vs ||
+ ctx->stage == gs_copy_vs ||
+ ctx->stage == ngg_vertex_gs ||
+ ctx->stage == ngg_tess_eval_gs);
+
+ int offset = (ctx->stage & sw_tes)
+ ? ctx->program->info->tes.outinfo.vs_output_param_offset[slot]
+ : ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
+ uint64_t mask = ctx->outputs.mask[slot];
if (!is_pos && !mask)
- return;
+ return false;
if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
- return;
+ return false;
aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
exp->enabled_mask = mask;
for (unsigned i = 0; i < 4; ++i) {
if (mask & (1 << i))
- exp->operands[i] = Operand(ctx->vs_output.outputs[slot][i]);
+ exp->operands[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
else
exp->operands[i] = Operand(v1);
}
else
exp->dest = V_008DFC_SQ_EXP_PARAM + offset;
ctx->block->instructions.emplace_back(std::move(exp));
+
+ return true;
}
static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos)
exp->enabled_mask = 0;
for (unsigned i = 0; i < 4; ++i)
exp->operands[i] = Operand(v1);
- if (ctx->vs_output.mask[VARYING_SLOT_PSIZ]) {
- exp->operands[0] = Operand(ctx->vs_output.outputs[VARYING_SLOT_PSIZ][0]);
+ if (ctx->outputs.mask[VARYING_SLOT_PSIZ]) {
+ exp->operands[0] = Operand(ctx->outputs.temps[VARYING_SLOT_PSIZ * 4u]);
exp->enabled_mask |= 0x1;
}
- if (ctx->vs_output.mask[VARYING_SLOT_LAYER]) {
- exp->operands[2] = Operand(ctx->vs_output.outputs[VARYING_SLOT_LAYER][0]);
+ if (ctx->outputs.mask[VARYING_SLOT_LAYER]) {
+ exp->operands[2] = Operand(ctx->outputs.temps[VARYING_SLOT_LAYER * 4u]);
exp->enabled_mask |= 0x4;
}
- if (ctx->vs_output.mask[VARYING_SLOT_VIEWPORT]) {
+ if (ctx->outputs.mask[VARYING_SLOT_VIEWPORT]) {
if (ctx->options->chip_class < GFX9) {
- exp->operands[3] = Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]);
+ exp->operands[3] = Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]);
exp->enabled_mask |= 0x8;
} else {
Builder bld(ctx->program, ctx->block);
Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u),
- Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]));
+ Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]));
if (exp->operands[2].isTemp())
out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
ctx->block->instructions.emplace_back(std::move(exp));
}
+static void create_export_phis(isel_context *ctx)
+{
+ /* Used when exports are needed, but the output temps are defined in a preceding block.
+ * This function will set up phis in order to access the outputs in the next block.
+ */
+
+ assert(ctx->block->instructions.back()->opcode == aco_opcode::p_logical_start);
+ aco_ptr<Instruction> logical_start = aco_ptr<Instruction>(ctx->block->instructions.back().release());
+ ctx->block->instructions.pop_back();
+
+ Builder bld(ctx->program, ctx->block);
+
+ for (unsigned slot = 0; slot <= VARYING_SLOT_VAR31; ++slot) {
+ uint64_t mask = ctx->outputs.mask[slot];
+ for (unsigned i = 0; i < 4; ++i) {
+ if (!(mask & (1 << i)))
+ continue;
+
+ Temp old = ctx->outputs.temps[slot * 4 + i];
+ Temp phi = bld.pseudo(aco_opcode::p_phi, bld.def(v1), old, Operand(v1));
+ ctx->outputs.temps[slot * 4 + i] = phi;
+ }
+ }
+
+ bld.insert(std::move(logical_start));
+}
+
static void create_vs_exports(isel_context *ctx)
{
- radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo;
+ assert(ctx->stage == vertex_vs ||
+ ctx->stage == tess_eval_vs ||
+ ctx->stage == gs_copy_vs ||
+ ctx->stage == ngg_vertex_gs ||
+ ctx->stage == ngg_tess_eval_gs);
+
+ radv_vs_output_info *outinfo = (ctx->stage & sw_tes)
+ ? &ctx->program->info->tes.outinfo
+ : &ctx->program->info->vs.outinfo;
+
+ if (outinfo->export_prim_id && !(ctx->stage & hw_ngg_gs)) {
+ ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
+ ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = get_arg(ctx, ctx->args->vs_prim_id);
+ }
+
+ if (ctx->options->key.has_multiview_view_index) {
+ ctx->outputs.mask[VARYING_SLOT_LAYER] |= 0x1;
+ ctx->outputs.temps[VARYING_SLOT_LAYER * 4u] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
+ }
+
+ /* the order these position exports are created is important */
+ int next_pos = 0;
+ bool exported_pos = export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
+ if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) {
+ export_vs_psiz_layer_viewport(ctx, &next_pos);
+ exported_pos = true;
+ }
+ if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
+ exported_pos |= export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
+ if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
+ exported_pos |= export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
+
+ if (ctx->export_clip_dists) {
+ if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
+ export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
+ if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
+ export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
+ }
+
+ for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
+ if (i < VARYING_SLOT_VAR0 &&
+ i != VARYING_SLOT_LAYER &&
+ i != VARYING_SLOT_PRIMITIVE_ID &&
+ i != VARYING_SLOT_VIEWPORT)
+ continue;
+
+ export_vs_varying(ctx, i, false, NULL);
+ }
+
+ if (!exported_pos)
+ create_null_export(ctx);
+}
+
+static bool export_fs_mrt_z(isel_context *ctx)
+{
+ Builder bld(ctx->program, ctx->block);
+ unsigned enabled_channels = 0;
+ bool compr = false;
+ Operand values[4];
+
+ for (unsigned i = 0; i < 4; ++i) {
+ values[i] = Operand(v1);
+ }
+
+ /* Both stencil and sample mask only need 16-bits. */
+ if (!ctx->program->info->ps.writes_z &&
+ (ctx->program->info->ps.writes_stencil ||
+ ctx->program->info->ps.writes_sample_mask)) {
+ compr = true; /* COMPR flag */
+
+ if (ctx->program->info->ps.writes_stencil) {
+ /* Stencil should be in X[23:16]. */
+ values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
+ values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u), values[0]);
+ enabled_channels |= 0x3;
+ }
+
+ if (ctx->program->info->ps.writes_sample_mask) {
+ /* SampleMask should be in Y[15:0]. */
+ values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
+ enabled_channels |= 0xc;
+ }
+ } else {
+ if (ctx->program->info->ps.writes_z) {
+ values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u]);
+ enabled_channels |= 0x1;
+ }
+
+ if (ctx->program->info->ps.writes_stencil) {
+ values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
+ enabled_channels |= 0x2;
+ }
+
+ if (ctx->program->info->ps.writes_sample_mask) {
+ values[2] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
+ enabled_channels |= 0x4;
+ }
+ }
+
+ /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X
+ * writemask component.
+ */
+ if (ctx->options->chip_class == GFX6 &&
+ ctx->options->family != CHIP_OLAND &&
+ ctx->options->family != CHIP_HAINAN) {
+ enabled_channels |= 0x1;
+ }
+
+ bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3],
+ enabled_channels, V_008DFC_SQ_EXP_MRTZ, compr);
+
+ return true;
+}
+
+static bool export_fs_mrt_color(isel_context *ctx, int slot)
+{
+ Builder bld(ctx->program, ctx->block);
+ unsigned write_mask = ctx->outputs.mask[slot];
+ Operand values[4];
+
+ for (unsigned i = 0; i < 4; ++i) {
+ if (write_mask & (1 << i)) {
+ values[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
+ } else {
+ values[i] = Operand(v1);
+ }
+ }
+
+ unsigned target, col_format;
+ unsigned enabled_channels = 0;
+ aco_opcode compr_op = (aco_opcode)0;
+
+ slot -= FRAG_RESULT_DATA0;
+ target = V_008DFC_SQ_EXP_MRT + slot;
+ col_format = (ctx->options->key.fs.col_format >> (4 * slot)) & 0xf;
+
+ bool is_int8 = (ctx->options->key.fs.is_int8 >> slot) & 1;
+ bool is_int10 = (ctx->options->key.fs.is_int10 >> slot) & 1;
+
+ switch (col_format)
+ {
+ case V_028714_SPI_SHADER_ZERO:
+ enabled_channels = 0; /* writemask */
+ target = V_008DFC_SQ_EXP_NULL;
+ break;
+
+ case V_028714_SPI_SHADER_32_R:
+ enabled_channels = 1;
+ break;
+
+ case V_028714_SPI_SHADER_32_GR:
+ enabled_channels = 0x3;
+ break;
+
+ case V_028714_SPI_SHADER_32_AR:
+ if (ctx->options->chip_class >= GFX10) {
+ /* Special case: on GFX10, the outputs are different for 32_AR */
+ enabled_channels = 0x3;
+ values[1] = values[3];
+ values[3] = Operand(v1);
+ } else {
+ enabled_channels = 0x9;
+ }
+ break;
+
+ case V_028714_SPI_SHADER_FP16_ABGR:
+ enabled_channels = 0x5;
+ compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
+ break;
+
+ case V_028714_SPI_SHADER_UNORM16_ABGR:
+ enabled_channels = 0x5;
+ compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
+ break;
+
+ case V_028714_SPI_SHADER_SNORM16_ABGR:
+ enabled_channels = 0x5;
+ compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
+ break;
+
+ case V_028714_SPI_SHADER_UINT16_ABGR: {
+ enabled_channels = 0x5;
+ compr_op = aco_opcode::v_cvt_pk_u16_u32;
+ if (is_int8 || is_int10) {
+ /* clamp */
+ uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
+ Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb));
+
+ for (unsigned i = 0; i < 4; i++) {
+ if ((write_mask >> i) & 1) {
+ values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1),
+ i == 3 && is_int10 ? Operand(3u) : Operand(max_rgb_val),
+ values[i]);
+ }
+ }
+ }
+ break;
+ }
+
+ case V_028714_SPI_SHADER_SINT16_ABGR:
+ enabled_channels = 0x5;
+ compr_op = aco_opcode::v_cvt_pk_i16_i32;
+ if (is_int8 || is_int10) {
+ /* clamp */
+ uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
+ uint32_t min_rgb = is_int8 ? -128 :is_int10 ? -512 : 0;
+ Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb));
+ Temp min_rgb_val = bld.copy(bld.def(s1), Operand(min_rgb));
+
+ for (unsigned i = 0; i < 4; i++) {
+ if ((write_mask >> i) & 1) {
+ values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1),
+ i == 3 && is_int10 ? Operand(1u) : Operand(max_rgb_val),
+ values[i]);
+ values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1),
+ i == 3 && is_int10 ? Operand(-2u) : Operand(min_rgb_val),
+ values[i]);
+ }
+ }
+ }
+ break;
+
+ case V_028714_SPI_SHADER_32_ABGR:
+ enabled_channels = 0xF;
+ break;
+
+ default:
+ break;
+ }
+
+ if (target == V_008DFC_SQ_EXP_NULL)
+ return false;
+
+ if ((bool) compr_op) {
+ for (int i = 0; i < 2; i++) {
+ /* check if at least one of the values to be compressed is enabled */
+ unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1;
+ if (enabled) {
+ enabled_channels |= enabled << (i*2);
+ values[i] = bld.vop3(compr_op, bld.def(v1),
+ values[i*2].isUndefined() ? Operand(0u) : values[i*2],
+ values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1]);
+ } else {
+ values[i] = Operand(v1);
+ }
+ }
+ values[2] = Operand(v1);
+ values[3] = Operand(v1);
+ } else {
+ for (int i = 0; i < 4; i++)
+ values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
+ }
+
+ bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3],
+ enabled_channels, target, (bool) compr_op);
+ return true;
+}
+
+static void create_fs_exports(isel_context *ctx)
+{
+ bool exported = false;
+
+ /* Export depth, stencil and sample mask. */
+ if (ctx->outputs.mask[FRAG_RESULT_DEPTH] ||
+ ctx->outputs.mask[FRAG_RESULT_STENCIL] ||
+ ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
+ exported |= export_fs_mrt_z(ctx);
+
+ /* Export all color render targets. */
+ for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i)
+ if (ctx->outputs.mask[i])
+ exported |= export_fs_mrt_color(ctx, i);
+
+ if (!exported)
+ create_null_export(ctx);
+}
+
+static void write_tcs_tess_factors(isel_context *ctx)
+{
+ unsigned outer_comps;
+ unsigned inner_comps;
+
+ switch (ctx->args->options->key.tcs.primitive_mode) {
+ case GL_ISOLINES:
+ outer_comps = 2;
+ inner_comps = 0;
+ break;
+ case GL_TRIANGLES:
+ outer_comps = 3;
+ inner_comps = 1;
+ break;
+ case GL_QUADS:
+ outer_comps = 4;
+ inner_comps = 2;
+ break;
+ default:
+ return;
+ }
+
+ Builder bld(ctx->program, ctx->block);
+
+ bld.barrier(aco_opcode::p_memory_barrier_shared);
+ if (unlikely(ctx->program->chip_class != GFX6 && ctx->program->workgroup_size > ctx->program->wave_size))
+ bld.sopp(aco_opcode::s_barrier);
+
+ Temp tcs_rel_ids = get_arg(ctx, ctx->args->ac.tcs_rel_ids);
+ Temp invocation_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand(8u), Operand(5u));
+
+ Temp invocation_id_is_zero = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), invocation_id);
+ if_context ic_invocation_id_is_zero;
+ begin_divergent_if_then(ctx, &ic_invocation_id_is_zero, invocation_id_is_zero);
+ bld.reset(ctx->block);
+
+ Temp hs_ring_tess_factor = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_FACTOR * 16u));
+
+ std::pair<Temp, unsigned> lds_base = get_tcs_output_lds_offset(ctx);
+ unsigned stride = inner_comps + outer_comps;
+ unsigned lds_align = calculate_lds_alignment(ctx, lds_base.second);
+ Temp tf_inner_vec;
+ Temp tf_outer_vec;
+ Temp out[6];
+ assert(stride <= (sizeof(out) / sizeof(Temp)));
+
+ if (ctx->args->options->key.tcs.primitive_mode == GL_ISOLINES) {
+ // LINES reversal
+ tf_outer_vec = load_lds(ctx, 4, bld.tmp(v2), lds_base.first, lds_base.second + ctx->tcs_tess_lvl_out_loc, lds_align);
+ out[1] = emit_extract_vector(ctx, tf_outer_vec, 0, v1);
+ out[0] = emit_extract_vector(ctx, tf_outer_vec, 1, v1);
+ } else {
+ tf_outer_vec = load_lds(ctx, 4, bld.tmp(RegClass(RegType::vgpr, outer_comps)), lds_base.first, lds_base.second + ctx->tcs_tess_lvl_out_loc, lds_align);
+ tf_inner_vec = load_lds(ctx, 4, bld.tmp(RegClass(RegType::vgpr, inner_comps)), lds_base.first, lds_base.second + ctx->tcs_tess_lvl_in_loc, lds_align);
- if (outinfo->export_prim_id) {
- ctx->vs_output.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
- ctx->vs_output.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = get_arg(ctx, ctx->args->vs_prim_id);
+ for (unsigned i = 0; i < outer_comps; ++i)
+ out[i] = emit_extract_vector(ctx, tf_outer_vec, i, v1);
+ for (unsigned i = 0; i < inner_comps; ++i)
+ out[outer_comps + i] = emit_extract_vector(ctx, tf_inner_vec, i, v1);
}
- if (ctx->options->key.has_multiview_view_index) {
- ctx->vs_output.mask[VARYING_SLOT_LAYER] |= 0x1;
- ctx->vs_output.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
- }
+ Temp rel_patch_id = get_tess_rel_patch_id(ctx);
+ Temp tf_base = get_arg(ctx, ctx->args->tess_factor_offset);
+ Temp byte_offset = bld.v_mul24_imm(bld.def(v1), rel_patch_id, stride * 4u);
+ unsigned tf_const_offset = 0;
- /* the order these position exports are created is important */
- int next_pos = 0;
- export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
- if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) {
- export_vs_psiz_layer_viewport(ctx, &next_pos);
- }
- if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
- export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
- if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
- export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
+ if (ctx->program->chip_class <= GFX8) {
+ Temp rel_patch_id_is_zero = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), rel_patch_id);
+ if_context ic_rel_patch_id_is_zero;
+ begin_divergent_if_then(ctx, &ic_rel_patch_id_is_zero, rel_patch_id_is_zero);
+ bld.reset(ctx->block);
- if (ctx->options->key.vs_common_out.export_clip_dists) {
- if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
- export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
- if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
- export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
+ /* Store the dynamic HS control word. */
+ Temp control_word = bld.copy(bld.def(v1), Operand(0x80000000u));
+ bld.mubuf(aco_opcode::buffer_store_dword,
+ /* SRSRC */ hs_ring_tess_factor, /* VADDR */ Operand(v1), /* SOFFSET */ tf_base, /* VDATA */ control_word,
+ /* immediate OFFSET */ 0, /* OFFEN */ false, /* idxen*/ false, /* addr64 */ false,
+ /* disable_wqm */ false, /* glc */ true);
+ tf_const_offset += 4;
+
+ begin_divergent_if_else(ctx, &ic_rel_patch_id_is_zero);
+ end_divergent_if(ctx, &ic_rel_patch_id_is_zero);
+ bld.reset(ctx->block);
}
- for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
- if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER &&
- i != VARYING_SLOT_PRIMITIVE_ID)
- continue;
+ assert(stride == 2 || stride == 4 || stride == 6);
+ Temp tf_vec = create_vec_from_array(ctx, out, stride, RegType::vgpr, 4u);
+ store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, false);
- export_vs_varying(ctx, i, false, NULL);
+ /* Store to offchip for TES to read - only if TES reads them */
+ if (ctx->args->options->key.tcs.tes_reads_tess_factors) {
+ Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
+ Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
+
+ std::pair<Temp, unsigned> vmem_offs_outer = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_out_loc);
+ store_vmem_mubuf(ctx, tf_outer_vec, hs_ring_tess_offchip, vmem_offs_outer.first, oc_lds, vmem_offs_outer.second, 4, (1 << outer_comps) - 1, true, false);
+
+ if (likely(inner_comps)) {
+ std::pair<Temp, unsigned> vmem_offs_inner = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_in_loc);
+ store_vmem_mubuf(ctx, tf_inner_vec, hs_ring_tess_offchip, vmem_offs_inner.first, oc_lds, vmem_offs_inner.second, 4, (1 << inner_comps) - 1, true, false);
+ }
}
+
+ begin_divergent_if_else(ctx, &ic_invocation_id_is_zero);
+ end_divergent_if(ctx, &ic_invocation_id_is_zero);
}
static void emit_stream_output(isel_context *ctx,
const struct radv_stream_output *output)
{
unsigned num_comps = util_bitcount(output->component_mask);
+ unsigned writemask = (1 << num_comps) - 1;
unsigned loc = output->location;
unsigned buf = output->buffer;
- unsigned offset = output->offset;
assert(num_comps && num_comps <= 4);
if (!num_comps || num_comps > 4)
Temp out[4];
bool all_undef = true;
- assert(ctx->stage == vertex_vs);
+ assert(ctx->stage & hw_vs);
for (unsigned i = 0; i < num_comps; i++) {
- out[i] = ctx->vs_output.outputs[loc][start + i];
+ out[i] = ctx->outputs.temps[loc * 4 + start + i];
all_undef = all_undef && !out[i].id();
}
if (all_undef)
return;
- Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_comps)};
- aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_comps, 1)};
- for (unsigned i = 0; i < num_comps; ++i)
- vec->operands[i] = (ctx->vs_output.mask[loc] & 1 << i) ? Operand(out[i]) : Operand(0u);
- vec->definitions[0] = Definition(write_data);
- ctx->block->instructions.emplace_back(std::move(vec));
+ while (writemask) {
+ int start, count;
+ u_bit_scan_consecutive_range(&writemask, &start, &count);
+ if (count == 3 && ctx->options->chip_class == GFX6) {
+ /* GFX6 doesn't support storing vec3, split it. */
+ writemask |= 1u << (start + 2);
+ count = 2;
+ }
- aco_opcode opcode;
- switch (num_comps) {
- case 1:
- opcode = aco_opcode::buffer_store_dword;
- break;
- case 2:
- opcode = aco_opcode::buffer_store_dwordx2;
- break;
- case 3:
- opcode = aco_opcode::buffer_store_dwordx3;
- break;
- case 4:
- opcode = aco_opcode::buffer_store_dwordx4;
- break;
- }
+ unsigned offset = output->offset + start * 4;
- aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
- store->operands[0] = Operand(so_write_offset[buf]);
- store->operands[1] = Operand(so_buffers[buf]);
- store->operands[2] = Operand((uint32_t) 0);
- store->operands[3] = Operand(write_data);
- if (offset > 4095) {
- /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
- Builder bld(ctx->program, ctx->block);
- store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf]));
- } else {
- store->offset = offset;
+ Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, count)};
+ aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
+ for (int i = 0; i < count; ++i)
+ vec->operands[i] = (ctx->outputs.mask[loc] & 1 << (start + i)) ? Operand(out[start + i]) : Operand(0u);
+ vec->definitions[0] = Definition(write_data);
+ ctx->block->instructions.emplace_back(std::move(vec));
+
+ aco_opcode opcode;
+ switch (count) {
+ case 1:
+ opcode = aco_opcode::buffer_store_dword;
+ break;
+ case 2:
+ opcode = aco_opcode::buffer_store_dwordx2;
+ break;
+ case 3:
+ opcode = aco_opcode::buffer_store_dwordx3;
+ break;
+ case 4:
+ opcode = aco_opcode::buffer_store_dwordx4;
+ break;
+ default:
+ unreachable("Unsupported dword count.");
+ }
+
+ aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
+ store->operands[0] = Operand(so_buffers[buf]);
+ store->operands[1] = Operand(so_write_offset[buf]);
+ store->operands[2] = Operand((uint32_t) 0);
+ store->operands[3] = Operand(write_data);
+ if (offset > 4095) {
+ /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
+ Builder bld(ctx->program, ctx->block);
+ store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf]));
+ } else {
+ store->offset = offset;
+ }
+ store->offen = true;
+ store->glc = true;
+ store->dlc = false;
+ store->slc = true;
+ store->can_reorder = true;
+ ctx->block->instructions.emplace_back(std::move(store));
}
- store->offen = true;
- store->glc = true;
- store->dlc = false;
- store->slc = true;
- store->can_reorder = true;
- ctx->block->instructions.emplace_back(std::move(store));
}
static void emit_streamout(isel_context *ctx, unsigned stream)
if (!stride)
continue;
- so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, Operand(i * 16u));
+ Operand off = bld.copy(bld.def(s1), Operand(i * 16u));
+ so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, off);
}
Temp so_vtx_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
} /* end namespace */
+void fix_ls_vgpr_init_bug(isel_context *ctx, Pseudo_instruction *startpgm)
+{
+ assert(ctx->shader->info.stage == MESA_SHADER_VERTEX);
+ Builder bld(ctx->program, ctx->block);
+ constexpr unsigned hs_idx = 1u;
+ Builder::Result hs_thread_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
+ get_arg(ctx, ctx->args->merged_wave_info),
+ Operand((8u << 16) | (hs_idx * 8u)));
+ Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());
+
+ /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
+
+ Temp instance_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
+ get_arg(ctx, ctx->args->rel_auto_id),
+ get_arg(ctx, ctx->args->ac.instance_id),
+ ls_has_nonzero_hs_threads);
+ Temp rel_auto_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
+ get_arg(ctx, ctx->args->ac.tcs_rel_ids),
+ get_arg(ctx, ctx->args->rel_auto_id),
+ ls_has_nonzero_hs_threads);
+ Temp vertex_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
+ get_arg(ctx, ctx->args->ac.tcs_patch_id),
+ get_arg(ctx, ctx->args->ac.vertex_id),
+ ls_has_nonzero_hs_threads);
+
+ ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = instance_id;
+ ctx->arg_temps[ctx->args->rel_auto_id.arg_index] = rel_auto_id;
+ ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = vertex_id;
+}
+
void split_arguments(isel_context *ctx, Pseudo_instruction *startpgm)
{
/* Split all arguments except for the first (ring_offsets) and the last
ctx->block->fp_mode = program->next_fp_mode;
}
+void cleanup_cfg(Program *program)
+{
+ /* create linear_succs/logical_succs */
+ for (Block& BB : program->blocks) {
+ for (unsigned idx : BB.linear_preds)
+ program->blocks[idx].linear_succs.emplace_back(BB.index);
+ for (unsigned idx : BB.logical_preds)
+ program->blocks[idx].logical_succs.emplace_back(BB.index);
+ }
+}
+
+Temp merged_wave_info_to_mask(isel_context *ctx, unsigned i)
+{
+ Builder bld(ctx->program, ctx->block);
+
+ /* The s_bfm only cares about s0.u[5:0] so we don't need either s_bfe nor s_and here */
+ Temp count = i == 0
+ ? get_arg(ctx, ctx->args->merged_wave_info)
+ : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
+ get_arg(ctx, ctx->args->merged_wave_info), Operand(i * 8u));
+
+ Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand(0u));
+ Temp cond;
+
+ if (ctx->program->wave_size == 64) {
+ /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */
+ Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count, Operand(6u /* log2(64) */));
+ cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(-1u), mask, bld.scc(active_64));
+ } else {
+ /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of the register */
+ cond = emit_extract_vector(ctx, mask, 0, bld.lm);
+ }
+
+ return cond;
+}
+
+bool ngg_early_prim_export(isel_context *ctx)
+{
+ /* TODO: Check edge flags, and if they are written, return false. (Needed for OpenGL, not for Vulkan.) */
+ return true;
+}
+
+void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx)
+{
+ Builder bld(ctx->program, ctx->block);
+
+ /* It is recommended to do the GS_ALLOC_REQ as soon and as quickly as possible, so we set the maximum priority (3). */
+ bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
+
+ /* Get the id of the current wave within the threadgroup (workgroup) */
+ Builder::Result wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
+ get_arg(ctx, ctx->args->merged_wave_info), Operand(24u | (4u << 16)));
+
+ /* Execute the following code only on the first wave (wave id 0),
+ * use the SCC def to tell if the wave id is zero or not.
+ */
+ Temp cond = wave_id_in_tg.def(1).getTemp();
+ if_context ic;
+ begin_uniform_if_then(ctx, &ic, cond);
+ begin_uniform_if_else(ctx, &ic);
+ bld.reset(ctx->block);
+
+ /* Number of vertices output by VS/TES */
+ Temp vtx_cnt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
+ get_arg(ctx, ctx->args->gs_tg_info), Operand(12u | (9u << 16u)));
+ /* Number of primitives output by VS/TES */
+ Temp prm_cnt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
+ get_arg(ctx, ctx->args->gs_tg_info), Operand(22u | (9u << 16u)));
+
+ /* Put the number of vertices and primitives into m0 for the GS_ALLOC_REQ */
+ Temp tmp = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand(12u));
+ tmp = bld.sop2(aco_opcode::s_or_b32, bld.m0(bld.def(s1)), bld.def(s1, scc), tmp, vtx_cnt);
+
+ /* Request the SPI to allocate space for the primitives and vertices that will be exported by the threadgroup. */
+ bld.sopp(aco_opcode::s_sendmsg, bld.m0(tmp), -1, sendmsg_gs_alloc_req);
+
+ end_uniform_if(ctx, &ic);
+
+ /* After the GS_ALLOC_REQ is done, reset priority to default (0). */
+ bld.reset(ctx->block);
+ bld.sopp(aco_opcode::s_setprio, -1u, 0x0u);
+}
+
+Temp ngg_get_prim_exp_arg(isel_context *ctx, unsigned num_vertices, const Temp vtxindex[])
+{
+ Builder bld(ctx->program, ctx->block);
+
+ if (ctx->args->options->key.vs_common_out.as_ngg_passthrough) {
+ return get_arg(ctx, ctx->args->gs_vtx_offset[0]);
+ }
+
+ Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id);
+ Temp tmp;
+
+ for (unsigned i = 0; i < num_vertices; ++i) {
+ assert(vtxindex[i].id());
+
+ if (i)
+ tmp = bld.vop3(aco_opcode::v_lshl_add_u32, bld.def(v1), vtxindex[i], Operand(10u * i), tmp);
+ else
+ tmp = vtxindex[i];
+
+ /* The initial edge flag is always false in tess eval shaders. */
+ if (ctx->stage == ngg_vertex_gs) {
+ Temp edgeflag = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), gs_invocation_id, Operand(8 + i), Operand(1u));
+ tmp = bld.vop3(aco_opcode::v_lshl_add_u32, bld.def(v1), edgeflag, Operand(10u * i + 9u), tmp);
+ }
+ }
+
+ /* TODO: Set isnull field in case of merged NGG VS+GS. */
+
+ return tmp;
+}
+
+void ngg_emit_prim_export(isel_context *ctx, unsigned num_vertices_per_primitive, const Temp vtxindex[])
+{
+ Builder bld(ctx->program, ctx->block);
+ Temp prim_exp_arg = ngg_get_prim_exp_arg(ctx, num_vertices_per_primitive, vtxindex);
+
+ bld.exp(aco_opcode::exp, prim_exp_arg, Operand(v1), Operand(v1), Operand(v1),
+ 1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */,
+ false /* compressed */, true/* done */, false /* valid mask */);
+}
+
+void ngg_emit_nogs_gsthreads(isel_context *ctx)
+{
+ /* Emit the things that NGG GS threads need to do, for shaders that don't have SW GS.
+ * These must always come before VS exports.
+ *
+ * It is recommended to do these as early as possible. They can be at the beginning when
+ * there is no SW GS and the shader doesn't write edge flags.
+ */
+
+ if_context ic;
+ Temp is_gs_thread = merged_wave_info_to_mask(ctx, 1);
+ begin_divergent_if_then(ctx, &ic, is_gs_thread);
+
+ Builder bld(ctx->program, ctx->block);
+ constexpr unsigned max_vertices_per_primitive = 3;
+ unsigned num_vertices_per_primitive = max_vertices_per_primitive;
+
+ if (ctx->stage == ngg_vertex_gs) {
+ /* TODO: optimize for points & lines */
+ } else if (ctx->stage == ngg_tess_eval_gs) {
+ if (ctx->shader->info.tess.point_mode)
+ num_vertices_per_primitive = 1;
+ else if (ctx->shader->info.tess.primitive_mode == GL_ISOLINES)
+ num_vertices_per_primitive = 2;
+ } else {
+ unreachable("Unsupported NGG shader stage");
+ }
+
+ Temp vtxindex[max_vertices_per_primitive];
+ vtxindex[0] = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu),
+ get_arg(ctx, ctx->args->gs_vtx_offset[0]));
+ vtxindex[1] = num_vertices_per_primitive < 2 ? Temp(0, v1) :
+ bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
+ get_arg(ctx, ctx->args->gs_vtx_offset[0]), Operand(16u), Operand(16u));
+ vtxindex[2] = num_vertices_per_primitive < 3 ? Temp(0, v1) :
+ bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu),
+ get_arg(ctx, ctx->args->gs_vtx_offset[2]));
+
+ /* Export primitive data to the index buffer. */
+ ngg_emit_prim_export(ctx, num_vertices_per_primitive, vtxindex);
+
+ /* Export primitive ID. */
+ if (ctx->stage == ngg_vertex_gs && ctx->args->options->key.vs_common_out.export_prim_id) {
+ /* Copy Primitive IDs from GS threads to the LDS address corresponding to the ES thread of the provoking vertex. */
+ Temp prim_id = get_arg(ctx, ctx->args->ac.gs_prim_id);
+ Temp provoking_vtx_index = vtxindex[0];
+ Temp addr = bld.v_mul_imm(bld.def(v1), provoking_vtx_index, 4u);
+
+ store_lds(ctx, 4, prim_id, 0x1u, addr, 0u, 4u);
+ }
+
+ begin_divergent_if_else(ctx, &ic);
+ end_divergent_if(ctx, &ic);
+}
+
+void ngg_emit_nogs_output(isel_context *ctx)
+{
+ /* Emits NGG GS output, for stages that don't have SW GS. */
+
+ if_context ic;
+ Builder bld(ctx->program, ctx->block);
+ bool late_prim_export = !ngg_early_prim_export(ctx);
+
+ /* NGG streamout is currently disabled by default. */
+ assert(!ctx->args->shader_info->so.num_outputs);
+
+ if (late_prim_export) {
+ /* VS exports are output to registers in a predecessor block. Emit phis to get them into this block. */
+ create_export_phis(ctx);
+ /* Do what we need to do in the GS threads. */
+ ngg_emit_nogs_gsthreads(ctx);
+
+ /* What comes next should be executed on ES threads. */
+ Temp is_es_thread = merged_wave_info_to_mask(ctx, 0);
+ begin_divergent_if_then(ctx, &ic, is_es_thread);
+ bld.reset(ctx->block);
+ }
+
+ /* Export VS outputs */
+ ctx->block->kind |= block_kind_export_end;
+ create_vs_exports(ctx);
+
+ /* Export primitive ID */
+ if (ctx->args->options->key.vs_common_out.export_prim_id) {
+ Temp prim_id;
+
+ if (ctx->stage == ngg_vertex_gs) {
+ /* Wait for GS threads to store primitive ID in LDS. */
+ bld.barrier(aco_opcode::p_memory_barrier_shared);
+ bld.sopp(aco_opcode::s_barrier);
+
+ /* Calculate LDS address where the GS threads stored the primitive ID. */
+ Temp wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
+ get_arg(ctx, ctx->args->merged_wave_info), Operand(24u | (4u << 16)));
+ Temp thread_id_in_wave = emit_mbcnt(ctx, bld.def(v1));
+ Temp wave_id_mul = bld.v_mul24_imm(bld.def(v1), as_vgpr(ctx, wave_id_in_tg), ctx->program->wave_size);
+ Temp thread_id_in_tg = bld.vadd32(bld.def(v1), Operand(wave_id_mul), Operand(thread_id_in_wave));
+ Temp addr = bld.v_mul24_imm(bld.def(v1), thread_id_in_tg, 4u);
+
+ /* Load primitive ID from LDS. */
+ prim_id = load_lds(ctx, 4, bld.tmp(v1), addr, 0u, 4u);
+ } else if (ctx->stage == ngg_tess_eval_gs) {
+ /* TES: Just use the patch ID as the primitive ID. */
+ prim_id = get_arg(ctx, ctx->args->ac.tes_patch_id);
+ } else {
+ unreachable("unsupported NGG shader stage.");
+ }
+
+ ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
+ ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = prim_id;
+
+ export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, nullptr);
+ }
+
+ if (late_prim_export) {
+ begin_divergent_if_else(ctx, &ic);
+ end_divergent_if(ctx, &ic);
+ bld.reset(ctx->block);
+ }
+}
+
void select_program(Program *program,
unsigned shader_count,
struct nir_shader *const *shaders,
ac_shader_config* config,
struct radv_shader_args *args)
{
- isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args);
+ isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
+ if_context ic_merged_wave_info;
+ bool ngg_no_gs = ctx.stage == ngg_vertex_gs || ctx.stage == ngg_tess_eval_gs;
for (unsigned i = 0; i < shader_count; i++) {
nir_shader *nir = shaders[i];
/* needs to be after init_context() for FS */
Pseudo_instruction *startpgm = add_startpgm(&ctx);
append_logical_start(ctx.block);
+
+ if (unlikely(args->options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs))
+ fix_ls_vgpr_init_bug(&ctx, startpgm);
+
split_arguments(&ctx, startpgm);
}
- if_context ic;
- if (shader_count >= 2) {
- Builder bld(ctx.program, ctx.block);
- Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), ctx.merged_wave_info, Operand((8u << 16) | (i * 8u)));
- Temp thread_id = emit_mbcnt(&ctx, bld.def(v1));
- Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(bld.lm)), count, thread_id);
+ if (ngg_no_gs) {
+ ngg_emit_sendmsg_gs_alloc_req(&ctx);
+
+ if (ngg_early_prim_export(&ctx))
+ ngg_emit_nogs_gsthreads(&ctx);
+ }
+
+ /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
+ nir_function_impl *func = nir_shader_get_entrypoint(nir);
+ bool empty_shader = nir_cf_list_is_empty_block(&func->body) &&
+ ((nir->info.stage == MESA_SHADER_VERTEX &&
+ (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
+ (nir->info.stage == MESA_SHADER_TESS_EVAL &&
+ ctx.stage == tess_eval_geometry_gs));
- begin_divergent_if_then(&ctx, &ic, cond);
+ bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : ((shader_count >= 2 && !empty_shader) || ngg_no_gs);
+ bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : check_merged_wave_info;
+ if (check_merged_wave_info) {
+ Temp cond = merged_wave_info_to_mask(&ctx, i);
+ begin_divergent_if_then(&ctx, &ic_merged_wave_info, cond);
}
if (i) {
Builder bld(ctx.program, ctx.block);
- bld.barrier(aco_opcode::p_memory_barrier_shared); //TODO: different barriers are needed for different stages
+
+ bld.barrier(aco_opcode::p_memory_barrier_shared);
bld.sopp(aco_opcode::s_barrier);
- }
+
+ if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {
+ ctx.gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | 16u));
+ }
+ } else if (ctx.stage == geometry_gs)
+ ctx.gs_wave_id = get_arg(&ctx, args->gs_wave_id);
if (ctx.stage == fragment_fs)
handle_bc_optimize(&ctx);
- nir_function_impl *func = nir_shader_get_entrypoint(nir);
visit_cf_list(&ctx, &func->body);
- if (ctx.program->info->so.num_outputs/*&& !ctx->is_gs_copy_shader */)
+ if (ctx.program->info->so.num_outputs && (ctx.stage & hw_vs))
emit_streamout(&ctx, 0);
- if (ctx.stage == vertex_vs)
+ if (ctx.stage & hw_vs) {
create_vs_exports(&ctx);
+ ctx.block->kind |= block_kind_export_end;
+ } else if (ngg_no_gs && ngg_early_prim_export(&ctx)) {
+ ngg_emit_nogs_output(&ctx);
+ } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
+ Builder bld(ctx.program, ctx.block);
+ bld.barrier(aco_opcode::p_memory_barrier_gs_data);
+ bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, sendmsg_gs_done(false, false, 0));
+ } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
+ write_tcs_tess_factors(&ctx);
+ }
- if (shader_count >= 2) {
- begin_divergent_if_else(&ctx, &ic);
- end_divergent_if(&ctx, &ic);
+ if (ctx.stage == fragment_fs) {
+ create_fs_exports(&ctx);
+ ctx.block->kind |= block_kind_export_end;
}
+ if (endif_merged_wave_info) {
+ begin_divergent_if_else(&ctx, &ic_merged_wave_info);
+ end_divergent_if(&ctx, &ic_merged_wave_info);
+ }
+
+ if (ngg_no_gs && !ngg_early_prim_export(&ctx))
+ ngg_emit_nogs_output(&ctx);
+
ralloc_free(ctx.divergent_vals);
+
+ if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
+ /* Outputs of the previous stage are inputs to the next stage */
+ ctx.inputs = ctx.outputs;
+ ctx.outputs = shader_io_state();
+ }
}
program->config->float_mode = program->blocks[0].fp_mode.val;
bld.smem(aco_opcode::s_dcache_wb, false);
bld.sopp(aco_opcode::s_endpgm);
- /* cleanup CFG */
- for (Block& BB : program->blocks) {
- for (unsigned idx : BB.linear_preds)
- program->blocks[idx].linear_succs.emplace_back(BB.index);
- for (unsigned idx : BB.logical_preds)
- program->blocks[idx].logical_succs.emplace_back(BB.index);
+ cleanup_cfg(program);
+}
+
+void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
+ ac_shader_config* config,
+ struct radv_shader_args *args)
+{
+ isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
+
+ program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
+ program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
+ program->next_fp_mode.must_flush_denorms32 = false;
+ program->next_fp_mode.must_flush_denorms16_64 = false;
+ program->next_fp_mode.care_about_round32 = false;
+ program->next_fp_mode.care_about_round16_64 = false;
+ program->next_fp_mode.denorm16_64 = fp_denorm_keep;
+ program->next_fp_mode.denorm32 = 0;
+ program->next_fp_mode.round32 = fp_round_ne;
+ program->next_fp_mode.round16_64 = fp_round_ne;
+ ctx.block->fp_mode = program->next_fp_mode;
+
+ add_startpgm(&ctx);
+ append_logical_start(ctx.block);
+
+ Builder bld(ctx.program, ctx.block);
+
+ Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), program->private_segment_buffer, Operand(RING_GSVS_VS * 16u));
+
+ Operand stream_id(0u);
+ if (args->shader_info->so.num_outputs)
+ stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
+ get_arg(&ctx, ctx.args->streamout_config), Operand(0x20018u));
+
+ Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), get_arg(&ctx, ctx.args->ac.vertex_id));
+
+ std::stack<Block> endif_blocks;
+
+ for (unsigned stream = 0; stream < 4; stream++) {
+ if (stream_id.isConstant() && stream != stream_id.constantValue())
+ continue;
+
+ unsigned num_components = args->shader_info->gs.num_stream_output_components[stream];
+ if (stream > 0 && (!num_components || !args->shader_info->so.num_outputs))
+ continue;
+
+ memset(ctx.outputs.mask, 0, sizeof(ctx.outputs.mask));
+
+ unsigned BB_if_idx = ctx.block->index;
+ Block BB_endif = Block();
+ if (!stream_id.isConstant()) {
+ /* begin IF */
+ Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand(stream));
+ append_logical_end(ctx.block);
+ ctx.block->kind |= block_kind_uniform;
+ bld.branch(aco_opcode::p_cbranch_z, cond);
+
+ BB_endif.kind |= ctx.block->kind & block_kind_top_level;
+
+ ctx.block = ctx.program->create_and_insert_block();
+ add_edge(BB_if_idx, ctx.block);
+ bld.reset(ctx.block);
+ append_logical_start(ctx.block);
+ }
+
+ unsigned offset = 0;
+ for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
+ if (args->shader_info->gs.output_streams[i] != stream)
+ continue;
+
+ unsigned output_usage_mask = args->shader_info->gs.output_usage_mask[i];
+ unsigned length = util_last_bit(output_usage_mask);
+ for (unsigned j = 0; j < length; ++j) {
+ if (!(output_usage_mask & (1 << j)))
+ continue;
+
+ unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4;
+ Temp voffset = vtx_offset;
+ if (const_offset >= 4096u) {
+ voffset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), voffset);
+ const_offset %= 4096u;
+ }
+
+ aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(aco_opcode::buffer_load_dword, Format::MUBUF, 3, 1)};
+ mubuf->definitions[0] = bld.def(v1);
+ mubuf->operands[0] = Operand(gsvs_ring);
+ mubuf->operands[1] = Operand(voffset);
+ mubuf->operands[2] = Operand(0u);
+ mubuf->offen = true;
+ mubuf->offset = const_offset;
+ mubuf->glc = true;
+ mubuf->slc = true;
+ mubuf->dlc = args->options->chip_class >= GFX10;
+ mubuf->barrier = barrier_none;
+ mubuf->can_reorder = true;
+
+ ctx.outputs.mask[i] |= 1 << j;
+ ctx.outputs.temps[i * 4u + j] = mubuf->definitions[0].getTemp();
+
+ bld.insert(std::move(mubuf));
+
+ offset++;
+ }
+ }
+
+ if (args->shader_info->so.num_outputs) {
+ emit_streamout(&ctx, stream);
+ bld.reset(ctx.block);
+ }
+
+ if (stream == 0) {
+ create_vs_exports(&ctx);
+ ctx.block->kind |= block_kind_export_end;
+ }
+
+ if (!stream_id.isConstant()) {
+ append_logical_end(ctx.block);
+
+ /* branch from then block to endif block */
+ bld.branch(aco_opcode::p_branch);
+ add_edge(ctx.block->index, &BB_endif);
+ ctx.block->kind |= block_kind_uniform;
+
+ /* emit else block */
+ ctx.block = ctx.program->create_and_insert_block();
+ add_edge(BB_if_idx, ctx.block);
+ bld.reset(ctx.block);
+ append_logical_start(ctx.block);
+
+ endif_blocks.push(std::move(BB_endif));
+ }
+ }
+
+ while (!endif_blocks.empty()) {
+ Block BB_endif = std::move(endif_blocks.top());
+ endif_blocks.pop();
+
+ Block *BB_else = ctx.block;
+
+ append_logical_end(BB_else);
+ /* branch from else block to endif block */
+ bld.branch(aco_opcode::p_branch);
+ add_edge(BB_else->index, &BB_endif);
+ BB_else->kind |= block_kind_uniform;
+
+ /** emit endif merge block */
+ ctx.block = program->insert_block(std::move(BB_endif));
+ bld.reset(ctx.block);
+ append_logical_start(ctx.block);
}
+
+ program->config->float_mode = program->blocks[0].fp_mode.val;
+
+ append_logical_end(ctx.block);
+ ctx.block->kind |= block_kind_uniform;
+ bld.sopp(aco_opcode::s_endpgm);
+
+ cleanup_cfg(program);
}
}