unsigned BB_if_idx;
unsigned invert_idx;
+ bool uniform_has_then_branch;
bool then_branch_divergent;
Block BB_invert;
Block BB_endif;
assert(idx == 0);
return src;
}
- assert(src.size() > idx);
+
+ assert(src.bytes() > (idx * dst_rc.bytes()));
Builder bld(ctx->program, ctx->block);
auto it = ctx->allocated_vec.find(src.id());
- /* the size check needs to be early because elements other than 0 may be garbage */
- if (it != ctx->allocated_vec.end() && it->second[0].size() == dst_rc.size()) {
+ if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
if (it->second[idx].regClass() == dst_rc) {
return it->second[idx];
} else {
- assert(dst_rc.size() == it->second[idx].regClass().size());
+ assert(!dst_rc.is_subdword());
assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
return bld.copy(bld.def(dst_rc), it->second[idx]);
}
}
- if (src.size() == dst_rc.size()) {
+ if (dst_rc.is_subdword())
+ src = as_vgpr(ctx, src);
+
+ if (src.bytes() == dst_rc.bytes()) {
assert(idx == 0);
return bld.copy(bld.def(dst_rc), src);
} else {
aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
split->operands[0] = Operand(vec_src);
std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
+ RegClass rc;
+ if (num_components > vec_src.size()) {
+ if (vec_src.type() == RegType::sgpr)
+ return;
+
+ /* sub-dword split */
+ assert(vec_src.type() == RegType::vgpr);
+ rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
+ } else {
+ rc = RegClass(vec_src.type(), vec_src.size() / num_components);
+ }
for (unsigned i = 0; i < num_components; i++) {
- elems[i] = {ctx->program->allocateId(), RegClass(vec_src.type(), vec_src.size() / num_components)};
+ elems[i] = {ctx->program->allocateId(), rc};
split->definitions[i] = Definition(elems[i]);
}
ctx->block->instructions.emplace_back(std::move(split));
ctx->allocated_vec.emplace(dst.id(), elems);
}
+/* adjust misaligned small bit size loads */
+void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst)
+{
+ Builder bld(ctx->program, ctx->block);
+ Operand shift;
+ Temp select = Temp();
+ if (offset.isConstant()) {
+ assert(offset.constantValue() && offset.constantValue() < 4);
+ shift = Operand(offset.constantValue() * 8);
+ } else {
+ /* bit_offset = 8 * (offset & 0x3) */
+ Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand(3u));
+ select = bld.tmp(s1);
+ shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp, Operand(3u));
+ }
+
+ if (vec.size() == 1) {
+ bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
+ } else if (vec.size() == 2) {
+ Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
+ bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
+ if (tmp == dst)
+ emit_split_vector(ctx, dst, 2);
+ else
+ emit_extract_vector(ctx, tmp, 0, dst);
+ } else if (vec.size() == 4) {
+ Temp lo = bld.tmp(s2), hi = bld.tmp(s2);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
+ hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand(0u));
+ if (select != Temp())
+ hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), select);
+ lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
+ Temp mid = bld.tmp(s1);
+ lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
+ hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
+ mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
+ bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
+ emit_split_vector(ctx, dst, 2);
+ }
+}
+
+/* this function trims subdword vectors:
+ * if dst is vgpr - split the src and create a shrunk version according to the mask.
+ * if dst is sgpr - split the src, but move the original to sgpr. */
+void trim_subdword_vector(isel_context *ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
+{
+ assert(vec_src.type() == RegType::vgpr);
+ emit_split_vector(ctx, vec_src, num_components);
+
+ Builder bld(ctx->program, ctx->block);
+ std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
+ unsigned component_size = vec_src.bytes() / num_components;
+ RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
+
+ unsigned k = 0;
+ for (unsigned i = 0; i < num_components; i++) {
+ if (mask & (1 << i))
+ elems[k++] = emit_extract_vector(ctx, vec_src, i, rc);
+ }
+
+ if (dst.type() == RegType::vgpr) {
+ assert(dst.bytes() == k * component_size);
+ aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, k, 1)};
+ for (unsigned i = 0; i < k; i++)
+ vec->operands[i] = Operand(elems[i]);
+ vec->definitions[0] = Definition(dst);
+ bld.insert(std::move(vec));
+ } else {
+ // TODO: alignbyte if mask doesn't start with 1?
+ assert(mask & 1);
+ assert(dst.size() == vec_src.size());
+ bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
+ }
+ ctx->allocated_vec.emplace(dst.id(), elems);
+}
+
Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2))
{
Builder bld(ctx->program, ctx->block);
}
Temp vec = get_ssa_temp(ctx, src.src.ssa);
- unsigned elem_size = vec.size() / src.src.ssa->num_components;
- assert(elem_size > 0); /* TODO: 8 and 16-bit vectors not supported */
- assert(vec.size() % elem_size == 0);
+ unsigned elem_size = vec.bytes() / src.src.ssa->num_components;
+ assert(elem_size > 0);
+ assert(vec.bytes() % elem_size == 0);
+
+ if (elem_size < 4 && vec.type() == RegType::sgpr) {
+ assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
+ assert(size == 1);
+ unsigned swizzle = src.swizzle[0];
+ if (vec.size() > 1) {
+ assert(src.src.ssa->bit_size == 16);
+ vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
+ swizzle = swizzle & 1;
+ }
+ if (swizzle == 0)
+ return vec;
+
+ Temp dst{ctx->program->allocateId(), s1};
+ aco_ptr<SOP2_instruction> bfe{create_instruction<SOP2_instruction>(aco_opcode::s_bfe_u32, Format::SOP2, 2, 1)};
+ bfe->operands[0] = Operand(vec);
+ bfe->operands[1] = Operand(uint32_t((src.src.ssa->bit_size << 16) | (src.src.ssa->bit_size * swizzle)));
+ bfe->definitions[0] = Definition(dst);
+ ctx->block->instructions.emplace_back(std::move(bfe));
+ return dst;
+ }
- RegClass elem_rc = RegClass(vec.type(), elem_size);
+ RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword() : RegClass(vec.type(), elem_size / 4);
if (size == 1) {
return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
} else {
elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
vec_instr->operands[i] = Operand{elems[i]};
}
- Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size)};
+ Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size / 4)};
vec_instr->definitions[0] = Definition(dst);
ctx->block->instructions.emplace_back(std::move(vec_instr));
ctx->allocated_vec.emplace(dst.id(), elems);
if (dst.type() == RegType::vgpr) {
aco_ptr<Instruction> bcsel;
- if (dst.size() == 1) {
+ if (dst.regClass() == v2b) {
+ then = as_vgpr(ctx, then);
+ els = as_vgpr(ctx, els);
+
+ Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), els, then, cond);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
then = as_vgpr(ctx, then);
els = as_vgpr(ctx, els);
bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
- } else if (dst.size() == 2) {
+ } else if (dst.regClass() == v2) {
Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
case nir_op_vec3:
case nir_op_vec4: {
std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
- aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
- for (unsigned i = 0; i < instr->dest.dest.ssa.num_components; ++i) {
+ unsigned num = instr->dest.dest.ssa.num_components;
+ for (unsigned i = 0; i < num; ++i)
elems[i] = get_alu_src(ctx, instr->src[i]);
- vec->operands[i] = Operand{elems[i]};
+
+ if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {
+ aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
+ for (unsigned i = 0; i < num; ++i)
+ vec->operands[i] = Operand{elems[i]};
+ vec->definitions[0] = Definition(dst);
+ ctx->block->instructions.emplace_back(std::move(vec));
+ ctx->allocated_vec.emplace(dst.id(), elems);
+ } else {
+ // TODO: that is a bit suboptimal..
+ Temp mask = bld.copy(bld.def(s1), Operand((1u << instr->dest.dest.ssa.bit_size) - 1));
+ for (unsigned i = 0; i < num - 1; ++i)
+ if (((i+1) * instr->dest.dest.ssa.bit_size) % 32)
+ elems[i] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
+ for (unsigned i = 0; i < num; ++i) {
+ unsigned bit = i * instr->dest.dest.ssa.bit_size;
+ if (bit % 32 == 0) {
+ elems[bit / 32] = elems[i];
+ } else {
+ elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
+ elems[i], Operand((i * instr->dest.dest.ssa.bit_size) % 32));
+ elems[bit / 32] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[bit / 32], elems[i]);
+ }
+ }
+ if (dst.size() == 1)
+ bld.copy(Definition(dst), elems[0]);
+ else
+ bld.pseudo(aco_opcode::p_create_vector, Definition(dst), elems[0], elems[1]);
}
- vec->definitions[0] = Definition(dst);
- ctx->block->instructions.emplace_back(std::move(vec));
- ctx->allocated_vec.emplace(dst.id(), elems);
break;
}
case nir_op_mov: {
break;
}
case nir_op_fmul: {
- if (dst.size() == 1) {
+ Temp src0 = get_alu_src(ctx, instr->src[0]);
+ Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
+ if (dst.regClass() == v2b) {
+ Temp tmp = bld.tmp(v1);
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, tmp, true);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
- } else if (dst.size() == 2) {
- bld.vop3(aco_opcode::v_mul_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
- as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+ } else if (dst.regClass() == v2) {
+ bld.vop3(aco_opcode::v_mul_f64, Definition(dst), src0, src1);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
break;
}
case nir_op_fadd: {
- if (dst.size() == 1) {
+ Temp src0 = get_alu_src(ctx, instr->src[0]);
+ Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
+ if (dst.regClass() == v2b) {
+ Temp tmp = bld.tmp(v1);
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, tmp, true);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
- } else if (dst.size() == 2) {
- bld.vop3(aco_opcode::v_add_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
- as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+ } else if (dst.regClass() == v2) {
+ bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, src1);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
}
case nir_op_fsub: {
Temp src0 = get_alu_src(ctx, instr->src[0]);
- Temp src1 = get_alu_src(ctx, instr->src[1]);
- if (dst.size() == 1) {
+ Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
+ if (dst.regClass() == v2b) {
+ Temp tmp = bld.tmp(v1);
+ if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, tmp, false);
+ else
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, tmp, true);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
else
emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
- } else if (dst.size() == 2) {
+ } else if (dst.regClass() == v2) {
Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
- get_alu_src(ctx, instr->src[0]),
- as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+ src0, src1);
VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
sub->neg[1] = true;
} else {
break;
}
case nir_op_fmax: {
- if (dst.size() == 1) {
+ Temp src0 = get_alu_src(ctx, instr->src[0]);
+ Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
+ if (dst.regClass() == v2b) {
+ // TODO: check fp_mode.must_flush_denorms16_64
+ Temp tmp = bld.tmp(v1);
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, tmp, true);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
- } else if (dst.size() == 2) {
+ } else if (dst.regClass() == v2) {
if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
- Temp tmp = bld.vop3(aco_opcode::v_max_f64, bld.def(v2),
- get_alu_src(ctx, instr->src[0]),
- as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+ Temp tmp = bld.vop3(aco_opcode::v_max_f64, bld.def(v2), src0, src1);
bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
} else {
- bld.vop3(aco_opcode::v_max_f64, Definition(dst),
- get_alu_src(ctx, instr->src[0]),
- as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+ bld.vop3(aco_opcode::v_max_f64, Definition(dst), src0, src1);
}
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
break;
}
case nir_op_fmin: {
- if (dst.size() == 1) {
+ Temp src0 = get_alu_src(ctx, instr->src[0]);
+ Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
+ if (dst.regClass() == v2b) {
+ // TODO: check fp_mode.must_flush_denorms16_64
+ Temp tmp = bld.tmp(v1);
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, tmp, true);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
- } else if (dst.size() == 2) {
+ } else if (dst.regClass() == v2) {
if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
- Temp tmp = bld.vop3(aco_opcode::v_min_f64, bld.def(v2),
- get_alu_src(ctx, instr->src[0]),
- as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+ Temp tmp = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), src0, src1);
bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
} else {
- bld.vop3(aco_opcode::v_min_f64, Definition(dst),
- get_alu_src(ctx, instr->src[0]),
- as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+ bld.vop3(aco_opcode::v_min_f64, Definition(dst), src0, src1);
}
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
break;
}
case nir_op_frsq: {
- if (dst.size() == 1) {
- emit_rsq(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
- } else if (dst.size() == 2) {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (dst.regClass() == v2b) {
+ Temp tmp = bld.vop1(aco_opcode::v_rsq_f16, bld.def(v1), src);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
+ emit_rsq(ctx, bld, Definition(dst), src);
+ } else if (dst.regClass() == v2) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
}
case nir_op_fneg: {
Temp src = get_alu_src(ctx, instr->src[0]);
- if (dst.size() == 1) {
+ if (dst.regClass() == v2b) {
+ Temp tmp = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x8000u), as_vgpr(ctx, src));
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
if (ctx->block->fp_mode.must_flush_denorms32)
src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
- } else if (dst.size() == 2) {
+ } else if (dst.regClass() == v2) {
if (ctx->block->fp_mode.must_flush_denorms16_64)
src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
}
case nir_op_fabs: {
Temp src = get_alu_src(ctx, instr->src[0]);
- if (dst.size() == 1) {
+ if (dst.regClass() == v2b) {
+ Temp tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFu), as_vgpr(ctx, src));
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
if (ctx->block->fp_mode.must_flush_denorms32)
src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
- } else if (dst.size() == 2) {
+ } else if (dst.regClass() == v2) {
if (ctx->block->fp_mode.must_flush_denorms16_64)
src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
}
case nir_op_fsat: {
Temp src = get_alu_src(ctx, instr->src[0]);
- if (dst.size() == 1) {
+ if (dst.regClass() == v2b) {
+ Temp one = bld.copy(bld.def(s1), Operand(0x3c00u));
+ Temp tmp = bld.vop3(aco_opcode::v_med3_f16, bld.def(v1), Operand(0u), one, src);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
/* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
// TODO: confirm that this holds under any circumstances
- } else if (dst.size() == 2) {
+ } else if (dst.regClass() == v2) {
Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
vop3->clamp = true;
break;
}
case nir_op_flog2: {
- if (dst.size() == 1) {
- emit_log2(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (dst.regClass() == v2b) {
+ Temp tmp = bld.vop1(aco_opcode::v_log_f16, bld.def(v1), src);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
+ emit_log2(ctx, bld, Definition(dst), src);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
break;
}
case nir_op_frcp: {
- if (dst.size() == 1) {
- emit_rcp(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
- } else if (dst.size() == 2) {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (dst.regClass() == v2b) {
+ Temp tmp = bld.vop1(aco_opcode::v_rcp_f16, bld.def(v1), src);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
+ emit_rcp(ctx, bld, Definition(dst), src);
+ } else if (dst.regClass() == v2) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
break;
}
case nir_op_fexp2: {
- if (dst.size() == 1) {
+ if (dst.regClass() == v2b) {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ Temp tmp = bld.vop1(aco_opcode::v_exp_f16, bld.def(v1), src);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
break;
}
case nir_op_fsqrt: {
- if (dst.size() == 1) {
- emit_sqrt(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
- } else if (dst.size() == 2) {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (dst.regClass() == v2b) {
+ Temp tmp = bld.vop1(aco_opcode::v_sqrt_f16, bld.def(v1), src);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
+ emit_sqrt(ctx, bld, Definition(dst), src);
+ } else if (dst.regClass() == v2) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
break;
}
case nir_op_ffract: {
- if (dst.size() == 1) {
+ if (dst.regClass() == v2b) {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ Temp tmp = bld.vop1(aco_opcode::v_fract_f16, bld.def(v1), src);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
- } else if (dst.size() == 2) {
+ } else if (dst.regClass() == v2) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
break;
}
case nir_op_ffloor: {
- if (dst.size() == 1) {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (dst.regClass() == v2b) {
+ Temp tmp = bld.vop1(aco_opcode::v_floor_f16, bld.def(v1), src);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
- } else if (dst.size() == 2) {
- emit_floor_f64(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
+ } else if (dst.regClass() == v2) {
+ emit_floor_f64(ctx, bld, Definition(dst), src);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
break;
}
case nir_op_fceil: {
- if (dst.size() == 1) {
+ Temp src0 = get_alu_src(ctx, instr->src[0]);
+ if (dst.regClass() == v2b) {
+ Temp tmp = bld.vop1(aco_opcode::v_ceil_f16, bld.def(v1), src0);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
- } else if (dst.size() == 2) {
+ } else if (dst.regClass() == v2) {
if (ctx->options->chip_class >= GFX7) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
} else {
/* GFX6 doesn't support V_CEIL_F64, lower it. */
- Temp src0 = get_alu_src(ctx, instr->src[0]);
-
/* trunc = trunc(src0)
* if (src0 > 0.0 && src0 != trunc)
* trunc += 1.0
break;
}
case nir_op_ftrunc: {
- if (dst.size() == 1) {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (dst.regClass() == v2b) {
+ Temp tmp = bld.vop1(aco_opcode::v_trunc_f16, bld.def(v1), src);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
- } else if (dst.size() == 2) {
- emit_trunc_f64(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
+ } else if (dst.regClass() == v2) {
+ emit_trunc_f64(ctx, bld, Definition(dst), src);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
break;
}
case nir_op_fround_even: {
- if (dst.size() == 1) {
+ Temp src0 = get_alu_src(ctx, instr->src[0]);
+ if (dst.regClass() == v2b) {
+ Temp tmp = bld.vop1(aco_opcode::v_rndne_f16, bld.def(v1), src0);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
- } else if (dst.size() == 2) {
+ } else if (dst.regClass() == v2) {
if (ctx->options->chip_class >= GFX7) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
} else {
/* GFX6 doesn't support V_RNDNE_F64, lower it. */
- Temp src0 = get_alu_src(ctx, instr->src[0]);
-
Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
}
case nir_op_fsin:
case nir_op_fcos: {
- Temp src = get_alu_src(ctx, instr->src[0]);
+ Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
aco_ptr<Instruction> norm;
- if (dst.size() == 1) {
- Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
- Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, as_vgpr(ctx, src));
+ Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
+ if (dst.regClass() == v2b) {
+ Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
+ aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
+ tmp = bld.vop1(opcode, bld.def(v1), tmp);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
+ Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
/* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
if (ctx->options->chip_class < GFX9)
break;
}
case nir_op_ldexp: {
- if (dst.size() == 1) {
- bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst),
- as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
- get_alu_src(ctx, instr->src[1]));
- } else if (dst.size() == 2) {
- bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst),
- as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
- get_alu_src(ctx, instr->src[1]));
+ Temp src0 = get_alu_src(ctx, instr->src[0]);
+ Temp src1 = get_alu_src(ctx, instr->src[1]);
+ if (dst.regClass() == v2b) {
+ Temp tmp = bld.tmp(v1);
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, tmp, false);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
+ bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst), as_vgpr(ctx, src0), src1);
+ } else if (dst.regClass() == v2) {
+ bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst), as_vgpr(ctx, src0), src1);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
break;
}
case nir_op_frexp_sig: {
- if (dst.size() == 1) {
- bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst),
- get_alu_src(ctx, instr->src[0]));
- } else if (dst.size() == 2) {
- bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst),
- get_alu_src(ctx, instr->src[0]));
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (dst.regClass() == v2b) {
+ Temp tmp = bld.vop1(aco_opcode::v_frexp_mant_f16, bld.def(v1), src);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
+ bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst), src);
+ } else if (dst.regClass() == v2) {
+ bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst), src);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
break;
}
case nir_op_frexp_exp: {
- if (instr->src[0].src.ssa->bit_size == 32) {
- bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst),
- get_alu_src(ctx, instr->src[0]));
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (instr->src[0].src.ssa->bit_size == 16) {
+ Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
+ bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), tmp, Operand(0u));
+ } else if (instr->src[0].src.ssa->bit_size == 32) {
+ bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst), src);
} else if (instr->src[0].src.ssa->bit_size == 64) {
- bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst),
- get_alu_src(ctx, instr->src[0]));
+ bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst), src);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
}
case nir_op_fsign: {
Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
- if (dst.size() == 1) {
+ if (dst.regClass() == v2b) {
+ Temp one = bld.copy(bld.def(v1), Operand(0x3c00u));
+ Temp minus_one = bld.copy(bld.def(v1), Operand(0xbc00u));
+ Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
+ src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), one, src, cond);
+ cond = bld.vopc(aco_opcode::v_cmp_le_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
+ Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), minus_one, src, cond);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+ } else if (dst.regClass() == v1) {
Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
- } else if (dst.size() == 2) {
+ } else if (dst.regClass() == v2) {
Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond);
}
break;
}
+ case nir_op_f2f16:
+ case nir_op_f2f16_rtne: {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (instr->src[0].src.ssa->bit_size == 64)
+ src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
+ src = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), src);
+ break;
+ }
+ case nir_op_f2f16_rtz: {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (instr->src[0].src.ssa->bit_size == 64)
+ src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
+ src = bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1), src, Operand(0u));
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), src);
+ break;
+ }
case nir_op_f2f32: {
- if (instr->src[0].src.ssa->bit_size == 64) {
+ if (instr->src[0].src.ssa->bit_size == 16) {
+ emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
+ } else if (instr->src[0].src.ssa->bit_size == 64) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
break;
}
case nir_op_f2f64: {
- if (instr->src[0].src.ssa->bit_size == 32) {
- emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_f32, dst);
- } else {
- fprintf(stderr, "Unimplemented NIR instr bit size: ");
- nir_print_instr(&instr->instr, stderr);
- fprintf(stderr, "\n");
- }
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (instr->src[0].src.ssa->bit_size == 16)
+ src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
+ bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
break;
}
case nir_op_i2f32: {
}
break;
}
+ case nir_op_f2i16: {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (instr->src[0].src.ssa->bit_size == 16)
+ src = bld.vop1(aco_opcode::v_cvt_i16_f16, bld.def(v1), src);
+ else if (instr->src[0].src.ssa->bit_size == 32)
+ src = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src);
+ else
+ src = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src);
+
+ if (dst.type() == RegType::vgpr)
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), src);
+ else
+ bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
+ break;
+ }
+ case nir_op_f2u16: {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (instr->src[0].src.ssa->bit_size == 16)
+ src = bld.vop1(aco_opcode::v_cvt_u16_f16, bld.def(v1), src);
+ else if (instr->src[0].src.ssa->bit_size == 32)
+ src = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src);
+ else
+ src = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src);
+
+ if (dst.type() == RegType::vgpr)
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), src);
+ else
+ bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
+ break;
+ }
case nir_op_f2i32: {
Temp src = get_alu_src(ctx, instr->src[0]);
- if (instr->src[0].src.ssa->bit_size == 32) {
+ if (instr->src[0].src.ssa->bit_size == 16) {
+ Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
+ if (dst.type() == RegType::vgpr) {
+ bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
+ } else {
+ bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
+ bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
+ }
+ } else if (instr->src[0].src.ssa->bit_size == 32) {
if (dst.type() == RegType::vgpr)
bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src);
else
}
case nir_op_f2u32: {
Temp src = get_alu_src(ctx, instr->src[0]);
- if (instr->src[0].src.ssa->bit_size == 32) {
+ if (instr->src[0].src.ssa->bit_size == 16) {
+ Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
+ if (dst.type() == RegType::vgpr) {
+ bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
+ } else {
+ bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
+ bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
+ }
+ } else if (instr->src[0].src.ssa->bit_size == 32) {
if (dst.type() == RegType::vgpr)
bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src);
else
}
break;
}
+ case nir_op_i2i8:
+ case nir_op_u2u8: {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ /* we can actually just say dst = src */
+ if (src.regClass() == s1)
+ bld.copy(Definition(dst), src);
+ else
+ emit_extract_vector(ctx, src, 0, dst);
+ break;
+ }
+ case nir_op_i2i16: {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (instr->src[0].src.ssa->bit_size == 8) {
+ if (dst.regClass() == s1) {
+ bld.sop1(aco_opcode::s_sext_i32_i8, Definition(dst), Operand(src));
+ } else {
+ assert(src.regClass() == v1b);
+ aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
+ sdwa->operands[0] = Operand(src);
+ sdwa->definitions[0] = Definition(dst);
+ sdwa->sel[0] = sdwa_sbyte;
+ sdwa->dst_sel = sdwa_sword;
+ ctx->block->instructions.emplace_back(std::move(sdwa));
+ }
+ } else {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ /* we can actually just say dst = src */
+ if (src.regClass() == s1)
+ bld.copy(Definition(dst), src);
+ else
+ emit_extract_vector(ctx, src, 0, dst);
+ }
+ break;
+ }
+ case nir_op_u2u16: {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ if (instr->src[0].src.ssa->bit_size == 8) {
+ if (dst.regClass() == s1)
+ bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFu), src);
+ else {
+ assert(src.regClass() == v1b);
+ aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
+ sdwa->operands[0] = Operand(src);
+ sdwa->definitions[0] = Definition(dst);
+ sdwa->sel[0] = sdwa_ubyte;
+ sdwa->dst_sel = sdwa_uword;
+ ctx->block->instructions.emplace_back(std::move(sdwa));
+ }
+ } else {
+ Temp src = get_alu_src(ctx, instr->src[0]);
+ /* we can actually just say dst = src */
+ if (src.regClass() == s1)
+ bld.copy(Definition(dst), src);
+ else
+ emit_extract_vector(ctx, src, 0, dst);
+ }
+ break;
+ }
case nir_op_i2i32: {
Temp src = get_alu_src(ctx, instr->src[0]);
- if (instr->src[0].src.ssa->bit_size == 64) {
+ if (instr->src[0].src.ssa->bit_size == 8) {
+ if (dst.regClass() == s1) {
+ bld.sop1(aco_opcode::s_sext_i32_i8, Definition(dst), Operand(src));
+ } else {
+ assert(src.regClass() == v1b);
+ aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
+ sdwa->operands[0] = Operand(src);
+ sdwa->definitions[0] = Definition(dst);
+ sdwa->sel[0] = sdwa_sbyte;
+ sdwa->dst_sel = sdwa_sdword;
+ ctx->block->instructions.emplace_back(std::move(sdwa));
+ }
+ } else if (instr->src[0].src.ssa->bit_size == 16) {
+ if (dst.regClass() == s1) {
+ bld.sop1(aco_opcode::s_sext_i32_i16, Definition(dst), Operand(src));
+ } else {
+ assert(src.regClass() == v2b);
+ aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
+ sdwa->operands[0] = Operand(src);
+ sdwa->definitions[0] = Definition(dst);
+ sdwa->sel[0] = sdwa_sword;
+ sdwa->dst_sel = sdwa_udword;
+ ctx->block->instructions.emplace_back(std::move(sdwa));
+ }
+ } else if (instr->src[0].src.ssa->bit_size == 64) {
/* we can actually just say dst = src, as it would map the lower register */
emit_extract_vector(ctx, src, 0, dst);
} else {
}
case nir_op_u2u32: {
Temp src = get_alu_src(ctx, instr->src[0]);
- if (instr->src[0].src.ssa->bit_size == 16) {
+ if (instr->src[0].src.ssa->bit_size == 8) {
+ if (dst.regClass() == s1)
+ bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFu), src);
+ else {
+ assert(src.regClass() == v1b);
+ aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
+ sdwa->operands[0] = Operand(src);
+ sdwa->definitions[0] = Definition(dst);
+ sdwa->sel[0] = sdwa_ubyte;
+ sdwa->dst_sel = sdwa_udword;
+ ctx->block->instructions.emplace_back(std::move(sdwa));
+ }
+ } else if (instr->src[0].src.ssa->bit_size == 16) {
if (dst.regClass() == s1) {
bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFFFu), src);
} else {
- // TODO: do better with SDWA
- bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0xFFFFu), src);
+ assert(src.regClass() == v2b);
+ aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
+ sdwa->operands[0] = Operand(src);
+ sdwa->definitions[0] = Definition(dst);
+ sdwa->sel[0] = sdwa_uword;
+ sdwa->dst_sel = sdwa_udword;
+ ctx->block->instructions.emplace_back(std::move(sdwa));
}
} else if (instr->src[0].src.ssa->bit_size == 64) {
/* we can actually just say dst = src, as it would map the lower register */
}
break;
}
+ case nir_op_b2b32:
case nir_op_b2i32: {
Temp src = get_alu_src(ctx, instr->src[0]);
assert(src.regClass() == bld.lm);
}
break;
}
+ case nir_op_b2b1:
case nir_op_i2b1: {
Temp src = get_alu_src(ctx, instr->src[0]);
assert(dst.regClass() == bld.lm);
case nir_op_unpack_64_2x32_split_y:
bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
break;
+ case nir_op_unpack_32_2x16_split_x:
+ if (dst.type() == RegType::vgpr) {
+ bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
+ } else {
+ bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
+ }
+ break;
+ case nir_op_unpack_32_2x16_split_y:
+ if (dst.type() == RegType::vgpr) {
+ bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
+ } else {
+ bld.sop2(aco_opcode::s_bfe_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), Operand(uint32_t(16 << 16 | 16)));
+ }
+ break;
+ case nir_op_pack_32_2x16_split: {
+ Temp src0 = get_alu_src(ctx, instr->src[0]);
+ Temp src1 = get_alu_src(ctx, instr->src[1]);
+ if (dst.regClass() == v1) {
+ bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
+ } else {
+ src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0, Operand(0xFFFFu));
+ src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1, Operand(16u));
+ bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
+ }
+ break;
+ }
case nir_op_pack_half_2x16: {
Temp src = get_alu_src(ctx, instr->src[0], 2);
}
-Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType reg_type, unsigned split_cnt = 0u, Temp dst = Temp())
+Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType reg_type, unsigned elem_size_bytes,
+ unsigned split_cnt = 0u, Temp dst = Temp())
{
Builder bld(ctx->program, ctx->block);
+ unsigned dword_size = elem_size_bytes / 4;
if (!dst.id())
- dst = bld.tmp(RegClass(reg_type, cnt * arr[0].size()));
+ dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
aco_ptr<Pseudo_instruction> instr {create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
instr->definitions[0] = Definition(dst);
for (unsigned i = 0; i < cnt; ++i) {
- assert(arr[i].size() == arr[0].size());
- allocated_vec[i] = arr[i];
- instr->operands[i] = Operand(arr[i]);
+ if (arr[i].id()) {
+ assert(arr[i].size() == dword_size);
+ allocated_vec[i] = arr[i];
+ instr->operands[i] = Operand(arr[i]);
+ } else {
+ Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)), Operand(0u, dword_size == 2));
+ allocated_vec[i] = zero;
+ instr->operands[i] = Operand(zero);
+ }
}
bld.insert(std::move(instr));
elems[i] = emit_single_mubuf_load(ctx, descriptor, voffset, soffset, const_offset, load_size, allow_reorder);
}
- create_vec_from_array(ctx, elems.data(), num_loads, RegType::vgpr, split_cnt, dst);
+ create_vec_from_array(ctx, elems.data(), num_loads, RegType::vgpr, load_size * 4u, split_cnt, dst);
}
std::pair<Temp, unsigned> offset_add_from_nir(isel_context *ctx, const std::pair<Temp, unsigned> &base_offset, nir_src *off_src, unsigned stride = 1u)
return false;
}
+bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ unsigned write_mask = nir_intrinsic_write_mask(instr);
+ unsigned component = nir_intrinsic_component(instr);
+ unsigned idx = nir_intrinsic_base(instr) + component;
+
+ nir_instr *off_instr = instr->src[1].ssa->parent_instr;
+ if (off_instr->type != nir_instr_type_load_const)
+ return false;
+
+ Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
+ idx += nir_src_as_uint(instr->src[1]) * 4u;
+
+ if (instr->src[0].ssa->bit_size == 64)
+ write_mask = widen_mask(write_mask, 2);
+
+ for (unsigned i = 0; i < 8; ++i) {
+ if (write_mask & (1 << i)) {
+ ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
+ ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, v1);
+ }
+ idx++;
+ }
+
+ return true;
+}
+
+bool load_input_from_temps(isel_context *ctx, nir_intrinsic_instr *instr, Temp dst)
+{
+ /* Only TCS per-vertex inputs are supported by this function.
+ * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations is the same.
+ */
+ if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
+ return false;
+
+ nir_src *off_src = nir_get_io_offset_src(instr);
+ nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
+ nir_instr *vertex_index_instr = vertex_index_src->ssa->parent_instr;
+ bool can_use_temps = nir_src_is_const(*off_src) &&
+ vertex_index_instr->type == nir_instr_type_intrinsic &&
+ nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
+
+ if (!can_use_temps)
+ return false;
+
+ unsigned idx = nir_intrinsic_base(instr) + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src);
+ Temp *src = &ctx->inputs.temps[idx];
+ Temp vec = create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u);
+ assert(vec.size() == dst.size());
+
+ Builder bld(ctx->program, ctx->block);
+ bld.copy(Definition(dst), vec);
+ return true;
+}
+
void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
{
Builder bld(ctx->program, ctx->block);
unsigned write_mask = nir_intrinsic_write_mask(instr);
unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8u;
+ if (ctx->tcs_in_out_eq && store_output_to_temps(ctx, instr)) {
+ /* When the TCS only reads this output directly and for the same vertices as its invocation id, it is unnecessary to store the VS output to LDS. */
+ bool indirect_write;
+ bool temp_only_input = tcs_driver_location_matches_api_mask(ctx, instr, true, ctx->tcs_temp_only_inputs, &indirect_write);
+ if (temp_only_input && !indirect_write)
+ return;
+ }
+
if (ctx->stage == vertex_es || ctx->stage == tess_eval_es) {
/* GFX6-8: ES stage is not merged into GS, data is passed from ES to GS in VMEM. */
Temp esgs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_VS * 16u));
if (ctx->stage == vertex_vs ||
ctx->stage == tess_eval_vs ||
ctx->stage == fragment_fs ||
+ ctx->stage == ngg_vertex_gs ||
+ ctx->stage == ngg_tess_eval_gs ||
ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
- unsigned write_mask = nir_intrinsic_write_mask(instr);
- unsigned component = nir_intrinsic_component(instr);
- Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
- unsigned idx = nir_intrinsic_base(instr) + component;
-
- nir_instr *off_instr = instr->src[1].ssa->parent_instr;
- if (off_instr->type != nir_instr_type_load_const) {
- fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
- nir_print_instr(off_instr, stderr);
+ bool stored_to_temps = store_output_to_temps(ctx, instr);
+ if (!stored_to_temps) {
+ fprintf(stderr, "Unimplemented output offset instruction:\n");
+ nir_print_instr(instr->src[1].ssa->parent_instr, stderr);
fprintf(stderr, "\n");
- }
- idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 4u;
-
- if (instr->src[0].ssa->bit_size == 64)
- write_mask = widen_mask(write_mask, 2);
-
- for (unsigned i = 0; i < 8; ++i) {
- if (write_mask & (1 << i)) {
- ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
- ctx->outputs.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1);
- }
- idx++;
+ abort();
}
} else if (ctx->stage == vertex_es ||
ctx->stage == vertex_ls ||
Builder bld(ctx->program, ctx->block);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+
+ if (load_input_from_temps(ctx, instr, dst))
+ return;
+
std::pair<Temp, unsigned> offs = get_tcs_per_vertex_input_lds_offset(ctx, instr);
unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), index);
}
-void load_buffer(isel_context *ctx, unsigned num_components, Temp dst,
- Temp rsrc, Temp offset, bool glc=false, bool readonly=true)
+void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size,
+ Temp dst, Temp rsrc, Temp offset, int byte_align,
+ bool glc=false, bool readonly=true)
{
Builder bld(ctx->program, ctx->block);
-
- unsigned num_bytes = dst.size() * 4;
bool dlc = glc && ctx->options->chip_class >= GFX10;
+ unsigned num_bytes = num_components * component_size;
aco_opcode op;
- if (dst.type() == RegType::vgpr || (ctx->options->chip_class < GFX8 && !readonly)) {
+ if (dst.type() == RegType::vgpr || ((ctx->options->chip_class < GFX8 || component_size < 4) && !readonly)) {
Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
unsigned const_offset = 0;
+ /* for small bit sizes add buffer for unaligned loads */
+ if (byte_align) {
+ if (num_bytes > 2)
+ num_bytes += byte_align == -1 ? 4 - component_size : byte_align;
+ else
+ byte_align = 0;
+ }
+
Temp lower = Temp();
if (num_bytes > 16) {
assert(num_components == 3 || num_components == 4);
}
switch (num_bytes) {
+ case 1:
+ op = aco_opcode::buffer_load_ubyte;
+ break;
+ case 2:
+ op = aco_opcode::buffer_load_ushort;
+ break;
+ case 3:
case 4:
op = aco_opcode::buffer_load_dword;
break;
+ case 5:
+ case 6:
+ case 7:
case 8:
op = aco_opcode::buffer_load_dwordx2;
break;
+ case 10:
case 12:
assert(ctx->options->chip_class > GFX6);
op = aco_opcode::buffer_load_dwordx3;
mubuf->offset = const_offset;
aco_ptr<Instruction> instr = std::move(mubuf);
- if (dst.size() > 4) {
+ if (component_size < 4) {
+ Temp vec = num_bytes <= 4 ? bld.tmp(v1) : num_bytes <= 8 ? bld.tmp(v2) : bld.tmp(v3);
+ instr->definitions[0] = Definition(vec);
+ bld.insert(std::move(instr));
+
+ if (byte_align == -1 || (byte_align && dst.type() == RegType::sgpr)) {
+ Operand align = byte_align == -1 ? Operand(offset) : Operand((uint32_t)byte_align);
+ Temp tmp[3] = {vec, vec, vec};
+
+ if (vec.size() == 3) {
+ tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec);
+ } else if (vec.size() == 2) {
+ tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
+ bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
+ }
+ for (unsigned i = 0; i < dst.size(); i++)
+ tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], align);
+
+ vec = tmp[0];
+ if (dst.size() == 2)
+ vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
+
+ byte_align = 0;
+ }
+
+ if (dst.type() == RegType::vgpr && num_components == 1) {
+ bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), vec, Operand(byte_align / component_size));
+ } else {
+ trim_subdword_vector(ctx, vec, dst, 4 * vec.size() / component_size, ((1 << num_components) - 1) << byte_align / component_size);
+ }
+
+ return;
+
+ } else if (dst.size() > 4) {
assert(lower != Temp());
Temp upper = bld.tmp(RegType::vgpr, dst.size() - lower.size());
instr->definitions[0] = Definition(upper);
emit_split_vector(ctx, dst, num_components);
}
} else {
+ /* for small bit sizes add buffer for unaligned loads */
+ if (byte_align)
+ num_bytes += byte_align == -1 ? 4 - component_size : byte_align;
+
switch (num_bytes) {
+ case 1:
+ case 2:
+ case 3:
case 4:
op = aco_opcode::s_buffer_load_dword;
break;
+ case 5:
+ case 6:
+ case 7:
case 8:
op = aco_opcode::s_buffer_load_dwordx2;
break;
+ case 10:
case 12:
case 16:
op = aco_opcode::s_buffer_load_dwordx4;
default:
unreachable("Load SSBO not implemented for this size.");
}
+ offset = bld.as_uniform(offset);
aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
load->operands[0] = Operand(rsrc);
- load->operands[1] = Operand(bld.as_uniform(offset));
+ load->operands[1] = Operand(offset);
assert(load->operands[1].getTemp().type() == RegType::sgpr);
load->definitions[0] = Definition(dst);
load->glc = glc;
load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works
assert(ctx->options->chip_class >= GFX8 || !glc);
+ /* adjust misaligned small bit size loads */
+ if (byte_align) {
+ Temp vec = num_bytes <= 4 ? bld.tmp(s1) : num_bytes <= 8 ? bld.tmp(s2) : bld.tmp(s4);
+ load->definitions[0] = Definition(vec);
+ bld.insert(std::move(load));
+ Operand byte_offset = byte_align > 0 ? Operand(uint32_t(byte_align)) : Operand(offset);
+ byte_align_scalar(ctx, vec, byte_offset, dst);
+
/* trim vector */
- if (dst.size() == 3) {
+ } else if (dst.size() == 3) {
Temp vec = bld.tmp(s4);
load->definitions[0] = Definition(vec);
bld.insert(std::move(load));
rsrc = convert_pointer_to_64_bit(ctx, rsrc);
rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
}
-
- load_buffer(ctx, instr->num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa));
+ unsigned size = instr->dest.ssa.bit_size / 8;
+ int byte_align = 0;
+ if (size < 4) {
+ unsigned align_mul = nir_intrinsic_align_mul(instr);
+ unsigned align_offset = nir_intrinsic_align_offset(instr);
+ byte_align = align_mul % 4 == 0 ? align_offset : -1;
+ }
+ load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), byte_align);
}
void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
{
Builder bld(ctx->program, ctx->block);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-
unsigned offset = nir_intrinsic_base(instr);
+ unsigned count = instr->dest.ssa.num_components;
nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]);
- if (index_cv && instr->dest.ssa.bit_size == 32) {
- unsigned count = instr->dest.ssa.num_components;
+ if (index_cv && instr->dest.ssa.bit_size == 32) {
unsigned start = (offset + index_cv->u32) / 4u;
start -= ctx->args->ac.base_inline_push_consts;
if (start + count <= ctx->args->ac.num_inline_push_consts) {
Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
Temp vec = dst;
bool trim = false;
+ bool aligned = true;
+
+ if (instr->dest.ssa.bit_size == 8) {
+ aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
+ bool fits_in_dword = count == 1 || (index_cv && ((offset + index_cv->u32) % 4 + count) <= 4);
+ if (!aligned)
+ vec = fits_in_dword ? bld.tmp(s1) : bld.tmp(s2);
+ } else if (instr->dest.ssa.bit_size == 16) {
+ aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
+ if (!aligned)
+ vec = count == 4 ? bld.tmp(s4) : count > 1 ? bld.tmp(s2) : bld.tmp(s1);
+ }
+
aco_opcode op;
- switch (dst.size()) {
+ switch (vec.size()) {
case 1:
op = aco_opcode::s_load_dword;
break;
bld.smem(op, Definition(vec), ptr, index);
+ if (!aligned) {
+ Operand byte_offset = index_cv ? Operand((offset + index_cv->u32) % 4) : Operand(index);
+ byte_align_scalar(ctx, vec, byte_offset, dst);
+ return;
+ }
+
if (trim) {
emit_split_vector(ctx, vec, 4);
RegClass rc = dst.size() == 3 ? s1 : s2;
bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)),
Operand(MIN2(base + range, ctx->shader->constant_data_size)),
Operand(desc_type));
-
- load_buffer(ctx, instr->num_components, dst, rsrc, offset);
+ unsigned size = instr->dest.ssa.bit_size / 8;
+ // TODO: get alignment information for subdword constants
+ unsigned byte_align = size < 4 ? -1 : 0;
+ load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, byte_align);
}
void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
- load_buffer(ctx, num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), glc, false);
+ unsigned size = instr->dest.ssa.bit_size / 8;
+ int byte_align = 0;
+ if (size < 4) {
+ unsigned align_mul = nir_intrinsic_align_mul(instr);
+ unsigned align_offset = nir_intrinsic_align_offset(instr);
+ byte_align = align_mul % 4 == 0 ? align_offset : -1;
+ }
+ load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), byte_align, glc, false);
}
void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] &&
- ctx->options->chip_class >= GFX8;
+ ctx->options->chip_class >= GFX8 &&
+ elem_size_bytes >= 4;
if (smem)
offset = bld.as_uniform(offset);
bool smem_nonfs = smem && ctx->stage != fragment_fs;
}
int num_bytes = count * elem_size_bytes;
+ /* dword or larger stores have to be dword-aligned */
+ if (elem_size_bytes < 4 && num_bytes > 2) {
+ // TODO: improve alignment check of sub-dword stores
+ unsigned count_new = 2 / elem_size_bytes;
+ writemask |= ((1 << (count - count_new)) - 1) << (start + count_new);
+ count = count_new;
+ num_bytes = 2;
+ }
+
if (num_bytes > 16) {
assert(elem_size_bytes == 8);
writemask |= (((count - 2) << 1) - 1) << (start + 2);
num_bytes = 16;
}
- // TODO: check alignment of sub-dword stores
- // TODO: split 3 bytes. there is no store instruction for that
-
Temp write_data;
- if (count != instr->num_components) {
- emit_split_vector(ctx, data, instr->num_components);
+ if (elem_size_bytes < 4) {
+ if (data.type() == RegType::sgpr) {
+ data = as_vgpr(ctx, data);
+ emit_split_vector(ctx, data, 4 * data.size() / elem_size_bytes);
+ }
+ RegClass rc = RegClass(RegType::vgpr, elem_size_bytes).as_subdword();
+ aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
+ for (int i = 0; i < count; i++)
+ vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, rc));
+ write_data = bld.tmp(RegClass(RegType::vgpr, num_bytes).as_subdword());
+ vec->definitions[0] = Definition(write_data);
+ bld.insert(std::move(vec));
+ } else if (count != instr->num_components) {
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
for (int i = 0; i < count; i++) {
Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(data.type(), elem_size_bytes / 4));
write_data = data;
}
- aco_opcode vmem_op, smem_op;
+ aco_opcode vmem_op, smem_op = aco_opcode::last_opcode;
switch (num_bytes) {
+ case 1:
+ vmem_op = aco_opcode::buffer_store_byte;
+ break;
+ case 2:
+ vmem_op = aco_opcode::buffer_store_short;
+ break;
case 4:
vmem_op = aco_opcode::buffer_store_dword;
smem_op = aco_opcode::s_buffer_store_dword;
break;
case 12:
vmem_op = aco_opcode::buffer_store_dwordx3;
- smem_op = aco_opcode::last_opcode;
assert(!smem && ctx->options->chip_class > GFX6);
break;
case 16:
mtbuf->operands[0] = Operand(gsvs_ring);
mtbuf->operands[1] = vaddr_offset;
mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->gs2vs_offset));
- mtbuf->operands[3] = Operand(ctx->outputs.outputs[i][j]);
+ mtbuf->operands[3] = Operand(ctx->outputs.temps[i * 4u + j]);
mtbuf->offen = !vaddr_offset.isUndefined();
mtbuf->dfmt = V_008F0C_BUF_DATA_FORMAT_32;
mtbuf->nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
break;
}
- if (ctx->shader->info.stage == MESA_SHADER_COMPUTE) {
- unsigned* bsize = ctx->program->info->cs.block_size;
- unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2];
- if (workgroup_size > ctx->program->wave_size)
- bld.sopp(aco_opcode::s_barrier);
- } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
- /* For each patch provided during rendering, n​ TCS shader invocations will be processed,
- * where n​ is the number of vertices in the output patch.
- */
- unsigned workgroup_size = ctx->tcs_num_patches * ctx->shader->info.tess.tcs_vertices_out;
- if (workgroup_size > ctx->program->wave_size)
- bld.sopp(aco_opcode::s_barrier);
- } else {
- /* We don't know the workgroup size, so always emit the s_barrier. */
+ if (ctx->program->workgroup_size > ctx->program->wave_size)
bld.sopp(aco_opcode::s_barrier);
- }
break;
}
}
}
+static void begin_uniform_if_then(isel_context *ctx, if_context *ic, Temp cond)
+{
+ assert(cond.regClass() == s1);
+
+ append_logical_end(ctx->block);
+ ctx->block->kind |= block_kind_uniform;
+
+ aco_ptr<Pseudo_branch_instruction> branch;
+ aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
+ branch.reset(create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 0));
+ branch->operands[0] = Operand(cond);
+ branch->operands[0].setFixed(scc);
+ ctx->block->instructions.emplace_back(std::move(branch));
+
+ ic->BB_if_idx = ctx->block->index;
+ ic->BB_endif = Block();
+ ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
+ ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
+
+ ctx->cf_info.has_branch = false;
+ ctx->cf_info.parent_loop.has_divergent_branch = false;
+
+ /** emit then block */
+ Block* BB_then = ctx->program->create_and_insert_block();
+ BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth;
+ add_edge(ic->BB_if_idx, BB_then);
+ append_logical_start(BB_then);
+ ctx->block = BB_then;
+}
+
+static void begin_uniform_if_else(isel_context *ctx, if_context *ic)
+{
+ Block *BB_then = ctx->block;
+
+ ic->uniform_has_then_branch = ctx->cf_info.has_branch;
+ ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
+
+ if (!ic->uniform_has_then_branch) {
+ append_logical_end(BB_then);
+ /* branch from then block to endif block */
+ aco_ptr<Pseudo_branch_instruction> branch;
+ branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
+ BB_then->instructions.emplace_back(std::move(branch));
+ add_linear_edge(BB_then->index, &ic->BB_endif);
+ if (!ic->then_branch_divergent)
+ add_logical_edge(BB_then->index, &ic->BB_endif);
+ BB_then->kind |= block_kind_uniform;
+ }
+
+ ctx->cf_info.has_branch = false;
+ ctx->cf_info.parent_loop.has_divergent_branch = false;
+
+ /** emit else block */
+ Block* BB_else = ctx->program->create_and_insert_block();
+ BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth;
+ add_edge(ic->BB_if_idx, BB_else);
+ append_logical_start(BB_else);
+ ctx->block = BB_else;
+}
+
+static void end_uniform_if(isel_context *ctx, if_context *ic)
+{
+ Block *BB_else = ctx->block;
+
+ if (!ctx->cf_info.has_branch) {
+ append_logical_end(BB_else);
+ /* branch from then block to endif block */
+ aco_ptr<Pseudo_branch_instruction> branch;
+ branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
+ BB_else->instructions.emplace_back(std::move(branch));
+ add_linear_edge(BB_else->index, &ic->BB_endif);
+ if (!ctx->cf_info.parent_loop.has_divergent_branch)
+ add_logical_edge(BB_else->index, &ic->BB_endif);
+ BB_else->kind |= block_kind_uniform;
+ }
+
+ ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
+ ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
+
+ /** emit endif merge block */
+ if (!ctx->cf_info.has_branch) {
+ ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
+ append_logical_start(ctx->block);
+ }
+}
+
static bool visit_if(isel_context *ctx, nir_if *if_stmt)
{
Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
Builder bld(ctx->program, ctx->block);
aco_ptr<Pseudo_branch_instruction> branch;
+ if_context ic;
if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */
/**
* to the loop exit/entry block. Otherwise, it branches to the next
* merge block.
**/
- append_logical_end(ctx->block);
- ctx->block->kind |= block_kind_uniform;
- /* emit branch */
- assert(cond.regClass() == bld.lm);
// TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
+ assert(cond.regClass() == ctx->program->lane_mask);
cond = bool_to_scalar_condition(ctx, cond);
- branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
- branch->operands[0] = Operand(cond);
- branch->operands[0].setFixed(scc);
- ctx->block->instructions.emplace_back(std::move(branch));
-
- unsigned BB_if_idx = ctx->block->index;
- Block BB_endif = Block();
- BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
- BB_endif.kind |= ctx->block->kind & block_kind_top_level;
-
- /** emit then block */
- Block* BB_then = ctx->program->create_and_insert_block();
- BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth;
- add_edge(BB_if_idx, BB_then);
- append_logical_start(BB_then);
- ctx->block = BB_then;
+ begin_uniform_if_then(ctx, &ic, cond);
visit_cf_list(ctx, &if_stmt->then_list);
- BB_then = ctx->block;
- bool then_branch = ctx->cf_info.has_branch;
- bool then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
-
- if (!then_branch) {
- append_logical_end(BB_then);
- /* branch from then block to endif block */
- branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
- BB_then->instructions.emplace_back(std::move(branch));
- add_linear_edge(BB_then->index, &BB_endif);
- if (!then_branch_divergent)
- add_logical_edge(BB_then->index, &BB_endif);
- BB_then->kind |= block_kind_uniform;
- }
-
- ctx->cf_info.has_branch = false;
- ctx->cf_info.parent_loop.has_divergent_branch = false;
- /** emit else block */
- Block* BB_else = ctx->program->create_and_insert_block();
- BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth;
- add_edge(BB_if_idx, BB_else);
- append_logical_start(BB_else);
- ctx->block = BB_else;
+ begin_uniform_if_else(ctx, &ic);
visit_cf_list(ctx, &if_stmt->else_list);
- BB_else = ctx->block;
-
- if (!ctx->cf_info.has_branch) {
- append_logical_end(BB_else);
- /* branch from then block to endif block */
- branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
- BB_else->instructions.emplace_back(std::move(branch));
- add_linear_edge(BB_else->index, &BB_endif);
- if (!ctx->cf_info.parent_loop.has_divergent_branch)
- add_logical_edge(BB_else->index, &BB_endif);
- BB_else->kind |= block_kind_uniform;
- }
- ctx->cf_info.has_branch &= then_branch;
- ctx->cf_info.parent_loop.has_divergent_branch &= then_branch_divergent;
+ end_uniform_if(ctx, &ic);
- /** emit endif merge block */
- if (!ctx->cf_info.has_branch) {
- ctx->block = ctx->program->insert_block(std::move(BB_endif));
- append_logical_start(ctx->block);
- }
return !ctx->cf_info.has_branch;
} else { /* non-uniform condition */
/**
* *) Exceptions may be due to break and continue statements within loops
**/
- if_context ic;
-
begin_divergent_if_then(ctx, &ic, cond);
visit_cf_list(ctx, &if_stmt->then_list);
{
assert(ctx->stage == vertex_vs ||
ctx->stage == tess_eval_vs ||
- ctx->stage == gs_copy_vs);
+ ctx->stage == gs_copy_vs ||
+ ctx->stage == ngg_vertex_gs ||
+ ctx->stage == ngg_tess_eval_gs);
- int offset = ctx->stage == tess_eval_vs
+ int offset = (ctx->stage & sw_tes)
? ctx->program->info->tes.outinfo.vs_output_param_offset[slot]
: ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
uint64_t mask = ctx->outputs.mask[slot];
exp->enabled_mask = mask;
for (unsigned i = 0; i < 4; ++i) {
if (mask & (1 << i))
- exp->operands[i] = Operand(ctx->outputs.outputs[slot][i]);
+ exp->operands[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
else
exp->operands[i] = Operand(v1);
}
for (unsigned i = 0; i < 4; ++i)
exp->operands[i] = Operand(v1);
if (ctx->outputs.mask[VARYING_SLOT_PSIZ]) {
- exp->operands[0] = Operand(ctx->outputs.outputs[VARYING_SLOT_PSIZ][0]);
+ exp->operands[0] = Operand(ctx->outputs.temps[VARYING_SLOT_PSIZ * 4u]);
exp->enabled_mask |= 0x1;
}
if (ctx->outputs.mask[VARYING_SLOT_LAYER]) {
- exp->operands[2] = Operand(ctx->outputs.outputs[VARYING_SLOT_LAYER][0]);
+ exp->operands[2] = Operand(ctx->outputs.temps[VARYING_SLOT_LAYER * 4u]);
exp->enabled_mask |= 0x4;
}
if (ctx->outputs.mask[VARYING_SLOT_VIEWPORT]) {
if (ctx->options->chip_class < GFX9) {
- exp->operands[3] = Operand(ctx->outputs.outputs[VARYING_SLOT_VIEWPORT][0]);
+ exp->operands[3] = Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]);
exp->enabled_mask |= 0x8;
} else {
Builder bld(ctx->program, ctx->block);
Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u),
- Operand(ctx->outputs.outputs[VARYING_SLOT_VIEWPORT][0]));
+ Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]));
if (exp->operands[2].isTemp())
out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
ctx->block->instructions.emplace_back(std::move(exp));
}
+static void create_export_phis(isel_context *ctx)
+{
+ /* Used when exports are needed, but the output temps are defined in a preceding block.
+ * This function will set up phis in order to access the outputs in the next block.
+ */
+
+ assert(ctx->block->instructions.back()->opcode == aco_opcode::p_logical_start);
+ aco_ptr<Instruction> logical_start = aco_ptr<Instruction>(ctx->block->instructions.back().release());
+ ctx->block->instructions.pop_back();
+
+ Builder bld(ctx->program, ctx->block);
+
+ for (unsigned slot = 0; slot <= VARYING_SLOT_VAR31; ++slot) {
+ uint64_t mask = ctx->outputs.mask[slot];
+ for (unsigned i = 0; i < 4; ++i) {
+ if (!(mask & (1 << i)))
+ continue;
+
+ Temp old = ctx->outputs.temps[slot * 4 + i];
+ Temp phi = bld.pseudo(aco_opcode::p_phi, bld.def(v1), old, Operand(v1));
+ ctx->outputs.temps[slot * 4 + i] = phi;
+ }
+ }
+
+ bld.insert(std::move(logical_start));
+}
+
static void create_vs_exports(isel_context *ctx)
{
assert(ctx->stage == vertex_vs ||
ctx->stage == tess_eval_vs ||
- ctx->stage == gs_copy_vs);
+ ctx->stage == gs_copy_vs ||
+ ctx->stage == ngg_vertex_gs ||
+ ctx->stage == ngg_tess_eval_gs);
- radv_vs_output_info *outinfo = ctx->stage == tess_eval_vs
+ radv_vs_output_info *outinfo = (ctx->stage & sw_tes)
? &ctx->program->info->tes.outinfo
: &ctx->program->info->vs.outinfo;
- if (outinfo->export_prim_id) {
+ if (outinfo->export_prim_id && !(ctx->stage & hw_ngg_gs)) {
ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
- ctx->outputs.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = get_arg(ctx, ctx->args->vs_prim_id);
+ ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = get_arg(ctx, ctx->args->vs_prim_id);
}
if (ctx->options->key.has_multiview_view_index) {
ctx->outputs.mask[VARYING_SLOT_LAYER] |= 0x1;
- ctx->outputs.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
+ ctx->outputs.temps[VARYING_SLOT_LAYER * 4u] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
}
/* the order these position exports are created is important */
}
for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
- if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER &&
+ if (i < VARYING_SLOT_VAR0 &&
+ i != VARYING_SLOT_LAYER &&
i != VARYING_SLOT_PRIMITIVE_ID)
continue;
if (ctx->program->info->ps.writes_stencil) {
/* Stencil should be in X[23:16]. */
- values[0] = Operand(ctx->outputs.outputs[FRAG_RESULT_STENCIL][0]);
+ values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u), values[0]);
enabled_channels |= 0x3;
}
if (ctx->program->info->ps.writes_sample_mask) {
/* SampleMask should be in Y[15:0]. */
- values[1] = Operand(ctx->outputs.outputs[FRAG_RESULT_SAMPLE_MASK][0]);
+ values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
enabled_channels |= 0xc;
}
} else {
if (ctx->program->info->ps.writes_z) {
- values[0] = Operand(ctx->outputs.outputs[FRAG_RESULT_DEPTH][0]);
+ values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u]);
enabled_channels |= 0x1;
}
if (ctx->program->info->ps.writes_stencil) {
- values[1] = Operand(ctx->outputs.outputs[FRAG_RESULT_STENCIL][0]);
+ values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
enabled_channels |= 0x2;
}
if (ctx->program->info->ps.writes_sample_mask) {
- values[2] = Operand(ctx->outputs.outputs[FRAG_RESULT_SAMPLE_MASK][0]);
+ values[2] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
enabled_channels |= 0x4;
}
}
for (unsigned i = 0; i < 4; ++i) {
if (write_mask & (1 << i)) {
- values[i] = Operand(ctx->outputs.outputs[slot][i]);
+ values[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
} else {
values[i] = Operand(v1);
}
Builder bld(ctx->program, ctx->block);
bld.barrier(aco_opcode::p_memory_barrier_shared);
- unsigned workgroup_size = ctx->tcs_num_patches * ctx->shader->info.tess.tcs_vertices_out;
- if (unlikely(ctx->program->chip_class != GFX6 && workgroup_size > ctx->program->wave_size))
+ if (unlikely(ctx->program->chip_class != GFX6 && ctx->program->workgroup_size > ctx->program->wave_size))
bld.sopp(aco_opcode::s_barrier);
Temp tcs_rel_ids = get_arg(ctx, ctx->args->ac.tcs_rel_ids);
}
assert(stride == 2 || stride == 4 || stride == 6);
- Temp tf_vec = create_vec_from_array(ctx, out, stride, RegType::vgpr);
+ Temp tf_vec = create_vec_from_array(ctx, out, stride, RegType::vgpr, 4u);
store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, false);
/* Store to offchip for TES to read - only if TES reads them */
bool all_undef = true;
assert(ctx->stage == vertex_vs || ctx->stage == gs_copy_vs);
for (unsigned i = 0; i < num_comps; i++) {
- out[i] = ctx->outputs.outputs[loc][start + i];
+ out[i] = ctx->outputs.temps[loc * 4 + start + i];
all_undef = all_undef && !out[i].id();
}
if (all_undef)
}
}
+Temp merged_wave_info_to_mask(isel_context *ctx, unsigned i)
+{
+ Builder bld(ctx->program, ctx->block);
+
+ /* The s_bfm only cares about s0.u[5:0] so we don't need either s_bfe nor s_and here */
+ Temp count = i == 0
+ ? get_arg(ctx, ctx->args->merged_wave_info)
+ : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
+ get_arg(ctx, ctx->args->merged_wave_info), Operand(i * 8u));
+
+ Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand(0u));
+ Temp cond;
+
+ if (ctx->program->wave_size == 64) {
+ /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */
+ Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count, Operand(6u /* log2(64) */));
+ cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(-1u), mask, bld.scc(active_64));
+ } else {
+ /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of the register */
+ cond = emit_extract_vector(ctx, mask, 0, bld.lm);
+ }
+
+ return cond;
+}
+
+bool ngg_early_prim_export(isel_context *ctx)
+{
+ /* TODO: Check edge flags, and if they are written, return false. (Needed for OpenGL, not for Vulkan.) */
+ return true;
+}
+
+void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx)
+{
+ Builder bld(ctx->program, ctx->block);
+
+ /* It is recommended to do the GS_ALLOC_REQ as soon and as quickly as possible, so we set the maximum priority (3). */
+ bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
+
+ /* Get the id of the current wave within the threadgroup (workgroup) */
+ Builder::Result wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
+ get_arg(ctx, ctx->args->merged_wave_info), Operand(24u | (4u << 16)));
+
+ /* Execute the following code only on the first wave (wave id 0),
+ * use the SCC def to tell if the wave id is zero or not.
+ */
+ Temp cond = wave_id_in_tg.def(1).getTemp();
+ if_context ic;
+ begin_uniform_if_then(ctx, &ic, cond);
+ begin_uniform_if_else(ctx, &ic);
+ bld.reset(ctx->block);
+
+ /* Number of vertices output by VS/TES */
+ Temp vtx_cnt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
+ get_arg(ctx, ctx->args->gs_tg_info), Operand(12u | (9u << 16u)));
+ /* Number of primitives output by VS/TES */
+ Temp prm_cnt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
+ get_arg(ctx, ctx->args->gs_tg_info), Operand(22u | (9u << 16u)));
+
+ /* Put the number of vertices and primitives into m0 for the GS_ALLOC_REQ */
+ Temp tmp = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand(12u));
+ tmp = bld.sop2(aco_opcode::s_or_b32, bld.m0(bld.def(s1)), bld.def(s1, scc), tmp, vtx_cnt);
+
+ /* Request the SPI to allocate space for the primitives and vertices that will be exported by the threadgroup. */
+ bld.sopp(aco_opcode::s_sendmsg, bld.m0(tmp), -1, sendmsg_gs_alloc_req);
+
+ /* After the GS_ALLOC_REQ is done, reset priority to default (0). */
+ bld.sopp(aco_opcode::s_setprio, -1u, 0x0u);
+
+ end_uniform_if(ctx, &ic);
+}
+
+Temp ngg_get_prim_exp_arg(isel_context *ctx, unsigned num_vertices, const Temp vtxindex[])
+{
+ Builder bld(ctx->program, ctx->block);
+
+ if (ctx->args->options->key.vs_common_out.as_ngg_passthrough) {
+ return get_arg(ctx, ctx->args->gs_vtx_offset[0]);
+ }
+
+ Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id);
+ Temp tmp;
+
+ for (unsigned i = 0; i < num_vertices; ++i) {
+ assert(vtxindex[i].id());
+
+ if (i)
+ tmp = bld.vop3(aco_opcode::v_lshl_add_u32, bld.def(v1), vtxindex[i], Operand(10u * i), tmp);
+ else
+ tmp = vtxindex[i];
+
+ /* The initial edge flag is always false in tess eval shaders. */
+ if (ctx->stage == ngg_vertex_gs) {
+ Temp edgeflag = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), gs_invocation_id, Operand(8 + i), Operand(1u));
+ tmp = bld.vop3(aco_opcode::v_lshl_add_u32, bld.def(v1), edgeflag, Operand(10u * i + 9u), tmp);
+ }
+ }
+
+ /* TODO: Set isnull field in case of merged NGG VS+GS. */
+
+ return tmp;
+}
+
+void ngg_emit_prim_export(isel_context *ctx, unsigned num_vertices_per_primitive, const Temp vtxindex[])
+{
+ Builder bld(ctx->program, ctx->block);
+ Temp prim_exp_arg = ngg_get_prim_exp_arg(ctx, num_vertices_per_primitive, vtxindex);
+
+ bld.exp(aco_opcode::exp, prim_exp_arg, Operand(v1), Operand(v1), Operand(v1),
+ 1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */,
+ false /* compressed */, true/* done */, false /* valid mask */);
+}
+
+void ngg_emit_nogs_gsthreads(isel_context *ctx)
+{
+ /* Emit the things that NGG GS threads need to do, for shaders that don't have SW GS.
+ * These must always come before VS exports.
+ *
+ * It is recommended to do these as early as possible. They can be at the beginning when
+ * there is no SW GS and the shader doesn't write edge flags.
+ */
+
+ if_context ic;
+ Temp is_gs_thread = merged_wave_info_to_mask(ctx, 1);
+ begin_divergent_if_then(ctx, &ic, is_gs_thread);
+
+ Builder bld(ctx->program, ctx->block);
+ constexpr unsigned max_vertices_per_primitive = 3;
+ unsigned num_vertices_per_primitive = max_vertices_per_primitive;
+
+ if (ctx->stage == ngg_vertex_gs) {
+ /* TODO: optimize for points & lines */
+ } else if (ctx->stage == ngg_tess_eval_gs) {
+ if (ctx->shader->info.tess.point_mode)
+ num_vertices_per_primitive = 1;
+ else if (ctx->shader->info.tess.primitive_mode == GL_ISOLINES)
+ num_vertices_per_primitive = 2;
+ } else {
+ unreachable("Unsupported NGG shader stage");
+ }
+
+ Temp vtxindex[max_vertices_per_primitive];
+ vtxindex[0] = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu),
+ get_arg(ctx, ctx->args->gs_vtx_offset[0]));
+ vtxindex[1] = num_vertices_per_primitive < 2 ? Temp(0, v1) :
+ bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
+ get_arg(ctx, ctx->args->gs_vtx_offset[0]), Operand(16u), Operand(16u));
+ vtxindex[2] = num_vertices_per_primitive < 3 ? Temp(0, v1) :
+ bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu),
+ get_arg(ctx, ctx->args->gs_vtx_offset[2]));
+
+ /* Export primitive data to the index buffer. */
+ ngg_emit_prim_export(ctx, num_vertices_per_primitive, vtxindex);
+
+ /* Export primitive ID. */
+ if (ctx->stage == ngg_vertex_gs && ctx->args->options->key.vs_common_out.export_prim_id) {
+ /* Copy Primitive IDs from GS threads to the LDS address corresponding to the ES thread of the provoking vertex. */
+ Temp prim_id = get_arg(ctx, ctx->args->ac.gs_prim_id);
+ Temp provoking_vtx_index = vtxindex[0];
+ Temp addr = bld.v_mul_imm(bld.def(v1), provoking_vtx_index, 4u);
+
+ store_lds(ctx, 4, prim_id, 0x1u, addr, 0u, 4u);
+ }
+
+ begin_divergent_if_else(ctx, &ic);
+ end_divergent_if(ctx, &ic);
+}
+
+void ngg_emit_nogs_output(isel_context *ctx)
+{
+ /* Emits NGG GS output, for stages that don't have SW GS. */
+
+ if_context ic;
+ Builder bld(ctx->program, ctx->block);
+ bool late_prim_export = !ngg_early_prim_export(ctx);
+
+ /* NGG streamout is currently disabled by default. */
+ assert(!ctx->args->shader_info->so.num_outputs);
+
+ if (late_prim_export) {
+ /* VS exports are output to registers in a predecessor block. Emit phis to get them into this block. */
+ create_export_phis(ctx);
+ /* Do what we need to do in the GS threads. */
+ ngg_emit_nogs_gsthreads(ctx);
+
+ /* What comes next should be executed on ES threads. */
+ Temp is_es_thread = merged_wave_info_to_mask(ctx, 0);
+ begin_divergent_if_then(ctx, &ic, is_es_thread);
+ bld.reset(ctx->block);
+ }
+
+ /* Export VS outputs */
+ ctx->block->kind |= block_kind_export_end;
+ create_vs_exports(ctx);
+
+ /* Export primitive ID */
+ if (ctx->args->options->key.vs_common_out.export_prim_id) {
+ Temp prim_id;
+
+ if (ctx->stage == ngg_vertex_gs) {
+ /* Wait for GS threads to store primitive ID in LDS. */
+ bld.barrier(aco_opcode::p_memory_barrier_shared);
+ bld.sopp(aco_opcode::s_barrier);
+
+ /* Calculate LDS address where the GS threads stored the primitive ID. */
+ Temp wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
+ get_arg(ctx, ctx->args->merged_wave_info), Operand(24u | (4u << 16)));
+ Temp thread_id_in_wave = emit_mbcnt(ctx, bld.def(v1));
+ Temp wave_id_mul = bld.v_mul_imm(bld.def(v1), as_vgpr(ctx, wave_id_in_tg), ctx->program->wave_size);
+ Temp thread_id_in_tg = bld.vadd32(bld.def(v1), Operand(wave_id_mul), Operand(thread_id_in_wave));
+ Temp addr = bld.v_mul_imm(bld.def(v1), thread_id_in_tg, 4u);
+
+ /* Load primitive ID from LDS. */
+ prim_id = load_lds(ctx, 4, bld.tmp(v1), addr, 0u, 4u);
+ } else if (ctx->stage == ngg_tess_eval_gs) {
+ /* TES: Just use the patch ID as the primitive ID. */
+ prim_id = get_arg(ctx, ctx->args->ac.tes_patch_id);
+ } else {
+ unreachable("unsupported NGG shader stage.");
+ }
+
+ ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
+ ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = prim_id;
+
+ export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, nullptr);
+ }
+
+ if (late_prim_export) {
+ begin_divergent_if_else(ctx, &ic);
+ end_divergent_if(ctx, &ic);
+ bld.reset(ctx->block);
+ }
+}
+
void select_program(Program *program,
unsigned shader_count,
struct nir_shader *const *shaders,
{
isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
if_context ic_merged_wave_info;
+ bool ngg_no_gs = ctx.stage == ngg_vertex_gs || ctx.stage == ngg_tess_eval_gs;
for (unsigned i = 0; i < shader_count; i++) {
nir_shader *nir = shaders[i];
split_arguments(&ctx, startpgm);
}
+ if (ngg_no_gs) {
+ ngg_emit_sendmsg_gs_alloc_req(&ctx);
+
+ if (ngg_early_prim_export(&ctx))
+ ngg_emit_nogs_gsthreads(&ctx);
+ }
+
/* In a merged VS+TCS HS, the VS implementation can be completely empty. */
nir_function_impl *func = nir_shader_get_entrypoint(nir);
bool empty_shader = nir_cf_list_is_empty_block(&func->body) &&
(nir->info.stage == MESA_SHADER_TESS_EVAL &&
ctx.stage == tess_eval_geometry_gs));
- bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader);
+ bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : ((shader_count >= 2 && !empty_shader) || ngg_no_gs);
bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : check_merged_wave_info;
if (check_merged_wave_info) {
- Builder bld(ctx.program, ctx.block);
-
- /* The s_bfm only cares about s0.u[5:0] so we don't need either s_bfe nor s_and here */
- Temp count = i == 0 ? get_arg(&ctx, args->merged_wave_info)
- : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
- get_arg(&ctx, args->merged_wave_info), Operand(i * 8u));
-
- Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand(0u));
- Temp cond;
-
- if (ctx.program->wave_size == 64) {
- /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */
- Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count, Operand(6u /* log2(64) */));
- cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(-1u), mask, bld.scc(active_64));
- } else {
- /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of the register */
- cond = emit_extract_vector(&ctx, mask, 0, bld.lm);
- }
-
+ Temp cond = merged_wave_info_to_mask(&ctx, i);
begin_divergent_if_then(&ctx, &ic_merged_wave_info, cond);
}
visit_cf_list(&ctx, &func->body);
- if (ctx.program->info->so.num_outputs && (ctx.stage == vertex_vs || ctx.stage == tess_eval_vs))
+ if (ctx.program->info->so.num_outputs && (ctx.stage & hw_vs))
emit_streamout(&ctx, 0);
- if (ctx.stage == vertex_vs || ctx.stage == tess_eval_vs) {
+ if (ctx.stage & hw_vs) {
create_vs_exports(&ctx);
+ ctx.block->kind |= block_kind_export_end;
+ } else if (ngg_no_gs && ngg_early_prim_export(&ctx)) {
+ ngg_emit_nogs_output(&ctx);
} else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
Builder bld(ctx.program, ctx.block);
bld.barrier(aco_opcode::p_memory_barrier_gs_data);
write_tcs_tess_factors(&ctx);
}
- if (ctx.stage == fragment_fs)
+ if (ctx.stage == fragment_fs) {
create_fs_exports(&ctx);
+ ctx.block->kind |= block_kind_export_end;
+ }
if (endif_merged_wave_info) {
begin_divergent_if_else(&ctx, &ic_merged_wave_info);
end_divergent_if(&ctx, &ic_merged_wave_info);
}
+ if (ngg_no_gs && !ngg_early_prim_export(&ctx))
+ ngg_emit_nogs_output(&ctx);
+
ralloc_free(ctx.divergent_vals);
+
+ if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
+ /* Outputs of the previous stage are inputs to the next stage */
+ ctx.inputs = ctx.outputs;
+ ctx.outputs = shader_io_state();
+ }
}
program->config->float_mode = program->blocks[0].fp_mode.val;
append_logical_end(ctx.block);
- ctx.block->kind |= block_kind_uniform | block_kind_export_end;
+ ctx.block->kind |= block_kind_uniform;
Builder bld(ctx.program, ctx.block);
if (ctx.program->wb_smem_l1_on_end)
bld.smem(aco_opcode::s_dcache_wb, false);
mubuf->can_reorder = true;
ctx.outputs.mask[i] |= 1 << j;
- ctx.outputs.outputs[i][j] = mubuf->definitions[0].getTemp();
+ ctx.outputs.temps[i * 4u + j] = mubuf->definitions[0].getTemp();
bld.insert(std::move(mubuf));