return insert(std::move(sub));
}
+ Result readlane(Definition dst, Op vsrc, Op lane)
+ {
+ if (program->chip_class >= GFX8)
+ return vop3(aco_opcode::v_readlane_b32_e64, dst, vsrc, lane);
+ else
+ return vop2(aco_opcode::v_readlane_b32, dst, vsrc, lane);
+ }
+ Result writelane(Definition dst, Op val, Op lane, Op vsrc) {
+ if (program->chip_class >= GFX8)
+ return vop3(aco_opcode::v_writelane_b32_e64, dst, val, lane, vsrc);
+ else
+ return vop2(aco_opcode::v_writelane_b32, dst, val, lane, vsrc);
+ }
<%
import itertools
formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(5))) + [(8, 1), (1, 8)]),
return true;
if (instr->isVOP3() && instr->definitions.size() == 2)
return true;
- if (instr->opcode == aco_opcode::v_readfirstlane_b32 || instr->opcode == aco_opcode::v_readlane_b32)
+ if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
+ instr->opcode == aco_opcode::v_readlane_b32 ||
+ instr->opcode == aco_opcode::v_readlane_b32_e64)
return true;
return false;
}
switch (instr->opcode) {
case aco_opcode::v_readlane_b32:
- case aco_opcode::v_writelane_b32: {
+ case aco_opcode::v_readlane_b32_e64:
+ case aco_opcode::v_writelane_b32:
+ case aco_opcode::v_writelane_b32_e64: {
if (ctx.VALU_wrsgpr + 4 < new_idx)
break;
PhysReg reg = instr->operands[1].physReg();
}
if (instr->opcode == aco_opcode::v_readlane_b32 ||
- instr->opcode == aco_opcode::v_writelane_b32)
+ instr->opcode == aco_opcode::v_readlane_b32_e64 ||
+ instr->opcode == aco_opcode::v_writelane_b32 ||
+ instr->opcode == aco_opcode::v_writelane_b32_e64)
return false;
return true;
static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data)
{
if (index.regClass() == s1)
- return bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), data, index);
+ return bld.readlane(bld.def(s1), data, index);
Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
if (dst.regClass() == v1) {
/* src2 is ignored for writelane. RA assigns the same reg for dst */
- emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val, lane, src), dst);
+ emit_wqm(ctx, bld.writelane(bld.def(v1), val, lane, src), dst);
} else if (dst.regClass() == v2) {
Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
- Temp lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_lo, lane, src_hi));
- Temp hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_hi, lane, src_hi));
+ Temp lo = emit_wqm(ctx, bld.writelane(bld.def(v1), val_lo, lane, src_hi));
+ Temp hi = emit_wqm(ctx, bld.writelane(bld.def(v1), val_hi, lane, src_hi));
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
emit_split_vector(ctx, dst, 2);
} else {
if (cluster_size == 64) {
for (unsigned i = 0; i < src.size(); i++)
- bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
+ bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
}
} else if (cluster_size == 32) {
if (ctx->program->wave_size == 64) {
/* fill in the gap in row 2 */
for (unsigned i = 0; i < src.size(); i++) {
- bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
- bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u));
+ bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
+ bld.writelane(Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u), Operand(PhysReg{vtmp+i}, v1));
}
}
std::swap(tmp, vtmp);
if (!identity[i].isConstant() || identity[i].constantValue()) { /* bound_ctrl should take case of this overwise */
if (ctx->program->chip_class < GFX10)
assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i});
- bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{tmp+i}, v1),
- identity[i], Operand(0u));
+ bld.writelane(Definition(PhysReg{tmp+i}, v1), identity[i], Operand(0u), Operand(PhysReg{tmp+i}, v1));
}
}
/* fall through */
bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0u));
bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0xffffffffu));
for (unsigned i = 0; i < src.size(); i++)
- bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
+ bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
}
} else {
if (op == aco_opcode::p_reduce && dst.regClass().type() == RegType::sgpr) {
for (unsigned k = 0; k < src.size(); k++) {
- bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{dst.physReg() + k}, s1),
- Operand(PhysReg{tmp + k}, v1), Operand(ctx->program->wave_size - 1));
+ bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1),
+ Operand(PhysReg{tmp + k}, v1), Operand(ctx->program->wave_size - 1));
}
} else if (!(dst.physReg() == tmp) && !dst_written) {
for (unsigned k = 0; k < src.size(); k++) {
case aco_opcode::p_spill:
{
assert(instr->operands[0].regClass() == v1.as_linear());
- for (unsigned i = 0; i < instr->operands[2].size(); i++) {
- bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1, instr->operands[0].physReg()),
- Operand(PhysReg{instr->operands[2].physReg() + i}, s1),
- Operand(instr->operands[1].constantValue() + i));
- }
+ for (unsigned i = 0; i < instr->operands[2].size(); i++)
+ bld.writelane(bld.def(v1, instr->operands[0].physReg()),
+ Operand(PhysReg{instr->operands[2].physReg() + i}, s1),
+ Operand(instr->operands[1].constantValue() + i),
+ instr->operands[0]);
break;
}
case aco_opcode::p_reload:
{
assert(instr->operands[0].regClass() == v1.as_linear());
- for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
- bld.vop3(aco_opcode::v_readlane_b32,
- bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
- instr->operands[0], Operand(instr->operands[1].constantValue() + i));
- }
+ for (unsigned i = 0; i < instr->definitions[0].size(); i++)
+ bld.readlane(bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
+ instr->operands[0],
+ Operand(instr->operands[1].constantValue() + i));
break;
}
case aco_opcode::p_as_uniform:
VOP2 = {
# GFX6, GFX7, GFX8, GFX9, GFX10, name, input/output modifiers
(0x00, 0x00, 0x00, 0x00, 0x01, "v_cndmask_b32", False),
+ (0x01, 0x01, -1, -1, -1, "v_readlane_b32", False),
+ (0x02, 0x02, -1, -1, -1, "v_writelane_b32", False),
(0x03, 0x03, 0x01, 0x01, 0x03, "v_add_f32", True),
(0x04, 0x04, 0x02, 0x02, 0x04, "v_sub_f32", True),
(0x05, 0x05, 0x03, 0x03, 0x05, "v_subrev_f32", True),
( -1, -1, 0x276, 0x276, -1, "v_interp_p2_legacy_f16", True, True),
( -1, -1, -1, 0x277, 0x35a, "v_interp_p2_f16", True, True),
(0x12b, 0x12b, 0x288, 0x288, 0x362, "v_ldexp_f32", False, True),
- (0x101, 0x101, 0x289, 0x289, 0x360, "v_readlane_b32", False, False),
- (0x102, 0x102, 0x28a, 0x28a, 0x361, "v_writelane_b32", False, False),
+ ( -1, -1, 0x289, 0x289, 0x360, "v_readlane_b32_e64", False, False),
+ ( -1, -1, 0x28a, 0x28a, 0x361, "v_writelane_b32_e64", False, False),
(0x122, 0x122, 0x28b, 0x28b, 0x364, "v_bcnt_u32_b32", False, False),
(0x123, 0x123, 0x28c, 0x28c, 0x365, "v_mbcnt_lo_u32_b32", False, False),
(0x124, 0x124, 0x28d, 0x28d, 0x366, "v_mbcnt_hi_u32_b32", False, False),
return instr->opcode != aco_opcode::v_madmk_f32 &&
instr->opcode != aco_opcode::v_madak_f32 &&
instr->opcode != aco_opcode::v_madmk_f16 &&
- instr->opcode != aco_opcode::v_madak_f16;
+ instr->opcode != aco_opcode::v_madak_f16 &&
+ instr->opcode != aco_opcode::v_readlane_b32 &&
+ instr->opcode != aco_opcode::v_writelane_b32 &&
+ instr->opcode != aco_opcode::v_readfirstlane_b32;
}
bool can_apply_sgprs(aco_ptr<Instruction>& instr)
{
return instr->opcode != aco_opcode::v_readfirstlane_b32 &&
instr->opcode != aco_opcode::v_readlane_b32 &&
- instr->opcode != aco_opcode::v_writelane_b32;
+ instr->opcode != aco_opcode::v_readlane_b32_e64 &&
+ instr->opcode != aco_opcode::v_writelane_b32 &&
+ instr->opcode != aco_opcode::v_writelane_b32_e64;
}
void to_VOP3(opt_ctx& ctx, aco_ptr<Instruction>& instr)
case aco_opcode::v_interp_p2_f32:
case aco_opcode::v_mac_f32:
case aco_opcode::v_writelane_b32:
+ case aco_opcode::v_writelane_b32_e64:
case aco_opcode::v_cndmask_b32:
return operand != 2;
case aco_opcode::s_addk_i32:
case aco_opcode::p_extract_vector:
case aco_opcode::p_split_vector:
case aco_opcode::v_readlane_b32:
+ case aco_opcode::v_readlane_b32_e64:
case aco_opcode::v_readfirstlane_b32:
return operand != 0;
default:
bool valu_can_accept_vgpr(aco_ptr<Instruction>& instr, unsigned operand)
{
- if (instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_writelane_b32)
+ if (instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_readlane_b32_e64 ||
+ instr->opcode == aco_opcode::v_writelane_b32 || instr->opcode == aco_opcode::v_writelane_b32_e64)
return operand != 1;
return true;
}
}
if (info.is_constant() && can_accept_constant(instr, i)) {
perfwarn(instr->opcode == aco_opcode::v_cndmask_b32 && i == 2, "v_cndmask_b32 with a constant selector", instr.get());
- if (i == 0) {
+ if (i == 0 || instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_writelane_b32) {
instr->operands[i] = Operand(info.val);
continue;
} else if (!instr->isVOP3() && can_swap_operands(instr)) {
/* handle definitions which must have the same register as an operand */
if (instr->opcode == aco_opcode::v_interp_p2_f32 ||
instr->opcode == aco_opcode::v_mac_f32 ||
- instr->opcode == aco_opcode::v_writelane_b32) {
+ instr->opcode == aco_opcode::v_writelane_b32 ||
+ instr->opcode == aco_opcode::v_writelane_b32_e64) {
instr->definitions[0].setFixed(instr->operands[2].physReg());
} else if (instr->opcode == aco_opcode::s_addk_i32 ||
instr->opcode == aco_opcode::s_mulk_i32) {
check(instr->definitions[0].getTemp().type() == RegType::vgpr ||
(int) instr->format & (int) Format::VOPC ||
instr->opcode == aco_opcode::v_readfirstlane_b32 ||
- instr->opcode == aco_opcode::v_readlane_b32,
+ instr->opcode == aco_opcode::v_readlane_b32 ||
+ instr->opcode == aco_opcode::v_readlane_b32_e64,
"Wrong Definition type for VALU instruction", instr.get());
unsigned num_sgpr = 0;
unsigned sgpr_idx = instr->operands.size();
- for (unsigned i = 0; i < instr->operands.size(); i++)
- {
+ for (unsigned i = 0; i < instr->operands.size(); i++) {
+ if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
+ instr->opcode == aco_opcode::v_readlane_b32 ||
+ instr->opcode == aco_opcode::v_readlane_b32_e64 ||
+ instr->opcode == aco_opcode::v_writelane_b32 ||
+ instr->opcode == aco_opcode::v_writelane_b32_e64) {
+ check(!instr->operands[i].isLiteral(), "No literal allowed on VALU instruction", instr.get());
+ check(i == 1 || (instr->operands[i].isTemp() && instr->operands[i].regClass() == v1), "Wrong Operand type for VALU instruction", instr.get());
+ continue;
+ }
if (instr->operands[i].isTemp() && instr->operands[i].regClass().type() == RegType::sgpr) {
check(i != 1 || (int) instr->format & (int) Format::VOP3A, "Wrong source position for SGPR argument", instr.get());