SMEM does the addition with 64-bits, not 32. So if the original code
relied on wrapping around (for example, for subtraction), it would break.
Apparently swizzled MUBUF accesses also have issues with combining
additions that could overflow. Normal MUBUF accesses seem fine.
fossil-db (Navi):
Totals from 27219 (20.02% of 135946) affected shaders:
CodeSize:
128303256 ->
131062756 (+2.15%); split: -0.00%, +2.15%
Instrs:
24818911 ->
25280558 (+1.86%); split: -0.01%, +1.87%
VMEM:
162311926 ->
177226874 (+9.19%); split: +9.36%, -0.17%
SMEM:
18182559 ->
20218734 (+11.20%); split: +11.53%, -0.34%
VClause: 423635 -> 424398 (+0.18%); split: -0.02%, +0.20%
SClause: 865384 ->
1104986 (+27.69%); split: -0.00%, +27.69%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/2748
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/2720>
-# ACO specific issues.
-dEQP-VK.transform_feedback.simple.multistreams_1
-dEQP-VK.transform_feedback.simple.multistreams_3
-
dEQP-VK.rasterization.flatshading.line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_lines_wide
dEQP-VK.rasterization.flatshading.line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_lines_wide
-# ACO specific issues.
-dEQP-VK.transform_feedback.simple.multistreams_1
-dEQP-VK.transform_feedback.simple.multistreams_3
-
dEQP-VK.rasterization.flatshading.line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_lines_wide
dEQP-VK.rasterization.flatshading.line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_lines_wide
-# ACO specific issues.
-dEQP-VK.transform_feedback.simple.multistreams_1
-dEQP-VK.transform_feedback.simple.multistreams_3
-
dEQP-VK.rasterization.flatshading.line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_lines_wide
dEQP-VK.rasterization.flatshading.line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_lines_wide
-# ACO specific issues.
-dEQP-VK.transform_feedback.simple.multistreams_1
-dEQP-VK.transform_feedback.simple.multistreams_3
-
dEQP-VK.rasterization.flatshading.line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_lines_wide
dEQP-VK.rasterization.flatshading.line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_lines_wide
("vopc_e64", [Format.VOPC, Format.VOP3A], 'VOP3A_instruction', itertools.product([1, 2], [2])),
("flat", [Format.FLAT], 'FLAT_instruction', [(0, 3), (1, 2)]),
("global", [Format.GLOBAL], 'FLAT_instruction', [(0, 3), (1, 2)])]
("vopc_e64", [Format.VOPC, Format.VOP3A], 'VOP3A_instruction', itertools.product([1, 2], [2])),
("flat", [Format.FLAT], 'FLAT_instruction', [(0, 3), (1, 2)]),
("global", [Format.GLOBAL], 'FLAT_instruction', [(0, 3), (1, 2)])]
+formats = [(f if len(f) == 5 else f + ('',)) for f in formats]
-% for name, formats, struct, shapes in formats:
+% for name, formats, struct, shapes, extra_field_setup in formats:
% for num_definitions, num_operands in shapes:
<%
args = ['aco_opcode opcode']
% for num_definitions, num_operands in shapes:
<%
args = ['aco_opcode opcode']
% endfor
${f.get_builder_initialization(num_operands)}
% endfor
% endfor
${f.get_builder_initialization(num_operands)}
% endfor
mubuf->barrier = info->barrier;
mubuf->can_reorder = info->can_reorder;
mubuf->offset = const_offset;
mubuf->barrier = info->barrier;
mubuf->can_reorder = info->can_reorder;
mubuf->offset = const_offset;
+ mubuf->swizzled = info->swizzle_component_size != 0;
RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
mubuf->definitions[0] = Definition(val);
RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
mubuf->definitions[0] = Definition(val);
}
void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
}
void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
- unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false)
+ unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false,
+ bool swizzled = false)
{
assert(vdata.id());
assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
{
assert(vdata.id());
assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
- /* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false,
- /* disable_wqm */ false, /* glc */ true, /* dlc*/ false, /* slc */ slc);
+ /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled,
+ /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
+ /* dlc*/ false, /* slc */ slc);
static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
}
static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
}
for (unsigned i = 0; i < write_count; i++) {
unsigned const_offset = offsets[i] + base_const_offset;
for (unsigned i = 0; i < write_count; i++) {
unsigned const_offset = offsets[i] + base_const_offset;
- emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc);
+ emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc, !allow_combining);
if (use_mubuf) {
Instruction *mubuf = bld.mubuf(opcode,
Definition(fetch_dst), list, fetch_index, soffset,
if (use_mubuf) {
Instruction *mubuf = bld.mubuf(opcode,
Definition(fetch_dst), list, fetch_index, soffset,
- fetch_offset, false, true).instr;
+ fetch_offset, false, false, true).instr;
static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
} else {
Instruction *mtbuf = bld.mtbuf(opcode,
static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
} else {
Instruction *mtbuf = bld.mtbuf(opcode,
for (unsigned i = 0; i < write_count; i++) {
aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
for (unsigned i = 0; i < write_count; i++) {
aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
- bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true);
+ bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
Temp control_word = bld.copy(bld.def(v1), Operand(0x80000000u));
bld.mubuf(aco_opcode::buffer_store_dword,
/* SRSRC */ hs_ring_tess_factor, /* VADDR */ Operand(v1), /* SOFFSET */ tf_base, /* VDATA */ control_word,
Temp control_word = bld.copy(bld.def(v1), Operand(0x80000000u));
bld.mubuf(aco_opcode::buffer_store_dword,
/* SRSRC */ hs_ring_tess_factor, /* VADDR */ Operand(v1), /* SOFFSET */ tf_base, /* VDATA */ control_word,
- /* immediate OFFSET */ 0, /* OFFEN */ false, /* idxen*/ false, /* addr64 */ false,
- /* disable_wqm */ false, /* glc */ true);
+ /* immediate OFFSET */ 0, /* OFFEN */ false, /* swizzled */ false, /* idxen*/ false,
+ /* addr64 */ false, /* disable_wqm */ false, /* glc */ true);
tf_const_offset += 4;
begin_divergent_if_else(ctx, &ic_rel_patch_id_is_zero);
tf_const_offset += 4;
begin_divergent_if_else(ctx, &ic_rel_patch_id_is_zero);
bool lds : 1; /* Return read-data to LDS instead of VGPRs */
bool disable_wqm : 1; /* Require an exec mask without helper invocations */
bool can_reorder : 1;
bool lds : 1; /* Return read-data to LDS instead of VGPRs */
bool disable_wqm : 1; /* Require an exec mask without helper invocations */
bool can_reorder : 1;
+ bool swizzled:1;
+ uint8_t padding : 1;
barrier_interaction barrier;
};
static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
barrier_interaction barrier;
};
static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
elif self == Format.MUBUF:
return [('unsigned', 'offset', None),
('bool', 'offen', None),
elif self == Format.MUBUF:
return [('unsigned', 'offset', None),
('bool', 'offen', None),
+ ('bool', 'swizzled', 'false'),
('bool', 'idxen', 'false'),
('bool', 'addr64', 'false'),
('bool', 'disable_wqm', 'false'),
('bool', 'idxen', 'false'),
('bool', 'addr64', 'false'),
('bool', 'disable_wqm', 'false'),
-bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset)
+bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset, bool prevent_overflow)
+ if (prevent_overflow)
+ return false; //TODO
+
Operand op = instr->operands[op_index];
if (!op.isTemp())
Operand op = instr->operands[op_index];
if (!op.isTemp())
continue;
uint32_t offset2 = 0;
continue;
uint32_t offset2 = 0;
- if (parse_base_offset(ctx, add_instr, !i, base, &offset2)) {
+ if (parse_base_offset(ctx, add_instr, !i, base, &offset2, prevent_overflow)) {
*offset += offset2;
} else {
*base = add_instr->operands[!i].getTemp();
*offset += offset2;
} else {
*base = add_instr->operands[!i].getTemp();
while (info.is_temp())
info = ctx.info[info.temp.id()];
while (info.is_temp())
info = ctx.info[info.temp.id()];
+ /* According to AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(), vaddr
+ * overflow for scratch accesses works only on GFX9+ and saddr overflow
+ * never works. Since swizzling is the only thing that separates
+ * scratch accesses and other accesses and swizzling changing how
+ * addressing works significantly, this probably applies to swizzled
+ * MUBUF accesses. */
+ bool vaddr_prevent_overflow = mubuf->swizzled && ctx.program->chip_class < GFX9;
+ bool saddr_prevent_overflow = mubuf->swizzled;
+
if (mubuf->offen && i == 1 && info.is_constant_or_literal(32) && mubuf->offset + info.val < 4096) {
assert(!mubuf->idxen);
instr->operands[1] = Operand(v1);
if (mubuf->offen && i == 1 && info.is_constant_or_literal(32) && mubuf->offset + info.val < 4096) {
assert(!mubuf->idxen);
instr->operands[1] = Operand(v1);
instr->operands[2] = Operand((uint32_t) 0);
mubuf->offset += info.val;
continue;
instr->operands[2] = Operand((uint32_t) 0);
mubuf->offset += info.val;
continue;
- } else if (mubuf->offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == v1 && mubuf->offset + offset < 4096) {
+ } else if (mubuf->offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, vaddr_prevent_overflow) &&
+ base.regClass() == v1 && mubuf->offset + offset < 4096) {
assert(!mubuf->idxen);
instr->operands[1].setTemp(base);
mubuf->offset += offset;
continue;
assert(!mubuf->idxen);
instr->operands[1].setTemp(base);
mubuf->offset += offset;
continue;
- } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && mubuf->offset + offset < 4096) {
+ } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset, saddr_prevent_overflow) &&
+ base.regClass() == s1 && mubuf->offset + offset < 4096) {
instr->operands[i].setTemp(base);
mubuf->offset += offset;
continue;
instr->operands[i].setTemp(base);
mubuf->offset += offset;
continue;
uint32_t offset;
bool has_usable_ds_offset = ctx.program->chip_class >= GFX7;
if (has_usable_ds_offset &&
uint32_t offset;
bool has_usable_ds_offset = ctx.program->chip_class >= GFX7;
if (has_usable_ds_offset &&
- i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset) &&
+ i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset, false) &&
base.regClass() == instr->operands[i].regClass() &&
instr->opcode != aco_opcode::ds_swizzle_b32) {
if (instr->opcode == aco_opcode::ds_write2_b32 || instr->opcode == aco_opcode::ds_read2_b32 ||
base.regClass() == instr->operands[i].regClass() &&
instr->opcode != aco_opcode::ds_swizzle_b32) {
if (instr->opcode == aco_opcode::ds_write2_b32 || instr->opcode == aco_opcode::ds_read2_b32 ||
(ctx.program->chip_class >= GFX8 && info.val <= 0xFFFFF))) {
instr->operands[i] = Operand(info.val);
continue;
(ctx.program->chip_class >= GFX8 && info.val <= 0xFFFFF))) {
instr->operands[i] = Operand(info.val);
continue;
- } else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) {
+ } else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, true) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) {
bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4);
if (soe &&
(!ctx.info[smem->operands.back().tempId()].is_constant_or_literal(32) ||
bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4);
if (soe &&
(!ctx.info[smem->operands.back().tempId()].is_constant_or_literal(32) ||
split->definitions[i] = bld.def(v1);
bld.insert(split);
for (unsigned i = 0; i < temp.size(); i++)
split->definitions[i] = bld.def(v1);
bld.insert(split);
for (unsigned i = 0; i < temp.size(); i++)
- bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false);
+ bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false, true);
- bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false);
+ bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false, true);
}
} else {
ctx.program->config->spilled_sgprs += (*it)->operands[0].size();
}
} else {
ctx.program->config->spilled_sgprs += (*it)->operands[0].size();
for (unsigned i = 0; i < def.size(); i++) {
Temp tmp = bld.tmp(v1);
vec->operands[i] = Operand(tmp);
for (unsigned i = 0; i < def.size(); i++) {
Temp tmp = bld.tmp(v1);
vec->operands[i] = Operand(tmp);
- bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false);
+ bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false, true);
}
bld.insert(vec);
} else {
}
bld.insert(vec);
} else {
- bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false);
+ bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false, true);
}
} else {
uint32_t spill_slot = slots[spill_id];
}
} else {
uint32_t spill_slot = slots[spill_id];