offset = Operand(0u);
}
- unsigned num_components = dst.bytes() / component_size;
+ unsigned num_components = vec.bytes() / component_size;
if (vec.regClass() == dst.regClass()) {
assert(offset.constantValue() == 0);
bld.copy(Definition(dst), vec);
return;
}
- emit_split_vector(ctx, vec, vec.bytes() / component_size);
+ emit_split_vector(ctx, vec, num_components);
std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
assert(offset.constantValue() % component_size == 0);
unsigned skip = offset.constantValue() / component_size;
- for (unsigned i = 0; i < num_components; i++)
- elems[i] = emit_extract_vector(ctx, vec, i + skip, rc);
+ for (unsigned i = skip; i < num_components; i++)
+ elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
/* if dst is vgpr - split the src and create a shrunk version according to the mask. */
if (dst.type() == RegType::vgpr) {
+ num_components = dst.bytes() / component_size;
aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
for (unsigned i = 0; i < num_components; i++)
create_vec->operands[i] = Operand(elems[i]);
sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
sop2->definitions[0] = Definition(dst);
+ if (instr->no_unsigned_wrap)
+ sop2->definitions[0].setNUW(true);
if (writes_scc)
sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
ctx->block->instructions.emplace_back(std::move(sop2));
bool glc = false;
unsigned swizzle_component_size = 0;
- barrier_interaction barrier = barrier_none;
- bool can_reorder = true;
+ memory_sync_info sync;
Temp soffset = Temp(0, s1);
};
/* align offset down if needed */
Operand aligned_offset = offset;
+ unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
if (need_to_align_offset) {
+ align = 4;
Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
if (offset.isConstant()) {
aligned_offset = Operand(offset.constantValue() & 0xfffffffcu);
Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp() :
bld.copy(bld.def(s1), aligned_offset);
- unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
Temp val = callback(bld, info, aligned_offset_tmp, bytes_needed, align,
reduced_const_offset, byte_align ? Temp() : info->dst);
if (num_tmps > 1) {
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
- for (unsigned i = 0; i < num_vals; i++)
+ for (unsigned i = 0; i < num_tmps; i++)
vec->operands[i] = Operand(tmp[i]);
tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
vec->definitions[0] = Definition(tmp[0]);
RegClass rc = RegClass(RegType::vgpr, DIV_ROUND_UP(size, 4));
Temp val = rc == info->dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
+ Instruction *instr;
if (read2)
- bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
+ instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
else
- bld.ds(op, Definition(val), offset, m, const_offset);
+ instr = bld.ds(op, Definition(val), offset, m, const_offset);
+ static_cast<DS_instruction *>(instr)->sync = info->sync;
if (size < 4)
val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, size)), val, Operand(0u));
load->definitions[0] = Definition(val);
load->glc = info->glc;
load->dlc = info->glc && bld.program->chip_class >= GFX10;
- load->barrier = info->barrier;
- load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works
+ load->sync = info->sync;
bld.insert(std::move(load));
return val;
}
unsigned bytes_size = 0;
aco_opcode op;
- if (bytes_needed == 1) {
+ if (bytes_needed == 1 || align_ % 2) {
bytes_size = 1;
op = aco_opcode::buffer_load_ubyte;
- } else if (bytes_needed == 2) {
+ } else if (bytes_needed == 2 || align_ % 4) {
bytes_size = 2;
op = aco_opcode::buffer_load_ushort;
} else if (bytes_needed <= 4) {
mubuf->offen = (offset.type() == RegType::vgpr);
mubuf->glc = info->glc;
mubuf->dlc = info->glc && bld.program->chip_class >= GFX10;
- mubuf->barrier = info->barrier;
- mubuf->can_reorder = info->can_reorder;
+ mubuf->sync = info->sync;
mubuf->offset = const_offset;
- RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
+ mubuf->swizzled = info->swizzle_component_size != 0;
+ RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
mubuf->definitions[0] = Definition(val);
bld.insert(std::move(mubuf));
}
static auto emit_mubuf_load = emit_load<mubuf_load_callback, true, true, 4096>;
+static auto emit_scratch_load = emit_load<mubuf_load_callback, false, true, 4096>;
Temp get_gfx6_global_rsrc(Builder& bld, Temp addr)
{
mubuf->offset = 0;
mubuf->addr64 = offset.type() == RegType::vgpr;
mubuf->disable_wqm = false;
- mubuf->barrier = info->barrier;
+ mubuf->sync = info->sync;
mubuf->definitions[0] = Definition(val);
bld.insert(std::move(mubuf));
} else {
flat->operands[1] = Operand(s1);
flat->glc = info->glc;
flat->dlc = info->glc && bld.program->chip_class >= GFX10;
- flat->barrier = info->barrier;
+ flat->sync = info->sync;
flat->offset = 0u;
flat->definitions[0] = Definition(val);
bld.insert(std::move(flat));
LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
info.align_mul = align;
info.align_offset = 0;
- info.barrier = barrier_shared;
- info.can_reorder = false;
+ info.sync = memory_sync_info(storage_shared);
info.const_offset = base_offset;
emit_lds_load(ctx, bld, &info);
/* use allocated_vec if possible */
auto it = ctx->allocated_vec.find(src.id());
if (it != ctx->allocated_vec.end()) {
- unsigned total_size = 0;
- for (unsigned i = 0; it->second[i].bytes() && (i < NIR_MAX_VEC_COMPONENTS); i++)
- total_size += it->second[i].bytes();
- if (total_size != src.bytes())
+ if (!it->second[0].id())
goto split;
-
unsigned elem_size = it->second[0].bytes();
+ assert(src.bytes() % elem_size == 0);
+
+ for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
+ if (!it->second[i].id())
+ goto split;
+ }
for (unsigned i = 0; i < count; i++) {
if (offsets[i] % elem_size || dst[i].bytes() % elem_size)
}
}
+ split:
+
if (dst_type == RegType::sgpr)
src = bld.as_uniform(src);
- split:
/* just split it */
aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, count)};
split->operands[0] = Operand(src);
}
assert(inline_offset <= max_offset); /* offsets[i] shouldn't be large enough for this to happen */
+ Instruction *instr;
if (write2) {
Temp second_data = write_datas[second];
inline_offset /= data.bytes();
- bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
+ instr = bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
} else {
- bld.ds(op, address_offset, data, m, inline_offset);
+ instr = bld.ds(op, address_offset, data, m, inline_offset);
}
+ static_cast<DS_instruction *>(instr)->sync =
+ memory_sync_info(storage_shared);
}
}
}
void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
- unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false)
+ unsigned const_offset = 0u, memory_sync_info sync=memory_sync_info(),
+ bool slc = false, bool swizzled = false)
{
assert(vdata.id());
assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
- /* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false,
- /* disable_wqm */ false, /* glc */ true, /* dlc*/ false, /* slc */ slc);
+ /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled,
+ /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
+ /* dlc*/ false, /* slc */ slc);
- static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
+ static_cast<MUBUF_instruction *>(r.instr)->sync = sync;
}
void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
- bool allow_combining = true, bool reorder = true, bool slc = false)
+ bool allow_combining = true, memory_sync_info sync=memory_sync_info(), bool slc = false)
{
Builder bld(ctx->program, ctx->block);
assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
for (unsigned i = 0; i < write_count; i++) {
unsigned const_offset = offsets[i] + base_const_offset;
- emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc);
+ emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, sync, slc, !allow_combining);
}
}
/* GFX6-8: ES stage is not merged into GS, data is passed from ES to GS in VMEM. */
Temp esgs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_VS * 16u));
Temp es2gs_offset = get_arg(ctx, ctx->args->es2gs_offset);
- store_vmem_mubuf(ctx, src, esgs_ring, offs.first, es2gs_offset, offs.second, elem_size_bytes, write_mask, false, true, true);
+ store_vmem_mubuf(ctx, src, esgs_ring, offs.first, es2gs_offset, offs.second, elem_size_bytes, write_mask, false, memory_sync_info(), true);
} else {
Temp lds_base;
Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
- store_vmem_mubuf(ctx, store_val, hs_ring_tess_offchip, vmem_offs.first, oc_lds, vmem_offs.second, elem_size_bytes, write_mask, true, false);
+ store_vmem_mubuf(ctx, store_val, hs_ring_tess_offchip, vmem_offs.first, oc_lds, vmem_offs.second, elem_size_bytes, write_mask, true, memory_sync_info(storage_vmem_output));
}
if (write_to_lds) {
}
if (use_mubuf) {
- Instruction *mubuf = bld.mubuf(opcode,
- Definition(fetch_dst), list, fetch_index, soffset,
- fetch_offset, false, true).instr;
- static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
+ bld.mubuf(opcode,
+ Definition(fetch_dst), list, fetch_index, soffset,
+ fetch_offset, false, false, true).instr;
} else {
- Instruction *mtbuf = bld.mtbuf(opcode,
- Definition(fetch_dst), list, fetch_index, soffset,
- fetch_dfmt, nfmt, fetch_offset, false, true).instr;
- static_cast<MTBUF_instruction*>(mtbuf)->can_reorder = true;
+ bld.mtbuf(opcode,
+ Definition(fetch_dst), list, fetch_index, soffset,
+ fetch_dfmt, nfmt, fetch_offset, false, true).instr;
}
emit_split_vector(ctx, fetch_dst, fetch_dst.size());
void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size,
Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
- bool glc=false, bool readonly=true, bool allow_smem=true)
+ bool glc=false, bool allow_smem=true, memory_sync_info sync=memory_sync_info())
{
Builder bld(ctx->program, ctx->block);
LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
info.glc = glc;
- info.barrier = readonly ? barrier_none : barrier_buffer;
- info.can_reorder = readonly;
+ info.sync = sync;
info.align_mul = align_mul;
info.align_offset = align_offset;
if (use_smem)
Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
if (offset != 0) // TODO check if index != 0 as well
- index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
+ index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
Temp vec = dst;
bool trim = false;
unreachable("unimplemented or forbidden load_push_constant.");
}
- bld.smem(op, Definition(vec), ptr, index);
+ static_cast<SMEM_instruction*>(bld.smem(op, Definition(vec), ptr, index).instr)->prevent_overflow = true;
if (!aligned) {
Operand byte_offset = index_cv ? Operand((offset + index_cv->u32) % 4) : Operand(index);
Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
if (base && offset.type() == RegType::sgpr)
- offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
+ offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
else if (base && offset.type() == RegType::vgpr)
offset = bld.vadd32(bld.def(v1), Operand(base), offset);
load->unrm = true;
load->da = da;
load->dim = dim;
- load->can_reorder = true; /* fmask images shouldn't be modified */
ctx->block->instructions.emplace_back(std::move(load));
Operand sample_index4;
}
+memory_sync_info get_memory_sync_info(nir_intrinsic_instr *instr, storage_class storage, unsigned semantics)
+{
+ /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
+ if (semantics & semantic_atomicrmw)
+ return memory_sync_info(storage, semantics);
+
+ unsigned access = nir_intrinsic_access(instr);
+
+ if (access & ACCESS_VOLATILE)
+ semantics |= semantic_volatile;
+ if (access & ACCESS_CAN_REORDER)
+ semantics |= semantic_can_reorder | semantic_private;
+
+ return memory_sync_info(storage, semantics);
+}
+
void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
{
Builder bld(ctx->program, ctx->block);
bool is_array = glsl_sampler_type_is_array(type);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+ memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
+ unsigned access = var->data.access | nir_intrinsic_access(instr);
+
if (dim == GLSL_SAMPLER_DIM_BUF) {
unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
unsigned num_channels = util_last_bit(mask);
tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)};
load->definitions[0] = Definition(tmp);
load->idxen = true;
- load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT);
+ load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
load->dlc = load->glc && ctx->options->chip_class >= GFX10;
- load->barrier = barrier_image;
+ load->sync = sync;
ctx->block->instructions.emplace_back(std::move(load));
expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
load->operands[1] = Operand(s4); /* no sampler */
load->operands[2] = Operand(coords);
load->definitions[0] = Definition(tmp);
- load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
+ load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
load->dlc = load->glc && ctx->options->chip_class >= GFX10;
load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
load->dmask = dmask;
load->unrm = true;
load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
- load->barrier = barrier_image;
+ load->sync = sync;
ctx->block->instructions.emplace_back(std::move(load));
expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
bool is_array = glsl_sampler_type_is_array(type);
Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
- bool glc = ctx->options->chip_class == GFX6 || var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
+ memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
+ unsigned access = var->data.access | nir_intrinsic_access(instr);
+ bool glc = ctx->options->chip_class == GFX6 || access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
if (dim == GLSL_SAMPLER_DIM_BUF) {
Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
store->glc = glc;
store->dlc = false;
store->disable_wqm = true;
- store->barrier = barrier_image;
+ store->sync = sync;
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(store));
return;
store->unrm = true;
store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
store->disable_wqm = true;
- store->barrier = barrier_image;
+ store->sync = sync;
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(store));
return;
}
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+ memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);
if (dim == GLSL_SAMPLER_DIM_BUF) {
Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
mubuf->glc = return_previous;
mubuf->dlc = false; /* Not needed for atomics */
mubuf->disable_wqm = true;
- mubuf->barrier = barrier_image;
+ mubuf->sync = sync;
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(mubuf));
return;
mimg->unrm = true;
mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
mimg->disable_wqm = true;
- mimg->barrier = barrier_image;
+ mimg->sync = sync;
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(mimg));
return;
mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
mimg->da = glsl_sampler_type_is_array(type);
- mimg->can_reorder = true;
Definition& def = mimg->definitions[0];
ctx->block->instructions.emplace_back(std::move(mimg));
allow_smem |= ((access & ACCESS_RESTRICT) && (access & ACCESS_NON_WRITEABLE)) || (access & ACCESS_CAN_REORDER);
load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
- nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false, allow_smem);
+ nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem,
+ get_memory_sync_info(instr, storage_buffer, 0));
}
void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
+ memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr));
/* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(op, Format::SMEM, 3, 0)};
store->operands[0] = Operand(rsrc);
if (offsets[i]) {
- Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
- offset, Operand(offsets[i]));
+ Temp off = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
+ offset, Operand(offsets[i]));
store->operands[1] = Operand(off);
} else {
store->operands[1] = Operand(offset);
store->glc = glc;
store->dlc = false;
store->disable_wqm = true;
- store->barrier = barrier_buffer;
+ store->sync = sync;
ctx->block->instructions.emplace_back(std::move(store));
ctx->program->wb_smem_l1_on_end = true;
if (op == aco_opcode::p_fs_buffer_store_smem) {
store->glc = glc;
store->dlc = false;
store->disable_wqm = true;
- store->barrier = barrier_buffer;
+ store->sync = sync;
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(store));
}
mubuf->glc = return_previous;
mubuf->dlc = false; /* Not needed for atomics */
mubuf->disable_wqm = true;
- mubuf->barrier = barrier_buffer;
+ mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(mubuf));
}
info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
info.align_mul = nir_intrinsic_align_mul(instr);
info.align_offset = nir_intrinsic_align_offset(instr);
- info.barrier = barrier_buffer;
- info.can_reorder = false;
+ info.sync = get_memory_sync_info(instr, storage_buffer, 0);
/* VMEM stores don't update the SMEM cache and it's difficult to prove that
* it's safe to use SMEM */
bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE;
Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
+ memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
if (ctx->options->chip_class >= GFX7)
flat->dlc = false;
flat->offset = offset;
flat->disable_wqm = true;
- flat->barrier = barrier_buffer;
+ flat->sync = sync;
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(flat));
} else {
mubuf->offset = offsets[i];
mubuf->addr64 = addr.type() == RegType::vgpr;
mubuf->disable_wqm = true;
- mubuf->barrier = barrier_buffer;
+ mubuf->sync = sync;
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(mubuf));
}
flat->dlc = false; /* Not needed for atomics */
flat->offset = 0;
flat->disable_wqm = true;
- flat->barrier = barrier_buffer;
+ flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(flat));
} else {
mubuf->offset = 0;
mubuf->addr64 = addr.type() == RegType::vgpr;
mubuf->disable_wqm = true;
- mubuf->barrier = barrier_buffer;
+ mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(mubuf));
}
}
+sync_scope translate_nir_scope(nir_scope scope)
+{
+ switch (scope) {
+ case NIR_SCOPE_NONE:
+ case NIR_SCOPE_INVOCATION:
+ return scope_invocation;
+ case NIR_SCOPE_SUBGROUP:
+ return scope_subgroup;
+ case NIR_SCOPE_WORKGROUP:
+ return scope_workgroup;
+ case NIR_SCOPE_QUEUE_FAMILY:
+ return scope_queuefamily;
+ case NIR_SCOPE_DEVICE:
+ return scope_device;
+ }
+ unreachable("invalid scope");
+}
+
void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
Builder bld(ctx->program, ctx->block);
+ storage_class all_mem = (storage_class)(storage_buffer | storage_image | storage_atomic_counter | storage_shared);
switch(instr->intrinsic) {
case nir_intrinsic_group_memory_barrier:
+ bld.barrier(aco_opcode::p_barrier,
+ memory_sync_info(all_mem, semantic_acqrel, scope_workgroup));
+ break;
case nir_intrinsic_memory_barrier:
- bld.barrier(aco_opcode::p_memory_barrier_common);
+ bld.barrier(aco_opcode::p_barrier,
+ memory_sync_info(all_mem, semantic_acqrel, scope_device));
break;
case nir_intrinsic_memory_barrier_buffer:
- bld.barrier(aco_opcode::p_memory_barrier_buffer);
- break;
+ bld.barrier(aco_opcode::p_barrier,
+ memory_sync_info((storage_class)storage_buffer, semantic_acqrel, scope_device));
case nir_intrinsic_memory_barrier_image:
- bld.barrier(aco_opcode::p_memory_barrier_image);
+ bld.barrier(aco_opcode::p_barrier,
+ memory_sync_info((storage_class)storage_image, semantic_acqrel, scope_device));
break;
case nir_intrinsic_memory_barrier_tcs_patch:
case nir_intrinsic_memory_barrier_shared:
- bld.barrier(aco_opcode::p_memory_barrier_shared);
+ bld.barrier(aco_opcode::p_barrier,
+ memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup));
+ break;
+ case nir_intrinsic_scoped_barrier: {
+ unsigned semantics = 0;
+ unsigned storage = 0;
+ sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr));
+ sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr));
+
+ unsigned nir_storage = nir_intrinsic_memory_modes(instr);
+ if (nir_storage & (nir_var_mem_ssbo | nir_var_mem_global))
+ storage |= storage_buffer | storage_image; //TODO: split this when NIR gets nir_var_mem_image
+ if (ctx->shader->info.stage == MESA_SHADER_COMPUTE && (nir_storage & nir_var_mem_shared))
+ storage |= storage_shared;
+ if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL && (nir_storage & nir_var_shader_out))
+ storage |= storage_shared;
+
+ unsigned nir_semantics = nir_intrinsic_memory_semantics(instr);
+ if (nir_semantics & NIR_MEMORY_ACQUIRE)
+ semantics |= semantic_acquire | semantic_release;
+ if (nir_semantics & NIR_MEMORY_RELEASE)
+ semantics |= semantic_acquire | semantic_release;
+
+ assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
+
+ bld.barrier(aco_opcode::p_barrier,
+ memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
+ exec_scope);
break;
+ }
default:
unreachable("Unimplemented memory barrier intrinsic");
break;
op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
num_operands = 4;
break;
+ case nir_intrinsic_shared_atomic_fadd:
+ op32 = aco_opcode::ds_add_f32;
+ op32_rtn = aco_opcode::ds_add_rtn_f32;
+ op64 = aco_opcode::num_opcodes;
+ op64_rtn = aco_opcode::num_opcodes;
+ break;
default:
unreachable("Unhandled shared atomic intrinsic");
}
ds->offset0 = offset;
if (return_previous)
ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
+ ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw);
ctx->block->instructions.emplace_back(std::move(ds));
}
scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u));
uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
- S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);;
+ S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
if (ctx->program->chip_class >= GFX10) {
rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
}
- /* older generations need element size = 16 bytes. element size removed in GFX9 */
+ /* older generations need element size = 4 bytes. element size removed in GFX9 */
if (ctx->program->chip_class <= GFX8)
- rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
+ rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
}
instr->dest.ssa.bit_size / 8u, rsrc};
info.align_mul = nir_intrinsic_align_mul(instr);
info.align_offset = nir_intrinsic_align_offset(instr);
- info.swizzle_component_size = 16;
- info.can_reorder = false;
+ info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0;
+ info.sync = memory_sync_info(storage_scratch, semantic_private);
info.soffset = ctx->program->scratch_offset;
- emit_mubuf_load(ctx, bld, &info);
+ emit_scratch_load(ctx, bld, &info);
}
void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
unsigned write_count = 0;
Temp write_datas[32];
unsigned offsets[32];
+ unsigned swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 16;
split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
- 16, &write_count, write_datas, offsets);
+ swizzle_component_size, &write_count, write_datas, offsets);
for (unsigned i = 0; i < write_count; i++) {
aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
- bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true);
+ Instruction *instr = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
+ static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_scratch, semantic_private);
}
}
mtbuf->offset = const_offset;
mtbuf->glc = true;
mtbuf->slc = true;
- mtbuf->barrier = barrier_gs_data;
- mtbuf->can_reorder = true;
+ mtbuf->sync = memory_sync_info(storage_vmem_output, semantic_can_reorder);
bld.insert(std::move(mtbuf));
}
Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
Temp private_segment_buffer = ctx->program->private_segment_buffer;
+ //TODO: bounds checking?
if (addr.type() == RegType::sgpr) {
Operand offset;
if (const_addr) {
load->glc = false;
load->dlc = false;
load->disable_wqm = false;
- load->barrier = barrier_none;
- load->can_reorder = true;
ctx->block->instructions.emplace_back(std::move(load));
}
case nir_intrinsic_shared_atomic_xor:
case nir_intrinsic_shared_atomic_exchange:
case nir_intrinsic_shared_atomic_comp_swap:
+ case nir_intrinsic_shared_atomic_fadd:
visit_shared_atomic(ctx, instr);
break;
case nir_intrinsic_image_deref_load:
visit_get_buffer_size(ctx, instr);
break;
case nir_intrinsic_control_barrier: {
- if (ctx->program->chip_class == GFX6 && ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
- /* GFX6 only (thanks to a hw bug workaround):
- * The real barrier instruction isn’t needed, because an entire patch
- * always fits into a single wave.
- */
- break;
- }
-
- if (ctx->program->workgroup_size > ctx->program->wave_size)
- bld.sopp(aco_opcode::s_barrier);
-
+ bld.barrier(aco_opcode::p_barrier, memory_sync_info(0, 0, scope_invocation), scope_workgroup);
break;
}
case nir_intrinsic_memory_barrier_tcs_patch:
case nir_intrinsic_memory_barrier_buffer:
case nir_intrinsic_memory_barrier_image:
case nir_intrinsic_memory_barrier_shared:
+ case nir_intrinsic_scoped_barrier:
emit_memory_barrier(ctx, instr);
break;
case nir_intrinsic_load_num_work_groups: {
aco_opcode opcode =
nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE ?
aco_opcode::s_memrealtime : aco_opcode::s_memtime;
- bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
+ bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), memory_sync_info(0, semantic_volatile));
emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2);
break;
}
tex->da = da;
tex->definitions[0] = Definition(tmp_dst);
tex->dim = dim;
- tex->can_reorder = true;
ctx->block->instructions.emplace_back(std::move(tex));
if (div_by_6) {
tex->da = da;
Temp size = bld.tmp(v2);
tex->definitions[0] = Definition(size);
- tex->can_reorder = true;
ctx->block->instructions.emplace_back(std::move(tex));
emit_split_vector(ctx, size, size.size());
mubuf->operands[2] = Operand((uint32_t) 0);
mubuf->definitions[0] = Definition(tmp_dst);
mubuf->idxen = true;
- mubuf->can_reorder = true;
ctx->block->instructions.emplace_back(std::move(mubuf));
expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
tex->unrm = true;
tex->da = da;
tex->definitions[0] = Definition(tmp_dst);
- tex->can_reorder = true;
ctx->block->instructions.emplace_back(std::move(tex));
if (instr->op == nir_texop_samples_identical) {
tex->dmask = dmask;
tex->da = da;
tex->definitions[0] = Definition(tmp_dst);
- tex->can_reorder = true;
ctx->block->instructions.emplace_back(std::move(tex));
if (tg4_integer_cube_workaround) {
create_null_export(ctx);
}
+static void create_workgroup_barrier(Builder& bld)
+{
+ bld.barrier(aco_opcode::p_barrier,
+ memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup),
+ scope_workgroup);
+}
+
static void write_tcs_tess_factors(isel_context *ctx)
{
unsigned outer_comps;
Builder bld(ctx->program, ctx->block);
- bld.barrier(aco_opcode::p_memory_barrier_shared);
- if (unlikely(ctx->program->chip_class != GFX6 && ctx->program->workgroup_size > ctx->program->wave_size))
- bld.sopp(aco_opcode::s_barrier);
+ create_workgroup_barrier(bld);
Temp tcs_rel_ids = get_arg(ctx, ctx->args->ac.tcs_rel_ids);
Temp invocation_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand(8u), Operand(5u));
Temp control_word = bld.copy(bld.def(v1), Operand(0x80000000u));
bld.mubuf(aco_opcode::buffer_store_dword,
/* SRSRC */ hs_ring_tess_factor, /* VADDR */ Operand(v1), /* SOFFSET */ tf_base, /* VDATA */ control_word,
- /* immediate OFFSET */ 0, /* OFFEN */ false, /* idxen*/ false, /* addr64 */ false,
- /* disable_wqm */ false, /* glc */ true);
+ /* immediate OFFSET */ 0, /* OFFEN */ false, /* swizzled */ false, /* idxen*/ false,
+ /* addr64 */ false, /* disable_wqm */ false, /* glc */ true);
tf_const_offset += 4;
begin_divergent_if_else(ctx, &ic_rel_patch_id_is_zero);
assert(stride == 2 || stride == 4 || stride == 6);
Temp tf_vec = create_vec_from_array(ctx, out, stride, RegType::vgpr, 4u);
- store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, false);
+ store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, memory_sync_info());
/* Store to offchip for TES to read - only if TES reads them */
if (ctx->args->options->key.tcs.tes_reads_tess_factors) {
Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
std::pair<Temp, unsigned> vmem_offs_outer = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_out_loc);
- store_vmem_mubuf(ctx, tf_outer_vec, hs_ring_tess_offchip, vmem_offs_outer.first, oc_lds, vmem_offs_outer.second, 4, (1 << outer_comps) - 1, true, false);
+ store_vmem_mubuf(ctx, tf_outer_vec, hs_ring_tess_offchip, vmem_offs_outer.first, oc_lds, vmem_offs_outer.second, 4, (1 << outer_comps) - 1, true, memory_sync_info(storage_vmem_output));
if (likely(inner_comps)) {
std::pair<Temp, unsigned> vmem_offs_inner = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_in_loc);
- store_vmem_mubuf(ctx, tf_inner_vec, hs_ring_tess_offchip, vmem_offs_inner.first, oc_lds, vmem_offs_inner.second, 4, (1 << inner_comps) - 1, true, false);
+ store_vmem_mubuf(ctx, tf_inner_vec, hs_ring_tess_offchip, vmem_offs_inner.first, oc_lds, vmem_offs_inner.second, 4, (1 << inner_comps) - 1, true, memory_sync_info(storage_vmem_output));
}
}
store->glc = true;
store->dlc = false;
store->slc = true;
- store->can_reorder = true;
ctx->block->instructions.emplace_back(std::move(store));
}
}
if (ctx->stage == ngg_vertex_gs) {
/* Wait for GS threads to store primitive ID in LDS. */
- bld.barrier(aco_opcode::p_memory_barrier_shared);
- bld.sopp(aco_opcode::s_barrier);
+ create_workgroup_barrier(bld);
/* Calculate LDS address where the GS threads stored the primitive ID. */
Temp wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
if (i) {
Builder bld(ctx.program, ctx.block);
- bld.barrier(aco_opcode::p_memory_barrier_shared);
- bld.sopp(aco_opcode::s_barrier);
+ create_workgroup_barrier(bld);
if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {
ctx.gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | 16u));
ngg_emit_nogs_output(&ctx);
} else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
Builder bld(ctx.program, ctx.block);
- bld.barrier(aco_opcode::p_memory_barrier_gs_data);
+ bld.barrier(aco_opcode::p_barrier,
+ memory_sync_info(storage_vmem_output, semantic_release, scope_device));
bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, sendmsg_gs_done(false, false, 0));
} else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
write_tcs_tess_factors(&ctx);
ctx.block->kind |= block_kind_uniform;
Builder bld(ctx.program, ctx.block);
if (ctx.program->wb_smem_l1_on_end)
- bld.smem(aco_opcode::s_dcache_wb, false);
+ bld.smem(aco_opcode::s_dcache_wb, memory_sync_info(storage_buffer, semantic_volatile));
bld.sopp(aco_opcode::s_endpgm);
cleanup_cfg(program);
{
isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
- program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
- program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
- program->next_fp_mode.must_flush_denorms32 = false;
- program->next_fp_mode.must_flush_denorms16_64 = false;
- program->next_fp_mode.care_about_round32 = false;
- program->next_fp_mode.care_about_round16_64 = false;
- program->next_fp_mode.denorm16_64 = fp_denorm_keep;
- program->next_fp_mode.denorm32 = 0;
- program->next_fp_mode.round32 = fp_round_ne;
- program->next_fp_mode.round16_64 = fp_round_ne;
ctx.block->fp_mode = program->next_fp_mode;
add_startpgm(&ctx);
mubuf->glc = true;
mubuf->slc = true;
mubuf->dlc = args->options->chip_class >= GFX10;
- mubuf->barrier = barrier_none;
- mubuf->can_reorder = true;
ctx.outputs.mask[i] |= 1 << j;
ctx.outputs.temps[i * 4u + j] = mubuf->definitions[0].getTemp();