}
void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
- unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false,
- bool swizzled = false)
+ unsigned const_offset = 0u, memory_sync_info sync=memory_sync_info(),
+ bool slc = false, bool swizzled = false)
{
assert(vdata.id());
assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
/* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
/* dlc*/ false, /* slc */ slc);
- if (!allow_reorder)
- static_cast<MUBUF_instruction *>(r.instr)->sync = memory_sync_info(storage_buffer, semantic_private);
+ static_cast<MUBUF_instruction *>(r.instr)->sync = sync;
}
void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
- bool allow_combining = true, bool reorder = true, bool slc = false)
+ bool allow_combining = true, memory_sync_info sync=memory_sync_info(), bool slc = false)
{
Builder bld(ctx->program, ctx->block);
assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
for (unsigned i = 0; i < write_count; i++) {
unsigned const_offset = offsets[i] + base_const_offset;
- emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc, !allow_combining);
+ emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, sync, slc, !allow_combining);
}
}
/* GFX6-8: ES stage is not merged into GS, data is passed from ES to GS in VMEM. */
Temp esgs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_VS * 16u));
Temp es2gs_offset = get_arg(ctx, ctx->args->es2gs_offset);
- store_vmem_mubuf(ctx, src, esgs_ring, offs.first, es2gs_offset, offs.second, elem_size_bytes, write_mask, false, true, true);
+ store_vmem_mubuf(ctx, src, esgs_ring, offs.first, es2gs_offset, offs.second, elem_size_bytes, write_mask, false, memory_sync_info(), true);
} else {
Temp lds_base;
Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
- store_vmem_mubuf(ctx, store_val, hs_ring_tess_offchip, vmem_offs.first, oc_lds, vmem_offs.second, elem_size_bytes, write_mask, true, false);
+ store_vmem_mubuf(ctx, store_val, hs_ring_tess_offchip, vmem_offs.first, oc_lds, vmem_offs.second, elem_size_bytes, write_mask, true, memory_sync_info(storage_vmem_output));
}
if (write_to_lds) {
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
+ unsigned access = var->data.access | nir_intrinsic_access(instr);
if (dim == GLSL_SAMPLER_DIM_BUF) {
unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)};
load->definitions[0] = Definition(tmp);
load->idxen = true;
- load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT);
+ load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
load->dlc = load->glc && ctx->options->chip_class >= GFX10;
load->sync = sync;
ctx->block->instructions.emplace_back(std::move(load));
load->operands[1] = Operand(s4); /* no sampler */
load->operands[2] = Operand(coords);
load->definitions[0] = Definition(tmp);
- load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
+ load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
load->dlc = load->glc && ctx->options->chip_class >= GFX10;
load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
load->dmask = dmask;
Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
- bool glc = ctx->options->chip_class == GFX6 || var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
+ unsigned access = var->data.access | nir_intrinsic_access(instr);
+ bool glc = ctx->options->chip_class == GFX6 || access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
if (dim == GLSL_SAMPLER_DIM_BUF) {
Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
}
}
+sync_scope translate_nir_scope(nir_scope scope)
+{
+ switch (scope) {
+ case NIR_SCOPE_NONE:
+ case NIR_SCOPE_INVOCATION:
+ return scope_invocation;
+ case NIR_SCOPE_SUBGROUP:
+ return scope_subgroup;
+ case NIR_SCOPE_WORKGROUP:
+ return scope_workgroup;
+ case NIR_SCOPE_QUEUE_FAMILY:
+ return scope_queuefamily;
+ case NIR_SCOPE_DEVICE:
+ return scope_device;
+ }
+ unreachable("invalid scope");
+}
+
void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
Builder bld(ctx->program, ctx->block);
storage_class all_mem = (storage_class)(storage_buffer | storage_image | storage_atomic_counter | storage_shared);
memory_sync_info(all_mem, semantic_acqrel, scope_device));
break;
case nir_intrinsic_memory_barrier_buffer:
+ bld.barrier(aco_opcode::p_barrier,
+ memory_sync_info((storage_class)storage_buffer, semantic_acqrel, scope_device));
case nir_intrinsic_memory_barrier_image:
- /* since NIR splits barriers, we have to unify buffer and image barriers
- * for now so dEQP-VK.memory_model.message_passing.core11.u32.coherent.
- * fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
- * passes
- */
bld.barrier(aco_opcode::p_barrier,
- memory_sync_info((storage_class)(storage_buffer | storage_image), semantic_acqrel, scope_device));
+ memory_sync_info((storage_class)storage_image, semantic_acqrel, scope_device));
break;
case nir_intrinsic_memory_barrier_tcs_patch:
case nir_intrinsic_memory_barrier_shared:
bld.barrier(aco_opcode::p_barrier,
memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup));
break;
+ case nir_intrinsic_scoped_barrier: {
+ unsigned semantics = 0;
+ unsigned storage = 0;
+ sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr));
+ sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr));
+
+ unsigned nir_storage = nir_intrinsic_memory_modes(instr);
+ if (nir_storage & (nir_var_mem_ssbo | nir_var_mem_global))
+ storage |= storage_buffer | storage_image; //TODO: split this when NIR gets nir_var_mem_image
+ if (ctx->shader->info.stage == MESA_SHADER_COMPUTE && (nir_storage & nir_var_mem_shared))
+ storage |= storage_shared;
+ if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL && (nir_storage & nir_var_shader_out))
+ storage |= storage_shared;
+
+ unsigned nir_semantics = nir_intrinsic_memory_semantics(instr);
+ if (nir_semantics & NIR_MEMORY_ACQUIRE)
+ semantics |= semantic_acquire | semantic_release;
+ if (nir_semantics & NIR_MEMORY_RELEASE)
+ semantics |= semantic_acquire | semantic_release;
+
+ assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
+
+ bld.barrier(aco_opcode::p_barrier,
+ memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
+ exec_scope);
+ break;
+ }
default:
unreachable("Unimplemented memory barrier intrinsic");
break;
info.align_mul = nir_intrinsic_align_mul(instr);
info.align_offset = nir_intrinsic_align_offset(instr);
info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0;
- info.sync = memory_sync_info(storage_buffer, semantic_private);
+ info.sync = memory_sync_info(storage_scratch, semantic_private);
info.soffset = ctx->program->scratch_offset;
emit_scratch_load(ctx, bld, &info);
}
for (unsigned i = 0; i < write_count; i++) {
aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
Instruction *instr = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
- static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_buffer, semantic_private);
+ static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_scratch, semantic_private);
}
}
case nir_intrinsic_memory_barrier_buffer:
case nir_intrinsic_memory_barrier_image:
case nir_intrinsic_memory_barrier_shared:
+ case nir_intrinsic_scoped_barrier:
emit_memory_barrier(ctx, instr);
break;
case nir_intrinsic_load_num_work_groups: {
assert(stride == 2 || stride == 4 || stride == 6);
Temp tf_vec = create_vec_from_array(ctx, out, stride, RegType::vgpr, 4u);
- store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, false);
+ store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, memory_sync_info());
/* Store to offchip for TES to read - only if TES reads them */
if (ctx->args->options->key.tcs.tes_reads_tess_factors) {
Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
std::pair<Temp, unsigned> vmem_offs_outer = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_out_loc);
- store_vmem_mubuf(ctx, tf_outer_vec, hs_ring_tess_offchip, vmem_offs_outer.first, oc_lds, vmem_offs_outer.second, 4, (1 << outer_comps) - 1, true, false);
+ store_vmem_mubuf(ctx, tf_outer_vec, hs_ring_tess_offchip, vmem_offs_outer.first, oc_lds, vmem_offs_outer.second, 4, (1 << outer_comps) - 1, true, memory_sync_info(storage_vmem_output));
if (likely(inner_comps)) {
std::pair<Temp, unsigned> vmem_offs_inner = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_in_loc);
- store_vmem_mubuf(ctx, tf_inner_vec, hs_ring_tess_offchip, vmem_offs_inner.first, oc_lds, vmem_offs_inner.second, 4, (1 << inner_comps) - 1, true, false);
+ store_vmem_mubuf(ctx, tf_inner_vec, hs_ring_tess_offchip, vmem_offs_inner.first, oc_lds, vmem_offs_inner.second, 4, (1 << inner_comps) - 1, true, memory_sync_info(storage_vmem_output));
}
}