void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size,
Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
- bool glc=false, bool readonly=true)
+ bool glc=false, bool readonly=true, bool allow_smem=true)
{
Builder bld(ctx->program, ctx->block);
- bool use_smem = dst.type() != RegType::vgpr && ((ctx->options->chip_class >= GFX8 && component_size >= 4) || readonly);
+ bool use_smem = dst.type() != RegType::vgpr && ((ctx->options->chip_class >= GFX8 && component_size >= 4) || readonly) && allow_smem;
if (use_smem)
offset = bld.as_uniform(offset);
Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
- bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
+ unsigned access = nir_intrinsic_access(instr);
+ bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
unsigned size = instr->dest.ssa.bit_size / 8;
+
+ uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[0].ssa, access);
+ /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
+ * TODO: this optimization is disabled for now because we still need to ensure correct ordering
+ */
+ bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_store : has_vmem_store));
+ allow_smem |= ((access & ACCESS_RESTRICT) && (access & ACCESS_NON_WRITEABLE)) || (access & ACCESS_CAN_REORDER);
+
load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
- nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false);
+ nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false, allow_smem);
}
void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
+ bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+ uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr));
+ /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
+ * TODO: this optimization is disabled for now because we still need to ensure correct ordering
+ */
+ bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_loadstore : has_vmem_loadstore));
+
bool smem = !nir_src_is_divergent(instr->src[2]) &&
ctx->options->chip_class >= GFX8 &&
- elem_size_bytes >= 4;
+ elem_size_bytes >= 4 &&
+ allow_smem;
if (smem)
offset = bld.as_uniform(offset);
bool smem_nonfs = smem && ctx->stage != fragment_fs;
if (op != aco_opcode::p_fs_buffer_store_smem)
store->operands[1].setFixed(m0);
store->operands[2] = Operand(write_datas[i]);
- store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+ store->glc = glc;
store->dlc = false;
store->disable_wqm = true;
store->barrier = barrier_buffer;
store->operands[3] = Operand(write_datas[i]);
store->offset = offsets[i];
store->offen = (offset.type() == RegType::vgpr);
- store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+ store->glc = glc;
store->dlc = false;
store->disable_wqm = true;
store->barrier = barrier_buffer;
}
};
+enum resource_flags {
+ has_glc_vmem_load = 0x1,
+ has_nonglc_vmem_load = 0x2,
+ has_glc_vmem_store = 0x4,
+ has_nonglc_vmem_store = 0x8,
+
+ has_vmem_store = has_glc_vmem_store | has_nonglc_vmem_store,
+ has_vmem_loadstore = has_vmem_store | has_glc_vmem_load | has_nonglc_vmem_load,
+ has_nonglc_vmem_loadstore = has_nonglc_vmem_load | has_nonglc_vmem_store,
+
+ buffer_is_restrict = 0x10,
+};
+
struct isel_context {
const struct radv_nir_compiler_options *options;
struct radv_shader_args *args;
std::unique_ptr<unsigned[]> nir_to_aco; /* NIR block index to ACO block index */
} cf_info;
+ uint32_t resource_flag_offsets[MAX_SETS];
+ std::vector<uint8_t> buffer_resource_flags;
+
Temp arg_temps[AC_MAX_ARGS];
/* FS inputs */
return progress;
}
+void get_buffer_resource_flags(isel_context *ctx, nir_ssa_def *def, unsigned access,
+ uint8_t **flags, uint32_t *count)
+{
+ int desc_set = -1;
+ unsigned binding = 0;
+
+ if (!def) {
+ /* global resources are considered aliasing with all other buffers and
+ * buffer images */
+ // TODO: only merge flags of resources which can really alias.
+ } else if (def->parent_instr->type == nir_instr_type_intrinsic) {
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(def->parent_instr);
+ if (intrin->intrinsic == nir_intrinsic_vulkan_resource_index) {
+ desc_set = nir_intrinsic_desc_set(intrin);
+ binding = nir_intrinsic_binding(intrin);
+ }
+ } else if (def->parent_instr->type == nir_instr_type_deref) {
+ nir_deref_instr *deref = nir_instr_as_deref(def->parent_instr);
+ assert(deref->type->is_image());
+ if (deref->type->sampler_dimensionality != GLSL_SAMPLER_DIM_BUF) {
+ *flags = NULL;
+ *count = 0;
+ return;
+ }
+
+ nir_variable *var = nir_deref_instr_get_variable(deref);
+ desc_set = var->data.descriptor_set;
+ binding = var->data.binding;
+ }
+
+ if (desc_set < 0) {
+ *flags = ctx->buffer_resource_flags.data();
+ *count = ctx->buffer_resource_flags.size();
+ return;
+ }
+
+ unsigned set_offset = ctx->resource_flag_offsets[desc_set];
+
+ if (!(ctx->buffer_resource_flags[set_offset + binding] & buffer_is_restrict)) {
+ /* Non-restrict buffers alias only with other non-restrict buffers.
+ * We reserve flags[0] for these. */
+ *flags = ctx->buffer_resource_flags.data();
+ *count = 1;
+ return;
+ }
+
+ *flags = ctx->buffer_resource_flags.data() + set_offset + binding;
+ *count = 1;
+}
+
+uint8_t get_all_buffer_resource_flags(isel_context *ctx, nir_ssa_def *def, unsigned access)
+{
+ uint8_t *flags;
+ uint32_t count;
+ get_buffer_resource_flags(ctx, def, access, &flags, &count);
+
+ uint8_t res = 0;
+ for (unsigned i = 0; i < count; i++)
+ res |= flags[i];
+ return res;
+}
+
+void fill_desc_set_info(isel_context *ctx, nir_function_impl *impl)
+{
+ radv_pipeline_layout *pipeline_layout = ctx->options->layout;
+
+ unsigned resource_flag_count = 1; /* +1 to reserve flags[0] for aliased resources */
+ for (unsigned i = 0; i < pipeline_layout->num_sets; i++) {
+ radv_descriptor_set_layout *layout = pipeline_layout->set[i].layout;
+ ctx->resource_flag_offsets[i] = resource_flag_count;
+ resource_flag_count += layout->binding_count;
+ }
+ ctx->buffer_resource_flags = std::vector<uint8_t>(resource_flag_count);
+
+ nir_foreach_variable(var, &impl->function->shader->uniforms) {
+ if (var->data.mode == nir_var_mem_ssbo && (var->data.access & ACCESS_RESTRICT)) {
+ uint32_t offset = ctx->resource_flag_offsets[var->data.descriptor_set];
+ ctx->buffer_resource_flags[offset + var->data.binding] |= buffer_is_restrict;
+ }
+ }
+
+ nir_foreach_block(block, impl) {
+ nir_foreach_instr(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ if (!(nir_intrinsic_infos[intrin->intrinsic].index_map[NIR_INTRINSIC_ACCESS]))
+ continue;
+
+ nir_ssa_def *res = NULL;
+ unsigned access = nir_intrinsic_access(intrin);
+ unsigned flags = 0;
+ bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+ switch (intrin->intrinsic) {
+ case nir_intrinsic_load_ssbo: {
+ unsigned elem_size = intrin->dest.ssa.bit_size;
+ if (nir_dest_is_divergent(intrin->dest) || ctx->program->chip_class < GFX8 || elem_size < 32)
+ flags |= glc ? has_glc_vmem_load : has_nonglc_vmem_load;
+ res = intrin->src[0].ssa;
+ break;
+ }
+ case nir_intrinsic_ssbo_atomic_add:
+ case nir_intrinsic_ssbo_atomic_imin:
+ case nir_intrinsic_ssbo_atomic_umin:
+ case nir_intrinsic_ssbo_atomic_imax:
+ case nir_intrinsic_ssbo_atomic_umax:
+ case nir_intrinsic_ssbo_atomic_and:
+ case nir_intrinsic_ssbo_atomic_or:
+ case nir_intrinsic_ssbo_atomic_xor:
+ case nir_intrinsic_ssbo_atomic_exchange:
+ case nir_intrinsic_ssbo_atomic_comp_swap:
+ flags |= has_glc_vmem_load | has_glc_vmem_store;
+ res = intrin->src[0].ssa;
+ break;
+ case nir_intrinsic_store_ssbo: {
+ unsigned elem_size = intrin->src[0].ssa->bit_size;
+ if (nir_src_is_divergent(intrin->src[2]) || ctx->program->chip_class < GFX8 || elem_size < 32)
+ flags |= glc ? has_glc_vmem_store : has_nonglc_vmem_store;
+ res = intrin->src[1].ssa;
+ break;
+ }
+ case nir_intrinsic_load_global:
+ if (!(access & ACCESS_NON_WRITEABLE))
+ flags |= glc ? has_glc_vmem_load : has_nonglc_vmem_load;
+ break;
+ case nir_intrinsic_store_global:
+ flags |= glc ? has_glc_vmem_store : has_nonglc_vmem_store;
+ break;
+ case nir_intrinsic_global_atomic_add:
+ case nir_intrinsic_global_atomic_imin:
+ case nir_intrinsic_global_atomic_umin:
+ case nir_intrinsic_global_atomic_imax:
+ case nir_intrinsic_global_atomic_umax:
+ case nir_intrinsic_global_atomic_and:
+ case nir_intrinsic_global_atomic_or:
+ case nir_intrinsic_global_atomic_xor:
+ case nir_intrinsic_global_atomic_exchange:
+ case nir_intrinsic_global_atomic_comp_swap:
+ flags |= has_glc_vmem_load | has_glc_vmem_store;
+ break;
+ case nir_intrinsic_image_deref_load:
+ res = intrin->src[0].ssa;
+ flags |= glc ? has_glc_vmem_load : has_nonglc_vmem_load;
+ break;
+ case nir_intrinsic_image_deref_store:
+ res = intrin->src[0].ssa;
+ flags |= (glc || ctx->program->chip_class == GFX6) ? has_glc_vmem_store : has_nonglc_vmem_store;
+ break;
+ case nir_intrinsic_image_deref_atomic_add:
+ case nir_intrinsic_image_deref_atomic_umin:
+ case nir_intrinsic_image_deref_atomic_imin:
+ case nir_intrinsic_image_deref_atomic_umax:
+ case nir_intrinsic_image_deref_atomic_imax:
+ case nir_intrinsic_image_deref_atomic_and:
+ case nir_intrinsic_image_deref_atomic_or:
+ case nir_intrinsic_image_deref_atomic_xor:
+ case nir_intrinsic_image_deref_atomic_exchange:
+ case nir_intrinsic_image_deref_atomic_comp_swap:
+ res = intrin->src[0].ssa;
+ flags |= has_glc_vmem_load | has_glc_vmem_store;
+ break;
+ default:
+ continue;
+ }
+
+ uint8_t *flags_ptr;
+ uint32_t count;
+ get_buffer_resource_flags(ctx, res, access, &flags_ptr, &count);
+
+ for (unsigned i = 0; i < count; i++)
+ flags_ptr[i] |= flags;
+ }
+ }
+}
+
RegClass get_reg_class(isel_context *ctx, RegType type, unsigned components, unsigned bitsize)
{
if (bitsize == 1)
ctx->shader = shader;
nir_divergence_analysis(shader, nir_divergence_view_index_uniform);
+ fill_desc_set_info(ctx, impl);
+
/* sanitize control flow */
nir_metadata_require(impl, nir_metadata_dominance);
sanitize_cf_list(impl, &impl->body);
std::unique_ptr<unsigned[]> nir_to_aco{new unsigned[impl->num_blocks]()};
+ /* TODO: make this recursive to improve compile times and merge with fill_desc_set_info() */
bool done = false;
while (!done) {
done = true;