From: Rhys Perry Date: Fri, 22 May 2020 16:55:15 +0000 (+0100) Subject: aco: only use SMEM if we can prove it's safe X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=c702f8ed1572ab32cc3904a6d92cc5126c7c6080;p=mesa.git aco: only use SMEM if we can prove it's safe Totals from 26 (0.02% of 127638) affected shaders: SGPRs: 1680 -> 1664 (-0.95%) VGPRs: 1492 -> 1504 (+0.80%) CodeSize: 233140 -> 233016 (-0.05%); split: -0.09%, +0.04% Instrs: 47121 -> 47114 (-0.01%); split: -0.08%, +0.06% VMEM: 4930 -> 4655 (-5.58%); split: +0.12%, -5.70% SMEM: 2030 -> 2001 (-1.43%); split: +3.79%, -5.22% VClause: 891 -> 947 (+6.29%) SClause: 876 -> 816 (-6.85%) Copies: 4734 -> 4716 (-0.38%); split: -0.40%, +0.02% Branches: 2048 -> 2047 (-0.05%) PreSGPRs: 1400 -> 1396 (-0.29%) PreVGPRs: 1440 -> 1443 (+0.21%) Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 58e34688310..83f45d6ed8d 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -5141,11 +5141,11 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr) void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size, Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, - bool glc=false, bool readonly=true) + bool glc=false, bool readonly=true, bool allow_smem=true) { Builder bld(ctx->program, ctx->block); - bool use_smem = dst.type() != RegType::vgpr && ((ctx->options->chip_class >= GFX8 && component_size >= 4) || readonly); + bool use_smem = dst.type() != RegType::vgpr && ((ctx->options->chip_class >= GFX8 && component_size >= 4) || readonly) && allow_smem; if (use_smem) offset = bld.as_uniform(offset); @@ -6140,10 +6140,19 @@ void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u)); - bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT); + unsigned access = nir_intrinsic_access(instr); + bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT); unsigned size = instr->dest.ssa.bit_size / 8; + + uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[0].ssa, access); + /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores + * TODO: this optimization is disabled for now because we still need to ensure correct ordering + */ + bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_store : has_vmem_store)); + allow_smem |= ((access & ACCESS_RESTRICT) && (access & ACCESS_NON_WRITEABLE)) || (access & ACCESS_CAN_REORDER); + load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), - nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false); + nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false, allow_smem); } void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) @@ -6157,9 +6166,17 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u)); + bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); + uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr)); + /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores + * TODO: this optimization is disabled for now because we still need to ensure correct ordering + */ + bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_loadstore : has_vmem_loadstore)); + bool smem = !nir_src_is_divergent(instr->src[2]) && ctx->options->chip_class >= GFX8 && - elem_size_bytes >= 4; + elem_size_bytes >= 4 && + allow_smem; if (smem) offset = bld.as_uniform(offset); bool smem_nonfs = smem && ctx->stage != fragment_fs; @@ -6188,7 +6205,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) if (op != aco_opcode::p_fs_buffer_store_smem) store->operands[1].setFixed(m0); store->operands[2] = Operand(write_datas[i]); - store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); + store->glc = glc; store->dlc = false; store->disable_wqm = true; store->barrier = barrier_buffer; @@ -6206,7 +6223,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) store->operands[3] = Operand(write_datas[i]); store->offset = offsets[i]; store->offen = (offset.type() == RegType::vgpr); - store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); + store->glc = glc; store->dlc = false; store->disable_wqm = true; store->barrier = barrier_buffer; diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 1dbf5b700b5..f82cf78fbcc 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -50,6 +50,19 @@ struct shader_io_state { } }; +enum resource_flags { + has_glc_vmem_load = 0x1, + has_nonglc_vmem_load = 0x2, + has_glc_vmem_store = 0x4, + has_nonglc_vmem_store = 0x8, + + has_vmem_store = has_glc_vmem_store | has_nonglc_vmem_store, + has_vmem_loadstore = has_vmem_store | has_glc_vmem_load | has_nonglc_vmem_load, + has_nonglc_vmem_loadstore = has_nonglc_vmem_load | has_nonglc_vmem_store, + + buffer_is_restrict = 0x10, +}; + struct isel_context { const struct radv_nir_compiler_options *options; struct radv_shader_args *args; @@ -82,6 +95,9 @@ struct isel_context { std::unique_ptr nir_to_aco; /* NIR block index to ACO block index */ } cf_info; + uint32_t resource_flag_offsets[MAX_SETS]; + std::vector buffer_resource_flags; + Temp arg_temps[AC_MAX_ARGS]; /* FS inputs */ @@ -223,6 +239,181 @@ sanitize_cf_list(nir_function_impl *impl, struct exec_list *cf_list) return progress; } +void get_buffer_resource_flags(isel_context *ctx, nir_ssa_def *def, unsigned access, + uint8_t **flags, uint32_t *count) +{ + int desc_set = -1; + unsigned binding = 0; + + if (!def) { + /* global resources are considered aliasing with all other buffers and + * buffer images */ + // TODO: only merge flags of resources which can really alias. + } else if (def->parent_instr->type == nir_instr_type_intrinsic) { + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(def->parent_instr); + if (intrin->intrinsic == nir_intrinsic_vulkan_resource_index) { + desc_set = nir_intrinsic_desc_set(intrin); + binding = nir_intrinsic_binding(intrin); + } + } else if (def->parent_instr->type == nir_instr_type_deref) { + nir_deref_instr *deref = nir_instr_as_deref(def->parent_instr); + assert(deref->type->is_image()); + if (deref->type->sampler_dimensionality != GLSL_SAMPLER_DIM_BUF) { + *flags = NULL; + *count = 0; + return; + } + + nir_variable *var = nir_deref_instr_get_variable(deref); + desc_set = var->data.descriptor_set; + binding = var->data.binding; + } + + if (desc_set < 0) { + *flags = ctx->buffer_resource_flags.data(); + *count = ctx->buffer_resource_flags.size(); + return; + } + + unsigned set_offset = ctx->resource_flag_offsets[desc_set]; + + if (!(ctx->buffer_resource_flags[set_offset + binding] & buffer_is_restrict)) { + /* Non-restrict buffers alias only with other non-restrict buffers. + * We reserve flags[0] for these. */ + *flags = ctx->buffer_resource_flags.data(); + *count = 1; + return; + } + + *flags = ctx->buffer_resource_flags.data() + set_offset + binding; + *count = 1; +} + +uint8_t get_all_buffer_resource_flags(isel_context *ctx, nir_ssa_def *def, unsigned access) +{ + uint8_t *flags; + uint32_t count; + get_buffer_resource_flags(ctx, def, access, &flags, &count); + + uint8_t res = 0; + for (unsigned i = 0; i < count; i++) + res |= flags[i]; + return res; +} + +void fill_desc_set_info(isel_context *ctx, nir_function_impl *impl) +{ + radv_pipeline_layout *pipeline_layout = ctx->options->layout; + + unsigned resource_flag_count = 1; /* +1 to reserve flags[0] for aliased resources */ + for (unsigned i = 0; i < pipeline_layout->num_sets; i++) { + radv_descriptor_set_layout *layout = pipeline_layout->set[i].layout; + ctx->resource_flag_offsets[i] = resource_flag_count; + resource_flag_count += layout->binding_count; + } + ctx->buffer_resource_flags = std::vector(resource_flag_count); + + nir_foreach_variable(var, &impl->function->shader->uniforms) { + if (var->data.mode == nir_var_mem_ssbo && (var->data.access & ACCESS_RESTRICT)) { + uint32_t offset = ctx->resource_flag_offsets[var->data.descriptor_set]; + ctx->buffer_resource_flags[offset + var->data.binding] |= buffer_is_restrict; + } + } + + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (!(nir_intrinsic_infos[intrin->intrinsic].index_map[NIR_INTRINSIC_ACCESS])) + continue; + + nir_ssa_def *res = NULL; + unsigned access = nir_intrinsic_access(intrin); + unsigned flags = 0; + bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); + switch (intrin->intrinsic) { + case nir_intrinsic_load_ssbo: { + unsigned elem_size = intrin->dest.ssa.bit_size; + if (nir_dest_is_divergent(intrin->dest) || ctx->program->chip_class < GFX8 || elem_size < 32) + flags |= glc ? has_glc_vmem_load : has_nonglc_vmem_load; + res = intrin->src[0].ssa; + break; + } + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + flags |= has_glc_vmem_load | has_glc_vmem_store; + res = intrin->src[0].ssa; + break; + case nir_intrinsic_store_ssbo: { + unsigned elem_size = intrin->src[0].ssa->bit_size; + if (nir_src_is_divergent(intrin->src[2]) || ctx->program->chip_class < GFX8 || elem_size < 32) + flags |= glc ? has_glc_vmem_store : has_nonglc_vmem_store; + res = intrin->src[1].ssa; + break; + } + case nir_intrinsic_load_global: + if (!(access & ACCESS_NON_WRITEABLE)) + flags |= glc ? has_glc_vmem_load : has_nonglc_vmem_load; + break; + case nir_intrinsic_store_global: + flags |= glc ? has_glc_vmem_store : has_nonglc_vmem_store; + break; + case nir_intrinsic_global_atomic_add: + case nir_intrinsic_global_atomic_imin: + case nir_intrinsic_global_atomic_umin: + case nir_intrinsic_global_atomic_imax: + case nir_intrinsic_global_atomic_umax: + case nir_intrinsic_global_atomic_and: + case nir_intrinsic_global_atomic_or: + case nir_intrinsic_global_atomic_xor: + case nir_intrinsic_global_atomic_exchange: + case nir_intrinsic_global_atomic_comp_swap: + flags |= has_glc_vmem_load | has_glc_vmem_store; + break; + case nir_intrinsic_image_deref_load: + res = intrin->src[0].ssa; + flags |= glc ? has_glc_vmem_load : has_nonglc_vmem_load; + break; + case nir_intrinsic_image_deref_store: + res = intrin->src[0].ssa; + flags |= (glc || ctx->program->chip_class == GFX6) ? has_glc_vmem_store : has_nonglc_vmem_store; + break; + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + res = intrin->src[0].ssa; + flags |= has_glc_vmem_load | has_glc_vmem_store; + break; + default: + continue; + } + + uint8_t *flags_ptr; + uint32_t count; + get_buffer_resource_flags(ctx, res, access, &flags_ptr, &count); + + for (unsigned i = 0; i < count; i++) + flags_ptr[i] |= flags; + } + } +} + RegClass get_reg_class(isel_context *ctx, RegType type, unsigned components, unsigned bitsize) { if (bitsize == 1) @@ -239,6 +430,8 @@ void init_context(isel_context *ctx, nir_shader *shader) ctx->shader = shader; nir_divergence_analysis(shader, nir_divergence_view_index_uniform); + fill_desc_set_info(ctx, impl); + /* sanitize control flow */ nir_metadata_require(impl, nir_metadata_dominance); sanitize_cf_list(impl, &impl->body); @@ -258,6 +451,7 @@ void init_context(isel_context *ctx, nir_shader *shader) std::unique_ptr nir_to_aco{new unsigned[impl->num_blocks]()}; + /* TODO: make this recursive to improve compile times and merge with fill_desc_set_info() */ bool done = false; while (!done) { done = true;