X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fvulkan%2Fanv_nir_apply_pipeline_layout.c;h=2777762a322c9b6a746131bc0203eef7b9f53ad8;hb=8413fd136c5f82cf8742ea306e139ac5d7bc18f3;hp=978a8a57c701565a107b0abb02b2a44bb890fb3c;hpb=1bce04deb8e62e7fc1c19b1ecc7b6484e33ecf20;p=mesa.git diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c index 978a8a57c70..2777762a322 100644 --- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c +++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c @@ -24,19 +24,39 @@ #include "anv_nir.h" #include "program/prog_parameter.h" #include "nir/nir_builder.h" +#include "compiler/brw_nir.h" +#include "util/set.h" + +/* Sampler tables don't actually have a maximum size but we pick one just so + * that we don't end up emitting too much state on-the-fly. + */ +#define MAX_SAMPLER_TABLE_SIZE 128 +#define BINDLESS_OFFSET 255 struct apply_pipeline_layout_state { + const struct anv_physical_device *pdevice; + nir_shader *shader; nir_builder builder; struct anv_pipeline_layout *layout; bool add_bounds_checks; + nir_address_format ssbo_addr_format; + + /* Place to flag lowered instructions so we don't lower them twice */ + struct set *lowered_instrs; + + int dynamic_offset_uniform_start; + bool uses_constants; + uint8_t constants_offset; struct { - BITSET_WORD *used; + bool desc_buffer_used; + uint8_t desc_offset; + + uint8_t *use_count; uint8_t *surface_offsets; uint8_t *sampler_offsets; - uint8_t *image_offsets; } set[MAX_SETS]; }; @@ -44,15 +64,39 @@ static void add_binding(struct apply_pipeline_layout_state *state, uint32_t set, uint32_t binding) { - BITSET_SET(state->set[set].used, binding); + const struct anv_descriptor_set_binding_layout *bind_layout = + &state->layout->set[set].layout->binding[binding]; + + if (state->set[set].use_count[binding] < UINT8_MAX) + state->set[set].use_count[binding]++; + + /* Only flag the descriptor buffer as used if there's actually data for + * this binding. This lets us be lazy and call this function constantly + * without worrying about unnecessarily enabling the buffer. + */ + if (anv_descriptor_size(bind_layout)) + state->set[set].desc_buffer_used = true; } static void -add_var_binding(struct apply_pipeline_layout_state *state, nir_variable *var) +add_deref_src_binding(struct apply_pipeline_layout_state *state, nir_src src) { + nir_deref_instr *deref = nir_src_as_deref(src); + nir_variable *var = nir_deref_instr_get_variable(deref); add_binding(state, var->data.descriptor_set, var->data.binding); } +static void +add_tex_src_binding(struct apply_pipeline_layout_state *state, + nir_tex_instr *tex, nir_tex_src_type deref_src_type) +{ + int deref_src_idx = nir_tex_instr_src_index(tex, deref_src_type); + if (deref_src_idx < 0) + return; + + add_deref_src_binding(state, tex->src[deref_src_idx].src); +} + static void get_used_bindings_block(nir_block *block, struct apply_pipeline_layout_state *state) @@ -67,19 +111,26 @@ get_used_bindings_block(nir_block *block, nir_intrinsic_binding(intrin)); break; - case nir_intrinsic_image_load: - case nir_intrinsic_image_store: - case nir_intrinsic_image_atomic_add: - case nir_intrinsic_image_atomic_min: - case nir_intrinsic_image_atomic_max: - case nir_intrinsic_image_atomic_and: - case nir_intrinsic_image_atomic_or: - case nir_intrinsic_image_atomic_xor: - case nir_intrinsic_image_atomic_exchange: - case nir_intrinsic_image_atomic_comp_swap: - case nir_intrinsic_image_size: - case nir_intrinsic_image_samples: - add_var_binding(state, intrin->variables[0]->var); + case nir_intrinsic_image_deref_load: + case nir_intrinsic_image_deref_store: + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_min: + case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_deref_size: + case nir_intrinsic_image_deref_samples: + case nir_intrinsic_image_deref_load_param_intel: + case nir_intrinsic_image_deref_load_raw_intel: + case nir_intrinsic_image_deref_store_raw_intel: + add_deref_src_binding(state, intrin->src[0]); + break; + + case nir_intrinsic_load_constant: + state->uses_constants = true; break; default: @@ -89,10 +140,8 @@ get_used_bindings_block(nir_block *block, } case nir_instr_type_tex: { nir_tex_instr *tex = nir_instr_as_tex(instr); - assert(tex->texture); - add_var_binding(state, tex->texture->var); - if (tex->sampler) - add_var_binding(state, tex->sampler->var); + add_tex_src_binding(state, tex, nir_tex_src_texture_deref); + add_tex_src_binding(state, tex, nir_tex_src_sampler_deref); break; } default: @@ -101,6 +150,204 @@ get_used_bindings_block(nir_block *block, } } +static bool +find_descriptor_for_index_src(nir_src src, + struct apply_pipeline_layout_state *state) +{ + nir_intrinsic_instr *intrin = nir_src_as_intrinsic(src); + + while (intrin && intrin->intrinsic == nir_intrinsic_vulkan_resource_reindex) + intrin = nir_src_as_intrinsic(intrin->src[0]); + + if (!intrin || intrin->intrinsic != nir_intrinsic_vulkan_resource_index) + return false; + + uint32_t set = nir_intrinsic_desc_set(intrin); + uint32_t binding = nir_intrinsic_binding(intrin); + uint32_t surface_index = state->set[set].surface_offsets[binding]; + + /* Only lower to a BTI message if we have a valid binding table index. */ + return surface_index < MAX_BINDING_TABLE_SIZE; +} + +static bool +nir_deref_find_descriptor(nir_deref_instr *deref, + struct apply_pipeline_layout_state *state) +{ + while (1) { + /* Nothing we will use this on has a variable */ + assert(deref->deref_type != nir_deref_type_var); + + nir_deref_instr *parent = nir_src_as_deref(deref->parent); + if (!parent) + break; + + deref = parent; + } + assert(deref->deref_type == nir_deref_type_cast); + + nir_intrinsic_instr *intrin = nir_src_as_intrinsic(deref->parent); + if (!intrin || intrin->intrinsic != nir_intrinsic_load_vulkan_descriptor) + return false; + + return find_descriptor_for_index_src(intrin->src[0], state); +} + +static nir_ssa_def * +build_index_for_res_reindex(nir_intrinsic_instr *intrin, + struct apply_pipeline_layout_state *state) +{ + nir_builder *b = &state->builder; + + if (intrin->intrinsic == nir_intrinsic_vulkan_resource_reindex) { + nir_ssa_def *bti = + build_index_for_res_reindex(nir_src_as_intrinsic(intrin->src[0]), state); + + b->cursor = nir_before_instr(&intrin->instr); + return nir_iadd(b, bti, nir_ssa_for_src(b, intrin->src[1], 1)); + } + + assert(intrin->intrinsic == nir_intrinsic_vulkan_resource_index); + + uint32_t set = nir_intrinsic_desc_set(intrin); + uint32_t binding = nir_intrinsic_binding(intrin); + + const struct anv_descriptor_set_binding_layout *bind_layout = + &state->layout->set[set].layout->binding[binding]; + + uint32_t surface_index = state->set[set].surface_offsets[binding]; + uint32_t array_size = bind_layout->array_size; + + b->cursor = nir_before_instr(&intrin->instr); + + nir_ssa_def *array_index = nir_ssa_for_src(b, intrin->src[0], 1); + if (nir_src_is_const(intrin->src[0]) || state->add_bounds_checks) + array_index = nir_umin(b, array_index, nir_imm_int(b, array_size - 1)); + + return nir_iadd_imm(b, array_index, surface_index); +} + +static nir_ssa_def * +build_index_offset_for_deref(nir_deref_instr *deref, + struct apply_pipeline_layout_state *state) +{ + nir_builder *b = &state->builder; + + nir_deref_instr *parent = nir_deref_instr_parent(deref); + if (parent) { + nir_ssa_def *addr = build_index_offset_for_deref(parent, state); + + b->cursor = nir_before_instr(&deref->instr); + return nir_explicit_io_address_from_deref(b, deref, addr, + nir_address_format_32bit_index_offset); + } + + nir_intrinsic_instr *load_desc = nir_src_as_intrinsic(deref->parent); + assert(load_desc->intrinsic == nir_intrinsic_load_vulkan_descriptor); + + nir_ssa_def *index = + build_index_for_res_reindex(nir_src_as_intrinsic(load_desc->src[0]), state); + + /* Return a 0 offset which will get picked up by the recursion */ + b->cursor = nir_before_instr(&deref->instr); + return nir_vec2(b, index, nir_imm_int(b, 0)); +} + +static bool +try_lower_direct_buffer_intrinsic(nir_intrinsic_instr *intrin, bool is_atomic, + struct apply_pipeline_layout_state *state) +{ + nir_builder *b = &state->builder; + + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + if (deref->mode != nir_var_mem_ssbo) + return false; + + /* 64-bit atomics only support A64 messages so we can't lower them to the + * index+offset model. + */ + if (is_atomic && nir_dest_bit_size(intrin->dest) == 64) + return false; + + /* Normal binding table-based messages can't handle non-uniform access so + * we have to fall back to A64. + */ + if (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM) + return false; + + if (!nir_deref_find_descriptor(deref, state)) + return false; + + nir_ssa_def *addr = build_index_offset_for_deref(deref, state); + + b->cursor = nir_before_instr(&intrin->instr); + nir_lower_explicit_io_instr(b, intrin, addr, + nir_address_format_32bit_index_offset); + return true; +} + +static void +lower_direct_buffer_access(nir_function_impl *impl, + struct apply_pipeline_layout_state *state) +{ + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_load_deref: + case nir_intrinsic_store_deref: + try_lower_direct_buffer_intrinsic(intrin, false, state); + break; + case nir_intrinsic_deref_atomic_add: + case nir_intrinsic_deref_atomic_imin: + case nir_intrinsic_deref_atomic_umin: + case nir_intrinsic_deref_atomic_imax: + case nir_intrinsic_deref_atomic_umax: + case nir_intrinsic_deref_atomic_and: + case nir_intrinsic_deref_atomic_or: + case nir_intrinsic_deref_atomic_xor: + case nir_intrinsic_deref_atomic_exchange: + case nir_intrinsic_deref_atomic_comp_swap: + case nir_intrinsic_deref_atomic_fmin: + case nir_intrinsic_deref_atomic_fmax: + case nir_intrinsic_deref_atomic_fcomp_swap: + try_lower_direct_buffer_intrinsic(intrin, true, state); + break; + + case nir_intrinsic_get_buffer_size: { + /* The get_buffer_size intrinsic always just takes a + * index/reindex intrinsic. + */ + if (!find_descriptor_for_index_src(intrin->src[0], state)) + break; + + nir_ssa_def *index = + build_index_for_res_reindex(nir_src_as_intrinsic(intrin->src[0]), + state); + nir_instr_rewrite_src(&intrin->instr, &intrin->src[0], + nir_src_for_ssa(index)); + _mesa_set_add(state->lowered_instrs, intrin); + } + + default: + break; + } + } + } +} + +static nir_address_format +desc_addr_format(VkDescriptorType desc_type, + struct apply_pipeline_layout_state *state) +{ + return (desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER || + desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) ? + state->ssbo_addr_format : nir_address_format_32bit_index_offset; +} + static void lower_res_index_intrinsic(nir_intrinsic_instr *intrin, struct apply_pipeline_layout_state *state) @@ -111,29 +358,78 @@ lower_res_index_intrinsic(nir_intrinsic_instr *intrin, uint32_t set = nir_intrinsic_desc_set(intrin); uint32_t binding = nir_intrinsic_binding(intrin); + const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin); - uint32_t surface_index = state->set[set].surface_offsets[binding]; - uint32_t array_size = - state->layout->set[set].layout->binding[binding].array_size; + const struct anv_descriptor_set_binding_layout *bind_layout = + &state->layout->set[set].layout->binding[binding]; - nir_const_value *const_array_index = nir_src_as_const_value(intrin->src[0]); + uint32_t surface_index = state->set[set].surface_offsets[binding]; + uint32_t array_size = bind_layout->array_size; + + nir_ssa_def *array_index = nir_ssa_for_src(b, intrin->src[0], 1); + if (nir_src_is_const(intrin->src[0]) || state->add_bounds_checks) + array_index = nir_umin(b, array_index, nir_imm_int(b, array_size - 1)); + + nir_ssa_def *index; + if (state->pdevice->has_a64_buffer_access && + (desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER || + desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)) { + /* We store the descriptor offset as 16.8.8 where the top 16 bits are + * the offset into the descriptor set, the next 8 are the binding table + * index of the descriptor buffer, and the bottom 8 bits are the offset + * (in bytes) into the dynamic offset table. + */ + assert(bind_layout->dynamic_offset_index < MAX_DYNAMIC_BUFFERS); + uint32_t dynamic_offset_index = 0xff; /* No dynamic offset */ + if (bind_layout->dynamic_offset_index >= 0) { + dynamic_offset_index = + state->layout->set[set].dynamic_offset_start + + bind_layout->dynamic_offset_index; + } - nir_ssa_def *block_index; - if (const_array_index) { - unsigned array_index = const_array_index->u32[0]; - array_index = MIN2(array_index, array_size - 1); - block_index = nir_imm_int(b, surface_index + array_index); + const uint32_t desc_offset = + bind_layout->descriptor_offset << 16 | + (uint32_t)state->set[set].desc_offset << 8 | + dynamic_offset_index; + + if (state->add_bounds_checks) { + assert(desc_addr_format(desc_type, state) == + nir_address_format_64bit_bounded_global); + assert(intrin->dest.ssa.num_components == 4); + assert(intrin->dest.ssa.bit_size == 32); + index = nir_vec4(b, nir_imm_int(b, desc_offset), + nir_ssa_for_src(b, intrin->src[0], 1), + nir_imm_int(b, array_size - 1), + nir_ssa_undef(b, 1, 32)); + } else { + assert(desc_addr_format(desc_type, state) == + nir_address_format_64bit_global); + assert(intrin->dest.ssa.num_components == 1); + assert(intrin->dest.ssa.bit_size == 64); + index = nir_pack_64_2x32_split(b, nir_imm_int(b, desc_offset), + nir_ssa_for_src(b, intrin->src[0], 1)); + } + } else if (bind_layout->data & ANV_DESCRIPTOR_INLINE_UNIFORM) { + /* This is an inline uniform block. Just reference the descriptor set + * and use the descriptor offset as the base. + */ + assert(desc_addr_format(desc_type, state) == + nir_address_format_32bit_index_offset); + assert(intrin->dest.ssa.num_components == 2); + assert(intrin->dest.ssa.bit_size == 32); + index = nir_imm_ivec2(b, state->set[set].desc_offset, + bind_layout->descriptor_offset); } else { - block_index = nir_ssa_for_src(b, intrin->src[0], 1); - - if (state->add_bounds_checks) - block_index = nir_umin(b, block_index, nir_imm_int(b, array_size - 1)); - - block_index = nir_iadd(b, nir_imm_int(b, surface_index), block_index); + assert(desc_addr_format(desc_type, state) == + nir_address_format_32bit_index_offset); + assert(intrin->dest.ssa.num_components == 2); + assert(intrin->dest.ssa.bit_size == 32); + index = nir_vec2(b, nir_iadd_imm(b, array_index, surface_index), + nir_imm_int(b, 0)); } assert(intrin->dest.is_ssa); - nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(block_index)); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(index)); nir_instr_remove(&intrin->instr); } @@ -143,146 +439,569 @@ lower_res_reindex_intrinsic(nir_intrinsic_instr *intrin, { nir_builder *b = &state->builder; + b->cursor = nir_before_instr(&intrin->instr); + + const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin); + /* For us, the resource indices are just indices into the binding table and * array elements are sequential. A resource_reindex just turns into an * add of the two indices. */ - assert(intrin->src[0].is_ssa && intrin->src[0].is_ssa); - nir_ssa_def *new_index = nir_iadd(b, intrin->src[0].ssa, - intrin->src[1].ssa); + assert(intrin->src[0].is_ssa && intrin->src[1].is_ssa); + nir_ssa_def *old_index = intrin->src[0].ssa; + nir_ssa_def *offset = intrin->src[1].ssa; + + nir_ssa_def *new_index; + switch (desc_addr_format(desc_type, state)) { + case nir_address_format_64bit_bounded_global: + /* See also lower_res_index_intrinsic() */ + assert(intrin->dest.ssa.num_components == 4); + assert(intrin->dest.ssa.bit_size == 32); + new_index = nir_vec4(b, nir_channel(b, old_index, 0), + nir_iadd(b, nir_channel(b, old_index, 1), + offset), + nir_channel(b, old_index, 2), + nir_ssa_undef(b, 1, 32)); + break; + + case nir_address_format_64bit_global: { + /* See also lower_res_index_intrinsic() */ + assert(intrin->dest.ssa.num_components == 1); + assert(intrin->dest.ssa.bit_size == 64); + nir_ssa_def *base = nir_unpack_64_2x32_split_x(b, old_index); + nir_ssa_def *arr_idx = nir_unpack_64_2x32_split_y(b, old_index); + new_index = nir_pack_64_2x32_split(b, base, nir_iadd(b, arr_idx, offset)); + break; + } + + case nir_address_format_32bit_index_offset: + assert(intrin->dest.ssa.num_components == 2); + assert(intrin->dest.ssa.bit_size == 32); + new_index = nir_vec2(b, nir_iadd(b, nir_channel(b, old_index, 0), offset), + nir_channel(b, old_index, 1)); + break; + + default: + unreachable("Uhandled address format"); + } assert(intrin->dest.is_ssa); nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(new_index)); nir_instr_remove(&intrin->instr); } +static nir_ssa_def * +build_ssbo_descriptor_load(const VkDescriptorType desc_type, + nir_ssa_def *index, + struct apply_pipeline_layout_state *state) +{ + nir_builder *b = &state->builder; + + nir_ssa_def *desc_offset, *array_index; + switch (state->ssbo_addr_format) { + case nir_address_format_64bit_bounded_global: + /* See also lower_res_index_intrinsic() */ + desc_offset = nir_channel(b, index, 0); + array_index = nir_umin(b, nir_channel(b, index, 1), + nir_channel(b, index, 2)); + break; + + case nir_address_format_64bit_global: + /* See also lower_res_index_intrinsic() */ + desc_offset = nir_unpack_64_2x32_split_x(b, index); + array_index = nir_unpack_64_2x32_split_y(b, index); + break; + + default: + unreachable("Unhandled address format for SSBO"); + } + + /* The desc_offset is actually 16.8.8 */ + nir_ssa_def *desc_buffer_index = + nir_extract_u8(b, desc_offset, nir_imm_int(b, 1)); + nir_ssa_def *desc_offset_base = + nir_extract_u16(b, desc_offset, nir_imm_int(b, 1)); + + /* Compute the actual descriptor offset */ + const unsigned descriptor_size = + anv_descriptor_type_size(state->pdevice, desc_type); + desc_offset = nir_iadd(b, desc_offset_base, + nir_imul_imm(b, array_index, descriptor_size)); + + nir_intrinsic_instr *desc_load = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo); + desc_load->src[0] = nir_src_for_ssa(desc_buffer_index); + desc_load->src[1] = nir_src_for_ssa(desc_offset); + desc_load->num_components = 4; + nir_ssa_dest_init(&desc_load->instr, &desc_load->dest, 4, 32, NULL); + nir_builder_instr_insert(b, &desc_load->instr); + + return &desc_load->dest.ssa; +} + static void -lower_tex_deref(nir_tex_instr *tex, nir_deref_var *deref, - unsigned *const_index, unsigned array_size, - nir_tex_src_type src_type, bool allow_indirect, - struct apply_pipeline_layout_state *state) +lower_load_vulkan_descriptor(nir_intrinsic_instr *intrin, + struct apply_pipeline_layout_state *state) { nir_builder *b = &state->builder; - if (deref->deref.child) { - assert(deref->deref.child->deref_type == nir_deref_type_array); - nir_deref_array *deref_array = nir_deref_as_array(deref->deref.child); - - if (deref_array->deref_array_type == nir_deref_array_type_indirect) { - /* From VK_KHR_sampler_ycbcr_conversion: - * - * If sampler Y’CBCR conversion is enabled, the combined image - * sampler must be indexed only by constant integral expressions when - * aggregated into arrays in shader code, irrespective of the - * shaderSampledImageArrayDynamicIndexing feature. + b->cursor = nir_before_instr(&intrin->instr); + + const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin); + + assert(intrin->src[0].is_ssa); + nir_ssa_def *index = intrin->src[0].ssa; + + nir_ssa_def *desc; + if (state->pdevice->has_a64_buffer_access && + (desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER || + desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)) { + desc = build_ssbo_descriptor_load(desc_type, index, state); + + /* We want nir_address_format_64bit_global */ + if (!state->add_bounds_checks) + desc = nir_pack_64_2x32(b, nir_channels(b, desc, 0x3)); + + if (state->dynamic_offset_uniform_start >= 0) { + /* This shader has dynamic offsets and we have no way of knowing + * (save from the dynamic offset base index) if this buffer has a + * dynamic offset. */ - assert(allow_indirect); + nir_ssa_def *desc_offset, *array_index; + switch (state->ssbo_addr_format) { + case nir_address_format_64bit_bounded_global: + /* See also lower_res_index_intrinsic() */ + desc_offset = nir_channel(b, index, 0); + array_index = nir_umin(b, nir_channel(b, index, 1), + nir_channel(b, index, 2)); + break; - nir_ssa_def *index = - nir_iadd(b, nir_imm_int(b, deref_array->base_offset), - nir_ssa_for_src(b, deref_array->indirect, 1)); + case nir_address_format_64bit_global: + /* See also lower_res_index_intrinsic() */ + desc_offset = nir_unpack_64_2x32_split_x(b, index); + array_index = nir_unpack_64_2x32_split_y(b, index); + break; - if (state->add_bounds_checks) - index = nir_umin(b, index, nir_imm_int(b, array_size - 1)); + default: + unreachable("Unhandled address format for SSBO"); + } - nir_tex_instr_add_src(tex, src_type, nir_src_for_ssa(index)); - } else { - *const_index += MIN2(deref_array->base_offset, array_size - 1); + nir_ssa_def *dyn_offset_base = + nir_extract_u8(b, desc_offset, nir_imm_int(b, 0)); + nir_ssa_def *dyn_offset_idx = + nir_iadd(b, dyn_offset_base, array_index); + if (state->add_bounds_checks) { + dyn_offset_idx = nir_umin(b, dyn_offset_idx, + nir_imm_int(b, MAX_DYNAMIC_BUFFERS)); + } + + nir_intrinsic_instr *dyn_load = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform); + nir_intrinsic_set_base(dyn_load, state->dynamic_offset_uniform_start); + nir_intrinsic_set_range(dyn_load, MAX_DYNAMIC_BUFFERS * 4); + dyn_load->src[0] = nir_src_for_ssa(nir_imul_imm(b, dyn_offset_idx, 4)); + dyn_load->num_components = 1; + nir_ssa_dest_init(&dyn_load->instr, &dyn_load->dest, 1, 32, NULL); + nir_builder_instr_insert(b, &dyn_load->instr); + + nir_ssa_def *dynamic_offset = + nir_bcsel(b, nir_ieq(b, dyn_offset_base, nir_imm_int(b, 0xff)), + nir_imm_int(b, 0), &dyn_load->dest.ssa); + + switch (state->ssbo_addr_format) { + case nir_address_format_64bit_bounded_global: { + /* The dynamic offset gets added to the base pointer so that we + * have a sliding window range. + */ + nir_ssa_def *base_ptr = + nir_pack_64_2x32(b, nir_channels(b, desc, 0x3)); + base_ptr = nir_iadd(b, base_ptr, nir_u2u64(b, dynamic_offset)); + desc = nir_vec4(b, nir_unpack_64_2x32_split_x(b, base_ptr), + nir_unpack_64_2x32_split_y(b, base_ptr), + nir_channel(b, desc, 2), + nir_channel(b, desc, 3)); + break; + } + + case nir_address_format_64bit_global: + desc = nir_iadd(b, desc, nir_u2u64(b, dynamic_offset)); + break; + + default: + unreachable("Unhandled address format for SSBO"); + } } + } else { + /* We follow the nir_address_format_32bit_index_offset model */ + desc = index; } + + assert(intrin->dest.is_ssa); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(desc)); + nir_instr_remove(&intrin->instr); } static void -cleanup_tex_deref(nir_tex_instr *tex, nir_deref_var *deref) +lower_get_buffer_size(nir_intrinsic_instr *intrin, + struct apply_pipeline_layout_state *state) { - if (deref->deref.child == NULL) + if (_mesa_set_search(state->lowered_instrs, intrin)) return; - nir_deref_array *deref_array = nir_deref_as_array(deref->deref.child); + nir_builder *b = &state->builder; + + b->cursor = nir_before_instr(&intrin->instr); - if (deref_array->deref_array_type != nir_deref_array_type_indirect) - return; + const VkDescriptorType desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - nir_instr_rewrite_src(&tex->instr, &deref_array->indirect, NIR_SRC_INIT); + assert(intrin->src[0].is_ssa); + nir_ssa_def *index = intrin->src[0].ssa; + + if (state->pdevice->has_a64_buffer_access) { + nir_ssa_def *desc = build_ssbo_descriptor_load(desc_type, index, state); + nir_ssa_def *size = nir_channel(b, desc, 2); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(size)); + nir_instr_remove(&intrin->instr); + } else { + /* We're following the nir_address_format_32bit_index_offset model so + * the binding table index is the first component of the address. The + * back-end wants a scalar binding table index source. + */ + nir_instr_rewrite_src(&intrin->instr, &intrin->src[0], + nir_src_for_ssa(nir_channel(b, index, 0))); + } } -static bool -has_tex_src_plane(nir_tex_instr *tex) +static nir_ssa_def * +build_descriptor_load(nir_deref_instr *deref, unsigned offset, + unsigned num_components, unsigned bit_size, + struct apply_pipeline_layout_state *state) { - for (unsigned i = 0; i < tex->num_srcs; i++) { - if (tex->src[i].src_type == nir_tex_src_plane) - return true; + nir_variable *var = nir_deref_instr_get_variable(deref); + + unsigned set = var->data.descriptor_set; + unsigned binding = var->data.binding; + unsigned array_size = + state->layout->set[set].layout->binding[binding].array_size; + + const struct anv_descriptor_set_binding_layout *bind_layout = + &state->layout->set[set].layout->binding[binding]; + + nir_builder *b = &state->builder; + + nir_ssa_def *desc_buffer_index = + nir_imm_int(b, state->set[set].desc_offset); + + nir_ssa_def *desc_offset = + nir_imm_int(b, bind_layout->descriptor_offset + offset); + if (deref->deref_type != nir_deref_type_var) { + assert(deref->deref_type == nir_deref_type_array); + + const unsigned descriptor_size = anv_descriptor_size(bind_layout); + nir_ssa_def *arr_index = nir_ssa_for_src(b, deref->arr.index, 1); + if (state->add_bounds_checks) + arr_index = nir_umin(b, arr_index, nir_imm_int(b, array_size - 1)); + + desc_offset = nir_iadd(b, desc_offset, + nir_imul_imm(b, arr_index, descriptor_size)); } - return false; + nir_intrinsic_instr *desc_load = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo); + desc_load->src[0] = nir_src_for_ssa(desc_buffer_index); + desc_load->src[1] = nir_src_for_ssa(desc_offset); + desc_load->num_components = num_components; + nir_ssa_dest_init(&desc_load->instr, &desc_load->dest, + num_components, bit_size, NULL); + nir_builder_instr_insert(b, &desc_load->instr); + + return &desc_load->dest.ssa; } -static uint32_t -extract_tex_src_plane(nir_tex_instr *tex) +static void +lower_image_intrinsic(nir_intrinsic_instr *intrin, + struct apply_pipeline_layout_state *state) { - unsigned plane = 0; - - int plane_src_idx = -1; - for (unsigned i = 0; i < tex->num_srcs; i++) { - if (tex->src[i].src_type == nir_tex_src_plane) { - nir_const_value *const_plane = - nir_src_as_const_value(tex->src[i].src); - - /* Our color conversion lowering pass should only ever insert - * constants. */ - assert(const_plane); - plane = const_plane->u32[0]; - plane_src_idx = i; + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + nir_variable *var = nir_deref_instr_get_variable(deref); + + unsigned set = var->data.descriptor_set; + unsigned binding = var->data.binding; + unsigned binding_offset = state->set[set].surface_offsets[binding]; + + nir_builder *b = &state->builder; + b->cursor = nir_before_instr(&intrin->instr); + + const bool use_bindless = state->pdevice->has_bindless_images; + + if (intrin->intrinsic == nir_intrinsic_image_deref_load_param_intel) { + b->cursor = nir_instr_remove(&intrin->instr); + + assert(!use_bindless); /* Otherwise our offsets would be wrong */ + const unsigned param = nir_intrinsic_base(intrin); + + nir_ssa_def *desc = + build_descriptor_load(deref, param * 16, + intrin->dest.ssa.num_components, + intrin->dest.ssa.bit_size, state); + + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(desc)); + } else if (binding_offset > MAX_BINDING_TABLE_SIZE) { + const bool write_only = + (var->data.image.access & ACCESS_NON_READABLE) != 0; + nir_ssa_def *desc = + build_descriptor_load(deref, 0, 2, 32, state); + nir_ssa_def *handle = nir_channel(b, desc, write_only ? 1 : 0); + nir_rewrite_image_intrinsic(intrin, handle, true); + } else { + unsigned array_size = + state->layout->set[set].layout->binding[binding].array_size; + + nir_ssa_def *index = NULL; + if (deref->deref_type != nir_deref_type_var) { + assert(deref->deref_type == nir_deref_type_array); + index = nir_ssa_for_src(b, deref->arr.index, 1); + if (state->add_bounds_checks) + index = nir_umin(b, index, nir_imm_int(b, array_size - 1)); + } else { + index = nir_imm_int(b, 0); } + + index = nir_iadd_imm(b, index, binding_offset); + nir_rewrite_image_intrinsic(intrin, index, false); } +} - assert(plane_src_idx >= 0); - nir_tex_instr_remove_src(tex, plane_src_idx); +static void +lower_load_constant(nir_intrinsic_instr *intrin, + struct apply_pipeline_layout_state *state) +{ + nir_builder *b = &state->builder; - return plane; + b->cursor = nir_before_instr(&intrin->instr); + + nir_ssa_def *index = nir_imm_int(b, state->constants_offset); + nir_ssa_def *offset = nir_iadd(b, nir_ssa_for_src(b, intrin->src[0], 1), + nir_imm_int(b, nir_intrinsic_base(intrin))); + + nir_intrinsic_instr *load_ubo = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo); + load_ubo->num_components = intrin->num_components; + load_ubo->src[0] = nir_src_for_ssa(index); + load_ubo->src[1] = nir_src_for_ssa(offset); + nir_ssa_dest_init(&load_ubo->instr, &load_ubo->dest, + intrin->dest.ssa.num_components, + intrin->dest.ssa.bit_size, NULL); + nir_builder_instr_insert(b, &load_ubo->instr); + + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, + nir_src_for_ssa(&load_ubo->dest.ssa)); + nir_instr_remove(&intrin->instr); } static void -lower_tex(nir_tex_instr *tex, struct apply_pipeline_layout_state *state) +lower_tex_deref(nir_tex_instr *tex, nir_tex_src_type deref_src_type, + unsigned *base_index, unsigned plane, + struct apply_pipeline_layout_state *state) { - /* No one should have come by and lowered it already */ - assert(tex->texture); + int deref_src_idx = nir_tex_instr_src_index(tex, deref_src_type); + if (deref_src_idx < 0) + return; - state->builder.cursor = nir_before_instr(&tex->instr); + nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src); + nir_variable *var = nir_deref_instr_get_variable(deref); - unsigned set = tex->texture->var->data.descriptor_set; - unsigned binding = tex->texture->var->data.binding; + unsigned set = var->data.descriptor_set; + unsigned binding = var->data.binding; unsigned array_size = state->layout->set[set].layout->binding[binding].array_size; - bool has_plane = has_tex_src_plane(tex); - unsigned plane = has_plane ? extract_tex_src_plane(tex) : 0; - tex->texture_index = state->set[set].surface_offsets[binding]; - lower_tex_deref(tex, tex->texture, &tex->texture_index, array_size, - nir_tex_src_texture_offset, !has_plane, state); - tex->texture_index += plane; + unsigned binding_offset; + if (deref_src_type == nir_tex_src_texture_deref) { + binding_offset = state->set[set].surface_offsets[binding]; + } else { + assert(deref_src_type == nir_tex_src_sampler_deref); + binding_offset = state->set[set].sampler_offsets[binding]; + } - if (tex->sampler) { - unsigned set = tex->sampler->var->data.descriptor_set; - unsigned binding = tex->sampler->var->data.binding; - unsigned array_size = - state->layout->set[set].layout->binding[binding].array_size; - tex->sampler_index = state->set[set].sampler_offsets[binding]; - lower_tex_deref(tex, tex->sampler, &tex->sampler_index, array_size, - nir_tex_src_sampler_offset, !has_plane, state); - tex->sampler_index += plane; + nir_builder *b = &state->builder; + + nir_tex_src_type offset_src_type; + nir_ssa_def *index = NULL; + if (binding_offset > MAX_BINDING_TABLE_SIZE) { + const unsigned plane_offset = + plane * sizeof(struct anv_sampled_image_descriptor); + + nir_ssa_def *desc = + build_descriptor_load(deref, plane_offset, 2, 32, state); + + if (deref_src_type == nir_tex_src_texture_deref) { + offset_src_type = nir_tex_src_texture_handle; + index = nir_channel(b, desc, 0); + } else { + assert(deref_src_type == nir_tex_src_sampler_deref); + offset_src_type = nir_tex_src_sampler_handle; + index = nir_channel(b, desc, 1); + } + } else { + if (deref_src_type == nir_tex_src_texture_deref) { + offset_src_type = nir_tex_src_texture_offset; + } else { + assert(deref_src_type == nir_tex_src_sampler_deref); + offset_src_type = nir_tex_src_sampler_offset; + } + + *base_index = binding_offset + plane; + + if (deref->deref_type != nir_deref_type_var) { + assert(deref->deref_type == nir_deref_type_array); + + if (nir_src_is_const(deref->arr.index)) { + unsigned arr_index = nir_src_as_uint(deref->arr.index); + *base_index += MIN2(arr_index, array_size - 1); + } else { + /* From VK_KHR_sampler_ycbcr_conversion: + * + * If sampler Y’CBCR conversion is enabled, the combined image + * sampler must be indexed only by constant integral expressions + * when aggregated into arrays in shader code, irrespective of + * the shaderSampledImageArrayDynamicIndexing feature. + */ + assert(nir_tex_instr_src_index(tex, nir_tex_src_plane) == -1); + + index = nir_ssa_for_src(b, deref->arr.index, 1); + + if (state->add_bounds_checks) + index = nir_umin(b, index, nir_imm_int(b, array_size - 1)); + } + } } + if (index) { + nir_instr_rewrite_src(&tex->instr, &tex->src[deref_src_idx].src, + nir_src_for_ssa(index)); + tex->src[deref_src_idx].src_type = offset_src_type; + } else { + nir_tex_instr_remove_src(tex, deref_src_idx); + } +} + +static uint32_t +tex_instr_get_and_remove_plane_src(nir_tex_instr *tex) +{ + int plane_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_plane); + if (plane_src_idx < 0) + return 0; + + unsigned plane = nir_src_as_uint(tex->src[plane_src_idx].src); + + nir_tex_instr_remove_src(tex, plane_src_idx); + + return plane; +} + +static nir_ssa_def * +build_def_array_select(nir_builder *b, nir_ssa_def **srcs, nir_ssa_def *idx, + unsigned start, unsigned end) +{ + if (start == end - 1) { + return srcs[start]; + } else { + unsigned mid = start + (end - start) / 2; + return nir_bcsel(b, nir_ilt(b, idx, nir_imm_int(b, mid)), + build_def_array_select(b, srcs, idx, start, mid), + build_def_array_select(b, srcs, idx, mid, end)); + } +} + +static void +lower_gen7_tex_swizzle(nir_tex_instr *tex, unsigned plane, + struct apply_pipeline_layout_state *state) +{ + assert(state->pdevice->info.gen == 7 && !state->pdevice->info.is_haswell); + if (tex->sampler_dim == GLSL_SAMPLER_DIM_BUF || + nir_tex_instr_is_query(tex) || + tex->op == nir_texop_tg4 || /* We can't swizzle TG4 */ + (tex->is_shadow && tex->is_new_style_shadow)) + return; + + int deref_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref); + assert(deref_src_idx >= 0); + + nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src); + UNUSED nir_variable *var = nir_deref_instr_get_variable(deref); + + UNUSED unsigned set = var->data.descriptor_set; + UNUSED unsigned binding = var->data.binding; + UNUSED const struct anv_descriptor_set_binding_layout *bind_layout = + &state->layout->set[set].layout->binding[binding]; + assert(bind_layout->data & ANV_DESCRIPTOR_TEXTURE_SWIZZLE); + + nir_builder *b = &state->builder; + b->cursor = nir_before_instr(&tex->instr); + + const unsigned plane_offset = + plane * sizeof(struct anv_texture_swizzle_descriptor); + nir_ssa_def *swiz = + build_descriptor_load(deref, plane_offset, 1, 32, state); + + b->cursor = nir_after_instr(&tex->instr); + + assert(tex->dest.ssa.bit_size == 32); + assert(tex->dest.ssa.num_components == 4); + + /* Initializing to undef is ok; nir_opt_undef will clean it up. */ + nir_ssa_def *undef = nir_ssa_undef(b, 1, 32); + nir_ssa_def *comps[8]; + for (unsigned i = 0; i < ARRAY_SIZE(comps); i++) + comps[i] = undef; + + comps[ISL_CHANNEL_SELECT_ZERO] = nir_imm_int(b, 0); + if (nir_alu_type_get_base_type(tex->dest_type) == nir_type_float) + comps[ISL_CHANNEL_SELECT_ONE] = nir_imm_float(b, 1); + else + comps[ISL_CHANNEL_SELECT_ONE] = nir_imm_int(b, 1); + comps[ISL_CHANNEL_SELECT_RED] = nir_channel(b, &tex->dest.ssa, 0); + comps[ISL_CHANNEL_SELECT_GREEN] = nir_channel(b, &tex->dest.ssa, 1); + comps[ISL_CHANNEL_SELECT_BLUE] = nir_channel(b, &tex->dest.ssa, 2); + comps[ISL_CHANNEL_SELECT_ALPHA] = nir_channel(b, &tex->dest.ssa, 3); + + nir_ssa_def *swiz_comps[4]; + for (unsigned i = 0; i < 4; i++) { + nir_ssa_def *comp_swiz = nir_extract_u8(b, swiz, nir_imm_int(b, i)); + swiz_comps[i] = build_def_array_select(b, comps, comp_swiz, 0, 8); + } + nir_ssa_def *swiz_tex_res = nir_vec(b, swiz_comps, 4); + + /* Rewrite uses before we insert so we don't rewrite this use */ + nir_ssa_def_rewrite_uses_after(&tex->dest.ssa, + nir_src_for_ssa(swiz_tex_res), + swiz_tex_res->parent_instr); +} + +static void +lower_tex(nir_tex_instr *tex, struct apply_pipeline_layout_state *state) +{ + unsigned plane = tex_instr_get_and_remove_plane_src(tex); + + /* On Ivy Bridge and Bay Trail, we have to swizzle in the shader. Do this + * before we lower the derefs away so we can still find the descriptor. + */ + if (state->pdevice->info.gen == 7 && !state->pdevice->info.is_haswell) + lower_gen7_tex_swizzle(tex, plane, state); + + state->builder.cursor = nir_before_instr(&tex->instr); + + lower_tex_deref(tex, nir_tex_src_texture_deref, + &tex->texture_index, plane, state); + + lower_tex_deref(tex, nir_tex_src_sampler_deref, + &tex->sampler_index, plane, state); + /* The backend only ever uses this to mark used surfaces. We don't care * about that little optimization so it just needs to be non-zero. */ tex->texture_array_size = 1; - - cleanup_tex_deref(tex, tex->texture); - if (tex->sampler) - cleanup_tex_deref(tex, tex->sampler); - tex->texture = NULL; - tex->sampler = NULL; } static void @@ -300,6 +1019,32 @@ apply_pipeline_layout_block(nir_block *block, case nir_intrinsic_vulkan_resource_reindex: lower_res_reindex_intrinsic(intrin, state); break; + case nir_intrinsic_load_vulkan_descriptor: + lower_load_vulkan_descriptor(intrin, state); + break; + case nir_intrinsic_get_buffer_size: + lower_get_buffer_size(intrin, state); + break; + case nir_intrinsic_image_deref_load: + case nir_intrinsic_image_deref_store: + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_min: + case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_deref_size: + case nir_intrinsic_image_deref_samples: + case nir_intrinsic_image_deref_load_param_intel: + case nir_intrinsic_image_deref_load_raw_intel: + case nir_intrinsic_image_deref_store_raw_intel: + lower_image_intrinsic(intrin, state); + break; + case nir_intrinsic_load_constant: + lower_load_constant(intrin, state); + break; default: break; } @@ -314,40 +1059,50 @@ apply_pipeline_layout_block(nir_block *block, } } -static void -setup_vec4_uniform_value(uint32_t *params, uint32_t offset, unsigned n) +struct binding_info { + uint32_t binding; + uint8_t set; + uint16_t score; +}; + +static int +compare_binding_infos(const void *_a, const void *_b) { - for (unsigned i = 0; i < n; ++i) - params[i] = ANV_PARAM_PUSH(offset + i * sizeof(uint32_t)); + const struct binding_info *a = _a, *b = _b; + if (a->score != b->score) + return b->score - a->score; - for (unsigned i = n; i < 4; ++i) - params[i] = BRW_PARAM_BUILTIN_ZERO; + if (a->set != b->set) + return a->set - b->set; + + return a->binding - b->binding; } void -anv_nir_apply_pipeline_layout(struct anv_pipeline *pipeline, +anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, + bool robust_buffer_access, + struct anv_pipeline_layout *layout, nir_shader *shader, struct brw_stage_prog_data *prog_data, struct anv_pipeline_bind_map *map) { - struct anv_pipeline_layout *layout = pipeline->layout; - gl_shader_stage stage = shader->info.stage; + void *mem_ctx = ralloc_context(NULL); struct apply_pipeline_layout_state state = { + .pdevice = pdevice, .shader = shader, .layout = layout, - .add_bounds_checks = pipeline->device->robust_buffer_access, + .add_bounds_checks = robust_buffer_access, + .ssbo_addr_format = anv_nir_ssbo_addr_format(pdevice, robust_buffer_access), + .lowered_instrs = _mesa_pointer_set_create(mem_ctx), + .dynamic_offset_uniform_start = -1, }; - void *mem_ctx = ralloc_context(NULL); - for (unsigned s = 0; s < layout->num_sets; s++) { const unsigned count = layout->set[s].layout->binding_count; - const unsigned words = BITSET_WORDS(count); - state.set[s].used = rzalloc_array(mem_ctx, BITSET_WORD, words); + state.set[s].use_count = rzalloc_array(mem_ctx, uint8_t, count); state.set[s].surface_offsets = rzalloc_array(mem_ctx, uint8_t, count); state.set[s].sampler_offsets = rzalloc_array(mem_ctx, uint8_t, count); - state.set[s].image_offsets = rzalloc_array(mem_ctx, uint8_t, count); } nir_foreach_function(function, shader) { @@ -358,86 +1113,175 @@ anv_nir_apply_pipeline_layout(struct anv_pipeline *pipeline, get_used_bindings_block(block, &state); } + for (unsigned s = 0; s < layout->num_sets; s++) { + if (state.set[s].desc_buffer_used) { + map->surface_to_descriptor[map->surface_count] = + (struct anv_pipeline_binding) { + .set = ANV_DESCRIPTOR_SET_DESCRIPTORS, + .binding = s, + }; + state.set[s].desc_offset = map->surface_count; + map->surface_count++; + } + } + + if (state.uses_constants) { + state.constants_offset = map->surface_count; + map->surface_to_descriptor[map->surface_count].set = + ANV_DESCRIPTOR_SET_SHADER_CONSTANTS; + map->surface_count++; + } + + unsigned used_binding_count = 0; for (uint32_t set = 0; set < layout->num_sets; set++) { struct anv_descriptor_set_layout *set_layout = layout->set[set].layout; + for (unsigned b = 0; b < set_layout->binding_count; b++) { + if (state.set[set].use_count[b] == 0) + continue; - BITSET_WORD b, _tmp; - BITSET_FOREACH_SET(b, _tmp, state.set[set].used, - set_layout->binding_count) { - if (set_layout->binding[b].stage[stage].surface_index >= 0) { - map->surface_count += - anv_descriptor_set_binding_layout_get_hw_size(&set_layout->binding[b]); - } - if (set_layout->binding[b].stage[stage].sampler_index >= 0) { - map->sampler_count += - anv_descriptor_set_binding_layout_get_hw_size(&set_layout->binding[b]); - } - if (set_layout->binding[b].stage[stage].image_index >= 0) - map->image_count += set_layout->binding[b].array_size; + used_binding_count++; } } - unsigned surface = 0; - unsigned sampler = 0; - unsigned image = 0; + struct binding_info *infos = + rzalloc_array(mem_ctx, struct binding_info, used_binding_count); + used_binding_count = 0; for (uint32_t set = 0; set < layout->num_sets; set++) { struct anv_descriptor_set_layout *set_layout = layout->set[set].layout; + for (unsigned b = 0; b < set_layout->binding_count; b++) { + if (state.set[set].use_count[b] == 0) + continue; - BITSET_WORD b, _tmp; - BITSET_FOREACH_SET(b, _tmp, state.set[set].used, - set_layout->binding_count) { struct anv_descriptor_set_binding_layout *binding = - &set_layout->binding[b]; + &layout->set[set].layout->binding[b]; + + /* Do a fixed-point calculation to generate a score based on the + * number of uses and the binding array size. We shift by 7 instead + * of 8 because we're going to use the top bit below to make + * everything which does not support bindless super higher priority + * than things which do. + */ + uint16_t score = ((uint16_t)state.set[set].use_count[b] << 7) / + binding->array_size; + + /* If the descriptor type doesn't support bindless then put it at the + * beginning so we guarantee it gets a slot. + */ + if (!anv_descriptor_supports_bindless(pdevice, binding, true) || + !anv_descriptor_supports_bindless(pdevice, binding, false)) + score |= 1 << 15; + + infos[used_binding_count++] = (struct binding_info) { + .set = set, + .binding = b, + .score = score, + }; + } + } + + /* Order the binding infos based on score with highest scores first. If + * scores are equal we then order by set and binding. + */ + qsort(infos, used_binding_count, sizeof(struct binding_info), + compare_binding_infos); + + bool have_dynamic_buffers = false; + + for (unsigned i = 0; i < used_binding_count; i++) { + unsigned set = infos[i].set, b = infos[i].binding; + struct anv_descriptor_set_binding_layout *binding = + &layout->set[set].layout->binding[b]; - if (binding->stage[stage].surface_index >= 0) { - state.set[set].surface_offsets[b] = surface; + if (binding->dynamic_offset_index >= 0) + have_dynamic_buffers = true; + + const uint32_t array_size = binding->array_size; + + if (binding->data & ANV_DESCRIPTOR_SURFACE_STATE) { + if (map->surface_count + array_size > MAX_BINDING_TABLE_SIZE || + anv_descriptor_requires_bindless(pdevice, binding, false)) { + /* If this descriptor doesn't fit in the binding table or if it + * requires bindless for some reason, flag it as bindless. + */ + assert(anv_descriptor_supports_bindless(pdevice, binding, false)); + state.set[set].surface_offsets[b] = BINDLESS_OFFSET; + } else { + state.set[set].surface_offsets[b] = map->surface_count; struct anv_sampler **samplers = binding->immutable_samplers; for (unsigned i = 0; i < binding->array_size; i++) { uint8_t planes = samplers ? samplers[i]->n_planes : 1; for (uint8_t p = 0; p < planes; p++) { - map->surface_to_descriptor[surface].set = set; - map->surface_to_descriptor[surface].binding = b; - map->surface_to_descriptor[surface].index = i; - map->surface_to_descriptor[surface].plane = p; - surface++; + map->surface_to_descriptor[map->surface_count++] = + (struct anv_pipeline_binding) { + .set = set, + .binding = b, + .index = i, + .plane = p, + }; } } } + assert(map->surface_count <= MAX_BINDING_TABLE_SIZE); + } - if (binding->stage[stage].sampler_index >= 0) { - state.set[set].sampler_offsets[b] = sampler; + if (binding->data & ANV_DESCRIPTOR_SAMPLER_STATE) { + if (map->sampler_count + array_size > MAX_SAMPLER_TABLE_SIZE || + anv_descriptor_requires_bindless(pdevice, binding, true)) { + /* If this descriptor doesn't fit in the binding table or if it + * requires bindless for some reason, flag it as bindless. + * + * We also make large sampler arrays bindless because we can avoid + * using indirect sends thanks to bindless samplers being packed + * less tightly than the sampler table. + */ + assert(anv_descriptor_supports_bindless(pdevice, binding, true)); + state.set[set].sampler_offsets[b] = BINDLESS_OFFSET; + } else { + state.set[set].sampler_offsets[b] = map->sampler_count; struct anv_sampler **samplers = binding->immutable_samplers; for (unsigned i = 0; i < binding->array_size; i++) { uint8_t planes = samplers ? samplers[i]->n_planes : 1; for (uint8_t p = 0; p < planes; p++) { - map->sampler_to_descriptor[sampler].set = set; - map->sampler_to_descriptor[sampler].binding = b; - map->sampler_to_descriptor[sampler].index = i; - map->sampler_to_descriptor[sampler].plane = p; - sampler++; + map->sampler_to_descriptor[map->sampler_count++] = + (struct anv_pipeline_binding) { + .set = set, + .binding = b, + .index = i, + .plane = p, + }; } } } - - if (binding->stage[stage].image_index >= 0) { - state.set[set].image_offsets[b] = image; - image += binding->array_size; - } } } + if (have_dynamic_buffers) { + state.dynamic_offset_uniform_start = shader->num_uniforms; + uint32_t *param = brw_stage_prog_data_add_params(prog_data, + MAX_DYNAMIC_BUFFERS); + for (unsigned i = 0; i < MAX_DYNAMIC_BUFFERS; i++) + param[i] = ANV_PARAM_DYN_OFFSET(i); + shader->num_uniforms += MAX_DYNAMIC_BUFFERS * 4; + assert(shader->num_uniforms == prog_data->nr_params * 4); + } + nir_foreach_variable(var, &shader->uniforms) { - if (!glsl_type_is_image(var->interface_type)) + const struct glsl_type *glsl_type = glsl_without_array(var->type); + + if (!glsl_type_is_image(glsl_type)) continue; - enum glsl_sampler_dim dim = glsl_get_sampler_dim(var->interface_type); + enum glsl_sampler_dim dim = glsl_get_sampler_dim(glsl_type); const uint32_t set = var->data.descriptor_set; const uint32_t binding = var->data.binding; const uint32_t array_size = layout->set[set].layout->binding[binding].array_size; - if (!BITSET_TEST(state.set[set].used, binding)) + if (state.set[set].use_count[binding] == 0) + continue; + + if (state.set[set].surface_offsets[binding] >= MAX_BINDING_TABLE_SIZE) continue; struct anv_pipeline_binding *pipe_binding = @@ -451,7 +1295,8 @@ anv_nir_apply_pipeline_layout(struct anv_pipeline *pipeline, dim == GLSL_SAMPLER_DIM_SUBPASS_MS) pipe_binding[i].input_attachment_index = var->data.index + i; - pipe_binding[i].write_only = var->data.image.write_only; + pipe_binding[i].write_only = + (var->data.image.access & ACCESS_NON_READABLE) != 0; } } @@ -459,6 +1304,38 @@ anv_nir_apply_pipeline_layout(struct anv_pipeline *pipeline, if (!function->impl) continue; + /* Before we do the normal lowering, we look for any SSBO operations + * that we can lower to the BTI model and lower them up-front. The BTI + * model can perform better than the A64 model for a couple reasons: + * + * 1. 48-bit address calculations are potentially expensive and using + * the BTI model lets us simply compute 32-bit offsets and the + * hardware adds the 64-bit surface base address. + * + * 2. The BTI messages, because they use surface states, do bounds + * checking for us. With the A64 model, we have to do our own + * bounds checking and this means wider pointers and extra + * calculations and branching in the shader. + * + * The solution to both of these is to convert things to the BTI model + * opportunistically. The reason why we need to do this as a pre-pass + * is for two reasons: + * + * 1. The BTI model requires nir_address_format_32bit_index_offset + * pointers which are not the same type as the pointers needed for + * the A64 model. Because all our derefs are set up for the A64 + * model (in case we have variable pointers), we have to crawl all + * the way back to the vulkan_resource_index intrinsic and build a + * completely fresh index+offset calculation. + * + * 2. Because the variable-pointers-capable lowering that we do as part + * of apply_pipeline_layout_block is destructive (It really has to + * be to handle variable pointers properly), we've lost the deref + * information by the time we get to the load/store/atomic + * intrinsics in that pass. + */ + lower_direct_buffer_access(function->impl, &state); + nir_builder_init(&state.builder, function->impl); nir_foreach_block(block, function->impl) apply_pipeline_layout_block(block, &state); @@ -466,51 +1343,5 @@ anv_nir_apply_pipeline_layout(struct anv_pipeline *pipeline, nir_metadata_dominance); } - if (map->image_count > 0) { - assert(map->image_count <= MAX_IMAGES); - nir_foreach_variable(var, &shader->uniforms) { - if (glsl_type_is_image(var->type) || - (glsl_type_is_array(var->type) && - glsl_type_is_image(glsl_get_array_element(var->type)))) { - /* Images are represented as uniform push constants and the actual - * information required for reading/writing to/from the image is - * storred in the uniform. - */ - unsigned set = var->data.descriptor_set; - unsigned binding = var->data.binding; - unsigned image_index = state.set[set].image_offsets[binding]; - - var->data.driver_location = shader->num_uniforms + - image_index * BRW_IMAGE_PARAM_SIZE * 4; - } - } - - uint32_t *param = brw_stage_prog_data_add_params(prog_data, - map->image_count * - BRW_IMAGE_PARAM_SIZE); - struct anv_push_constants *null_data = NULL; - const struct brw_image_param *image_param = null_data->images; - for (uint32_t i = 0; i < map->image_count; i++) { - setup_vec4_uniform_value(param + BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET, - (uintptr_t)&image_param->surface_idx, 1); - setup_vec4_uniform_value(param + BRW_IMAGE_PARAM_OFFSET_OFFSET, - (uintptr_t)image_param->offset, 2); - setup_vec4_uniform_value(param + BRW_IMAGE_PARAM_SIZE_OFFSET, - (uintptr_t)image_param->size, 3); - setup_vec4_uniform_value(param + BRW_IMAGE_PARAM_STRIDE_OFFSET, - (uintptr_t)image_param->stride, 4); - setup_vec4_uniform_value(param + BRW_IMAGE_PARAM_TILING_OFFSET, - (uintptr_t)image_param->tiling, 3); - setup_vec4_uniform_value(param + BRW_IMAGE_PARAM_SWIZZLING_OFFSET, - (uintptr_t)image_param->swizzling, 2); - - param += BRW_IMAGE_PARAM_SIZE; - image_param ++; - } - assert(param == prog_data->param + prog_data->nr_params); - - shader->num_uniforms += map->image_count * BRW_IMAGE_PARAM_SIZE * 4; - } - ralloc_free(mem_ctx); }