From a07b55443b59572d022924b65c2ab67cd91250e4 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Mon, 23 Mar 2020 17:23:32 +0100 Subject: [PATCH] tu: Emit CP_LOAD_STATE6 for descriptors This restores the pre-loading of descriptor state, using the new SS6_BINDLESS method that allows us to pre-load bindless resources. Part-of: --- src/freedreno/vulkan/tu_cmd_buffer.c | 40 ++++ src/freedreno/vulkan/tu_descriptor_set.c | 1 + src/freedreno/vulkan/tu_descriptor_set.h | 3 + src/freedreno/vulkan/tu_pipeline.c | 260 ++++++++++++++++++++++- src/freedreno/vulkan/tu_private.h | 5 + 5 files changed, 304 insertions(+), 5 deletions(-) diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index 1c9c43f35e9..a15797e1042 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -2527,6 +2527,7 @@ enum tu_draw_state_group_id TU_DRAW_STATE_FS_CONST, TU_DRAW_STATE_DESC_SETS, TU_DRAW_STATE_DESC_SETS_GMEM, + TU_DRAW_STATE_DESC_SETS_LOAD, TU_DRAW_STATE_VS_PARAMS, TU_DRAW_STATE_COUNT, @@ -3089,6 +3090,42 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd, .ib = desc_sets_gmem, }; } + + /* We need to reload the descriptors every time the descriptor sets + * change. However, the commands we send only depend on the pipeline + * because the whole point is to cache descriptors which are used by the + * pipeline. There's a problem here, in that the firmware has an + * "optimization" which skips executing groups that are set to the same + * value as the last draw. This means that if the descriptor sets change + * but not the pipeline, we'd try to re-execute the same buffer which + * the firmware would ignore and we wouldn't pre-load the new + * descriptors. The blob seems to re-emit the LOAD_STATE group whenever + * the descriptor sets change, which we emulate here by copying the + * pre-prepared buffer. + */ + const struct tu_cs_entry *load_entry = &pipeline->load_state.state_ib; + if (load_entry->size > 0) { + struct tu_cs load_cs; + result = tu_cs_begin_sub_stream(&cmd->sub_cs, load_entry->size, &load_cs); + if (result != VK_SUCCESS) + return result; + tu_cs_emit_array(&load_cs, + (uint32_t *)((char *)load_entry->bo->map + load_entry->offset), + load_entry->size / 4); + struct tu_cs_entry load_copy = tu_cs_end_sub_stream(&cmd->sub_cs, &load_cs); + + draw_state_groups[draw_state_group_count++] = + (struct tu_draw_state_group) { + .id = TU_DRAW_STATE_DESC_SETS_LOAD, + /* The blob seems to not enable this for binning, even when + * resources would actually be used in the binning shader. + * Presumably the overhead of prefetching the resources isn't + * worth it. + */ + .enable_mask = ENABLE_DRAW, + .ib = load_copy, + }; + } } struct tu_cs_entry vs_params; @@ -3520,6 +3557,9 @@ tu_dispatch(struct tu_cmd_buffer *cmd, if (ib.size) tu_cs_emit_ib(cs, &ib); + if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS) + tu_cs_emit_ib(cs, &pipeline->load_state.state_ib); + cmd->state.dirty &= ~(TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS | TU_CMD_DIRTY_COMPUTE_PIPELINE); diff --git a/src/freedreno/vulkan/tu_descriptor_set.c b/src/freedreno/vulkan/tu_descriptor_set.c index de1683c6bd7..f6327b85c50 100644 --- a/src/freedreno/vulkan/tu_descriptor_set.c +++ b/src/freedreno/vulkan/tu_descriptor_set.c @@ -173,6 +173,7 @@ tu_CreateDescriptorSetLayout( set_layout->binding[b].dynamic_offset_offset = dynamic_offset_count; set_layout->binding[b].input_attachment_offset = input_attachment_count; set_layout->binding[b].size = descriptor_size(binding->descriptorType); + set_layout->binding[b].shader_stages = binding->stageFlags; if (variable_flags && binding->binding < variable_flags->bindingCount && (variable_flags->pBindingFlags[binding->binding] & diff --git a/src/freedreno/vulkan/tu_descriptor_set.h b/src/freedreno/vulkan/tu_descriptor_set.h index 3a24822eb67..4c1bd502e30 100644 --- a/src/freedreno/vulkan/tu_descriptor_set.h +++ b/src/freedreno/vulkan/tu_descriptor_set.h @@ -60,6 +60,9 @@ struct tu_descriptor_set_binding_layout /* Offset in the tu_descriptor_set_layout of the immutable samplers, or 0 * if there are no immutable samplers. */ uint32_t immutable_samplers_offset; + + /* Shader stages that use this binding */ + uint32_t shader_stages; }; struct tu_descriptor_set_layout diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index dc2a568a59c..38765025641 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -40,6 +40,247 @@ #include "tu_cs.h" +/* Emit IB that preloads the descriptors that the shader uses */ + +static inline uint32_t +tu6_vkstage2opcode(VkShaderStageFlags stage) +{ + switch (stage) { + case VK_SHADER_STAGE_VERTEX_BIT: + case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT: + case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT: + case VK_SHADER_STAGE_GEOMETRY_BIT: + return CP_LOAD_STATE6_GEOM; + case VK_SHADER_STAGE_FRAGMENT_BIT: + case VK_SHADER_STAGE_COMPUTE_BIT: + return CP_LOAD_STATE6_FRAG; + default: + unreachable("bad shader type"); + } +} + +static enum a6xx_state_block +tu6_tex_stage2sb(VkShaderStageFlags stage) +{ + switch (stage) { + case VK_SHADER_STAGE_VERTEX_BIT: + return SB6_VS_TEX; + case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT: + return SB6_HS_TEX; + case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT: + return SB6_DS_TEX; + case VK_SHADER_STAGE_GEOMETRY_BIT: + return SB6_GS_TEX; + case VK_SHADER_STAGE_FRAGMENT_BIT: + return SB6_FS_TEX; + case VK_SHADER_STAGE_COMPUTE_BIT: + return SB6_CS_TEX; + default: + unreachable("bad shader stage"); + } +} + +static enum a6xx_state_block +tu6_ubo_stage2sb(VkShaderStageFlags stage) +{ + switch (stage) { + case VK_SHADER_STAGE_VERTEX_BIT: + return SB6_VS_SHADER; + case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT: + return SB6_HS_SHADER; + case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT: + return SB6_DS_SHADER; + case VK_SHADER_STAGE_GEOMETRY_BIT: + return SB6_GS_SHADER; + case VK_SHADER_STAGE_FRAGMENT_BIT: + return SB6_FS_SHADER; + case VK_SHADER_STAGE_COMPUTE_BIT: + return SB6_CS_SHADER; + default: + unreachable("bad shader stage"); + } +} + +static void +emit_load_state(struct tu_cs *cs, unsigned opcode, enum a6xx_state_type st, + enum a6xx_state_block sb, unsigned base, unsigned offset, + unsigned count) +{ + /* Note: just emit one packet, even if count overflows NUM_UNIT. It's not + * clear if emitting more packets will even help anything. Presumably the + * descriptor cache is relatively small, and these packets stop doing + * anything when there are too many descriptors. + */ + tu_cs_emit_pkt7(cs, opcode, 3); + tu_cs_emit(cs, + CP_LOAD_STATE6_0_STATE_TYPE(st) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_BINDLESS) | + CP_LOAD_STATE6_0_STATE_BLOCK(sb) | + CP_LOAD_STATE6_0_NUM_UNIT(MIN2(count, 1024-1))); + tu_cs_emit_qw(cs, offset | (base << 28)); +} + +static unsigned +tu6_load_state_size(struct tu_pipeline_layout *layout, bool compute) +{ + const unsigned load_state_size = 4; + unsigned size = 0; + for (unsigned i = 0; i < layout->num_sets; i++) { + struct tu_descriptor_set_layout *set_layout = layout->set[i].layout; + for (unsigned j = 0; j < set_layout->binding_count; j++) { + struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j]; + unsigned count = 0; + /* Note: some users, like amber for example, pass in + * VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so + * filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly. + */ + VkShaderStageFlags stages = compute ? + binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT : + binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS; + unsigned stage_count = util_bitcount(stages); + switch (binding->type) { + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + /* IBO-backed resources only need one packet for all graphics stages */ + if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) + count += 1; + if (stages & VK_SHADER_STAGE_COMPUTE_BIT) + count += 1; + break; + case VK_DESCRIPTOR_TYPE_SAMPLER: + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + /* Textures and UBO's needs a packet for each stage */ + count = stage_count; + break; + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + /* Because of how we pack combined images and samplers, we + * currently can't use one packet for the whole array. + */ + count = stage_count * binding->array_size * 2; + break; + default: + unreachable("bad descriptor type"); + } + size += count * load_state_size; + } + } + return size; +} + +static void +tu6_emit_load_state(struct tu_pipeline *pipeline, bool compute) +{ + unsigned size = tu6_load_state_size(pipeline->layout, compute); + if (size == 0) + return; + + struct tu_cs cs; + tu_cs_begin_sub_stream(&pipeline->cs, size, &cs); + + struct tu_pipeline_layout *layout = pipeline->layout; + for (unsigned i = 0; i < layout->num_sets; i++) { + struct tu_descriptor_set_layout *set_layout = layout->set[i].layout; + for (unsigned j = 0; j < set_layout->binding_count; j++) { + struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j]; + unsigned base = i; + unsigned offset = binding->offset / 4; + /* Note: some users, like amber for example, pass in + * VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so + * filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly. + */ + VkShaderStageFlags stages = compute ? + binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT : + binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS; + unsigned count = binding->array_size; + if (count == 0 || stages == 0) + continue; + switch (binding->type) { + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + base = MAX_SETS; + offset = (layout->input_attachment_count + + layout->set[i].dynamic_offset_start + + binding->dynamic_offset_offset) * A6XX_TEX_CONST_DWORDS; + /* fallthrough */ + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + /* IBO-backed resources only need one packet for all graphics stages */ + if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) { + emit_load_state(&cs, CP_LOAD_STATE6, ST6_SHADER, SB6_IBO, + base, offset, count); + } + if (stages & VK_SHADER_STAGE_COMPUTE_BIT) { + emit_load_state(&cs, CP_LOAD_STATE6_FRAG, ST6_IBO, SB6_CS_SHADER, + base, offset, count); + } + break; + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + base = MAX_SETS; + offset = (layout->set[i].input_attachment_start + + binding->input_attachment_offset) * A6XX_TEX_CONST_DWORDS; + case VK_DESCRIPTOR_TYPE_SAMPLER: + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: { + unsigned stage_log2; + for_each_bit(stage_log2, stages) { + VkShaderStageFlags stage = 1 << stage_log2; + emit_load_state(&cs, tu6_vkstage2opcode(stage), + binding->type == VK_DESCRIPTOR_TYPE_SAMPLER ? + ST6_SHADER : ST6_CONSTANTS, + tu6_tex_stage2sb(stage), base, offset, count); + } + break; + } + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + base = MAX_SETS; + offset = (layout->input_attachment_count + + layout->set[i].dynamic_offset_start + + binding->dynamic_offset_offset) * A6XX_TEX_CONST_DWORDS; + /* fallthrough */ + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: { + unsigned stage_log2; + for_each_bit(stage_log2, stages) { + VkShaderStageFlags stage = 1 << stage_log2; + emit_load_state(&cs, tu6_vkstage2opcode(stage), ST6_UBO, + tu6_ubo_stage2sb(stage), base, offset, count); + } + break; + } + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: { + unsigned stage_log2; + for_each_bit(stage_log2, stages) { + VkShaderStageFlags stage = 1 << stage_log2; + /* TODO: We could emit less CP_LOAD_STATE6 if we used + * struct-of-arrays instead of array-of-structs. + */ + for (unsigned i = 0; i < count; i++) { + unsigned tex_offset = offset + 2 * i * A6XX_TEX_CONST_DWORDS; + unsigned sam_offset = offset + (2 * i + 1) * A6XX_TEX_CONST_DWORDS; + emit_load_state(&cs, tu6_vkstage2opcode(stage), + ST6_CONSTANTS, tu6_tex_stage2sb(stage), + base, tex_offset, 1); + emit_load_state(&cs, tu6_vkstage2opcode(stage), + ST6_SHADER, tu6_tex_stage2sb(stage), + base, sam_offset, 1); + } + } + break; + } + default: + unreachable("bad descriptor type"); + } + } + } + + pipeline->load_state.state_ib = tu_cs_end_sub_stream(&pipeline->cs, &cs); +} + struct tu_pipeline_builder { struct tu_device *device; @@ -1774,6 +2015,8 @@ tu6_emit_blend_constants(struct tu_cs *cs, const float constants[4]) static VkResult tu_pipeline_create(struct tu_device *dev, + struct tu_pipeline_layout *layout, + bool compute, const VkAllocationCallbacks *pAllocator, struct tu_pipeline **out_pipeline) { @@ -1785,8 +2028,12 @@ tu_pipeline_create(struct tu_device *dev, tu_cs_init(&pipeline->cs, dev, TU_CS_MODE_SUB_STREAM, 2048); - /* reserve the space now such that tu_cs_begin_sub_stream never fails */ - VkResult result = tu_cs_reserve_space(&pipeline->cs, 2048); + /* Reserve the space now such that tu_cs_begin_sub_stream never fails. Note + * that LOAD_STATE can potentially take up a large amount of space so we + * calculate its size explicitly. + */ + unsigned load_state_size = tu6_load_state_size(layout, compute); + VkResult result = tu_cs_reserve_space(&pipeline->cs, 2048 + load_state_size); if (result != VK_SUCCESS) { vk_free2(&dev->alloc, pAllocator, pipeline); return result; @@ -2182,8 +2429,8 @@ static VkResult tu_pipeline_builder_build(struct tu_pipeline_builder *builder, struct tu_pipeline **pipeline) { - VkResult result = tu_pipeline_create(builder->device, builder->alloc, - pipeline); + VkResult result = tu_pipeline_create(builder->device, builder->layout, + false, builder->alloc, pipeline); if (result != VK_SUCCESS) return result; @@ -2209,6 +2456,7 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder, tu_pipeline_builder_parse_rasterization(builder, *pipeline); tu_pipeline_builder_parse_depth_stencil(builder, *pipeline); tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline); + tu6_emit_load_state(*pipeline, false); /* we should have reserved enough space upfront such that the CS never * grows @@ -2381,7 +2629,7 @@ tu_compute_pipeline_create(VkDevice device, *pPipeline = VK_NULL_HANDLE; - result = tu_pipeline_create(dev, pAllocator, &pipeline); + result = tu_pipeline_create(dev, layout, true, pAllocator, &pipeline); if (result != VK_SUCCESS) return result; @@ -2418,6 +2666,8 @@ tu_compute_pipeline_create(VkDevice device, tu6_emit_compute_program(&prog_cs, shader, &pipeline->program.binary_bo); pipeline->program.state_ib = tu_cs_end_sub_stream(&pipeline->cs, &prog_cs); + tu6_emit_load_state(pipeline, true); + *pPipeline = tu_pipeline_to_handle(pipeline); return VK_SUCCESS; diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index 87e4df85ff5..9fa8763179d 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -1210,6 +1210,11 @@ struct tu_pipeline unsigned input_attachment_idx[MAX_RTS]; } program; + struct + { + struct tu_cs_entry state_ib; + } load_state; + struct { uint8_t bindings[MAX_VERTEX_ATTRIBS]; -- 2.30.2