From a66b186bebf9b63897199b9b6e26d40977417f74 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 26 Feb 2019 13:42:28 +0100 Subject: [PATCH] radv: use typed buffer loads for vertex input fetches This drastically reduces the number of SGPRs because the driver now uses descriptors per vertex binding, instead of per vertex attribute format. 29077 shaders in 15096 tests Totals: SGPRS: 1354285 -> 1282109 (-5.33 %) VGPRS: 909896 -> 908800 (-0.12 %) Spilled SGPRs: 24840 -> 24811 (-0.12 %) Code Size: 49221144 -> 48986628 (-0.48 %) bytes Max Waves: 243930 -> 244229 (0.12 %) Totals from affected shaders: SGPRS: 390648 -> 318472 (-18.48 %) VGPRS: 288432 -> 287336 (-0.38 %) Spilled SGPRs: 94 -> 65 (-30.85 %) Code Size: 11548412 -> 11313896 (-2.03 %) bytes Max Waves: 86460 -> 86759 (0.35 %) This gives a really tiny boost. Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen --- src/amd/vulkan/radv_cmd_buffer.c | 21 +++++++++----- src/amd/vulkan/radv_nir_to_llvm.c | 47 +++++++++++++++++++++++++------ src/amd/vulkan/radv_pipeline.c | 37 ++---------------------- src/amd/vulkan/radv_private.h | 5 +--- 4 files changed, 57 insertions(+), 53 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index d8aceb8b082..06806ed6fce 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -1988,13 +1988,13 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, { if ((pipeline_is_dirty || (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) && - cmd_buffer->state.pipeline->vertex_elements.count && + cmd_buffer->state.pipeline->num_vertex_bindings && radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.info.vs.has_vertex_buffers) { struct radv_vertex_elements_info *velems = &cmd_buffer->state.pipeline->vertex_elements; unsigned vb_offset; void *vb_ptr; uint32_t i = 0; - uint32_t count = velems->count; + uint32_t count = cmd_buffer->state.pipeline->num_vertex_bindings; uint64_t va; /* allocate some descriptor state for vertex buffers */ @@ -2005,13 +2005,15 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, for (i = 0; i < count; i++) { uint32_t *desc = &((uint32_t *)vb_ptr)[i * 4]; uint32_t offset; - int vb = velems->binding[i]; - struct radv_buffer *buffer = cmd_buffer->vertex_bindings[vb].buffer; - uint32_t stride = cmd_buffer->state.pipeline->binding_stride[vb]; + struct radv_buffer *buffer = cmd_buffer->vertex_bindings[i].buffer; + uint32_t stride = cmd_buffer->state.pipeline->binding_stride[i]; + + if (!buffer) + continue; va = radv_buffer_get_va(buffer->bo); - offset = cmd_buffer->vertex_bindings[vb].offset + velems->offset[i]; + offset = cmd_buffer->vertex_bindings[i].offset; va += offset + buffer->offset; desc[0] = va; desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride); @@ -2019,7 +2021,12 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, desc[2] = (buffer->size - offset - velems->format_size[i]) / stride + 1; else desc[2] = buffer->size - offset; - desc[3] = velems->rsrc_word3[i]; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); } va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 20371759a97..3dd3e80f3b9 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -2008,6 +2008,8 @@ adjust_vertex_fetch_alpha(struct radv_shader_context *ctx, LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0); + alpha = LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.f32, ""); + if (adjustment == RADV_ALPHA_ADJUST_SSCALED) alpha = LLVMBuildFPToUI(ctx->ac.builder, alpha, ctx->ac.i32, ""); else @@ -2035,7 +2037,7 @@ adjust_vertex_fetch_alpha(struct radv_shader_context *ctx, alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, ""); } - return alpha; + return LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.i32, ""); } static unsigned @@ -2096,7 +2098,7 @@ radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx, for (unsigned i = num_channels; i < 4; i++) { chan[i] = i == 3 ? one : zero; - chan[i] = ac_to_float(&ctx->ac, chan[i]); + chan[i] = ac_to_integer(&ctx->ac, chan[i]); } return ac_build_gather_values(&ctx->ac, chan, 4); @@ -2154,20 +2156,49 @@ handle_vs_input_decl(struct radv_shader_context *ctx, } else buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id, ctx->abi.base_vertex, ""); - t_offset = LLVMConstInt(ctx->ac.i32, attrib_index, false); - - t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset); /* Adjust the number of channels to load based on the vertex * attribute format. */ unsigned num_format_channels = get_num_channels_from_data_format(data_format); unsigned num_channels = MIN2(num_input_channels, num_format_channels); + unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[attrib_index]; + unsigned attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[attrib_index]; + unsigned attrib_stride = ctx->options->key.vs.vertex_attribute_strides[attrib_index]; - input = ac_build_buffer_load_format(&ctx->ac, t_list, + if (attrib_stride != 0 && attrib_offset > attrib_stride) { + LLVMValueRef buffer_offset = + LLVMConstInt(ctx->ac.i32, + attrib_offset / attrib_stride, false); + + buffer_index = LLVMBuildAdd(ctx->ac.builder, buffer_index, - ctx->ac.i32_0, - num_channels, false, true); + buffer_offset, ""); + + attrib_offset = attrib_offset % attrib_stride; + } + + t_offset = LLVMConstInt(ctx->ac.i32, attrib_binding, false); + t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset); + + input = ac_build_tbuffer_load(&ctx->ac, t_list, buffer_index, + LLVMConstInt(ctx->ac.i32, attrib_offset, false), + ctx->ac.i32_0, ctx->ac.i32_0, + num_channels, + data_format, num_format, + false, false, true); + + if (ctx->options->key.vs.post_shuffle & (1 << attrib_index)) { + if (num_channels > 1) { + LLVMValueRef c[4]; + c[0] = ac_llvm_extract_elem(&ctx->ac, input, 2); + c[1] = ac_llvm_extract_elem(&ctx->ac, input, 1); + c[2] = ac_llvm_extract_elem(&ctx->ac, input, 0); + c[3] = ac_llvm_extract_elem(&ctx->ac, input, 3); + + input = ac_build_gather_values(&ctx->ac, c, 4); + } + } input = radv_fixup_vertex_input_fetches(ctx, input, num_channels, is_float); diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 60510f97e0f..7f2f96c540a 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -1244,25 +1244,6 @@ si_conv_prim_to_gs_out(enum VkPrimitiveTopology topology) } } -static unsigned si_map_swizzle(unsigned swizzle) -{ - switch (swizzle) { - case VK_SWIZZLE_Y: - return V_008F0C_SQ_SEL_Y; - case VK_SWIZZLE_Z: - return V_008F0C_SQ_SEL_Z; - case VK_SWIZZLE_W: - return V_008F0C_SQ_SEL_W; - case VK_SWIZZLE_0: - return V_008F0C_SQ_SEL_0; - case VK_SWIZZLE_1: - return V_008F0C_SQ_SEL_1; - default: /* VK_SWIZZLE_X */ - return V_008F0C_SQ_SEL_X; - } -} - - static unsigned radv_dynamic_state_mask(VkDynamicState state) { switch(state) { @@ -3557,24 +3538,10 @@ radv_compute_vertex_input_state(struct radv_pipeline *pipeline, &vi_info->pVertexAttributeDescriptions[i]; unsigned loc = desc->location; const struct vk_format_description *format_desc; - int first_non_void; - uint32_t num_format, data_format; - format_desc = vk_format_description(desc->format); - first_non_void = vk_format_get_first_non_void_channel(desc->format); - num_format = radv_translate_buffer_numformat(format_desc, first_non_void); - data_format = radv_translate_buffer_dataformat(format_desc, first_non_void); + format_desc = vk_format_description(desc->format); - velems->rsrc_word3[loc] = S_008F0C_DST_SEL_X(si_map_swizzle(format_desc->swizzle[0])) | - S_008F0C_DST_SEL_Y(si_map_swizzle(format_desc->swizzle[1])) | - S_008F0C_DST_SEL_Z(si_map_swizzle(format_desc->swizzle[2])) | - S_008F0C_DST_SEL_W(si_map_swizzle(format_desc->swizzle[3])) | - S_008F0C_NUM_FORMAT(num_format) | - S_008F0C_DATA_FORMAT(data_format); velems->format_size[loc] = format_desc->block.bits / 8; - velems->offset[loc] = desc->offset; - velems->binding[loc] = desc->binding; - velems->count = MAX2(velems->count, loc + 1); } for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) { @@ -3582,6 +3549,8 @@ radv_compute_vertex_input_state(struct radv_pipeline *pipeline, &vi_info->pVertexBindingDescriptions[i]; pipeline->binding_stride[desc->binding] = desc->stride; + pipeline->num_vertex_bindings = + MAX2(pipeline->num_vertex_bindings, desc->binding + 1); } } diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index c73bdaca0a3..39fa6110fde 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -1342,11 +1342,7 @@ struct radv_prim_vertex_count { }; struct radv_vertex_elements_info { - uint32_t rsrc_word3[MAX_VERTEX_ATTRIBS]; uint32_t format_size[MAX_VERTEX_ATTRIBS]; - uint32_t binding[MAX_VERTEX_ATTRIBS]; - uint32_t offset[MAX_VERTEX_ATTRIBS]; - uint32_t count; }; struct radv_ia_multi_vgt_param_helpers { @@ -1378,6 +1374,7 @@ struct radv_pipeline { struct radv_vertex_elements_info vertex_elements; uint32_t binding_stride[MAX_VBS]; + uint8_t num_vertex_bindings; uint32_t user_data_0[MESA_SHADER_STAGES]; union { -- 2.30.2