radv: use typed buffer loads for vertex input fetches
authorSamuel Pitoiset <samuel.pitoiset@gmail.com>
Tue, 26 Feb 2019 12:42:28 +0000 (13:42 +0100)
committerSamuel Pitoiset <samuel.pitoiset@gmail.com>
Wed, 13 Mar 2019 12:31:11 +0000 (13:31 +0100)
This drastically reduces the number of SGPRs because the driver
now uses descriptors per vertex binding, instead of per vertex
attribute format.

29077 shaders in 15096 tests
Totals:
SGPRS: 1354285 -> 1282109 (-5.33 %)
VGPRS: 909896 -> 908800 (-0.12 %)
Spilled SGPRs: 24840 -> 24811 (-0.12 %)
Code Size: 49221144 -> 48986628 (-0.48 %) bytes
Max Waves: 243930 -> 244229 (0.12 %)

Totals from affected shaders:
SGPRS: 390648 -> 318472 (-18.48 %)
VGPRS: 288432 -> 287336 (-0.38 %)
Spilled SGPRs: 94 -> 65 (-30.85 %)
Code Size: 11548412 -> 11313896 (-2.03 %) bytes
Max Waves: 86460 -> 86759 (0.35 %)

This gives a really tiny boost.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
src/amd/vulkan/radv_cmd_buffer.c
src/amd/vulkan/radv_nir_to_llvm.c
src/amd/vulkan/radv_pipeline.c
src/amd/vulkan/radv_private.h

index d8aceb8b082404e081959730c7f09b78cec67cac..06806ed6fcef48baeb02b2b6b28740305d7d6d93 100644 (file)
@@ -1988,13 +1988,13 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
 {
        if ((pipeline_is_dirty ||
            (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
-           cmd_buffer->state.pipeline->vertex_elements.count &&
+           cmd_buffer->state.pipeline->num_vertex_bindings &&
            radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.info.vs.has_vertex_buffers) {
                struct radv_vertex_elements_info *velems = &cmd_buffer->state.pipeline->vertex_elements;
                unsigned vb_offset;
                void *vb_ptr;
                uint32_t i = 0;
-               uint32_t count = velems->count;
+               uint32_t count = cmd_buffer->state.pipeline->num_vertex_bindings;
                uint64_t va;
 
                /* allocate some descriptor state for vertex buffers */
@@ -2005,13 +2005,15 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
                for (i = 0; i < count; i++) {
                        uint32_t *desc = &((uint32_t *)vb_ptr)[i * 4];
                        uint32_t offset;
-                       int vb = velems->binding[i];
-                       struct radv_buffer *buffer = cmd_buffer->vertex_bindings[vb].buffer;
-                       uint32_t stride = cmd_buffer->state.pipeline->binding_stride[vb];
+                       struct radv_buffer *buffer = cmd_buffer->vertex_bindings[i].buffer;
+                       uint32_t stride = cmd_buffer->state.pipeline->binding_stride[i];
+
+                       if (!buffer)
+                               continue;
 
                        va = radv_buffer_get_va(buffer->bo);
 
-                       offset = cmd_buffer->vertex_bindings[vb].offset + velems->offset[i];
+                       offset = cmd_buffer->vertex_bindings[i].offset;
                        va += offset + buffer->offset;
                        desc[0] = va;
                        desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
@@ -2019,7 +2021,12 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
                                desc[2] = (buffer->size - offset - velems->format_size[i]) / stride + 1;
                        else
                                desc[2] = buffer->size - offset;
-                       desc[3] = velems->rsrc_word3[i];
+                       desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+                                 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                                 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+                                 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+                                 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
+                                 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
                }
 
                va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
index 20371759a97d2b0989f30df4653b5f4bdcf5bbb3..3dd3e80f3b96760e4ba14f689e5defae81a9f552 100644 (file)
@@ -2008,6 +2008,8 @@ adjust_vertex_fetch_alpha(struct radv_shader_context *ctx,
 
        LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
 
+       alpha = LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.f32, "");
+
        if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
                alpha = LLVMBuildFPToUI(ctx->ac.builder, alpha, ctx->ac.i32, "");
        else
@@ -2035,7 +2037,7 @@ adjust_vertex_fetch_alpha(struct radv_shader_context *ctx,
                alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");
        }
 
-       return alpha;
+       return LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.i32, "");
 }
 
 static unsigned
@@ -2096,7 +2098,7 @@ radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx,
 
        for (unsigned i = num_channels; i < 4; i++) {
                chan[i] = i == 3 ? one : zero;
-               chan[i] = ac_to_float(&ctx->ac, chan[i]);
+               chan[i] = ac_to_integer(&ctx->ac, chan[i]);
        }
 
        return ac_build_gather_values(&ctx->ac, chan, 4);
@@ -2154,20 +2156,49 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
                } else
                        buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
                                                    ctx->abi.base_vertex, "");
-               t_offset = LLVMConstInt(ctx->ac.i32, attrib_index, false);
-
-               t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
 
                /* Adjust the number of channels to load based on the vertex
                 * attribute format.
                 */
                unsigned num_format_channels = get_num_channels_from_data_format(data_format);
                unsigned num_channels = MIN2(num_input_channels, num_format_channels);
+               unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[attrib_index];
+               unsigned attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[attrib_index];
+               unsigned attrib_stride = ctx->options->key.vs.vertex_attribute_strides[attrib_index];
 
-               input = ac_build_buffer_load_format(&ctx->ac, t_list,
+               if (attrib_stride != 0 && attrib_offset > attrib_stride) {
+                       LLVMValueRef buffer_offset =
+                               LLVMConstInt(ctx->ac.i32,
+                                            attrib_offset / attrib_stride, false);
+
+                       buffer_index = LLVMBuildAdd(ctx->ac.builder,
                                                    buffer_index,
-                                                   ctx->ac.i32_0,
-                                                   num_channels, false, true);
+                                                   buffer_offset, "");
+
+                       attrib_offset = attrib_offset % attrib_stride;
+               }
+
+               t_offset = LLVMConstInt(ctx->ac.i32, attrib_binding, false);
+               t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
+
+               input = ac_build_tbuffer_load(&ctx->ac, t_list, buffer_index,
+                                             LLVMConstInt(ctx->ac.i32, attrib_offset, false),
+                                             ctx->ac.i32_0, ctx->ac.i32_0,
+                                             num_channels,
+                                             data_format, num_format,
+                                             false, false, true);
+
+               if (ctx->options->key.vs.post_shuffle & (1 << attrib_index)) {
+                       if (num_channels > 1) {
+                               LLVMValueRef c[4];
+                               c[0] = ac_llvm_extract_elem(&ctx->ac, input, 2);
+                               c[1] = ac_llvm_extract_elem(&ctx->ac, input, 1);
+                               c[2] = ac_llvm_extract_elem(&ctx->ac, input, 0);
+                               c[3] = ac_llvm_extract_elem(&ctx->ac, input, 3);
+
+                               input = ac_build_gather_values(&ctx->ac, c, 4);
+                       }
+               }
 
                input = radv_fixup_vertex_input_fetches(ctx, input, num_channels,
                                                        is_float);
index 60510f97e0f9bb62ae1448d8d14105c86881ba23..7f2f96c540afdf43f21e44c54e090c5c4dc1d7b4 100644 (file)
@@ -1244,25 +1244,6 @@ si_conv_prim_to_gs_out(enum VkPrimitiveTopology topology)
        }
 }
 
-static unsigned si_map_swizzle(unsigned swizzle)
-{
-       switch (swizzle) {
-       case VK_SWIZZLE_Y:
-               return V_008F0C_SQ_SEL_Y;
-       case VK_SWIZZLE_Z:
-               return V_008F0C_SQ_SEL_Z;
-       case VK_SWIZZLE_W:
-               return V_008F0C_SQ_SEL_W;
-       case VK_SWIZZLE_0:
-               return V_008F0C_SQ_SEL_0;
-       case VK_SWIZZLE_1:
-               return V_008F0C_SQ_SEL_1;
-       default: /* VK_SWIZZLE_X */
-               return V_008F0C_SQ_SEL_X;
-       }
-}
-
-
 static unsigned radv_dynamic_state_mask(VkDynamicState state)
 {
        switch(state) {
@@ -3557,24 +3538,10 @@ radv_compute_vertex_input_state(struct radv_pipeline *pipeline,
                        &vi_info->pVertexAttributeDescriptions[i];
                unsigned loc = desc->location;
                const struct vk_format_description *format_desc;
-               int first_non_void;
-               uint32_t num_format, data_format;
-               format_desc = vk_format_description(desc->format);
-               first_non_void = vk_format_get_first_non_void_channel(desc->format);
 
-               num_format = radv_translate_buffer_numformat(format_desc, first_non_void);
-               data_format = radv_translate_buffer_dataformat(format_desc, first_non_void);
+               format_desc = vk_format_description(desc->format);
 
-               velems->rsrc_word3[loc] = S_008F0C_DST_SEL_X(si_map_swizzle(format_desc->swizzle[0])) |
-                       S_008F0C_DST_SEL_Y(si_map_swizzle(format_desc->swizzle[1])) |
-                       S_008F0C_DST_SEL_Z(si_map_swizzle(format_desc->swizzle[2])) |
-                       S_008F0C_DST_SEL_W(si_map_swizzle(format_desc->swizzle[3])) |
-                       S_008F0C_NUM_FORMAT(num_format) |
-                       S_008F0C_DATA_FORMAT(data_format);
                velems->format_size[loc] = format_desc->block.bits / 8;
-               velems->offset[loc] = desc->offset;
-               velems->binding[loc] = desc->binding;
-               velems->count = MAX2(velems->count, loc + 1);
        }
 
        for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
@@ -3582,6 +3549,8 @@ radv_compute_vertex_input_state(struct radv_pipeline *pipeline,
                        &vi_info->pVertexBindingDescriptions[i];
 
                pipeline->binding_stride[desc->binding] = desc->stride;
+               pipeline->num_vertex_bindings =
+                       MAX2(pipeline->num_vertex_bindings, desc->binding + 1);
        }
 }
 
index c73bdaca0a3b4b117f01d3c222f851f9ae575ba4..39fa6110fde6c397c28e4f0af93e52e67f64f45c 100644 (file)
@@ -1342,11 +1342,7 @@ struct radv_prim_vertex_count {
 };
 
 struct radv_vertex_elements_info {
-       uint32_t rsrc_word3[MAX_VERTEX_ATTRIBS];
        uint32_t format_size[MAX_VERTEX_ATTRIBS];
-       uint32_t binding[MAX_VERTEX_ATTRIBS];
-       uint32_t offset[MAX_VERTEX_ATTRIBS];
-       uint32_t count;
 };
 
 struct radv_ia_multi_vgt_param_helpers {
@@ -1378,6 +1374,7 @@ struct radv_pipeline {
        struct radv_vertex_elements_info             vertex_elements;
 
        uint32_t                                     binding_stride[MAX_VBS];
+       uint8_t                                      num_vertex_bindings;
 
        uint32_t user_data_0[MESA_SHADER_STAGES];
        union {