radeonsi: put up to 5 VBO descriptors into user SGPRs
authorMarek Olšák <marek.olsak@amd.com>
Tue, 7 Jan 2020 23:23:53 +0000 (18:23 -0500)
committerMarek Olšák <marek.olsak@amd.com>
Mon, 13 Jan 2020 20:57:07 +0000 (15:57 -0500)
gfx6-8: 1 VBO descriptor in user SGPRs
gfx9-10: 5 VBO descriptors in user SGPRs

We no longer pull up to 5 VBO descriptors from GTT when SDMA is disabled.

Totals from affected shaders:
SGPRS: 1110528 -> 1170528 (5.40 %)
VGPRS: 952896 -> 951936 (-0.10 %)
Spilled SGPRs: 83 -> 61 (-26.51 %)
Spilled VGPRs: 0 -> 0 (0.00 %)
Private memory VGPRs: 0 -> 0 (0.00 %)
Scratch size: 0 -> 0 (0.00 %) dwords per thread
Code Size: 23766296 -> 22843920 (-3.88 %) bytes
LDS: 0 -> 0 (0.00 %) blocks
Max Waves: 179344 -> 179344 (0.00 %)
Wait states: 0 -> 0 (0.00 %)

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
src/gallium/drivers/radeonsi/si_blit.c
src/gallium/drivers/radeonsi/si_descriptors.c
src/gallium/drivers/radeonsi/si_pipe.c
src/gallium/drivers/radeonsi/si_pipe.h
src/gallium/drivers/radeonsi/si_shader.c
src/gallium/drivers/radeonsi/si_shader.h
src/gallium/drivers/radeonsi/si_shader_internal.h
src/gallium/drivers/radeonsi/si_state.c
src/gallium/drivers/radeonsi/si_state_draw.c
src/gallium/drivers/radeonsi/si_state_shaders.c

index 80dedf61e0abcdf77a14f064d9106dc4c123e439..70ad55d8a41b6e470bea641ad2848ce05a4f0751 100644 (file)
@@ -99,6 +99,7 @@ void si_blitter_end(struct si_context *sctx)
         * non-global VS user SGPRs. */
        sctx->shader_pointers_dirty |= SI_DESCS_SHADER_MASK(VERTEX);
        sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
+       sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0;
        si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
 }
 
index 642a22ccfa603e5d10d02c76c9f90b5ab595f048..3c43911a2118fd73e0b1717f895dbc3e7ce2a851 100644 (file)
@@ -1103,36 +1103,48 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 
        struct si_vertex_elements *velems = sctx->vertex_elements;
        unsigned alloc_size = velems->vb_desc_list_alloc_size;
-       unsigned first_vb_use_mask = velems->first_vb_use_mask;
 
-       /* Vertex buffer descriptors are the only ones which are uploaded
-        * directly through a staging buffer and don't go through
-        * the fine-grained upload path.
-        */
-       u_upload_alloc(sctx->b.const_uploader, 0,
-                      alloc_size,
-                      si_optimal_tcc_alignment(sctx, alloc_size),
-                      &sctx->vb_descriptors_offset,
-                      (struct pipe_resource**)&sctx->vb_descriptors_buffer,
-                      (void**)&ptr);
-       if (!sctx->vb_descriptors_buffer) {
-               sctx->vb_descriptors_offset = 0;
-               sctx->vb_descriptors_gpu_list = NULL;
-               return false;
-       }
+       if (alloc_size) {
+               /* Vertex buffer descriptors are the only ones which are uploaded
+                * directly through a staging buffer and don't go through
+                * the fine-grained upload path.
+                */
+               u_upload_alloc(sctx->b.const_uploader, 0,
+                              alloc_size,
+                              si_optimal_tcc_alignment(sctx, alloc_size),
+                              &sctx->vb_descriptors_offset,
+                              (struct pipe_resource**)&sctx->vb_descriptors_buffer,
+                              (void**)&ptr);
+               if (!sctx->vb_descriptors_buffer) {
+                       sctx->vb_descriptors_offset = 0;
+                       sctx->vb_descriptors_gpu_list = NULL;
+                       return false;
+               }
 
-       sctx->vb_descriptors_gpu_list = ptr;
-       radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-                                 sctx->vb_descriptors_buffer, RADEON_USAGE_READ,
-                                 RADEON_PRIO_DESCRIPTORS);
+               sctx->vb_descriptors_gpu_list = ptr;
+               radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
+                                         sctx->vb_descriptors_buffer, RADEON_USAGE_READ,
+                                         RADEON_PRIO_DESCRIPTORS);
+               sctx->vertex_buffer_pointer_dirty = true;
+               sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
+       } else {
+               si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
+               sctx->vertex_buffer_pointer_dirty = false;
+               sctx->prefetch_L2_mask &= ~SI_PREFETCH_VBO_DESCRIPTORS;
+       }
 
        assert(count <= SI_MAX_ATTRIBS);
 
+       unsigned first_vb_use_mask = velems->first_vb_use_mask;
+       unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs;
+
        for (i = 0; i < count; i++) {
                struct pipe_vertex_buffer *vb;
                struct si_resource *buf;
                unsigned vbo_index = velems->vertex_buffer_index[i];
-               uint32_t *desc = &ptr[i*4];
+               uint32_t *desc = i < num_vbos_in_user_sgprs ?
+                                       &sctx->vb_descriptor_user_sgprs[i * 4] :
+                                       &ptr[(i - num_vbos_in_user_sgprs) * 4];
 
                vb = &sctx->vertex_buffer[vbo_index];
                buf = si_resource(vb->buffer.resource);
@@ -1187,9 +1199,8 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
         * uploaded to a fresh new buffer, so I don't think flushing the const
         * cache is needed. */
        si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
+       sctx->vertex_buffer_user_sgprs_dirty = num_vbos_in_user_sgprs > 0;
        sctx->vertex_buffers_dirty = false;
-       sctx->vertex_buffer_pointer_dirty = true;
-       sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
        return true;
 }
 
@@ -2050,8 +2061,11 @@ static void si_mark_shader_pointers_dirty(struct si_context *sctx,
                u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS,
                                  SI_NUM_SHADER_DESCS);
 
-       if (shader == PIPE_SHADER_VERTEX)
+       if (shader == PIPE_SHADER_VERTEX) {
                sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
+               sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 &&
+                                                      sctx->screen->num_vbos_in_user_sgprs;
+       }
 
        si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
 }
@@ -2060,6 +2074,8 @@ static void si_shader_pointers_begin_new_cs(struct si_context *sctx)
 {
        sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
        sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
+       sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 &&
+                                              sctx->screen->num_vbos_in_user_sgprs;
        si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
        sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
        sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
@@ -2258,8 +2274,6 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx)
                struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
                /* Find the location of the VB descriptor pointer. */
-               /* TODO: In the future, the pointer will be packed in unused
-                *       bits of the first 2 VB descriptors. */
                unsigned sh_dw_offset = SI_VS_NUM_USER_SGPR;
                if (sctx->chip_class >= GFX9) {
                        if (sctx->tes_shader.cso)
@@ -2276,6 +2290,18 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx)
                sctx->vertex_buffer_pointer_dirty = false;
        }
 
+       if (sctx->vertex_buffer_user_sgprs_dirty) {
+               struct radeon_cmdbuf *cs = sctx->gfx_cs;
+               unsigned num_desc = MIN2(sctx->num_vertex_elements,
+                                        sctx->screen->num_vbos_in_user_sgprs);
+               unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4;
+
+               assert(num_desc);
+               si_emit_shader_pointer_head(cs, sh_offset, num_desc * 4);
+               radeon_emit_array(cs, sctx->vb_descriptor_user_sgprs, num_desc * 4);
+               sctx->vertex_buffer_user_sgprs_dirty = false;
+       }
+
        if (sctx->graphics_bindless_pointer_dirty) {
                si_emit_global_shader_pointers(sctx,
                                               &sctx->bindless_descriptors);
index 755c768fb0b5894de999b662338abfce94dec25b..a69f6c07800bde434a52fc0efdb0fee97319803f 100644 (file)
@@ -1092,6 +1092,8 @@ radeonsi_screen_create_impl(struct radeon_winsys *ws,
        if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
                si_init_perfcounters(sscreen);
 
+       sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1;
+
        /* Determine tessellation ring info. */
        bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 &&
                                      sscreen->info.family != CHIP_CARRIZO &&
index 14768f02384e3db952b4f79d59ec924ffabdae90..6c92dc1a81e455ac238d5191d1c86b5619cac562 100644 (file)
@@ -486,6 +486,7 @@ struct si_screen {
                        uint32_t *state,
                        uint32_t *fmask_state);
 
+       unsigned                        num_vbos_in_user_sgprs;
        unsigned                        pa_sc_raster_config;
        unsigned                        pa_sc_raster_config_1;
        unsigned                        se_tile_repeat;
@@ -1006,11 +1007,6 @@ struct si_context {
        bool                            flatshade;
        bool                            do_update_shaders;
 
-       /* vertex buffer descriptors */
-       uint32_t *vb_descriptors_gpu_list;
-       struct si_resource *vb_descriptors_buffer;
-       unsigned vb_descriptors_offset;
-
        /* shader descriptors */
        struct si_descriptors           descriptors[SI_NUM_DESCS];
        unsigned                        descriptors_dirty;
@@ -1037,11 +1033,16 @@ struct si_context {
        uint32_t                        vs_blit_sh_data[SI_VS_BLIT_SGPRS_POS_TEXCOORD];
        uint32_t                        cs_user_data[4];
 
-       /* Vertex and index buffers. */
+       /* Vertex buffers. */
        bool                            vertex_buffers_dirty;
        bool                            vertex_buffer_pointer_dirty;
+       bool                            vertex_buffer_user_sgprs_dirty;
        struct pipe_vertex_buffer       vertex_buffer[SI_NUM_VERTEX_BUFFERS];
        uint16_t                        vertex_buffer_unaligned; /* bitmask of not dword-aligned buffers */
+       uint32_t                        *vb_descriptors_gpu_list;
+       struct si_resource              *vb_descriptors_buffer;
+       unsigned                        vb_descriptors_offset;
+       unsigned                        vb_descriptor_user_sgprs[5*4];
 
        /* MSAA config state. */
        int                             ps_iter_samples;
index f734221728b877c9077523a59e53f3ec0ce5fcd9..392972256178eb978e313aa9e24772c877402204 100644 (file)
@@ -455,19 +455,20 @@ void si_llvm_load_input_vs(
                return;
        }
 
+       unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
        union si_vs_fix_fetch fix_fetch;
-       LLVMValueRef t_list_ptr;
-       LLVMValueRef t_offset;
-       LLVMValueRef t_list;
+       LLVMValueRef vb_desc;
        LLVMValueRef vertex_index;
        LLVMValueRef tmp;
 
-       /* Load the T list */
-       t_list_ptr = ac_get_arg(&ctx->ac, ctx->vertex_buffers);
-
-       t_offset = LLVMConstInt(ctx->i32, input_index, 0);
-
-       t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
+       if (input_index < num_vbos_in_user_sgprs) {
+               vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]);
+       } else {
+               unsigned index= input_index - num_vbos_in_user_sgprs;
+               vb_desc = ac_build_load_to_sgpr(&ctx->ac,
+                                               ac_get_arg(&ctx->ac, ctx->vertex_buffers),
+                                               LLVMConstInt(ctx->i32, index, 0));
+       }
 
        vertex_index = LLVMGetParam(ctx->main_fn,
                                    ctx->vertex_index0.arg_index +
@@ -488,7 +489,7 @@ void si_llvm_load_input_vs(
                tmp = ac_build_opencoded_load_format(
                                &ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1,
                                fix_fetch.u.format, fix_fetch.u.reverse, !opencode,
-                               t_list, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true);
+                               vb_desc, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true);
                for (unsigned i = 0; i < 4; ++i)
                        out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->i32, i, false), "");
                return;
@@ -513,7 +514,7 @@ void si_llvm_load_input_vs(
 
        for (unsigned i = 0; i < num_fetches; ++i) {
                LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
-               fetches[i] = ac_build_buffer_load_format(&ctx->ac, t_list, vertex_index, voffset,
+               fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
                                                         channels_per_fetch, 0, true);
        }
 
@@ -3359,6 +3360,28 @@ static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx)
        }
 }
 
+static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx)
+{
+       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers);
+
+       unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
+       if (num_vbos_in_user_sgprs) {
+               unsigned user_sgprs = ctx->args.num_sgprs_used;
+
+               if (is_merged_shader(ctx))
+                       user_sgprs -= 8;
+               assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
+
+               /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
+               for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
+                       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
+
+               assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors));
+               for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
+                       ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]);
+       }
+}
+
 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
                                   unsigned *num_prolog_vgprs)
 {
@@ -3479,10 +3502,8 @@ static void create_function(struct si_shader_context *ctx)
 
                declare_per_stage_desc_pointers(ctx, true);
                declare_vs_specific_input_sgprs(ctx); 
-               if (!shader->is_gs_copy_shader) {
-                       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
-                                  &ctx->vertex_buffers);
-               }
+               if (!shader->is_gs_copy_shader)
+                       declare_vb_descriptor_input_sgprs(ctx);
 
                if (shader->key.as_es) {
                        ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
@@ -3547,7 +3568,7 @@ static void create_function(struct si_shader_context *ctx)
                ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
                ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
                ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers);
+               declare_vb_descriptor_input_sgprs(ctx);
 
                /* VGPRs (first TCS, then VS) */
                ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
@@ -3611,10 +3632,8 @@ static void create_function(struct si_shader_context *ctx)
                        /* Declare as many input SGPRs as the VS has. */
                }
 
-               if (ctx->type == PIPE_SHADER_VERTEX) {
-                       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
-                                  &ctx->vertex_buffers);
-               }
+               if (ctx->type == PIPE_SHADER_VERTEX)
+                       declare_vb_descriptor_input_sgprs(ctx);
 
                /* VGPRs (first GS, then VS/TES) */
                ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset);
index 089b534b4bb574e6487b9ed8daf1e4e183c3c0ad..36c6218151c23ff340be3077606958fa89bff655 100644 (file)
@@ -212,6 +212,11 @@ enum {
        /* PS only */
        SI_SGPR_ALPHA_REF       = SI_NUM_RESOURCE_SGPRS,
        SI_PS_NUM_USER_SGPR,
+
+       /* The value has to be 12, because the hw requires that descriptors
+        * are aligned to 4 SGPRs.
+        */
+       SI_SGPR_VS_VB_DESCRIPTOR_FIRST = 12,
 };
 
 /* LLVM function parameter indices */
@@ -340,6 +345,7 @@ struct si_shader_selector {
        bool            force_correct_derivs_after_kill;
        bool            prim_discard_cs_allowed;
        unsigned        num_vs_inputs;
+       unsigned        num_vbos_in_user_sgprs;
        unsigned        pa_cl_vs_out_cntl;
        ubyte           clipdist_mask;
        ubyte           culldist_mask;
index 1ec74a84a696237adaabd84e0a2265c0405ba12f..da104678bd1e35d0b04e4af8b00558c085f751da 100644 (file)
@@ -83,6 +83,7 @@ struct si_shader_context {
        struct ac_arg merged_scratch_offset;
        /* API VS */
        struct ac_arg vertex_buffers;
+       struct ac_arg vb_descriptors[5];
        struct ac_arg rel_auto_id;
        struct ac_arg vs_prim_id;
        struct ac_arg vertex_index0;
index 790050b18ad3c79b16e7ce856b39320d06073160..8c3c150fcd6094ede013e20a677e74252b73b35a 100644 (file)
@@ -4873,7 +4873,10 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
                return NULL;
 
        v->count = count;
-       v->vb_desc_list_alloc_size = align(count * 16, SI_CPDMA_ALIGNMENT);
+
+       unsigned alloc_count = count > sscreen->num_vbos_in_user_sgprs ?
+                              count - sscreen->num_vbos_in_user_sgprs : 0;
+       v->vb_desc_list_alloc_size = align(alloc_count * 16, SI_CPDMA_ALIGNMENT);
 
        for (i = 0; i < count; ++i) {
                const struct util_format_description *desc;
@@ -5075,7 +5078,13 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
 
        sctx->vertex_elements = v;
        sctx->num_vertex_elements = v ? v->count : 0;
-       sctx->vertex_buffers_dirty = true;
+
+       if (sctx->num_vertex_elements) {
+               sctx->vertex_buffers_dirty = true;
+       } else {
+               sctx->vertex_buffer_pointer_dirty = false;
+               sctx->vertex_buffer_user_sgprs_dirty = false;
+       }
 
        if (v &&
            (!old ||
@@ -5111,8 +5120,10 @@ static void si_delete_vertex_element(struct pipe_context *ctx, void *state)
        struct si_context *sctx = (struct si_context *)ctx;
        struct si_vertex_elements *v = (struct si_vertex_elements*)state;
 
-       if (sctx->vertex_elements == state)
+       if (sctx->vertex_elements == state) {
                sctx->vertex_elements = NULL;
+               sctx->num_vertex_elements = 0;
+       }
        si_resource_reference(&v->instance_divisor_factor_buffer, NULL);
        FREE(state);
 }
index aaf25be66c77ebbc6b98bbc64c29fe3b95fb5119..80f5f7c943cbd14ee4a58d7d026a78327b76b757 100644 (file)
@@ -2234,6 +2234,7 @@ si_draw_rectangle(struct blitter_context *blitter,
        /* Don't set per-stage shader pointers for VS. */
        sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(VERTEX);
        sctx->vertex_buffer_pointer_dirty = false;
+       sctx->vertex_buffer_user_sgprs_dirty = false;
 
        si_draw_vbo(pipe, &info);
 }
index 18015bbec485f74bfffdd202d240d112ce64c808..826b7186fc3ac96c9053e524f0fed88484d6503d 100644 (file)
@@ -457,8 +457,19 @@ static struct si_pm4_state *si_get_shader_pm4_state(struct si_shader *shader)
        }
 }
 
-static unsigned si_get_num_vs_user_sgprs(unsigned num_always_on_user_sgprs)
+static unsigned si_get_num_vs_user_sgprs(struct si_shader *shader,
+                                        unsigned num_always_on_user_sgprs)
 {
+       struct si_shader_selector *vs = shader->previous_stage_sel ?
+                       shader->previous_stage_sel : shader->selector;
+       unsigned num_vbos_in_user_sgprs = vs->num_vbos_in_user_sgprs;
+
+       /* 1 SGPR is reserved for the vertex buffer pointer. */
+       assert(num_always_on_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST - 1);
+
+       if (num_vbos_in_user_sgprs)
+               return SI_SGPR_VS_VB_DESCRIPTOR_FIRST + num_vbos_in_user_sgprs * 4;
+
        /* Add the pointer to VBO descriptors. */
        return num_always_on_user_sgprs + 1;
 }
@@ -510,7 +521,7 @@ static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader)
                           S_00B528_VGPR_COMP_CNT(si_get_vs_vgpr_comp_cnt(sscreen, shader, false)) |
                           S_00B528_DX10_CLAMP(1) |
                           S_00B528_FLOAT_MODE(shader->config.float_mode);
-       shader->config.rsrc2 = S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(SI_VS_NUM_USER_SGPR)) |
+       shader->config.rsrc2 = S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR)) |
                           S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
 }
 
@@ -536,7 +547,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
                }
 
                unsigned num_user_sgprs =
-                       si_get_num_vs_user_sgprs(GFX9_TCS_NUM_USER_SGPR);
+                       si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR);
 
                shader->config.rsrc2 =
                        S_00B42C_USER_SGPR(num_user_sgprs) |
@@ -620,7 +631,7 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
 
        if (shader->selector->type == PIPE_SHADER_VERTEX) {
                vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
-               num_user_sgprs = si_get_num_vs_user_sgprs(SI_VS_NUM_USER_SGPR);
+               num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
        } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
                vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 : 2;
                num_user_sgprs = SI_TES_NUM_USER_SGPR;
@@ -887,7 +898,7 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
 
                unsigned num_user_sgprs;
                if (es_type == PIPE_SHADER_VERTEX)
-                       num_user_sgprs = si_get_num_vs_user_sgprs(GFX9_VSGS_NUM_USER_SGPR);
+                       num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
                else
                        num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
 
@@ -1131,7 +1142,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
                        num_user_sgprs = SI_SGPR_VS_BLIT_DATA +
                                         es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
                } else {
-                       num_user_sgprs = si_get_num_vs_user_sgprs(GFX9_VSGS_NUM_USER_SGPR);
+                       num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
                }
        } else {
                assert(es_type == PIPE_SHADER_TESS_EVAL);
@@ -1399,7 +1410,7 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
                        num_user_sgprs = SI_SGPR_VS_BLIT_DATA +
                                         info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
                } else {
-                       num_user_sgprs = si_get_num_vs_user_sgprs(SI_VS_NUM_USER_SGPR);
+                       num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
                }
        } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
                vgpr_comp_cnt = enable_prim_id ? 3 : 2;
@@ -1444,6 +1455,11 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
                         S_00B12C_OC_LDS_EN(oc_lds_en) |
                         S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
 
+       if (sscreen->info.chip_class >= GFX10)
+               rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
+       else if (sscreen->info.chip_class == GFX9)
+               rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
+
        if (sscreen->info.chip_class <= GFX9)
                rsrc1 |= S_00B128_SGPRS((shader->config.num_sgprs - 1) / 8);
 
@@ -2717,6 +2733,8 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
        sel->num_vs_inputs = sel->type == PIPE_SHADER_VERTEX &&
                             !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] ?
                                     sel->info.num_inputs : 0;
+       sel->num_vbos_in_user_sgprs =
+               MIN2(sel->num_vs_inputs, sscreen->num_vbos_in_user_sgprs);
 
        /* The prolog is a no-op if there are no inputs. */
        sel->vs_needs_prolog = sel->type == PIPE_SHADER_VERTEX &&