From: Marek Olšák Date: Tue, 7 Jan 2020 23:23:53 +0000 (-0500) Subject: radeonsi: put up to 5 VBO descriptors into user SGPRs X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=363b4027fcbae3cc69ff6e55989f900398c3968a;p=mesa.git radeonsi: put up to 5 VBO descriptors into user SGPRs gfx6-8: 1 VBO descriptor in user SGPRs gfx9-10: 5 VBO descriptors in user SGPRs We no longer pull up to 5 VBO descriptors from GTT when SDMA is disabled. Totals from affected shaders: SGPRS: 1110528 -> 1170528 (5.40 %) VGPRS: 952896 -> 951936 (-0.10 %) Spilled SGPRs: 83 -> 61 (-26.51 %) Spilled VGPRs: 0 -> 0 (0.00 %) Private memory VGPRs: 0 -> 0 (0.00 %) Scratch size: 0 -> 0 (0.00 %) dwords per thread Code Size: 23766296 -> 22843920 (-3.88 %) bytes LDS: 0 -> 0 (0.00 %) blocks Max Waves: 179344 -> 179344 (0.00 %) Wait states: 0 -> 0 (0.00 %) Reviewed-by: Pierre-Eric Pelloux-Prayer --- diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index 80dedf61e0a..70ad55d8a41 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -99,6 +99,7 @@ void si_blitter_end(struct si_context *sctx) * non-global VS user SGPRs. */ sctx->shader_pointers_dirty |= SI_DESCS_SHADER_MASK(VERTEX); sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL; + sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0; si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); } diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 642a22ccfa6..3c43911a211 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -1103,36 +1103,48 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) struct si_vertex_elements *velems = sctx->vertex_elements; unsigned alloc_size = velems->vb_desc_list_alloc_size; - unsigned first_vb_use_mask = velems->first_vb_use_mask; - /* Vertex buffer descriptors are the only ones which are uploaded - * directly through a staging buffer and don't go through - * the fine-grained upload path. - */ - u_upload_alloc(sctx->b.const_uploader, 0, - alloc_size, - si_optimal_tcc_alignment(sctx, alloc_size), - &sctx->vb_descriptors_offset, - (struct pipe_resource**)&sctx->vb_descriptors_buffer, - (void**)&ptr); - if (!sctx->vb_descriptors_buffer) { - sctx->vb_descriptors_offset = 0; - sctx->vb_descriptors_gpu_list = NULL; - return false; - } + if (alloc_size) { + /* Vertex buffer descriptors are the only ones which are uploaded + * directly through a staging buffer and don't go through + * the fine-grained upload path. + */ + u_upload_alloc(sctx->b.const_uploader, 0, + alloc_size, + si_optimal_tcc_alignment(sctx, alloc_size), + &sctx->vb_descriptors_offset, + (struct pipe_resource**)&sctx->vb_descriptors_buffer, + (void**)&ptr); + if (!sctx->vb_descriptors_buffer) { + sctx->vb_descriptors_offset = 0; + sctx->vb_descriptors_gpu_list = NULL; + return false; + } - sctx->vb_descriptors_gpu_list = ptr; - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - sctx->vb_descriptors_buffer, RADEON_USAGE_READ, - RADEON_PRIO_DESCRIPTORS); + sctx->vb_descriptors_gpu_list = ptr; + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, + sctx->vb_descriptors_buffer, RADEON_USAGE_READ, + RADEON_PRIO_DESCRIPTORS); + sctx->vertex_buffer_pointer_dirty = true; + sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS; + } else { + si_resource_reference(&sctx->vb_descriptors_buffer, NULL); + sctx->vertex_buffer_pointer_dirty = false; + sctx->prefetch_L2_mask &= ~SI_PREFETCH_VBO_DESCRIPTORS; + } assert(count <= SI_MAX_ATTRIBS); + unsigned first_vb_use_mask = velems->first_vb_use_mask; + unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs; + for (i = 0; i < count; i++) { struct pipe_vertex_buffer *vb; struct si_resource *buf; unsigned vbo_index = velems->vertex_buffer_index[i]; - uint32_t *desc = &ptr[i*4]; + uint32_t *desc = i < num_vbos_in_user_sgprs ? + &sctx->vb_descriptor_user_sgprs[i * 4] : + &ptr[(i - num_vbos_in_user_sgprs) * 4]; vb = &sctx->vertex_buffer[vbo_index]; buf = si_resource(vb->buffer.resource); @@ -1187,9 +1199,8 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) * uploaded to a fresh new buffer, so I don't think flushing the const * cache is needed. */ si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); + sctx->vertex_buffer_user_sgprs_dirty = num_vbos_in_user_sgprs > 0; sctx->vertex_buffers_dirty = false; - sctx->vertex_buffer_pointer_dirty = true; - sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS; return true; } @@ -2050,8 +2061,11 @@ static void si_mark_shader_pointers_dirty(struct si_context *sctx, u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS, SI_NUM_SHADER_DESCS); - if (shader == PIPE_SHADER_VERTEX) + if (shader == PIPE_SHADER_VERTEX) { sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL; + sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 && + sctx->screen->num_vbos_in_user_sgprs; + } si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); } @@ -2060,6 +2074,8 @@ static void si_shader_pointers_begin_new_cs(struct si_context *sctx) { sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS); sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL; + sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 && + sctx->screen->num_vbos_in_user_sgprs; si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL; sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL; @@ -2258,8 +2274,6 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx) struct radeon_cmdbuf *cs = sctx->gfx_cs; /* Find the location of the VB descriptor pointer. */ - /* TODO: In the future, the pointer will be packed in unused - * bits of the first 2 VB descriptors. */ unsigned sh_dw_offset = SI_VS_NUM_USER_SGPR; if (sctx->chip_class >= GFX9) { if (sctx->tes_shader.cso) @@ -2276,6 +2290,18 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx) sctx->vertex_buffer_pointer_dirty = false; } + if (sctx->vertex_buffer_user_sgprs_dirty) { + struct radeon_cmdbuf *cs = sctx->gfx_cs; + unsigned num_desc = MIN2(sctx->num_vertex_elements, + sctx->screen->num_vbos_in_user_sgprs); + unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4; + + assert(num_desc); + si_emit_shader_pointer_head(cs, sh_offset, num_desc * 4); + radeon_emit_array(cs, sctx->vb_descriptor_user_sgprs, num_desc * 4); + sctx->vertex_buffer_user_sgprs_dirty = false; + } + if (sctx->graphics_bindless_pointer_dirty) { si_emit_global_shader_pointers(sctx, &sctx->bindless_descriptors); diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 755c768fb0b..a69f6c07800 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -1092,6 +1092,8 @@ radeonsi_screen_create_impl(struct radeon_winsys *ws, if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false)) si_init_perfcounters(sscreen); + sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1; + /* Determine tessellation ring info. */ bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 && sscreen->info.family != CHIP_CARRIZO && diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 14768f02384..6c92dc1a81e 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -486,6 +486,7 @@ struct si_screen { uint32_t *state, uint32_t *fmask_state); + unsigned num_vbos_in_user_sgprs; unsigned pa_sc_raster_config; unsigned pa_sc_raster_config_1; unsigned se_tile_repeat; @@ -1006,11 +1007,6 @@ struct si_context { bool flatshade; bool do_update_shaders; - /* vertex buffer descriptors */ - uint32_t *vb_descriptors_gpu_list; - struct si_resource *vb_descriptors_buffer; - unsigned vb_descriptors_offset; - /* shader descriptors */ struct si_descriptors descriptors[SI_NUM_DESCS]; unsigned descriptors_dirty; @@ -1037,11 +1033,16 @@ struct si_context { uint32_t vs_blit_sh_data[SI_VS_BLIT_SGPRS_POS_TEXCOORD]; uint32_t cs_user_data[4]; - /* Vertex and index buffers. */ + /* Vertex buffers. */ bool vertex_buffers_dirty; bool vertex_buffer_pointer_dirty; + bool vertex_buffer_user_sgprs_dirty; struct pipe_vertex_buffer vertex_buffer[SI_NUM_VERTEX_BUFFERS]; uint16_t vertex_buffer_unaligned; /* bitmask of not dword-aligned buffers */ + uint32_t *vb_descriptors_gpu_list; + struct si_resource *vb_descriptors_buffer; + unsigned vb_descriptors_offset; + unsigned vb_descriptor_user_sgprs[5*4]; /* MSAA config state. */ int ps_iter_samples; diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index f734221728b..39297225617 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -455,19 +455,20 @@ void si_llvm_load_input_vs( return; } + unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs; union si_vs_fix_fetch fix_fetch; - LLVMValueRef t_list_ptr; - LLVMValueRef t_offset; - LLVMValueRef t_list; + LLVMValueRef vb_desc; LLVMValueRef vertex_index; LLVMValueRef tmp; - /* Load the T list */ - t_list_ptr = ac_get_arg(&ctx->ac, ctx->vertex_buffers); - - t_offset = LLVMConstInt(ctx->i32, input_index, 0); - - t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset); + if (input_index < num_vbos_in_user_sgprs) { + vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]); + } else { + unsigned index= input_index - num_vbos_in_user_sgprs; + vb_desc = ac_build_load_to_sgpr(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->vertex_buffers), + LLVMConstInt(ctx->i32, index, 0)); + } vertex_index = LLVMGetParam(ctx->main_fn, ctx->vertex_index0.arg_index + @@ -488,7 +489,7 @@ void si_llvm_load_input_vs( tmp = ac_build_opencoded_load_format( &ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1, fix_fetch.u.format, fix_fetch.u.reverse, !opencode, - t_list, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true); + vb_desc, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true); for (unsigned i = 0; i < 4; ++i) out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->i32, i, false), ""); return; @@ -513,7 +514,7 @@ void si_llvm_load_input_vs( for (unsigned i = 0; i < num_fetches; ++i) { LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0); - fetches[i] = ac_build_buffer_load_format(&ctx->ac, t_list, vertex_index, voffset, + fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset, channels_per_fetch, 0, true); } @@ -3359,6 +3360,28 @@ static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx) } } +static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx) +{ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers); + + unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs; + if (num_vbos_in_user_sgprs) { + unsigned user_sgprs = ctx->args.num_sgprs_used; + + if (is_merged_shader(ctx)) + user_sgprs -= 8; + assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST); + + /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */ + for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++) + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ + + assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors)); + for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++) + ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]); + } +} + static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs) { @@ -3479,10 +3502,8 @@ static void create_function(struct si_shader_context *ctx) declare_per_stage_desc_pointers(ctx, true); declare_vs_specific_input_sgprs(ctx); - if (!shader->is_gs_copy_shader) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, - &ctx->vertex_buffers); - } + if (!shader->is_gs_copy_shader) + declare_vb_descriptor_input_sgprs(ctx); if (shader->key.as_es) { ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, @@ -3547,7 +3568,7 @@ static void create_function(struct si_shader_context *ctx) ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets); ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers); + declare_vb_descriptor_input_sgprs(ctx); /* VGPRs (first TCS, then VS) */ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id); @@ -3611,10 +3632,8 @@ static void create_function(struct si_shader_context *ctx) /* Declare as many input SGPRs as the VS has. */ } - if (ctx->type == PIPE_SHADER_VERTEX) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, - &ctx->vertex_buffers); - } + if (ctx->type == PIPE_SHADER_VERTEX) + declare_vb_descriptor_input_sgprs(ctx); /* VGPRs (first GS, then VS/TES) */ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset); diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 089b534b4bb..36c6218151c 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -212,6 +212,11 @@ enum { /* PS only */ SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS, SI_PS_NUM_USER_SGPR, + + /* The value has to be 12, because the hw requires that descriptors + * are aligned to 4 SGPRs. + */ + SI_SGPR_VS_VB_DESCRIPTOR_FIRST = 12, }; /* LLVM function parameter indices */ @@ -340,6 +345,7 @@ struct si_shader_selector { bool force_correct_derivs_after_kill; bool prim_discard_cs_allowed; unsigned num_vs_inputs; + unsigned num_vbos_in_user_sgprs; unsigned pa_cl_vs_out_cntl; ubyte clipdist_mask; ubyte culldist_mask; diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 1ec74a84a69..da104678bd1 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -83,6 +83,7 @@ struct si_shader_context { struct ac_arg merged_scratch_offset; /* API VS */ struct ac_arg vertex_buffers; + struct ac_arg vb_descriptors[5]; struct ac_arg rel_auto_id; struct ac_arg vs_prim_id; struct ac_arg vertex_index0; diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 790050b18ad..8c3c150fcd6 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -4873,7 +4873,10 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, return NULL; v->count = count; - v->vb_desc_list_alloc_size = align(count * 16, SI_CPDMA_ALIGNMENT); + + unsigned alloc_count = count > sscreen->num_vbos_in_user_sgprs ? + count - sscreen->num_vbos_in_user_sgprs : 0; + v->vb_desc_list_alloc_size = align(alloc_count * 16, SI_CPDMA_ALIGNMENT); for (i = 0; i < count; ++i) { const struct util_format_description *desc; @@ -5075,7 +5078,13 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state) sctx->vertex_elements = v; sctx->num_vertex_elements = v ? v->count : 0; - sctx->vertex_buffers_dirty = true; + + if (sctx->num_vertex_elements) { + sctx->vertex_buffers_dirty = true; + } else { + sctx->vertex_buffer_pointer_dirty = false; + sctx->vertex_buffer_user_sgprs_dirty = false; + } if (v && (!old || @@ -5111,8 +5120,10 @@ static void si_delete_vertex_element(struct pipe_context *ctx, void *state) struct si_context *sctx = (struct si_context *)ctx; struct si_vertex_elements *v = (struct si_vertex_elements*)state; - if (sctx->vertex_elements == state) + if (sctx->vertex_elements == state) { sctx->vertex_elements = NULL; + sctx->num_vertex_elements = 0; + } si_resource_reference(&v->instance_divisor_factor_buffer, NULL); FREE(state); } diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index aaf25be66c7..80f5f7c943c 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -2234,6 +2234,7 @@ si_draw_rectangle(struct blitter_context *blitter, /* Don't set per-stage shader pointers for VS. */ sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(VERTEX); sctx->vertex_buffer_pointer_dirty = false; + sctx->vertex_buffer_user_sgprs_dirty = false; si_draw_vbo(pipe, &info); } diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 18015bbec48..826b7186fc3 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -457,8 +457,19 @@ static struct si_pm4_state *si_get_shader_pm4_state(struct si_shader *shader) } } -static unsigned si_get_num_vs_user_sgprs(unsigned num_always_on_user_sgprs) +static unsigned si_get_num_vs_user_sgprs(struct si_shader *shader, + unsigned num_always_on_user_sgprs) { + struct si_shader_selector *vs = shader->previous_stage_sel ? + shader->previous_stage_sel : shader->selector; + unsigned num_vbos_in_user_sgprs = vs->num_vbos_in_user_sgprs; + + /* 1 SGPR is reserved for the vertex buffer pointer. */ + assert(num_always_on_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST - 1); + + if (num_vbos_in_user_sgprs) + return SI_SGPR_VS_VB_DESCRIPTOR_FIRST + num_vbos_in_user_sgprs * 4; + /* Add the pointer to VBO descriptors. */ return num_always_on_user_sgprs + 1; } @@ -510,7 +521,7 @@ static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader) S_00B528_VGPR_COMP_CNT(si_get_vs_vgpr_comp_cnt(sscreen, shader, false)) | S_00B528_DX10_CLAMP(1) | S_00B528_FLOAT_MODE(shader->config.float_mode); - shader->config.rsrc2 = S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(SI_VS_NUM_USER_SGPR)) | + shader->config.rsrc2 = S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR)) | S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0); } @@ -536,7 +547,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader) } unsigned num_user_sgprs = - si_get_num_vs_user_sgprs(GFX9_TCS_NUM_USER_SGPR); + si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR); shader->config.rsrc2 = S_00B42C_USER_SGPR(num_user_sgprs) | @@ -620,7 +631,7 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader) if (shader->selector->type == PIPE_SHADER_VERTEX) { vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false); - num_user_sgprs = si_get_num_vs_user_sgprs(SI_VS_NUM_USER_SGPR); + num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR); } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) { vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 : 2; num_user_sgprs = SI_TES_NUM_USER_SGPR; @@ -887,7 +898,7 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) unsigned num_user_sgprs; if (es_type == PIPE_SHADER_VERTEX) - num_user_sgprs = si_get_num_vs_user_sgprs(GFX9_VSGS_NUM_USER_SGPR); + num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR); else num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR; @@ -1131,7 +1142,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader num_user_sgprs = SI_SGPR_VS_BLIT_DATA + es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]; } else { - num_user_sgprs = si_get_num_vs_user_sgprs(GFX9_VSGS_NUM_USER_SGPR); + num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR); } } else { assert(es_type == PIPE_SHADER_TESS_EVAL); @@ -1399,7 +1410,7 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader, num_user_sgprs = SI_SGPR_VS_BLIT_DATA + info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]; } else { - num_user_sgprs = si_get_num_vs_user_sgprs(SI_VS_NUM_USER_SGPR); + num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR); } } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) { vgpr_comp_cnt = enable_prim_id ? 3 : 2; @@ -1444,6 +1455,11 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader, S_00B12C_OC_LDS_EN(oc_lds_en) | S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0); + if (sscreen->info.chip_class >= GFX10) + rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5); + else if (sscreen->info.chip_class == GFX9) + rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5); + if (sscreen->info.chip_class <= GFX9) rsrc1 |= S_00B128_SGPRS((shader->config.num_sgprs - 1) / 8); @@ -2717,6 +2733,8 @@ static void *si_create_shader_selector(struct pipe_context *ctx, sel->num_vs_inputs = sel->type == PIPE_SHADER_VERTEX && !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] ? sel->info.num_inputs : 0; + sel->num_vbos_in_user_sgprs = + MIN2(sel->num_vs_inputs, sscreen->num_vbos_in_user_sgprs); /* The prolog is a no-op if there are no inputs. */ sel->vs_needs_prolog = sel->type == PIPE_SHADER_VERTEX &&