* non-global VS user SGPRs. */
sctx->shader_pointers_dirty |= SI_DESCS_SHADER_MASK(VERTEX);
sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
+ sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0;
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
}
struct si_vertex_elements *velems = sctx->vertex_elements;
unsigned alloc_size = velems->vb_desc_list_alloc_size;
- unsigned first_vb_use_mask = velems->first_vb_use_mask;
- /* Vertex buffer descriptors are the only ones which are uploaded
- * directly through a staging buffer and don't go through
- * the fine-grained upload path.
- */
- u_upload_alloc(sctx->b.const_uploader, 0,
- alloc_size,
- si_optimal_tcc_alignment(sctx, alloc_size),
- &sctx->vb_descriptors_offset,
- (struct pipe_resource**)&sctx->vb_descriptors_buffer,
- (void**)&ptr);
- if (!sctx->vb_descriptors_buffer) {
- sctx->vb_descriptors_offset = 0;
- sctx->vb_descriptors_gpu_list = NULL;
- return false;
- }
+ if (alloc_size) {
+ /* Vertex buffer descriptors are the only ones which are uploaded
+ * directly through a staging buffer and don't go through
+ * the fine-grained upload path.
+ */
+ u_upload_alloc(sctx->b.const_uploader, 0,
+ alloc_size,
+ si_optimal_tcc_alignment(sctx, alloc_size),
+ &sctx->vb_descriptors_offset,
+ (struct pipe_resource**)&sctx->vb_descriptors_buffer,
+ (void**)&ptr);
+ if (!sctx->vb_descriptors_buffer) {
+ sctx->vb_descriptors_offset = 0;
+ sctx->vb_descriptors_gpu_list = NULL;
+ return false;
+ }
- sctx->vb_descriptors_gpu_list = ptr;
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
- sctx->vb_descriptors_buffer, RADEON_USAGE_READ,
- RADEON_PRIO_DESCRIPTORS);
+ sctx->vb_descriptors_gpu_list = ptr;
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
+ sctx->vb_descriptors_buffer, RADEON_USAGE_READ,
+ RADEON_PRIO_DESCRIPTORS);
+ sctx->vertex_buffer_pointer_dirty = true;
+ sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
+ } else {
+ si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
+ sctx->vertex_buffer_pointer_dirty = false;
+ sctx->prefetch_L2_mask &= ~SI_PREFETCH_VBO_DESCRIPTORS;
+ }
assert(count <= SI_MAX_ATTRIBS);
+ unsigned first_vb_use_mask = velems->first_vb_use_mask;
+ unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs;
+
for (i = 0; i < count; i++) {
struct pipe_vertex_buffer *vb;
struct si_resource *buf;
unsigned vbo_index = velems->vertex_buffer_index[i];
- uint32_t *desc = &ptr[i*4];
+ uint32_t *desc = i < num_vbos_in_user_sgprs ?
+ &sctx->vb_descriptor_user_sgprs[i * 4] :
+ &ptr[(i - num_vbos_in_user_sgprs) * 4];
vb = &sctx->vertex_buffer[vbo_index];
buf = si_resource(vb->buffer.resource);
* uploaded to a fresh new buffer, so I don't think flushing the const
* cache is needed. */
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
+ sctx->vertex_buffer_user_sgprs_dirty = num_vbos_in_user_sgprs > 0;
sctx->vertex_buffers_dirty = false;
- sctx->vertex_buffer_pointer_dirty = true;
- sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
return true;
}
u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS,
SI_NUM_SHADER_DESCS);
- if (shader == PIPE_SHADER_VERTEX)
+ if (shader == PIPE_SHADER_VERTEX) {
sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
+ sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 &&
+ sctx->screen->num_vbos_in_user_sgprs;
+ }
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
}
{
sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
+ sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 &&
+ sctx->screen->num_vbos_in_user_sgprs;
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
struct radeon_cmdbuf *cs = sctx->gfx_cs;
/* Find the location of the VB descriptor pointer. */
- /* TODO: In the future, the pointer will be packed in unused
- * bits of the first 2 VB descriptors. */
unsigned sh_dw_offset = SI_VS_NUM_USER_SGPR;
if (sctx->chip_class >= GFX9) {
if (sctx->tes_shader.cso)
sctx->vertex_buffer_pointer_dirty = false;
}
+ if (sctx->vertex_buffer_user_sgprs_dirty) {
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ unsigned num_desc = MIN2(sctx->num_vertex_elements,
+ sctx->screen->num_vbos_in_user_sgprs);
+ unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4;
+
+ assert(num_desc);
+ si_emit_shader_pointer_head(cs, sh_offset, num_desc * 4);
+ radeon_emit_array(cs, sctx->vb_descriptor_user_sgprs, num_desc * 4);
+ sctx->vertex_buffer_user_sgprs_dirty = false;
+ }
+
if (sctx->graphics_bindless_pointer_dirty) {
si_emit_global_shader_pointers(sctx,
&sctx->bindless_descriptors);
if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
si_init_perfcounters(sscreen);
+ sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1;
+
/* Determine tessellation ring info. */
bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 &&
sscreen->info.family != CHIP_CARRIZO &&
uint32_t *state,
uint32_t *fmask_state);
+ unsigned num_vbos_in_user_sgprs;
unsigned pa_sc_raster_config;
unsigned pa_sc_raster_config_1;
unsigned se_tile_repeat;
bool flatshade;
bool do_update_shaders;
- /* vertex buffer descriptors */
- uint32_t *vb_descriptors_gpu_list;
- struct si_resource *vb_descriptors_buffer;
- unsigned vb_descriptors_offset;
-
/* shader descriptors */
struct si_descriptors descriptors[SI_NUM_DESCS];
unsigned descriptors_dirty;
uint32_t vs_blit_sh_data[SI_VS_BLIT_SGPRS_POS_TEXCOORD];
uint32_t cs_user_data[4];
- /* Vertex and index buffers. */
+ /* Vertex buffers. */
bool vertex_buffers_dirty;
bool vertex_buffer_pointer_dirty;
+ bool vertex_buffer_user_sgprs_dirty;
struct pipe_vertex_buffer vertex_buffer[SI_NUM_VERTEX_BUFFERS];
uint16_t vertex_buffer_unaligned; /* bitmask of not dword-aligned buffers */
+ uint32_t *vb_descriptors_gpu_list;
+ struct si_resource *vb_descriptors_buffer;
+ unsigned vb_descriptors_offset;
+ unsigned vb_descriptor_user_sgprs[5*4];
/* MSAA config state. */
int ps_iter_samples;
return;
}
+ unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
union si_vs_fix_fetch fix_fetch;
- LLVMValueRef t_list_ptr;
- LLVMValueRef t_offset;
- LLVMValueRef t_list;
+ LLVMValueRef vb_desc;
LLVMValueRef vertex_index;
LLVMValueRef tmp;
- /* Load the T list */
- t_list_ptr = ac_get_arg(&ctx->ac, ctx->vertex_buffers);
-
- t_offset = LLVMConstInt(ctx->i32, input_index, 0);
-
- t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
+ if (input_index < num_vbos_in_user_sgprs) {
+ vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]);
+ } else {
+ unsigned index= input_index - num_vbos_in_user_sgprs;
+ vb_desc = ac_build_load_to_sgpr(&ctx->ac,
+ ac_get_arg(&ctx->ac, ctx->vertex_buffers),
+ LLVMConstInt(ctx->i32, index, 0));
+ }
vertex_index = LLVMGetParam(ctx->main_fn,
ctx->vertex_index0.arg_index +
tmp = ac_build_opencoded_load_format(
&ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1,
fix_fetch.u.format, fix_fetch.u.reverse, !opencode,
- t_list, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true);
+ vb_desc, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true);
for (unsigned i = 0; i < 4; ++i)
out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->i32, i, false), "");
return;
for (unsigned i = 0; i < num_fetches; ++i) {
LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
- fetches[i] = ac_build_buffer_load_format(&ctx->ac, t_list, vertex_index, voffset,
+ fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
channels_per_fetch, 0, true);
}
}
}
+static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx)
+{
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers);
+
+ unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
+ if (num_vbos_in_user_sgprs) {
+ unsigned user_sgprs = ctx->args.num_sgprs_used;
+
+ if (is_merged_shader(ctx))
+ user_sgprs -= 8;
+ assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
+
+ /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
+ for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
+
+ assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors));
+ for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]);
+ }
+}
+
static void declare_vs_input_vgprs(struct si_shader_context *ctx,
unsigned *num_prolog_vgprs)
{
declare_per_stage_desc_pointers(ctx, true);
declare_vs_specific_input_sgprs(ctx);
- if (!shader->is_gs_copy_shader) {
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
- &ctx->vertex_buffers);
- }
+ if (!shader->is_gs_copy_shader)
+ declare_vb_descriptor_input_sgprs(ctx);
if (shader->key.as_es) {
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers);
+ declare_vb_descriptor_input_sgprs(ctx);
/* VGPRs (first TCS, then VS) */
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
/* Declare as many input SGPRs as the VS has. */
}
- if (ctx->type == PIPE_SHADER_VERTEX) {
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
- &ctx->vertex_buffers);
- }
+ if (ctx->type == PIPE_SHADER_VERTEX)
+ declare_vb_descriptor_input_sgprs(ctx);
/* VGPRs (first GS, then VS/TES) */
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset);
/* PS only */
SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS,
SI_PS_NUM_USER_SGPR,
+
+ /* The value has to be 12, because the hw requires that descriptors
+ * are aligned to 4 SGPRs.
+ */
+ SI_SGPR_VS_VB_DESCRIPTOR_FIRST = 12,
};
/* LLVM function parameter indices */
bool force_correct_derivs_after_kill;
bool prim_discard_cs_allowed;
unsigned num_vs_inputs;
+ unsigned num_vbos_in_user_sgprs;
unsigned pa_cl_vs_out_cntl;
ubyte clipdist_mask;
ubyte culldist_mask;
struct ac_arg merged_scratch_offset;
/* API VS */
struct ac_arg vertex_buffers;
+ struct ac_arg vb_descriptors[5];
struct ac_arg rel_auto_id;
struct ac_arg vs_prim_id;
struct ac_arg vertex_index0;
return NULL;
v->count = count;
- v->vb_desc_list_alloc_size = align(count * 16, SI_CPDMA_ALIGNMENT);
+
+ unsigned alloc_count = count > sscreen->num_vbos_in_user_sgprs ?
+ count - sscreen->num_vbos_in_user_sgprs : 0;
+ v->vb_desc_list_alloc_size = align(alloc_count * 16, SI_CPDMA_ALIGNMENT);
for (i = 0; i < count; ++i) {
const struct util_format_description *desc;
sctx->vertex_elements = v;
sctx->num_vertex_elements = v ? v->count : 0;
- sctx->vertex_buffers_dirty = true;
+
+ if (sctx->num_vertex_elements) {
+ sctx->vertex_buffers_dirty = true;
+ } else {
+ sctx->vertex_buffer_pointer_dirty = false;
+ sctx->vertex_buffer_user_sgprs_dirty = false;
+ }
if (v &&
(!old ||
struct si_context *sctx = (struct si_context *)ctx;
struct si_vertex_elements *v = (struct si_vertex_elements*)state;
- if (sctx->vertex_elements == state)
+ if (sctx->vertex_elements == state) {
sctx->vertex_elements = NULL;
+ sctx->num_vertex_elements = 0;
+ }
si_resource_reference(&v->instance_divisor_factor_buffer, NULL);
FREE(state);
}
/* Don't set per-stage shader pointers for VS. */
sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(VERTEX);
sctx->vertex_buffer_pointer_dirty = false;
+ sctx->vertex_buffer_user_sgprs_dirty = false;
si_draw_vbo(pipe, &info);
}
}
}
-static unsigned si_get_num_vs_user_sgprs(unsigned num_always_on_user_sgprs)
+static unsigned si_get_num_vs_user_sgprs(struct si_shader *shader,
+ unsigned num_always_on_user_sgprs)
{
+ struct si_shader_selector *vs = shader->previous_stage_sel ?
+ shader->previous_stage_sel : shader->selector;
+ unsigned num_vbos_in_user_sgprs = vs->num_vbos_in_user_sgprs;
+
+ /* 1 SGPR is reserved for the vertex buffer pointer. */
+ assert(num_always_on_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST - 1);
+
+ if (num_vbos_in_user_sgprs)
+ return SI_SGPR_VS_VB_DESCRIPTOR_FIRST + num_vbos_in_user_sgprs * 4;
+
/* Add the pointer to VBO descriptors. */
return num_always_on_user_sgprs + 1;
}
S_00B528_VGPR_COMP_CNT(si_get_vs_vgpr_comp_cnt(sscreen, shader, false)) |
S_00B528_DX10_CLAMP(1) |
S_00B528_FLOAT_MODE(shader->config.float_mode);
- shader->config.rsrc2 = S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(SI_VS_NUM_USER_SGPR)) |
+ shader->config.rsrc2 = S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR)) |
S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
}
}
unsigned num_user_sgprs =
- si_get_num_vs_user_sgprs(GFX9_TCS_NUM_USER_SGPR);
+ si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR);
shader->config.rsrc2 =
S_00B42C_USER_SGPR(num_user_sgprs) |
if (shader->selector->type == PIPE_SHADER_VERTEX) {
vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
- num_user_sgprs = si_get_num_vs_user_sgprs(SI_VS_NUM_USER_SGPR);
+ num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
} else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 : 2;
num_user_sgprs = SI_TES_NUM_USER_SGPR;
unsigned num_user_sgprs;
if (es_type == PIPE_SHADER_VERTEX)
- num_user_sgprs = si_get_num_vs_user_sgprs(GFX9_VSGS_NUM_USER_SGPR);
+ num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
else
num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
num_user_sgprs = SI_SGPR_VS_BLIT_DATA +
es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
} else {
- num_user_sgprs = si_get_num_vs_user_sgprs(GFX9_VSGS_NUM_USER_SGPR);
+ num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
}
} else {
assert(es_type == PIPE_SHADER_TESS_EVAL);
num_user_sgprs = SI_SGPR_VS_BLIT_DATA +
info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
} else {
- num_user_sgprs = si_get_num_vs_user_sgprs(SI_VS_NUM_USER_SGPR);
+ num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
}
} else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
vgpr_comp_cnt = enable_prim_id ? 3 : 2;
S_00B12C_OC_LDS_EN(oc_lds_en) |
S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+ if (sscreen->info.chip_class >= GFX10)
+ rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
+ else if (sscreen->info.chip_class == GFX9)
+ rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
+
if (sscreen->info.chip_class <= GFX9)
rsrc1 |= S_00B128_SGPRS((shader->config.num_sgprs - 1) / 8);
sel->num_vs_inputs = sel->type == PIPE_SHADER_VERTEX &&
!sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] ?
sel->info.num_inputs : 0;
+ sel->num_vbos_in_user_sgprs =
+ MIN2(sel->num_vs_inputs, sscreen->num_vbos_in_user_sgprs);
/* The prolog is a no-op if there are no inputs. */
sel->vs_needs_prolog = sel->type == PIPE_SHADER_VERTEX &&