From: Marek Olšák Date: Fri, 13 Jul 2018 04:23:36 +0000 (-0400) Subject: radeonsi: reduce LDS stalls by 40% for tessellation X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=86b52d42368ac496fe24bc6674e754c323381635;p=mesa.git radeonsi: reduce LDS stalls by 40% for tessellation 40% is the decrease in the LGKM counter (which includes SMEM too) for the GFX9 LSHS stage. This will make the LDS size slightly larger, but I wasn't able to increase the patch stride without corruption, so I'm increasing the vertex stride. --- diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 5dc12d87243..43ba23ff494 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -417,14 +417,14 @@ static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx) switch (ctx->type) { case PIPE_SHADER_VERTEX: - stride = util_last_bit64(ctx->shader->selector->outputs_written); - return LLVMConstInt(ctx->i32, stride * 4, 0); + stride = ctx->shader->selector->lshs_vertex_stride / 4; + return LLVMConstInt(ctx->i32, stride, 0); case PIPE_SHADER_TESS_CTRL: if (ctx->screen->info.chip_class >= GFX9 && ctx->shader->is_monolithic) { - stride = util_last_bit64(ctx->shader->key.part.tcs.ls->outputs_written); - return LLVMConstInt(ctx->i32, stride * 4, 0); + stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4; + return LLVMConstInt(ctx->i32, stride, 0); } return si_unpack_param(ctx, ctx->param_vs_state_bits, 24, 8); diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index a2fb48ab023..2dc4bc7e787 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -368,7 +368,8 @@ struct si_shader_selector { ubyte culldist_mask; /* ES parameters. */ - unsigned esgs_itemsize; + unsigned esgs_itemsize; /* vertex stride */ + unsigned lshs_vertex_stride; /* GS parameters. */ unsigned gs_input_verts_per_prim; diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index f35f73a37ce..d901401f0bb 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -134,7 +134,7 @@ static bool si_emit_derived_tess_state(struct si_context *sctx, num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */ } - input_vertex_size = num_tcs_inputs * 16; + input_vertex_size = ls->lshs_vertex_stride; output_vertex_size = num_tcs_outputs * 16; input_patch_size = num_tcs_input_cp * input_vertex_size; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 4e0320a226d..de33d250301 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -2160,6 +2160,13 @@ static void *si_create_shader_selector(struct pipe_context *ctx, } } sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16; + sel->lshs_vertex_stride = sel->esgs_itemsize; + + /* Add 1 dword to reduce LDS bank conflicts, so that each vertex + * will start on a different bank. (except for the maximum 32*16). + */ + if (sel->lshs_vertex_stride < 32*16) + sel->lshs_vertex_stride += 4; /* For the ESGS ring in LDS, add 1 dword to reduce LDS bank * conflicts, i.e. each vertex will start at a different bank.