From 6fdef7d26569c1c8bfebcd5d16749ef094b01982 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Nicolai=20H=C3=A4hnle?= Date: Wed, 30 Nov 2016 12:25:45 +0100 Subject: [PATCH] radeonsi: shrink each vertex stream to the actually required size MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Reviewed-by: Marek Olšák --- src/gallium/drivers/radeonsi/si_shader.c | 28 +++++++++----- .../drivers/radeonsi/si_state_shaders.c | 37 +++++++++++-------- 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 9b495925a6e..72cf8270c78 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -5842,6 +5842,7 @@ static void preload_ring_buffers(struct si_shader_context *ctx) ctx->gsvs_ring[0] = build_indexed_load_const(ctx, buf_ptr, offset); } else if (ctx->type == PIPE_SHADER_GEOMETRY) { + const struct si_shader_selector *sel = ctx->shader->selector; struct lp_build_context *uint = &ctx->soa.bld_base.uint_bld; LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS); LLVMValueRef base_ring; @@ -5857,33 +5858,40 @@ static void preload_ring_buffers(struct si_shader_context *ctx) * Override the buffer descriptor accordingly. */ LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2); - unsigned max_gsvs_emit_size = ctx->shader->selector->max_gsvs_emit_size; - unsigned num_records; - - num_records = 64; - if (ctx->screen->b.chip_class >= VI) - num_records *= max_gsvs_emit_size; + uint64_t stream_offset = 0; for (unsigned stream = 0; stream < 4; ++stream) { + unsigned num_components; + unsigned stride; + unsigned num_records; LLVMValueRef ring, tmp; - if (!ctx->shader->selector->info.num_stream_output_components[stream]) + num_components = sel->info.num_stream_output_components[stream]; + if (!num_components) continue; + stride = 4 * num_components * sel->gs_max_out_vertices; + /* Limit on the stride field for <= CIK. */ - assert(max_gsvs_emit_size < (1 << 14)); + assert(stride < (1 << 14)); + + num_records = 64; + if (ctx->screen->b.chip_class >= VI) + num_records *= stride; ring = LLVMBuildBitCast(builder, base_ring, v2i64, ""); tmp = LLVMBuildExtractElement(builder, ring, uint->zero, ""); tmp = LLVMBuildAdd(builder, tmp, LLVMConstInt(ctx->i64, - max_gsvs_emit_size * 64 * stream, 0), ""); + stream_offset, 0), ""); + stream_offset += stride * 64; + ring = LLVMBuildInsertElement(builder, ring, tmp, uint->zero, ""); ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, ""); tmp = LLVMBuildExtractElement(builder, ring, uint->one, ""); tmp = LLVMBuildOr(builder, tmp, LLVMConstInt(ctx->i32, - S_008F04_STRIDE(max_gsvs_emit_size) | + S_008F04_STRIDE(stride) | S_008F04_SWIZZLE_ENABLE(1), 0), ""); ring = LLVMBuildInsertElement(builder, ring, tmp, uint->one, ""); ring = LLVMBuildInsertElement(builder, ring, diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 1e9f5f0a217..151ed17b1d4 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -468,15 +468,13 @@ static uint32_t si_vgt_gs_mode(struct si_shader_selector *sel) static void si_shader_gs(struct si_shader *shader) { - unsigned gs_vert_itemsize = shader->selector->gsvs_vertex_size; - unsigned gsvs_itemsize = shader->selector->max_gsvs_emit_size >> 2; - unsigned gs_num_invocations = shader->selector->gs_num_invocations; + struct si_shader_selector *sel = shader->selector; + const ubyte *num_components = sel->info.num_stream_output_components; + unsigned gs_num_invocations = sel->gs_num_invocations; struct si_pm4_state *pm4; uint64_t va; - unsigned max_stream = shader->selector->max_gs_stream; - - /* The GSVS_RING_ITEMSIZE register takes 15 bits */ - assert(gsvs_itemsize < (1 << 15)); + unsigned max_stream = sel->max_gs_stream; + unsigned offset; pm4 = si_get_shader_pm4_state(shader); if (!pm4) @@ -484,18 +482,27 @@ static void si_shader_gs(struct si_shader *shader) si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE, si_vgt_gs_mode(shader->selector)); - si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, gsvs_itemsize); - si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * ((max_stream >= 2) ? 2 : 1)); - si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize * ((max_stream >= 3) ? 3 : 1)); + offset = num_components[0] * sel->gs_max_out_vertices; + si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, offset); + if (max_stream >= 1) + offset += num_components[1] * sel->gs_max_out_vertices; + si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, offset); + if (max_stream >= 2) + offset += num_components[2] * sel->gs_max_out_vertices; + si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, offset); + if (max_stream >= 3) + offset += num_components[3] * sel->gs_max_out_vertices; + si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset); - si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * (max_stream + 1)); + /* The GSVS_RING_ITEMSIZE register takes 15 bits */ + assert(offset < (1 << 15)); si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, shader->selector->gs_max_out_vertices); - si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize >> 2); - si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? gs_vert_itemsize >> 2 : 0); - si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= 2) ? gs_vert_itemsize >> 2 : 0); - si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= 3) ? gs_vert_itemsize >> 2 : 0); + si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, num_components[0]); + si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? num_components[1] : 0); + si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= 2) ? num_components[2] : 0); + si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= 3) ? num_components[3] : 0); si_pm4_set_reg(pm4, R_028B90_VGT_GS_INSTANCE_CNT, S_028B90_CNT(MIN2(gs_num_invocations, 127)) | -- 2.30.2