radeonsi: shrink each vertex stream to the actually required size
authorNicolai Hähnle <nicolai.haehnle@amd.com>
Wed, 30 Nov 2016 11:25:45 +0000 (12:25 +0100)
committerNicolai Hähnle <nicolai.haehnle@amd.com>
Mon, 12 Dec 2016 08:05:13 +0000 (09:05 +0100)
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
src/gallium/drivers/radeonsi/si_shader.c
src/gallium/drivers/radeonsi/si_state_shaders.c

index 9b495925a6e6616d9eb31d822790296810672cb7..72cf8270c7800b784ca3d2e6faae33e4538d8b1e 100644 (file)
@@ -5842,6 +5842,7 @@ static void preload_ring_buffers(struct si_shader_context *ctx)
                ctx->gsvs_ring[0] =
                        build_indexed_load_const(ctx, buf_ptr, offset);
        } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
+               const struct si_shader_selector *sel = ctx->shader->selector;
                struct lp_build_context *uint = &ctx->soa.bld_base.uint_bld;
                LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS);
                LLVMValueRef base_ring;
@@ -5857,33 +5858,40 @@ static void preload_ring_buffers(struct si_shader_context *ctx)
                 * Override the buffer descriptor accordingly.
                 */
                LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
-               unsigned max_gsvs_emit_size = ctx->shader->selector->max_gsvs_emit_size;
-               unsigned num_records;
-
-               num_records = 64;
-               if (ctx->screen->b.chip_class >= VI)
-                       num_records *= max_gsvs_emit_size;
+               uint64_t stream_offset = 0;
 
                for (unsigned stream = 0; stream < 4; ++stream) {
+                       unsigned num_components;
+                       unsigned stride;
+                       unsigned num_records;
                        LLVMValueRef ring, tmp;
 
-                       if (!ctx->shader->selector->info.num_stream_output_components[stream])
+                       num_components = sel->info.num_stream_output_components[stream];
+                       if (!num_components)
                                continue;
 
+                       stride = 4 * num_components * sel->gs_max_out_vertices;
+
                        /* Limit on the stride field for <= CIK. */
-                       assert(max_gsvs_emit_size < (1 << 14));
+                       assert(stride < (1 << 14));
+
+                       num_records = 64;
+                       if (ctx->screen->b.chip_class >= VI)
+                               num_records *= stride;
 
                        ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
                        tmp = LLVMBuildExtractElement(builder, ring, uint->zero, "");
                        tmp = LLVMBuildAdd(builder, tmp,
                                           LLVMConstInt(ctx->i64,
-                                                       max_gsvs_emit_size * 64 * stream, 0), "");
+                                                       stream_offset, 0), "");
+                       stream_offset += stride * 64;
+
                        ring = LLVMBuildInsertElement(builder, ring, tmp, uint->zero, "");
                        ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
                        tmp = LLVMBuildExtractElement(builder, ring, uint->one, "");
                        tmp = LLVMBuildOr(builder, tmp,
                                LLVMConstInt(ctx->i32,
-                                            S_008F04_STRIDE(max_gsvs_emit_size) |
+                                            S_008F04_STRIDE(stride) |
                                             S_008F04_SWIZZLE_ENABLE(1), 0), "");
                        ring = LLVMBuildInsertElement(builder, ring, tmp, uint->one, "");
                        ring = LLVMBuildInsertElement(builder, ring,
index 1e9f5f0a217b3d04261fa3ef88e949942104d3e5..151ed17b1d41e1f3e166e641866a027cfb973878 100644 (file)
@@ -468,15 +468,13 @@ static uint32_t si_vgt_gs_mode(struct si_shader_selector *sel)
 
 static void si_shader_gs(struct si_shader *shader)
 {
-       unsigned gs_vert_itemsize = shader->selector->gsvs_vertex_size;
-       unsigned gsvs_itemsize = shader->selector->max_gsvs_emit_size >> 2;
-       unsigned gs_num_invocations = shader->selector->gs_num_invocations;
+       struct si_shader_selector *sel = shader->selector;
+       const ubyte *num_components = sel->info.num_stream_output_components;
+       unsigned gs_num_invocations = sel->gs_num_invocations;
        struct si_pm4_state *pm4;
        uint64_t va;
-       unsigned max_stream = shader->selector->max_gs_stream;
-
-       /* The GSVS_RING_ITEMSIZE register takes 15 bits */
-       assert(gsvs_itemsize < (1 << 15));
+       unsigned max_stream = sel->max_gs_stream;
+       unsigned offset;
 
        pm4 = si_get_shader_pm4_state(shader);
        if (!pm4)
@@ -484,18 +482,27 @@ static void si_shader_gs(struct si_shader *shader)
 
        si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE, si_vgt_gs_mode(shader->selector));
 
-       si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, gsvs_itemsize);
-       si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * ((max_stream >= 2) ? 2 : 1));
-       si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize * ((max_stream >= 3) ? 3 : 1));
+       offset = num_components[0] * sel->gs_max_out_vertices;
+       si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, offset);
+       if (max_stream >= 1)
+               offset += num_components[1] * sel->gs_max_out_vertices;
+       si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, offset);
+       if (max_stream >= 2)
+               offset += num_components[2] * sel->gs_max_out_vertices;
+       si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, offset);
+       if (max_stream >= 3)
+               offset += num_components[3] * sel->gs_max_out_vertices;
+       si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset);
 
-       si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * (max_stream + 1));
+       /* The GSVS_RING_ITEMSIZE register takes 15 bits */
+       assert(offset < (1 << 15));
 
        si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, shader->selector->gs_max_out_vertices);
 
-       si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize >> 2);
-       si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? gs_vert_itemsize >> 2 : 0);
-       si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= 2) ? gs_vert_itemsize >> 2 : 0);
-       si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= 3) ? gs_vert_itemsize >> 2 : 0);
+       si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, num_components[0]);
+       si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? num_components[1] : 0);
+       si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= 2) ? num_components[2] : 0);
+       si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= 3) ? num_components[3] : 0);
 
        si_pm4_set_reg(pm4, R_028B90_VGT_GS_INSTANCE_CNT,
                       S_028B90_CNT(MIN2(gs_num_invocations, 127)) |