radeonsi: reduce LDS stalls by 40% for tessellation
authorMarek Olšák <marek.olsak@amd.com>
Fri, 13 Jul 2018 04:23:36 +0000 (00:23 -0400)
committerMarek Olšák <marek.olsak@amd.com>
Tue, 24 Jul 2018 00:23:52 +0000 (20:23 -0400)
40% is the decrease in the LGKM counter (which includes SMEM too)
for the GFX9 LSHS stage.

This will make the LDS size slightly larger, but I wasn't able to increase
the patch stride without corruption, so I'm increasing the vertex stride.

src/gallium/drivers/radeonsi/si_shader.c
src/gallium/drivers/radeonsi/si_shader.h
src/gallium/drivers/radeonsi/si_state_draw.c
src/gallium/drivers/radeonsi/si_state_shaders.c

index 5dc12d87243631c4f8e667c85aa817c60924919a..43ba23ff49483aa89bad7a00bd10d5580f875a97 100644 (file)
@@ -417,14 +417,14 @@ static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
 
        switch (ctx->type) {
        case PIPE_SHADER_VERTEX:
-               stride = util_last_bit64(ctx->shader->selector->outputs_written);
-               return LLVMConstInt(ctx->i32, stride * 4, 0);
+               stride = ctx->shader->selector->lshs_vertex_stride / 4;
+               return LLVMConstInt(ctx->i32, stride, 0);
 
        case PIPE_SHADER_TESS_CTRL:
                if (ctx->screen->info.chip_class >= GFX9 &&
                    ctx->shader->is_monolithic) {
-                       stride = util_last_bit64(ctx->shader->key.part.tcs.ls->outputs_written);
-                       return LLVMConstInt(ctx->i32, stride * 4, 0);
+                       stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4;
+                       return LLVMConstInt(ctx->i32, stride, 0);
                }
                return si_unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
 
index a2fb48ab023489afc0e5aec1fec2e1f29b645200..2dc4bc7e78705b6a097b66ce947968042b17f0ff 100644 (file)
@@ -368,7 +368,8 @@ struct si_shader_selector {
        ubyte           culldist_mask;
 
        /* ES parameters. */
-       unsigned        esgs_itemsize;
+       unsigned        esgs_itemsize; /* vertex stride */
+       unsigned        lshs_vertex_stride;
 
        /* GS parameters. */
        unsigned        gs_input_verts_per_prim;
index f35f73a37ce493e3526049387275c376260ce63c..d901401f0bbe08ef726217a71a0362307f8999cb 100644 (file)
@@ -134,7 +134,7 @@ static bool si_emit_derived_tess_state(struct si_context *sctx,
                num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */
        }
 
-       input_vertex_size = num_tcs_inputs * 16;
+       input_vertex_size = ls->lshs_vertex_stride;
        output_vertex_size = num_tcs_outputs * 16;
 
        input_patch_size = num_tcs_input_cp * input_vertex_size;
index 4e0320a226d823dc277ed040c23ea69206e5a006..de33d2503018c7c16829a7dea9dabfc810aab907 100644 (file)
@@ -2160,6 +2160,13 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
                        }
                }
                sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16;
+               sel->lshs_vertex_stride = sel->esgs_itemsize;
+
+               /* Add 1 dword to reduce LDS bank conflicts, so that each vertex
+                * will start on a different bank. (except for the maximum 32*16).
+                */
+               if (sel->lshs_vertex_stride < 32*16)
+                       sel->lshs_vertex_stride += 4;
 
                /* For the ESGS ring in LDS, add 1 dword to reduce LDS bank
                 * conflicts, i.e. each vertex will start at a different bank.