radeonsi/gfx10: export primitives at the beginning of VS/TES
authorMarek Olšák <marek.olsak@amd.com>
Fri, 3 Jan 2020 21:59:20 +0000 (16:59 -0500)
committerMarek Olšák <marek.olsak@amd.com>
Mon, 20 Jan 2020 21:16:11 +0000 (16:16 -0500)
This decreases VGPR usage and will allow us to merge some IF blocks
in shaders.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
src/gallium/drivers/radeonsi/si_shader.c
src/gallium/drivers/radeonsi/si_shader_internal.h

index 2f6f9fe3cc2a384df05df8cee03481b96c44f394..b8c34634cbee826a437bef105a359c9f9c67ce9d 100644 (file)
@@ -123,6 +123,16 @@ static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx,
        }
 }
 
+bool gfx10_ngg_export_prim_early(struct si_shader *shader)
+{
+       struct si_shader_selector *sel = shader->selector;
+
+       assert(shader->key.as_ngg && !shader->key.as_es);
+
+       return sel->type != PIPE_SHADER_GEOMETRY &&
+              !sel->info.writes_edgeflag;
+}
+
 void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx)
 {
        ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
@@ -130,6 +140,49 @@ void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx)
                                      ngg_get_prim_cnt(ctx));
 }
 
+void gfx10_ngg_build_export_prim(struct si_shader_context *ctx,
+                                LLVMValueRef user_edgeflags[3])
+{
+       if (gfx10_is_ngg_passthrough(ctx->shader)) {
+               ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
+               {
+                       struct ac_ngg_prim prim = {};
+
+                       prim.passthrough = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
+                       ac_build_export_prim(&ctx->ac, &prim);
+               }
+               ac_build_endif(&ctx->ac, 6001);
+               return;
+       }
+
+       ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
+       {
+               struct ac_ngg_prim prim = {};
+
+               ngg_get_vertices_per_prim(ctx, &prim.num_vertices);
+
+               prim.isnull = ctx->ac.i1false;
+               prim.index[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
+               prim.index[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
+               prim.index[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
+
+               for (unsigned i = 0; i < prim.num_vertices; ++i) {
+                       prim.edgeflag[i] = ngg_get_initial_edgeflag(ctx, i);
+
+                       if (ctx->shader->selector->info.writes_edgeflag) {
+                               LLVMValueRef edge;
+
+                               edge = LLVMBuildLoad(ctx->ac.builder, user_edgeflags[i], "");
+                               edge = LLVMBuildAnd(ctx->ac.builder, prim.edgeflag[i], edge, "");
+                               prim.edgeflag[i] = edge;
+                       }
+               }
+
+               ac_build_export_prim(&ctx->ac, &prim);
+       }
+       ac_build_endif(&ctx->ac, 6001);
+}
+
 static void build_streamout_vertex(struct si_shader_context *ctx,
                                   LLVMValueRef *so_buffer, LLVMValueRef *wg_offset_dw,
                                   unsigned stream, LLVMValueRef offset_vtx,
@@ -689,31 +742,8 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
        }
 
        /* Build the primitive export. */
-       ac_build_ifcc(&ctx->ac, is_gs_thread, 6001);
-       {
-               struct ac_ngg_prim prim = {};
-
-               if (gfx10_is_ngg_passthrough(ctx->shader)) {
-                       prim.passthrough = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
-               } else {
-                       prim.num_vertices = num_vertices;
-                       prim.isnull = ctx->ac.i1false;
-                       memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3);
-
-                       for (unsigned i = 0; i < num_vertices; ++i) {
-                               prim.edgeflag[i] = ngg_get_initial_edgeflag(ctx, i);
-
-                               if (sel->info.writes_edgeflag) {
-                                       tmp2 = LLVMBuildLoad(builder, user_edgeflags[i], "");
-                                       prim.edgeflag[i] = LLVMBuildAnd(builder, prim.edgeflag[i],
-                                                                       tmp2, "");
-                               }
-                       }
-               }
-
-               ac_build_export_prim(&ctx->ac, &prim);
-       }
-       ac_build_endif(&ctx->ac, 6001);
+       if (!gfx10_ngg_export_prim_early(ctx->shader))
+               gfx10_ngg_build_export_prim(ctx, user_edgeflags);
 
        /* Export per-vertex data (positions and parameters). */
        ac_build_ifcc(&ctx->ac, is_es_thread, 6002);
index 592a486424ac821588ec13f964733a7c5ca9c31d..24f744ba5cd99ab388a283922dfcfe9491cb8beb 100644 (file)
@@ -2423,9 +2423,16 @@ static bool si_build_main_function(struct si_shader_context *ctx,
 
                        if ((ctx->type == PIPE_SHADER_VERTEX ||
                             ctx->type == PIPE_SHADER_TESS_EVAL) &&
-                           shader->key.as_ngg && !shader->key.as_es)
+                           shader->key.as_ngg && !shader->key.as_es) {
                                gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
 
+                               /* Build the primitive export at the beginning
+                                * of the shader if possible.
+                                */
+                               if (gfx10_ngg_export_prim_early(shader))
+                                       gfx10_ngg_build_export_prim(ctx, NULL);
+                       }
+
                        if (ctx->type == PIPE_SHADER_TESS_CTRL ||
                            ctx->type == PIPE_SHADER_GEOMETRY) {
                                if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) {
index b8d2ac84fca54e55728ae0ff66d34257078e4d51..6509edb81814c82c4eaee262958669778562997d 100644 (file)
@@ -299,7 +299,10 @@ void si_llvm_emit_streamout(struct si_shader_context *ctx,
                            unsigned noutput, unsigned stream);
 void si_create_function(struct si_shader_context *ctx);
 
+bool gfx10_ngg_export_prim_early(struct si_shader *shader);
 void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx);
+void gfx10_ngg_build_export_prim(struct si_shader_context *ctx,
+                                LLVMValueRef user_edgeflags[3]);
 void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
                             unsigned max_outputs,
                             LLVMValueRef *addrs);