radeonsi/gfx10: enable GS fast launch for triangles and strips with NGG culling
authorMarek Olšák <marek.olsak@amd.com>
Thu, 9 Jan 2020 01:21:04 +0000 (20:21 -0500)
committerMarek Olšák <marek.olsak@amd.com>
Mon, 20 Jan 2020 21:16:11 +0000 (16:16 -0500)
Only non-indexed triangle lists and strips are supported. This increases
performance if there is something to cull.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
src/gallium/drivers/radeonsi/si_pipe.h
src/gallium/drivers/radeonsi/si_shader.c
src/gallium/drivers/radeonsi/si_shader.h
src/gallium/drivers/radeonsi/si_state_draw.c
src/gallium/drivers/radeonsi/si_state_shaders.c

index 9d50409bf3943dd159385634becdcb936020e10e..02d51ec7d5bf1196b8b7850f353c34bafde9fc22 100644 (file)
@@ -667,6 +667,20 @@ static LLVMValueRef ngg_nogs_vertex_ptr(struct si_shader_context *ctx,
        return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");
 }
 
+static LLVMValueRef si_insert_input_v4i32(struct si_shader_context *ctx,
+                                         LLVMValueRef ret, struct ac_arg param,
+                                         unsigned return_index)
+{
+       LLVMValueRef v = ac_get_arg(&ctx->ac, param);
+
+       for (unsigned i = 0; i < 4; i++) {
+               ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
+                                          ac_llvm_extract_elem(&ctx->ac, v, i),
+                                          return_index + i, "");
+       }
+       return ret;
+}
+
 static void load_bitmasks_2x64(struct si_shader_context *ctx,
                               LLVMValueRef lds_ptr, unsigned dw_offset,
                               LLVMValueRef mask[2], LLVMValueRef *total_bitcount)
@@ -874,10 +888,18 @@ void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi,
         * - In ES threads, update the ES input VGPRs (VertexID, InstanceID, TES inputs).
         */
 
-       LLVMValueRef vtxindex[] = {
-               si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16),
-               si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16),
-               si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16),
+       LLVMValueRef vtxindex[3];
+       if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) {
+               /* For the GS fast launch, the VS prologs simply puts the Vertex IDs
+                * into these VGPRs.
+                */
+               vtxindex[0] = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
+               vtxindex[1] = ac_get_arg(&ctx->ac, ctx->gs_vtx23_offset);
+               vtxindex[2] = ac_get_arg(&ctx->ac, ctx->gs_vtx45_offset);
+       } else {
+               vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
+               vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
+               vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
        };
        LLVMValueRef gs_vtxptr[] = {
                ngg_nogs_vertex_ptr(ctx, vtxindex[0]),
@@ -1143,6 +1165,11 @@ void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi,
                                          8 + SI_SGPR_DRAWID);
                ret = si_insert_input_ptr(ctx, ret, ctx->vertex_buffers,
                                          8 + SI_VS_NUM_USER_SGPR);
+
+               for (unsigned i = 0; i < shader->selector->num_vbos_in_user_sgprs; i++) {
+                       ret = si_insert_input_v4i32(ctx, ret, ctx->vb_descriptors[i],
+                                                   8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + i * 4);
+               }
        } else {
                assert(ctx->type == PIPE_SHADER_TESS_EVAL);
                ret = si_insert_input_ptr(ctx, ret, ctx->tcs_offchip_layout,
@@ -1152,10 +1179,16 @@ void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi,
        }
 
        unsigned vgpr;
-       if (ctx->type == PIPE_SHADER_VERTEX)
-               vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1;
-       else
+       if (ctx->type == PIPE_SHADER_VERTEX) {
+               if (shader->selector->num_vbos_in_user_sgprs) {
+                       vgpr = 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST +
+                              shader->selector->num_vbos_in_user_sgprs * 4;
+               } else {
+                       vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1;
+               }
+       } else {
                vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
+       }
 
        val = LLVMBuildLoad(builder, new_vgpr0, "");
        ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
@@ -1986,8 +2019,16 @@ void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
 
        /* All these are per subgroup: */
        bool max_vert_out_per_gs_instance = false;
-       unsigned max_esverts_base = 128;
        unsigned max_gsprims_base = 128; /* default prim group size clamp */
+       unsigned max_esverts_base = 128;
+
+       if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
+               max_gsprims_base = 128 / 3;
+               max_esverts_base = max_gsprims_base * 3;
+       } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
+               max_gsprims_base = 126;
+               max_esverts_base = 128;
+       }
 
        /* Hardware has the following non-natural restrictions on the value
         * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
index 0a9c787dd7623da518b7266798534d2de3cd8b52..a7c885dda64e0bfc51c8e3eb9e97120dd92ddd61 100644 (file)
@@ -802,7 +802,7 @@ union si_vgt_param_key {
        uint32_t index;
 };
 
-#define SI_NUM_VGT_STAGES_KEY_BITS 5
+#define SI_NUM_VGT_STAGES_KEY_BITS 6
 #define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS)
 
 /* The VGT_SHADER_STAGES key used to index the table of precomputed values.
@@ -813,6 +813,7 @@ union si_vgt_stages_key {
 #if UTIL_ARCH_LITTLE_ENDIAN
                unsigned tess:1;
                unsigned gs:1;
+               unsigned ngg_gs_fast_launch:1;
                unsigned ngg_passthrough:1;
                unsigned ngg:1; /* gfx10+ */
                unsigned streamout:1; /* only used with NGG */
@@ -822,6 +823,7 @@ union si_vgt_stages_key {
                unsigned streamout:1;
                unsigned ngg:1;
                unsigned ngg_passthrough:1;
+               unsigned ngg_gs_fast_launch:1;
                unsigned gs:1;
                unsigned tess:1;
 #endif
index e54b9fb97ba4567c4ea2feab45a541cf8fe7695a..daaf7722942c7a6e9e2a27e903287310cb4d9cf0 100644 (file)
@@ -1474,11 +1474,20 @@ void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
                     ctx->type == PIPE_SHADER_TESS_EVAL)) {
                        unsigned num_user_sgprs, num_vgprs;
 
-                       /* For the NGG cull shader, add 1 SGPR to hold the vertex buffer pointer. */
-                       if (ctx->type == PIPE_SHADER_VERTEX)
+                       if (ctx->type == PIPE_SHADER_VERTEX) {
+                               /* For the NGG cull shader, add 1 SGPR to hold
+                                * the vertex buffer pointer.
+                                */
                                num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader;
-                       else
+
+                               if (ngg_cull_shader && shader->selector->num_vbos_in_user_sgprs) {
+                                       assert(num_user_sgprs <= 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
+                                       num_user_sgprs = SI_SGPR_VS_VB_DESCRIPTOR_FIRST +
+                                                        shader->selector->num_vbos_in_user_sgprs * 4;
+                               }
+                       } else {
                                num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
+                       }
 
                        /* The NGG cull shader has to return all 9 VGPRs + the old thread ID.
                         *
@@ -2278,13 +2287,16 @@ static void si_init_exec_from_input(struct si_shader_context *ctx,
 }
 
 static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
-                              const struct si_vs_prolog_bits *key)
+                              const struct si_vs_prolog_bits *prolog_key,
+                              const struct si_shader_key *key,
+                              bool ngg_cull_shader)
 {
        /* VGPR initialization fixup for Vega10 and Raven is always done in the
         * VS prolog. */
        return sel->vs_needs_prolog ||
-              key->ls_vgpr_fix ||
-              key->unpack_instance_id_from_vertex_id;
+              prolog_key->ls_vgpr_fix ||
+              prolog_key->unpack_instance_id_from_vertex_id ||
+              (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
 }
 
 static bool si_build_main_function(struct si_shader_context *ctx,
@@ -2436,7 +2448,8 @@ static bool si_build_main_function(struct si_shader_context *ctx,
                    (shader->key.as_es || shader->key.as_ls) &&
                    (ctx->type == PIPE_SHADER_TESS_EVAL ||
                     (ctx->type == PIPE_SHADER_VERTEX &&
-                     !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
+                     !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
+                                         &shader->key, ngg_cull_shader)))) {
                        si_init_exec_from_input(ctx,
                                                ctx->merged_wave_info, 0);
                } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
@@ -2551,8 +2564,14 @@ static void si_get_vs_prolog_key(const struct si_shader_info *info,
        key->vs_prolog.as_es = shader_out->key.as_es;
        key->vs_prolog.as_ngg = shader_out->key.as_ngg;
 
-       if (!ngg_cull_shader)
+       if (ngg_cull_shader) {
+               key->vs_prolog.gs_fast_launch_tri_list = !!(shader_out->key.opt.ngg_culling &
+                                                           SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
+               key->vs_prolog.gs_fast_launch_tri_strip = !!(shader_out->key.opt.ngg_culling &
+                                                            SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
+       } else {
                key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling;
+       }
 
        if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
                key->vs_prolog.as_ls = 1;
@@ -2937,11 +2956,12 @@ int si_compile_shader(struct si_screen *sscreen,
        if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
                LLVMValueRef parts[4];
                unsigned num_parts = 0;
-               bool need_prolog = si_vs_needs_prolog(sel, &shader->key.part.vs.prolog);
+               bool has_prolog = false;
                LLVMValueRef main_fn = ctx.main_fn;
 
                if (ngg_cull_main_fn) {
-                       if (need_prolog) {
+                       if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
+                                              &shader->key, true)) {
                                union si_shader_part_key prolog_key;
                                si_get_vs_prolog_key(&sel->info,
                                                     shader->info.num_input_sgprs,
@@ -2951,11 +2971,13 @@ int si_compile_shader(struct si_screen *sscreen,
                                prolog_key.vs_prolog.is_monolithic = true;
                                si_build_vs_prolog_function(&ctx, &prolog_key);
                                parts[num_parts++] = ctx.main_fn;
+                               has_prolog = true;
                        }
                        parts[num_parts++] = ngg_cull_main_fn;
                }
 
-               if (need_prolog) {
+               if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
+                                      &shader->key, false)) {
                        union si_shader_part_key prolog_key;
                        si_get_vs_prolog_key(&sel->info,
                                             shader->info.num_input_sgprs,
@@ -2965,11 +2987,12 @@ int si_compile_shader(struct si_screen *sscreen,
                        prolog_key.vs_prolog.is_monolithic = true;
                        si_build_vs_prolog_function(&ctx, &prolog_key);
                        parts[num_parts++] = ctx.main_fn;
+                       has_prolog = true;
                }
                parts[num_parts++] = main_fn;
 
                si_build_wrapper_function(&ctx, parts, num_parts,
-                                         need_prolog ? 1 : 0, 0);
+                                         has_prolog ? 1 : 0, 0);
 
                if (ctx.shader->key.opt.vs_as_prim_discard_cs)
                        si_build_prim_discard_compute_shader(&ctx);
@@ -2986,7 +3009,8 @@ int si_compile_shader(struct si_screen *sscreen,
                        struct si_shader_selector *ls = shader->key.part.tcs.ls;
                        LLVMValueRef parts[4];
                        bool vs_needs_prolog =
-                               si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
+                               si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog,
+                                                  &shader->key, false);
 
                        /* TCS main part */
                        parts[2] = ctx.main_fn;
@@ -3086,7 +3110,8 @@ int si_compile_shader(struct si_screen *sscreen,
 
                        /* ES prolog */
                        if (es->type == PIPE_SHADER_VERTEX &&
-                           si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog)) {
+                           si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog,
+                                              &shader->key, false)) {
                                union si_shader_part_key vs_prolog_key;
                                si_get_vs_prolog_key(&es->info,
                                                     shader_es.info.num_input_sgprs,
@@ -3391,6 +3416,72 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                }
        }
 
+       if (key->vs_prolog.gs_fast_launch_tri_list ||
+           key->vs_prolog.gs_fast_launch_tri_strip) {
+               LLVMValueRef wave_id, thread_id_in_tg;
+
+               wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4);
+               thread_id_in_tg = ac_build_imad(&ctx->ac, wave_id,
+                                               LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false),
+                                               ac_get_thread_id(&ctx->ac));
+
+               /* The GS fast launch initializes all VGPRs to the value of
+                * the first thread, so we have to add the thread ID.
+                *
+                * Only these are initialized by the hw:
+                *   VGPR2: Base Primitive ID
+                *   VGPR5: Base Vertex ID
+                *   VGPR6: Instance ID
+                */
+
+               /* Put the vertex thread IDs into VGPRs as-is instead of packing them.
+                * The NGG cull shader will read them from there.
+                */
+               if (key->vs_prolog.gs_fast_launch_tri_list) {
+                       input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx01_offset */
+                                                      LLVMConstInt(ctx->i32, 3, 0), /* Vertex 0 */
+                                                      LLVMConstInt(ctx->i32, 0, 0));
+                       input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx23_offset */
+                                                      LLVMConstInt(ctx->i32, 3, 0), /* Vertex 1 */
+                                                      LLVMConstInt(ctx->i32, 1, 0));
+                       input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx45_offset */
+                                                      LLVMConstInt(ctx->i32, 3, 0), /* Vertex 2 */
+                                                      LLVMConstInt(ctx->i32, 2, 0));
+               } else {
+                       assert(key->vs_prolog.gs_fast_launch_tri_strip);
+                       LLVMBuilderRef builder = ctx->ac.builder;
+                       /* Triangle indices: */
+                       LLVMValueRef index[3] = {
+                               thread_id_in_tg,
+                               LLVMBuildAdd(builder, thread_id_in_tg,
+                                            LLVMConstInt(ctx->i32, 1, 0), ""),
+                               LLVMBuildAdd(builder, thread_id_in_tg,
+                                            LLVMConstInt(ctx->i32, 2, 0), ""),
+                       };
+                       LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder,
+                                                            thread_id_in_tg, ctx->i1, "");
+                       LLVMValueRef flatshade_first =
+                               LLVMBuildICmp(builder, LLVMIntEQ,
+                                             si_unpack_param(ctx, ctx->vs_state_bits, 4, 2),
+                                             ctx->i32_0, "");
+
+                       ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd,
+                                                                   flatshade_first, index);
+                       input_vgprs[0] = index[0];
+                       input_vgprs[1] = index[1];
+                       input_vgprs[4] = index[2];
+               }
+
+               /* Triangles always have all edge flags set initially. */
+               input_vgprs[3] = LLVMConstInt(ctx->i32, 0x7 << 8, 0);
+
+               input_vgprs[2] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[2],
+                                             thread_id_in_tg, ""); /* PrimID */
+               input_vgprs[5] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[5],
+                                             thread_id_in_tg, ""); /* VertexID */
+               input_vgprs[8] = input_vgprs[6]; /* InstanceID */
+       }
+
        unsigned vertex_id_vgpr = first_vs_vgpr;
        unsigned instance_id_vgpr =
                ctx->screen->info.chip_class >= GFX10 ?
@@ -3498,7 +3589,7 @@ static bool si_get_vs_prolog(struct si_screen *sscreen,
 {
        struct si_shader_selector *vs = main_part->selector;
 
-       if (!si_vs_needs_prolog(vs, key))
+       if (!si_vs_needs_prolog(vs, key, &shader->key, false))
                return true;
 
        /* Get the prolog. */
index ee1ca9cda1d41e429620c76ac54575d64e6864b0..3a1d0e44290130aabf8dd5b6f7f29d02c6d718c4 100644 (file)
@@ -273,9 +273,12 @@ enum {
        SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
 };
 
-#define SI_NGG_CULL_VIEW_SMALLPRIMS    (1 << 0) /* view.xy + small prims */
-#define SI_NGG_CULL_BACK_FACE          (1 << 1) /* back faces */
-#define SI_NGG_CULL_FRONT_FACE         (1 << 2) /* front faces */
+#define SI_NGG_CULL_VIEW_SMALLPRIMS            (1 << 0) /* view.xy + small prims */
+#define SI_NGG_CULL_BACK_FACE                  (1 << 1) /* back faces */
+#define SI_NGG_CULL_FRONT_FACE                 (1 << 2) /* front faces */
+#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST    (1 << 3) /* GS fast launch: triangles */
+#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP   (1 << 4) /* GS fast launch: triangle strip */
+#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL         (0x3 << 3) /* GS fast launch (both prim types) */
 
 /**
  * For VS shader keys, describe any fixups required for vertex fetch.
@@ -564,6 +567,8 @@ union si_shader_part_key {
                unsigned        as_es:1;
                unsigned        as_ngg:1;
                unsigned        has_ngg_cull_inputs:1; /* from the NGG cull shader */
+               unsigned        gs_fast_launch_tri_list:1; /* for NGG culling */
+               unsigned        gs_fast_launch_tri_strip:1; /* for NGG culling */
                /* Prologs for monolithic shaders shouldn't set EXEC. */
                unsigned        is_monolithic:1;
        } vs_prolog;
@@ -655,7 +660,7 @@ struct si_shader_key {
                unsigned        clip_disable:1;
 
                /* For NGG VS and TES. */
-               unsigned        ngg_culling:3; /* SI_NGG_CULL_* */
+               unsigned        ngg_culling:5; /* SI_NGG_CULL_* */
 
                /* For shaders where monolithic variants have better code.
                 *
index 7f7398ff7f589b986685418b0761d3e43bc979a5..2f87896ead6c778718d83dd70ab12c1345bc5e1a 100644 (file)
@@ -2042,12 +2042,11 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
        if (sctx->ngg &&
            rast_prim == PIPE_PRIM_TRIANGLES &&
            (sctx->screen->always_use_ngg_culling ||
-            /* At least 1500 non-indexed triangles (4500 vertices) are needed
-             * per draw call (no TES/GS) to enable NGG culling. Triangle strips
-             * don't need this, because they have good reuse and therefore
-             * perform the same as indexed triangles.
+            /* At least 1024 non-indexed vertices (8 subgroups) are needed
+             * per draw call (no TES/GS) to enable NGG culling.
              */
-            (!index_size && prim == PIPE_PRIM_TRIANGLES && direct_count > 4500 &&
+            (!index_size && direct_count >= 1024 &&
+             (prim == PIPE_PRIM_TRIANGLES || prim == PIPE_PRIM_TRIANGLE_STRIP) &&
              !sctx->tes_shader.cso && !sctx->gs_shader.cso)) &&
            si_get_vs(sctx)->cso->ngg_culling_allowed) {
                unsigned ngg_culling = 0;
@@ -2068,6 +2067,18 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
                        if (sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back)
                                ngg_culling |= SI_NGG_CULL_BACK_FACE;
                }
+
+               /* Use NGG fast launch for certain non-indexed primitive types.
+                * A draw must have at least 1 full primitive.
+                */
+               if (ngg_culling && !index_size && direct_count >= 3 &&
+                   !sctx->tes_shader.cso && !sctx->gs_shader.cso) {
+                       if (prim == PIPE_PRIM_TRIANGLES)
+                               ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST;
+                       else if (prim == PIPE_PRIM_TRIANGLE_STRIP)
+                               ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP;
+               }
+
                if (ngg_culling != sctx->ngg_culling) {
                        sctx->ngg_culling = ngg_culling;
                        sctx->do_update_shaders = true;
index 1b8450c0a8ea10057c3dbe84b4cfd54089b9eddc..d270ae7c31aa43da897880585c3296772a0d9379 100644 (file)
@@ -1234,6 +1234,8 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
                late_alloc_wave64 = 0;
        else if (num_cu_per_sh <= 6)
                late_alloc_wave64 = num_cu_per_sh - 2; /* All CUs enabled */
+       else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
+               late_alloc_wave64 = (num_cu_per_sh - 2) * 6;
        else
                late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
 
@@ -1316,26 +1318,36 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
        shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(1) |
                                          S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
 
-       shader->ge_cntl =
-               S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
-               S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
-               S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
+       if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
+               shader->ge_cntl =
+                       S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+                       S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims * 3);
+       } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
+               shader->ge_cntl =
+                       S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+                       S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims + 2);
+       } else {
+               shader->ge_cntl =
+                       S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+                       S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
+                       S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
 
-       /* Bug workaround for a possible hang with non-tessellation cases.
-        * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
-        *
-        * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
-        */
-       if ((sscreen->info.family == CHIP_NAVI10 ||
-            sscreen->info.family == CHIP_NAVI12 ||
-            sscreen->info.family == CHIP_NAVI14) &&
-           (es_type == PIPE_SHADER_VERTEX || gs_type == PIPE_SHADER_VERTEX) && /* = no tess */
-           shader->ngg.hw_max_esverts != 256) {
-               shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
-
-               if (shader->ngg.hw_max_esverts > 5) {
-                       shader->ge_cntl |=
-                               S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
+               /* Bug workaround for a possible hang with non-tessellation cases.
+                * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
+                *
+                * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
+                */
+               if ((sscreen->info.family == CHIP_NAVI10 ||
+                    sscreen->info.family == CHIP_NAVI12 ||
+                    sscreen->info.family == CHIP_NAVI14) &&
+                   (es_type == PIPE_SHADER_VERTEX || gs_type == PIPE_SHADER_VERTEX) && /* = no tess */
+                   shader->ngg.hw_max_esverts != 256) {
+                       shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
+
+                       if (shader->ngg.hw_max_esverts > 5) {
+                               shader->ge_cntl |=
+                                       S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
+                       }
                }
        }
 
@@ -3954,6 +3966,7 @@ static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen,
 
        if (key.u.ngg) {
                stages |= S_028B54_PRIMGEN_EN(1) |
+                         S_028B54_GS_FAST_LAUNCH(key.u.ngg_gs_fast_launch) |
                          S_028B54_NGG_WAVE_ID_EN(key.u.streamout) |
                          S_028B54_PRIMGEN_PASSTHRU_EN(key.u.ngg_passthrough);
        } else if (key.u.gs)
@@ -4109,8 +4122,13 @@ bool si_update_shaders(struct si_context *sctx)
        }
 
        /* This must be done after the shader variant is selected. */
-       if (sctx->ngg)
-               key.u.ngg_passthrough = gfx10_is_ngg_passthrough(si_get_vs(sctx)->current);
+       if (sctx->ngg) {
+               struct si_shader *vs = si_get_vs(sctx)->current;
+
+               key.u.ngg_passthrough = gfx10_is_ngg_passthrough(vs);
+               key.u.ngg_gs_fast_launch = !!(vs->key.opt.ngg_culling &
+                                             SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
+       }
 
        si_update_vgt_shader_config(sctx, key);