From: Marek Olšák Date: Thu, 9 Jan 2020 01:21:04 +0000 (-0500) Subject: radeonsi/gfx10: enable GS fast launch for triangles and strips with NGG culling X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=735a3ba00765baa717ff541fb5aa5105dc816ad7;p=mesa.git radeonsi/gfx10: enable GS fast launch for triangles and strips with NGG culling Only non-indexed triangle lists and strips are supported. This increases performance if there is something to cull. Acked-by: Pierre-Eric Pelloux-Prayer --- diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 9d50409bf39..02d51ec7d5b 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -667,6 +667,20 @@ static LLVMValueRef ngg_nogs_vertex_ptr(struct si_shader_context *ctx, return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, ""); } +static LLVMValueRef si_insert_input_v4i32(struct si_shader_context *ctx, + LLVMValueRef ret, struct ac_arg param, + unsigned return_index) +{ + LLVMValueRef v = ac_get_arg(&ctx->ac, param); + + for (unsigned i = 0; i < 4; i++) { + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, + ac_llvm_extract_elem(&ctx->ac, v, i), + return_index + i, ""); + } + return ret; +} + static void load_bitmasks_2x64(struct si_shader_context *ctx, LLVMValueRef lds_ptr, unsigned dw_offset, LLVMValueRef mask[2], LLVMValueRef *total_bitcount) @@ -874,10 +888,18 @@ void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, * - In ES threads, update the ES input VGPRs (VertexID, InstanceID, TES inputs). */ - LLVMValueRef vtxindex[] = { - si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16), - si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16), - si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16), + LLVMValueRef vtxindex[3]; + if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) { + /* For the GS fast launch, the VS prologs simply puts the Vertex IDs + * into these VGPRs. + */ + vtxindex[0] = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset); + vtxindex[1] = ac_get_arg(&ctx->ac, ctx->gs_vtx23_offset); + vtxindex[2] = ac_get_arg(&ctx->ac, ctx->gs_vtx45_offset); + } else { + vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16); + vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16); + vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16); }; LLVMValueRef gs_vtxptr[] = { ngg_nogs_vertex_ptr(ctx, vtxindex[0]), @@ -1143,6 +1165,11 @@ void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, 8 + SI_SGPR_DRAWID); ret = si_insert_input_ptr(ctx, ret, ctx->vertex_buffers, 8 + SI_VS_NUM_USER_SGPR); + + for (unsigned i = 0; i < shader->selector->num_vbos_in_user_sgprs; i++) { + ret = si_insert_input_v4i32(ctx, ret, ctx->vb_descriptors[i], + 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + i * 4); + } } else { assert(ctx->type == PIPE_SHADER_TESS_EVAL); ret = si_insert_input_ptr(ctx, ret, ctx->tcs_offchip_layout, @@ -1152,10 +1179,16 @@ void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, } unsigned vgpr; - if (ctx->type == PIPE_SHADER_VERTEX) - vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1; - else + if (ctx->type == PIPE_SHADER_VERTEX) { + if (shader->selector->num_vbos_in_user_sgprs) { + vgpr = 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + + shader->selector->num_vbos_in_user_sgprs * 4; + } else { + vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1; + } + } else { vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR; + } val = LLVMBuildLoad(builder, new_vgpr0, ""); ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), @@ -1986,8 +2019,16 @@ void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader) /* All these are per subgroup: */ bool max_vert_out_per_gs_instance = false; - unsigned max_esverts_base = 128; unsigned max_gsprims_base = 128; /* default prim group size clamp */ + unsigned max_esverts_base = 128; + + if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) { + max_gsprims_base = 128 / 3; + max_esverts_base = max_gsprims_base * 3; + } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) { + max_gsprims_base = 126; + max_esverts_base = 128; + } /* Hardware has the following non-natural restrictions on the value * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 0a9c787dd76..a7c885dda64 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -802,7 +802,7 @@ union si_vgt_param_key { uint32_t index; }; -#define SI_NUM_VGT_STAGES_KEY_BITS 5 +#define SI_NUM_VGT_STAGES_KEY_BITS 6 #define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS) /* The VGT_SHADER_STAGES key used to index the table of precomputed values. @@ -813,6 +813,7 @@ union si_vgt_stages_key { #if UTIL_ARCH_LITTLE_ENDIAN unsigned tess:1; unsigned gs:1; + unsigned ngg_gs_fast_launch:1; unsigned ngg_passthrough:1; unsigned ngg:1; /* gfx10+ */ unsigned streamout:1; /* only used with NGG */ @@ -822,6 +823,7 @@ union si_vgt_stages_key { unsigned streamout:1; unsigned ngg:1; unsigned ngg_passthrough:1; + unsigned ngg_gs_fast_launch:1; unsigned gs:1; unsigned tess:1; #endif diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index e54b9fb97ba..daaf7722942 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1474,11 +1474,20 @@ void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader) ctx->type == PIPE_SHADER_TESS_EVAL)) { unsigned num_user_sgprs, num_vgprs; - /* For the NGG cull shader, add 1 SGPR to hold the vertex buffer pointer. */ - if (ctx->type == PIPE_SHADER_VERTEX) + if (ctx->type == PIPE_SHADER_VERTEX) { + /* For the NGG cull shader, add 1 SGPR to hold + * the vertex buffer pointer. + */ num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader; - else + + if (ngg_cull_shader && shader->selector->num_vbos_in_user_sgprs) { + assert(num_user_sgprs <= 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST); + num_user_sgprs = SI_SGPR_VS_VB_DESCRIPTOR_FIRST + + shader->selector->num_vbos_in_user_sgprs * 4; + } + } else { num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR; + } /* The NGG cull shader has to return all 9 VGPRs + the old thread ID. * @@ -2278,13 +2287,16 @@ static void si_init_exec_from_input(struct si_shader_context *ctx, } static bool si_vs_needs_prolog(const struct si_shader_selector *sel, - const struct si_vs_prolog_bits *key) + const struct si_vs_prolog_bits *prolog_key, + const struct si_shader_key *key, + bool ngg_cull_shader) { /* VGPR initialization fixup for Vega10 and Raven is always done in the * VS prolog. */ return sel->vs_needs_prolog || - key->ls_vgpr_fix || - key->unpack_instance_id_from_vertex_id; + prolog_key->ls_vgpr_fix || + prolog_key->unpack_instance_id_from_vertex_id || + (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL); } static bool si_build_main_function(struct si_shader_context *ctx, @@ -2436,7 +2448,8 @@ static bool si_build_main_function(struct si_shader_context *ctx, (shader->key.as_es || shader->key.as_ls) && (ctx->type == PIPE_SHADER_TESS_EVAL || (ctx->type == PIPE_SHADER_VERTEX && - !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) { + !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, + &shader->key, ngg_cull_shader)))) { si_init_exec_from_input(ctx, ctx->merged_wave_info, 0); } else if (ctx->type == PIPE_SHADER_TESS_CTRL || @@ -2551,8 +2564,14 @@ static void si_get_vs_prolog_key(const struct si_shader_info *info, key->vs_prolog.as_es = shader_out->key.as_es; key->vs_prolog.as_ngg = shader_out->key.as_ngg; - if (!ngg_cull_shader) + if (ngg_cull_shader) { + key->vs_prolog.gs_fast_launch_tri_list = !!(shader_out->key.opt.ngg_culling & + SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST); + key->vs_prolog.gs_fast_launch_tri_strip = !!(shader_out->key.opt.ngg_culling & + SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP); + } else { key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling; + } if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) { key->vs_prolog.as_ls = 1; @@ -2937,11 +2956,12 @@ int si_compile_shader(struct si_screen *sscreen, if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) { LLVMValueRef parts[4]; unsigned num_parts = 0; - bool need_prolog = si_vs_needs_prolog(sel, &shader->key.part.vs.prolog); + bool has_prolog = false; LLVMValueRef main_fn = ctx.main_fn; if (ngg_cull_main_fn) { - if (need_prolog) { + if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, + &shader->key, true)) { union si_shader_part_key prolog_key; si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, @@ -2951,11 +2971,13 @@ int si_compile_shader(struct si_screen *sscreen, prolog_key.vs_prolog.is_monolithic = true; si_build_vs_prolog_function(&ctx, &prolog_key); parts[num_parts++] = ctx.main_fn; + has_prolog = true; } parts[num_parts++] = ngg_cull_main_fn; } - if (need_prolog) { + if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, + &shader->key, false)) { union si_shader_part_key prolog_key; si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, @@ -2965,11 +2987,12 @@ int si_compile_shader(struct si_screen *sscreen, prolog_key.vs_prolog.is_monolithic = true; si_build_vs_prolog_function(&ctx, &prolog_key); parts[num_parts++] = ctx.main_fn; + has_prolog = true; } parts[num_parts++] = main_fn; si_build_wrapper_function(&ctx, parts, num_parts, - need_prolog ? 1 : 0, 0); + has_prolog ? 1 : 0, 0); if (ctx.shader->key.opt.vs_as_prim_discard_cs) si_build_prim_discard_compute_shader(&ctx); @@ -2986,7 +3009,8 @@ int si_compile_shader(struct si_screen *sscreen, struct si_shader_selector *ls = shader->key.part.tcs.ls; LLVMValueRef parts[4]; bool vs_needs_prolog = - si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog); + si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog, + &shader->key, false); /* TCS main part */ parts[2] = ctx.main_fn; @@ -3086,7 +3110,8 @@ int si_compile_shader(struct si_screen *sscreen, /* ES prolog */ if (es->type == PIPE_SHADER_VERTEX && - si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog)) { + si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog, + &shader->key, false)) { union si_shader_part_key vs_prolog_key; si_get_vs_prolog_key(&es->info, shader_es.info.num_input_sgprs, @@ -3391,6 +3416,72 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx, } } + if (key->vs_prolog.gs_fast_launch_tri_list || + key->vs_prolog.gs_fast_launch_tri_strip) { + LLVMValueRef wave_id, thread_id_in_tg; + + wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4); + thread_id_in_tg = ac_build_imad(&ctx->ac, wave_id, + LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), + ac_get_thread_id(&ctx->ac)); + + /* The GS fast launch initializes all VGPRs to the value of + * the first thread, so we have to add the thread ID. + * + * Only these are initialized by the hw: + * VGPR2: Base Primitive ID + * VGPR5: Base Vertex ID + * VGPR6: Instance ID + */ + + /* Put the vertex thread IDs into VGPRs as-is instead of packing them. + * The NGG cull shader will read them from there. + */ + if (key->vs_prolog.gs_fast_launch_tri_list) { + input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx01_offset */ + LLVMConstInt(ctx->i32, 3, 0), /* Vertex 0 */ + LLVMConstInt(ctx->i32, 0, 0)); + input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx23_offset */ + LLVMConstInt(ctx->i32, 3, 0), /* Vertex 1 */ + LLVMConstInt(ctx->i32, 1, 0)); + input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx45_offset */ + LLVMConstInt(ctx->i32, 3, 0), /* Vertex 2 */ + LLVMConstInt(ctx->i32, 2, 0)); + } else { + assert(key->vs_prolog.gs_fast_launch_tri_strip); + LLVMBuilderRef builder = ctx->ac.builder; + /* Triangle indices: */ + LLVMValueRef index[3] = { + thread_id_in_tg, + LLVMBuildAdd(builder, thread_id_in_tg, + LLVMConstInt(ctx->i32, 1, 0), ""), + LLVMBuildAdd(builder, thread_id_in_tg, + LLVMConstInt(ctx->i32, 2, 0), ""), + }; + LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder, + thread_id_in_tg, ctx->i1, ""); + LLVMValueRef flatshade_first = + LLVMBuildICmp(builder, LLVMIntEQ, + si_unpack_param(ctx, ctx->vs_state_bits, 4, 2), + ctx->i32_0, ""); + + ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, + flatshade_first, index); + input_vgprs[0] = index[0]; + input_vgprs[1] = index[1]; + input_vgprs[4] = index[2]; + } + + /* Triangles always have all edge flags set initially. */ + input_vgprs[3] = LLVMConstInt(ctx->i32, 0x7 << 8, 0); + + input_vgprs[2] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[2], + thread_id_in_tg, ""); /* PrimID */ + input_vgprs[5] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[5], + thread_id_in_tg, ""); /* VertexID */ + input_vgprs[8] = input_vgprs[6]; /* InstanceID */ + } + unsigned vertex_id_vgpr = first_vs_vgpr; unsigned instance_id_vgpr = ctx->screen->info.chip_class >= GFX10 ? @@ -3498,7 +3589,7 @@ static bool si_get_vs_prolog(struct si_screen *sscreen, { struct si_shader_selector *vs = main_part->selector; - if (!si_vs_needs_prolog(vs, key)) + if (!si_vs_needs_prolog(vs, key, &shader->key, false)) return true; /* Get the prolog. */ diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index ee1ca9cda1d..3a1d0e44290 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -273,9 +273,12 @@ enum { SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9, }; -#define SI_NGG_CULL_VIEW_SMALLPRIMS (1 << 0) /* view.xy + small prims */ -#define SI_NGG_CULL_BACK_FACE (1 << 1) /* back faces */ -#define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */ +#define SI_NGG_CULL_VIEW_SMALLPRIMS (1 << 0) /* view.xy + small prims */ +#define SI_NGG_CULL_BACK_FACE (1 << 1) /* back faces */ +#define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */ +#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST (1 << 3) /* GS fast launch: triangles */ +#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP (1 << 4) /* GS fast launch: triangle strip */ +#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL (0x3 << 3) /* GS fast launch (both prim types) */ /** * For VS shader keys, describe any fixups required for vertex fetch. @@ -564,6 +567,8 @@ union si_shader_part_key { unsigned as_es:1; unsigned as_ngg:1; unsigned has_ngg_cull_inputs:1; /* from the NGG cull shader */ + unsigned gs_fast_launch_tri_list:1; /* for NGG culling */ + unsigned gs_fast_launch_tri_strip:1; /* for NGG culling */ /* Prologs for monolithic shaders shouldn't set EXEC. */ unsigned is_monolithic:1; } vs_prolog; @@ -655,7 +660,7 @@ struct si_shader_key { unsigned clip_disable:1; /* For NGG VS and TES. */ - unsigned ngg_culling:3; /* SI_NGG_CULL_* */ + unsigned ngg_culling:5; /* SI_NGG_CULL_* */ /* For shaders where monolithic variants have better code. * diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 7f7398ff7f5..2f87896ead6 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -2042,12 +2042,11 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i if (sctx->ngg && rast_prim == PIPE_PRIM_TRIANGLES && (sctx->screen->always_use_ngg_culling || - /* At least 1500 non-indexed triangles (4500 vertices) are needed - * per draw call (no TES/GS) to enable NGG culling. Triangle strips - * don't need this, because they have good reuse and therefore - * perform the same as indexed triangles. + /* At least 1024 non-indexed vertices (8 subgroups) are needed + * per draw call (no TES/GS) to enable NGG culling. */ - (!index_size && prim == PIPE_PRIM_TRIANGLES && direct_count > 4500 && + (!index_size && direct_count >= 1024 && + (prim == PIPE_PRIM_TRIANGLES || prim == PIPE_PRIM_TRIANGLE_STRIP) && !sctx->tes_shader.cso && !sctx->gs_shader.cso)) && si_get_vs(sctx)->cso->ngg_culling_allowed) { unsigned ngg_culling = 0; @@ -2068,6 +2067,18 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i if (sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back) ngg_culling |= SI_NGG_CULL_BACK_FACE; } + + /* Use NGG fast launch for certain non-indexed primitive types. + * A draw must have at least 1 full primitive. + */ + if (ngg_culling && !index_size && direct_count >= 3 && + !sctx->tes_shader.cso && !sctx->gs_shader.cso) { + if (prim == PIPE_PRIM_TRIANGLES) + ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST; + else if (prim == PIPE_PRIM_TRIANGLE_STRIP) + ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP; + } + if (ngg_culling != sctx->ngg_culling) { sctx->ngg_culling = ngg_culling; sctx->do_update_shaders = true; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 1b8450c0a8e..d270ae7c31a 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1234,6 +1234,8 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader late_alloc_wave64 = 0; else if (num_cu_per_sh <= 6) late_alloc_wave64 = num_cu_per_sh - 2; /* All CUs enabled */ + else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) + late_alloc_wave64 = (num_cu_per_sh - 2) * 6; else late_alloc_wave64 = (num_cu_per_sh - 2) * 4; @@ -1316,26 +1318,36 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(1) | S_030980_NUM_PC_LINES(oversub_pc_lines - 1); - shader->ge_cntl = - S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | - S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */ - S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi); + if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) { + shader->ge_cntl = + S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | + S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims * 3); + } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) { + shader->ge_cntl = + S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | + S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims + 2); + } else { + shader->ge_cntl = + S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | + S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */ + S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi); - /* Bug workaround for a possible hang with non-tessellation cases. - * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0 - * - * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5 - */ - if ((sscreen->info.family == CHIP_NAVI10 || - sscreen->info.family == CHIP_NAVI12 || - sscreen->info.family == CHIP_NAVI14) && - (es_type == PIPE_SHADER_VERTEX || gs_type == PIPE_SHADER_VERTEX) && /* = no tess */ - shader->ngg.hw_max_esverts != 256) { - shader->ge_cntl &= C_03096C_VERT_GRP_SIZE; - - if (shader->ngg.hw_max_esverts > 5) { - shader->ge_cntl |= - S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5); + /* Bug workaround for a possible hang with non-tessellation cases. + * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0 + * + * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5 + */ + if ((sscreen->info.family == CHIP_NAVI10 || + sscreen->info.family == CHIP_NAVI12 || + sscreen->info.family == CHIP_NAVI14) && + (es_type == PIPE_SHADER_VERTEX || gs_type == PIPE_SHADER_VERTEX) && /* = no tess */ + shader->ngg.hw_max_esverts != 256) { + shader->ge_cntl &= C_03096C_VERT_GRP_SIZE; + + if (shader->ngg.hw_max_esverts > 5) { + shader->ge_cntl |= + S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5); + } } } @@ -3954,6 +3966,7 @@ static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, if (key.u.ngg) { stages |= S_028B54_PRIMGEN_EN(1) | + S_028B54_GS_FAST_LAUNCH(key.u.ngg_gs_fast_launch) | S_028B54_NGG_WAVE_ID_EN(key.u.streamout) | S_028B54_PRIMGEN_PASSTHRU_EN(key.u.ngg_passthrough); } else if (key.u.gs) @@ -4109,8 +4122,13 @@ bool si_update_shaders(struct si_context *sctx) } /* This must be done after the shader variant is selected. */ - if (sctx->ngg) - key.u.ngg_passthrough = gfx10_is_ngg_passthrough(si_get_vs(sctx)->current); + if (sctx->ngg) { + struct si_shader *vs = si_get_vs(sctx)->current; + + key.u.ngg_passthrough = gfx10_is_ngg_passthrough(vs); + key.u.ngg_gs_fast_launch = !!(vs->key.opt.ngg_culling & + SI_NGG_CULL_GS_FAST_LAUNCH_ALL); + } si_update_vgt_shader_config(sctx, key);