return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");
}
+static LLVMValueRef si_insert_input_v4i32(struct si_shader_context *ctx,
+ LLVMValueRef ret, struct ac_arg param,
+ unsigned return_index)
+{
+ LLVMValueRef v = ac_get_arg(&ctx->ac, param);
+
+ for (unsigned i = 0; i < 4; i++) {
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
+ ac_llvm_extract_elem(&ctx->ac, v, i),
+ return_index + i, "");
+ }
+ return ret;
+}
+
static void load_bitmasks_2x64(struct si_shader_context *ctx,
LLVMValueRef lds_ptr, unsigned dw_offset,
LLVMValueRef mask[2], LLVMValueRef *total_bitcount)
* - In ES threads, update the ES input VGPRs (VertexID, InstanceID, TES inputs).
*/
- LLVMValueRef vtxindex[] = {
- si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16),
- si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16),
- si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16),
+ LLVMValueRef vtxindex[3];
+ if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) {
+ /* For the GS fast launch, the VS prologs simply puts the Vertex IDs
+ * into these VGPRs.
+ */
+ vtxindex[0] = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
+ vtxindex[1] = ac_get_arg(&ctx->ac, ctx->gs_vtx23_offset);
+ vtxindex[2] = ac_get_arg(&ctx->ac, ctx->gs_vtx45_offset);
+ } else {
+ vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
+ vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
+ vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
};
LLVMValueRef gs_vtxptr[] = {
ngg_nogs_vertex_ptr(ctx, vtxindex[0]),
8 + SI_SGPR_DRAWID);
ret = si_insert_input_ptr(ctx, ret, ctx->vertex_buffers,
8 + SI_VS_NUM_USER_SGPR);
+
+ for (unsigned i = 0; i < shader->selector->num_vbos_in_user_sgprs; i++) {
+ ret = si_insert_input_v4i32(ctx, ret, ctx->vb_descriptors[i],
+ 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + i * 4);
+ }
} else {
assert(ctx->type == PIPE_SHADER_TESS_EVAL);
ret = si_insert_input_ptr(ctx, ret, ctx->tcs_offchip_layout,
}
unsigned vgpr;
- if (ctx->type == PIPE_SHADER_VERTEX)
- vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1;
- else
+ if (ctx->type == PIPE_SHADER_VERTEX) {
+ if (shader->selector->num_vbos_in_user_sgprs) {
+ vgpr = 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST +
+ shader->selector->num_vbos_in_user_sgprs * 4;
+ } else {
+ vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1;
+ }
+ } else {
vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
+ }
val = LLVMBuildLoad(builder, new_vgpr0, "");
ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
/* All these are per subgroup: */
bool max_vert_out_per_gs_instance = false;
- unsigned max_esverts_base = 128;
unsigned max_gsprims_base = 128; /* default prim group size clamp */
+ unsigned max_esverts_base = 128;
+
+ if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
+ max_gsprims_base = 128 / 3;
+ max_esverts_base = max_gsprims_base * 3;
+ } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
+ max_gsprims_base = 126;
+ max_esverts_base = 128;
+ }
/* Hardware has the following non-natural restrictions on the value
* of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
uint32_t index;
};
-#define SI_NUM_VGT_STAGES_KEY_BITS 5
+#define SI_NUM_VGT_STAGES_KEY_BITS 6
#define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS)
/* The VGT_SHADER_STAGES key used to index the table of precomputed values.
#if UTIL_ARCH_LITTLE_ENDIAN
unsigned tess:1;
unsigned gs:1;
+ unsigned ngg_gs_fast_launch:1;
unsigned ngg_passthrough:1;
unsigned ngg:1; /* gfx10+ */
unsigned streamout:1; /* only used with NGG */
unsigned streamout:1;
unsigned ngg:1;
unsigned ngg_passthrough:1;
+ unsigned ngg_gs_fast_launch:1;
unsigned gs:1;
unsigned tess:1;
#endif
ctx->type == PIPE_SHADER_TESS_EVAL)) {
unsigned num_user_sgprs, num_vgprs;
- /* For the NGG cull shader, add 1 SGPR to hold the vertex buffer pointer. */
- if (ctx->type == PIPE_SHADER_VERTEX)
+ if (ctx->type == PIPE_SHADER_VERTEX) {
+ /* For the NGG cull shader, add 1 SGPR to hold
+ * the vertex buffer pointer.
+ */
num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader;
- else
+
+ if (ngg_cull_shader && shader->selector->num_vbos_in_user_sgprs) {
+ assert(num_user_sgprs <= 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
+ num_user_sgprs = SI_SGPR_VS_VB_DESCRIPTOR_FIRST +
+ shader->selector->num_vbos_in_user_sgprs * 4;
+ }
+ } else {
num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
+ }
/* The NGG cull shader has to return all 9 VGPRs + the old thread ID.
*
}
static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
- const struct si_vs_prolog_bits *key)
+ const struct si_vs_prolog_bits *prolog_key,
+ const struct si_shader_key *key,
+ bool ngg_cull_shader)
{
/* VGPR initialization fixup for Vega10 and Raven is always done in the
* VS prolog. */
return sel->vs_needs_prolog ||
- key->ls_vgpr_fix ||
- key->unpack_instance_id_from_vertex_id;
+ prolog_key->ls_vgpr_fix ||
+ prolog_key->unpack_instance_id_from_vertex_id ||
+ (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
}
static bool si_build_main_function(struct si_shader_context *ctx,
(shader->key.as_es || shader->key.as_ls) &&
(ctx->type == PIPE_SHADER_TESS_EVAL ||
(ctx->type == PIPE_SHADER_VERTEX &&
- !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
+ !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
+ &shader->key, ngg_cull_shader)))) {
si_init_exec_from_input(ctx,
ctx->merged_wave_info, 0);
} else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
key->vs_prolog.as_es = shader_out->key.as_es;
key->vs_prolog.as_ngg = shader_out->key.as_ngg;
- if (!ngg_cull_shader)
+ if (ngg_cull_shader) {
+ key->vs_prolog.gs_fast_launch_tri_list = !!(shader_out->key.opt.ngg_culling &
+ SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
+ key->vs_prolog.gs_fast_launch_tri_strip = !!(shader_out->key.opt.ngg_culling &
+ SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
+ } else {
key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling;
+ }
if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
key->vs_prolog.as_ls = 1;
if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
LLVMValueRef parts[4];
unsigned num_parts = 0;
- bool need_prolog = si_vs_needs_prolog(sel, &shader->key.part.vs.prolog);
+ bool has_prolog = false;
LLVMValueRef main_fn = ctx.main_fn;
if (ngg_cull_main_fn) {
- if (need_prolog) {
+ if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
+ &shader->key, true)) {
union si_shader_part_key prolog_key;
si_get_vs_prolog_key(&sel->info,
shader->info.num_input_sgprs,
prolog_key.vs_prolog.is_monolithic = true;
si_build_vs_prolog_function(&ctx, &prolog_key);
parts[num_parts++] = ctx.main_fn;
+ has_prolog = true;
}
parts[num_parts++] = ngg_cull_main_fn;
}
- if (need_prolog) {
+ if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
+ &shader->key, false)) {
union si_shader_part_key prolog_key;
si_get_vs_prolog_key(&sel->info,
shader->info.num_input_sgprs,
prolog_key.vs_prolog.is_monolithic = true;
si_build_vs_prolog_function(&ctx, &prolog_key);
parts[num_parts++] = ctx.main_fn;
+ has_prolog = true;
}
parts[num_parts++] = main_fn;
si_build_wrapper_function(&ctx, parts, num_parts,
- need_prolog ? 1 : 0, 0);
+ has_prolog ? 1 : 0, 0);
if (ctx.shader->key.opt.vs_as_prim_discard_cs)
si_build_prim_discard_compute_shader(&ctx);
struct si_shader_selector *ls = shader->key.part.tcs.ls;
LLVMValueRef parts[4];
bool vs_needs_prolog =
- si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
+ si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog,
+ &shader->key, false);
/* TCS main part */
parts[2] = ctx.main_fn;
/* ES prolog */
if (es->type == PIPE_SHADER_VERTEX &&
- si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog)) {
+ si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog,
+ &shader->key, false)) {
union si_shader_part_key vs_prolog_key;
si_get_vs_prolog_key(&es->info,
shader_es.info.num_input_sgprs,
}
}
+ if (key->vs_prolog.gs_fast_launch_tri_list ||
+ key->vs_prolog.gs_fast_launch_tri_strip) {
+ LLVMValueRef wave_id, thread_id_in_tg;
+
+ wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4);
+ thread_id_in_tg = ac_build_imad(&ctx->ac, wave_id,
+ LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false),
+ ac_get_thread_id(&ctx->ac));
+
+ /* The GS fast launch initializes all VGPRs to the value of
+ * the first thread, so we have to add the thread ID.
+ *
+ * Only these are initialized by the hw:
+ * VGPR2: Base Primitive ID
+ * VGPR5: Base Vertex ID
+ * VGPR6: Instance ID
+ */
+
+ /* Put the vertex thread IDs into VGPRs as-is instead of packing them.
+ * The NGG cull shader will read them from there.
+ */
+ if (key->vs_prolog.gs_fast_launch_tri_list) {
+ input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx01_offset */
+ LLVMConstInt(ctx->i32, 3, 0), /* Vertex 0 */
+ LLVMConstInt(ctx->i32, 0, 0));
+ input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx23_offset */
+ LLVMConstInt(ctx->i32, 3, 0), /* Vertex 1 */
+ LLVMConstInt(ctx->i32, 1, 0));
+ input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx45_offset */
+ LLVMConstInt(ctx->i32, 3, 0), /* Vertex 2 */
+ LLVMConstInt(ctx->i32, 2, 0));
+ } else {
+ assert(key->vs_prolog.gs_fast_launch_tri_strip);
+ LLVMBuilderRef builder = ctx->ac.builder;
+ /* Triangle indices: */
+ LLVMValueRef index[3] = {
+ thread_id_in_tg,
+ LLVMBuildAdd(builder, thread_id_in_tg,
+ LLVMConstInt(ctx->i32, 1, 0), ""),
+ LLVMBuildAdd(builder, thread_id_in_tg,
+ LLVMConstInt(ctx->i32, 2, 0), ""),
+ };
+ LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder,
+ thread_id_in_tg, ctx->i1, "");
+ LLVMValueRef flatshade_first =
+ LLVMBuildICmp(builder, LLVMIntEQ,
+ si_unpack_param(ctx, ctx->vs_state_bits, 4, 2),
+ ctx->i32_0, "");
+
+ ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd,
+ flatshade_first, index);
+ input_vgprs[0] = index[0];
+ input_vgprs[1] = index[1];
+ input_vgprs[4] = index[2];
+ }
+
+ /* Triangles always have all edge flags set initially. */
+ input_vgprs[3] = LLVMConstInt(ctx->i32, 0x7 << 8, 0);
+
+ input_vgprs[2] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[2],
+ thread_id_in_tg, ""); /* PrimID */
+ input_vgprs[5] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[5],
+ thread_id_in_tg, ""); /* VertexID */
+ input_vgprs[8] = input_vgprs[6]; /* InstanceID */
+ }
+
unsigned vertex_id_vgpr = first_vs_vgpr;
unsigned instance_id_vgpr =
ctx->screen->info.chip_class >= GFX10 ?
{
struct si_shader_selector *vs = main_part->selector;
- if (!si_vs_needs_prolog(vs, key))
+ if (!si_vs_needs_prolog(vs, key, &shader->key, false))
return true;
/* Get the prolog. */
SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
};
-#define SI_NGG_CULL_VIEW_SMALLPRIMS (1 << 0) /* view.xy + small prims */
-#define SI_NGG_CULL_BACK_FACE (1 << 1) /* back faces */
-#define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */
+#define SI_NGG_CULL_VIEW_SMALLPRIMS (1 << 0) /* view.xy + small prims */
+#define SI_NGG_CULL_BACK_FACE (1 << 1) /* back faces */
+#define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */
+#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST (1 << 3) /* GS fast launch: triangles */
+#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP (1 << 4) /* GS fast launch: triangle strip */
+#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL (0x3 << 3) /* GS fast launch (both prim types) */
/**
* For VS shader keys, describe any fixups required for vertex fetch.
unsigned as_es:1;
unsigned as_ngg:1;
unsigned has_ngg_cull_inputs:1; /* from the NGG cull shader */
+ unsigned gs_fast_launch_tri_list:1; /* for NGG culling */
+ unsigned gs_fast_launch_tri_strip:1; /* for NGG culling */
/* Prologs for monolithic shaders shouldn't set EXEC. */
unsigned is_monolithic:1;
} vs_prolog;
unsigned clip_disable:1;
/* For NGG VS and TES. */
- unsigned ngg_culling:3; /* SI_NGG_CULL_* */
+ unsigned ngg_culling:5; /* SI_NGG_CULL_* */
/* For shaders where monolithic variants have better code.
*
if (sctx->ngg &&
rast_prim == PIPE_PRIM_TRIANGLES &&
(sctx->screen->always_use_ngg_culling ||
- /* At least 1500 non-indexed triangles (4500 vertices) are needed
- * per draw call (no TES/GS) to enable NGG culling. Triangle strips
- * don't need this, because they have good reuse and therefore
- * perform the same as indexed triangles.
+ /* At least 1024 non-indexed vertices (8 subgroups) are needed
+ * per draw call (no TES/GS) to enable NGG culling.
*/
- (!index_size && prim == PIPE_PRIM_TRIANGLES && direct_count > 4500 &&
+ (!index_size && direct_count >= 1024 &&
+ (prim == PIPE_PRIM_TRIANGLES || prim == PIPE_PRIM_TRIANGLE_STRIP) &&
!sctx->tes_shader.cso && !sctx->gs_shader.cso)) &&
si_get_vs(sctx)->cso->ngg_culling_allowed) {
unsigned ngg_culling = 0;
if (sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back)
ngg_culling |= SI_NGG_CULL_BACK_FACE;
}
+
+ /* Use NGG fast launch for certain non-indexed primitive types.
+ * A draw must have at least 1 full primitive.
+ */
+ if (ngg_culling && !index_size && direct_count >= 3 &&
+ !sctx->tes_shader.cso && !sctx->gs_shader.cso) {
+ if (prim == PIPE_PRIM_TRIANGLES)
+ ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST;
+ else if (prim == PIPE_PRIM_TRIANGLE_STRIP)
+ ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP;
+ }
+
if (ngg_culling != sctx->ngg_culling) {
sctx->ngg_culling = ngg_culling;
sctx->do_update_shaders = true;
late_alloc_wave64 = 0;
else if (num_cu_per_sh <= 6)
late_alloc_wave64 = num_cu_per_sh - 2; /* All CUs enabled */
+ else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
+ late_alloc_wave64 = (num_cu_per_sh - 2) * 6;
else
late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(1) |
S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
- shader->ge_cntl =
- S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
- S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
- S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
+ if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
+ shader->ge_cntl =
+ S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+ S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims * 3);
+ } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
+ shader->ge_cntl =
+ S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+ S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims + 2);
+ } else {
+ shader->ge_cntl =
+ S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+ S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
+ S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
- /* Bug workaround for a possible hang with non-tessellation cases.
- * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
- *
- * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
- */
- if ((sscreen->info.family == CHIP_NAVI10 ||
- sscreen->info.family == CHIP_NAVI12 ||
- sscreen->info.family == CHIP_NAVI14) &&
- (es_type == PIPE_SHADER_VERTEX || gs_type == PIPE_SHADER_VERTEX) && /* = no tess */
- shader->ngg.hw_max_esverts != 256) {
- shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
-
- if (shader->ngg.hw_max_esverts > 5) {
- shader->ge_cntl |=
- S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
+ /* Bug workaround for a possible hang with non-tessellation cases.
+ * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
+ *
+ * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
+ */
+ if ((sscreen->info.family == CHIP_NAVI10 ||
+ sscreen->info.family == CHIP_NAVI12 ||
+ sscreen->info.family == CHIP_NAVI14) &&
+ (es_type == PIPE_SHADER_VERTEX || gs_type == PIPE_SHADER_VERTEX) && /* = no tess */
+ shader->ngg.hw_max_esverts != 256) {
+ shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
+
+ if (shader->ngg.hw_max_esverts > 5) {
+ shader->ge_cntl |=
+ S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
+ }
}
}
if (key.u.ngg) {
stages |= S_028B54_PRIMGEN_EN(1) |
+ S_028B54_GS_FAST_LAUNCH(key.u.ngg_gs_fast_launch) |
S_028B54_NGG_WAVE_ID_EN(key.u.streamout) |
S_028B54_PRIMGEN_PASSTHRU_EN(key.u.ngg_passthrough);
} else if (key.u.gs)
}
/* This must be done after the shader variant is selected. */
- if (sctx->ngg)
- key.u.ngg_passthrough = gfx10_is_ngg_passthrough(si_get_vs(sctx)->current);
+ if (sctx->ngg) {
+ struct si_shader *vs = si_get_vs(sctx)->current;
+
+ key.u.ngg_passthrough = gfx10_is_ngg_passthrough(vs);
+ key.u.ngg_gs_fast_launch = !!(vs->key.opt.ngg_culling &
+ SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
+ }
si_update_vgt_shader_config(sctx, key);