}
static void declare_vs_input_vgprs(struct si_shader_context *ctx,
- unsigned *num_prolog_vgprs)
+ unsigned *num_prolog_vgprs,
+ bool ngg_cull_shader)
{
struct si_shader *shader = ctx->shader;
}
if (!shader->is_gs_copy_shader) {
+ if (shader->key.opt.ngg_culling && !ngg_cull_shader) {
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
+ &ctx->ngg_old_thread_id);
+ }
+
/* Vertex load indices. */
if (shader->selector->info.num_inputs) {
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
}
}
-static void declare_tes_input_vgprs(struct si_shader_context *ctx)
+static void declare_tes_input_vgprs(struct si_shader_context *ctx, bool ngg_cull_shader)
{
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u);
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v);
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id);
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
+
+ if (ctx->shader->key.opt.ngg_culling && !ngg_cull_shader) {
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
+ &ctx->ngg_old_thread_id);
+ }
}
enum {
ac_add_arg(args, file, registers, type, arg);
}
-void si_create_function(struct si_shader_context *ctx)
+void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
{
struct si_shader *shader = ctx->shader;
LLVMTypeRef returns[AC_MAX_ARGS];
declare_vs_blit_inputs(ctx, vs_blit_property);
/* VGPRs */
- declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
+ declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
break;
}
}
/* VGPRs */
- declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
+ declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
/* Return values */
if (shader->key.opt.vs_as_prim_discard_cs) {
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
if (ctx->type == PIPE_SHADER_VERTEX) {
- declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
+ declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
/* LS return values are inputs to the TCS main shader part. */
for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
+ &ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
declare_global_desc_pointers(ctx);
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);
if (ctx->type == PIPE_SHADER_VERTEX) {
- declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
+ declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
} else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
- declare_tes_input_vgprs(ctx);
+ declare_tes_input_vgprs(ctx, ngg_cull_shader);
}
- if (ctx->shader->key.as_es &&
+ if ((ctx->shader->key.as_es || ngg_cull_shader) &&
(ctx->type == PIPE_SHADER_VERTEX ||
ctx->type == PIPE_SHADER_TESS_EVAL)) {
- unsigned num_user_sgprs;
+ unsigned num_user_sgprs, num_vgprs;
+ /* For the NGG cull shader, add 1 SGPR to hold the vertex buffer pointer. */
if (ctx->type == PIPE_SHADER_VERTEX)
- num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR;
+ num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader;
else
num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
+ /* The NGG cull shader has to return all 9 VGPRs + the old thread ID.
+ *
+ * The normal merged ESGS shader only has to return the 5 VGPRs
+ * for the GS stage.
+ */
+ num_vgprs = ngg_cull_shader ? 10 : 5;
+
/* ES return values are inputs to GS. */
for (i = 0; i < 8 + num_user_sgprs; i++)
returns[num_returns++] = ctx->i32; /* SGPRs */
- for (i = 0; i < 5; i++)
+ for (i = 0; i < num_vgprs; i++)
returns[num_returns++] = ctx->f32; /* VGPRs */
}
break;
}
/* VGPRs */
- declare_tes_input_vgprs(ctx);
+ declare_tes_input_vgprs(ctx, ngg_cull_shader);
break;
case PIPE_SHADER_GEOMETRY:
return;
}
- si_llvm_create_func(ctx, "main", returns, num_returns,
- si_get_max_workgroup_size(shader));
+ si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main",
+ returns, num_returns, si_get_max_workgroup_size(shader));
/* Reserve register locations for VGPR inputs the PS prolog may need. */
if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
!key->as_es && !key->as_ls) {
fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable);
+ if (shader_type != PIPE_SHADER_GEOMETRY)
+ fprintf(f, " opt.ngg_culling = 0x%x\n", key->opt.ngg_culling);
}
}
}
static bool si_build_main_function(struct si_shader_context *ctx,
- struct nir_shader *nir, bool free_nir)
+ struct nir_shader *nir, bool free_nir,
+ bool ngg_cull_shader)
{
struct si_shader *shader = ctx->shader;
struct si_shader_selector *sel = shader->selector;
ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
else if (shader->key.opt.vs_as_prim_discard_cs)
ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
+ else if (ngg_cull_shader)
+ ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
else if (shader->key.as_ngg)
ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
else
if (shader->key.as_es)
ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
+ else if (ngg_cull_shader)
+ ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
else if (shader->key.as_ngg)
ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
else
return false;
}
- si_create_function(ctx);
+ si_create_function(ctx, ngg_cull_shader);
if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)
si_preload_esgs_ring(ctx);
if (sel->so.num_outputs)
scratch_size = 44;
+ assert(!ctx->gs_ngg_scratch);
LLVMTypeRef ai32 = LLVMArrayType(ctx->i32, scratch_size);
ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
/* This is really only needed when streamout and / or vertex
* compaction is enabled.
*/
- if (sel->so.num_outputs && !ctx->gs_ngg_scratch) {
+ if (!ctx->gs_ngg_scratch &&
+ (sel->so.num_outputs || shader->key.opt.ngg_culling)) {
LLVMTypeRef asi32 = LLVMArrayType(ctx->i32, 8);
ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
if (!shader->is_monolithic ||
(ctx->type == PIPE_SHADER_TESS_EVAL &&
- (shader->key.as_ngg && !shader->key.as_es)))
+ shader->key.as_ngg && !shader->key.as_es &&
+ !shader->key.opt.ngg_culling))
ac_init_exec_full_mask(&ctx->ac);
if ((ctx->type == PIPE_SHADER_VERTEX ||
ctx->type == PIPE_SHADER_TESS_EVAL) &&
- shader->key.as_ngg && !shader->key.as_es) {
+ shader->key.as_ngg && !shader->key.as_es &&
+ !shader->key.opt.ngg_culling) {
gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
/* Build the primitive export at the beginning
* of the shader if possible.
*/
if (gfx10_ngg_export_prim_early(shader))
- gfx10_ngg_build_export_prim(ctx, NULL);
+ gfx10_ngg_build_export_prim(ctx, NULL, NULL);
}
if (ctx->type == PIPE_SHADER_TESS_CTRL ||
*
* \param info Shader info of the vertex shader.
* \param num_input_sgprs Number of input SGPRs for the vertex shader.
+ * \param has_old_ Whether the preceding shader part is the NGG cull shader.
* \param prolog_key Key of the VS prolog
* \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
* \param key Output shader part key.
*/
static void si_get_vs_prolog_key(const struct si_shader_info *info,
unsigned num_input_sgprs,
+ bool ngg_cull_shader,
const struct si_vs_prolog_bits *prolog_key,
struct si_shader *shader_out,
union si_shader_part_key *key)
key->vs_prolog.as_es = shader_out->key.as_es;
key->vs_prolog.as_ngg = shader_out->key.as_ngg;
+ if (!ngg_cull_shader)
+ key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling;
+
if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
key->vs_prolog.as_ls = 1;
key->vs_prolog.num_merged_next_stage_vgprs = 2;
shader->info.uses_instanceid = sel->info.uses_instanceid;
- if (!si_build_main_function(&ctx, nir, free_nir)) {
+ LLVMValueRef ngg_cull_main_fn = NULL;
+ if (ctx.shader->key.opt.ngg_culling) {
+ if (!si_build_main_function(&ctx, nir, false, true)) {
+ si_llvm_dispose(&ctx);
+ return -1;
+ }
+ ngg_cull_main_fn = ctx.main_fn;
+ ctx.main_fn = NULL;
+ /* Re-set the IR. */
+ si_llvm_context_set_ir(&ctx, shader);
+ }
+
+ if (!si_build_main_function(&ctx, nir, free_nir, false)) {
si_llvm_dispose(&ctx);
return -1;
}
if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
- LLVMValueRef parts[2];
+ LLVMValueRef parts[4];
+ unsigned num_parts = 0;
bool need_prolog = si_vs_needs_prolog(sel, &shader->key.part.vs.prolog);
-
- parts[1] = ctx.main_fn;
+ LLVMValueRef main_fn = ctx.main_fn;
+
+ if (ngg_cull_main_fn) {
+ if (need_prolog) {
+ union si_shader_part_key prolog_key;
+ si_get_vs_prolog_key(&sel->info,
+ shader->info.num_input_sgprs,
+ true,
+ &shader->key.part.vs.prolog,
+ shader, &prolog_key);
+ prolog_key.vs_prolog.is_monolithic = true;
+ si_build_vs_prolog_function(&ctx, &prolog_key);
+ parts[num_parts++] = ctx.main_fn;
+ }
+ parts[num_parts++] = ngg_cull_main_fn;
+ }
if (need_prolog) {
union si_shader_part_key prolog_key;
si_get_vs_prolog_key(&sel->info,
shader->info.num_input_sgprs,
+ false,
&shader->key.part.vs.prolog,
shader, &prolog_key);
prolog_key.vs_prolog.is_monolithic = true;
si_build_vs_prolog_function(&ctx, &prolog_key);
- parts[0] = ctx.main_fn;
+ parts[num_parts++] = ctx.main_fn;
}
+ parts[num_parts++] = main_fn;
- si_build_wrapper_function(&ctx, parts + !need_prolog,
- 1 + need_prolog, need_prolog, 0);
+ si_build_wrapper_function(&ctx, parts, num_parts,
+ need_prolog ? 1 : 0, 0);
if (ctx.shader->key.opt.vs_as_prim_discard_cs)
si_build_prim_discard_compute_shader(&ctx);
+ } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL &&
+ ngg_cull_main_fn) {
+ LLVMValueRef parts[2];
+
+ parts[0] = ngg_cull_main_fn;
+ parts[1] = ctx.main_fn;
+
+ si_build_wrapper_function(&ctx, parts, 2, 0, 0);
} else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
if (sscreen->info.chip_class >= GFX9) {
struct si_shader_selector *ls = shader->key.part.tcs.ls;
shader_ls.is_monolithic = true;
si_llvm_context_set_ir(&ctx, &shader_ls);
- if (!si_build_main_function(&ctx, nir, free_nir)) {
+ if (!si_build_main_function(&ctx, nir, free_nir, false)) {
si_llvm_dispose(&ctx);
return -1;
}
union si_shader_part_key vs_prolog_key;
si_get_vs_prolog_key(&ls->info,
shader_ls.info.num_input_sgprs,
+ false,
&shader->key.part.tcs.ls_prolog,
shader, &vs_prolog_key);
vs_prolog_key.vs_prolog.is_monolithic = true;
shader_es.is_monolithic = true;
si_llvm_context_set_ir(&ctx, &shader_es);
- if (!si_build_main_function(&ctx, nir, free_nir)) {
+ if (!si_build_main_function(&ctx, nir, free_nir, false)) {
si_llvm_dispose(&ctx);
return -1;
}
union si_shader_part_key vs_prolog_key;
si_get_vs_prolog_key(&es->info,
shader_es.info.num_input_sgprs,
+ false,
&shader->key.part.gs.vs_prolog,
shader, &vs_prolog_key);
vs_prolog_key.vs_prolog.is_monolithic = true;
LLVMValueRef ret, func;
int num_returns, i;
unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
- unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
+ unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4 +
+ (key->vs_prolog.has_ngg_cull_inputs ? 1 : 0);
struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
- struct ac_arg input_vgpr_param[9];
- LLVMValueRef input_vgprs[9];
+ struct ac_arg input_vgpr_param[13];
+ LLVMValueRef input_vgprs[13];
unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
num_input_vgprs;
unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
/* Get the prolog. */
union si_shader_part_key prolog_key;
- si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
+ si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false,
key, shader, &prolog_key);
shader->prolog =