From ee21bd7440c3222cc01a630c4ef49d33bf431807 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 5 Jul 2019 08:33:06 +0200 Subject: [PATCH] radv/gfx10: implement NGG support (VS only) This needs to be cleaned up a bit, and it probably contains missing stuff and/or bugs. This doesn't fix the "half of the triangles" issue. Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen --- src/amd/vulkan/radv_nir_to_llvm.c | 224 +++++++++++++++++- src/amd/vulkan/radv_pipeline.c | 374 +++++++++++++++++++++++++++++- src/amd/vulkan/radv_private.h | 2 + src/amd/vulkan/radv_shader.c | 19 +- src/amd/vulkan/radv_shader.h | 2 + src/amd/vulkan/si_cmd_buffer.c | 11 + 6 files changed, 610 insertions(+), 22 deletions(-) diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 7c0275be7cf..51414be2304 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -70,6 +70,13 @@ struct radv_shader_context { LLVMValueRef tes_u; LLVMValueRef tes_v; + /* HW GS */ + /* On gfx10: + * - bits 0..10: ordered_wave_id + * - bits 12..20: number of vertices in group + * - bits 22..30: number of primitives in group + */ + LLVMValueRef gs_tg_info; LLVMValueRef gs2vs_offset; LLVMValueRef gs_wave_id; LLVMValueRef gs_vtx_offset[6]; @@ -823,11 +830,18 @@ declare_vs_input_vgprs(struct radv_shader_context *ctx, struct arg_info *args) if (ctx->options->key.vs.out.as_ls) { add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->rel_auto_id); add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); + add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */ } else { - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id); + if (ctx->ac.chip_class >= GFX10) { + add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */ + add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */ + add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); + } else { + add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); + add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id); + add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */ + } } - add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */ } } @@ -969,6 +983,12 @@ static void set_llvm_calling_convention(LLVMValueRef func, LLVMSetFunctionCallConv(func, calling_conv); } +/* Returns whether the stage is a stage that can be directly before the GS */ +static bool is_pre_gs_stage(gl_shader_stage stage) +{ + return stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL; +} + static void create_function(struct radv_shader_context *ctx, gl_shader_stage stage, bool has_previous_stage, @@ -987,6 +1007,15 @@ static void create_function(struct radv_shader_context *ctx, &ctx->ring_offsets); } + if (ctx->ac.chip_class >= GFX10) { + if (stage == MESA_SHADER_VERTEX && ctx->options->key.vs.out.as_ngg) { + /* On GFX10, VS is merged into GS for NGG. */ + stage = MESA_SHADER_GEOMETRY; + has_previous_stage = true; + previous_stage = MESA_SHADER_VERTEX; + } + } + switch (stage) { case MESA_SHADER_COMPUTE: declare_global_input_sgprs(ctx, &user_sgpr_info, &args, @@ -1101,8 +1130,14 @@ static void create_function(struct radv_shader_context *ctx, case MESA_SHADER_GEOMETRY: if (has_previous_stage) { // First 6 system regs - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->gs2vs_offset); + if (ctx->options->key.vs.out.as_ngg) { + add_arg(&args, ARG_SGPR, ctx->ac.i32, + &ctx->gs_tg_info); + } else { + add_arg(&args, ARG_SGPR, ctx->ac.i32, + &ctx->gs2vs_offset); + } + add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->merged_wave_info); add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds); @@ -3194,6 +3229,168 @@ handle_ls_outputs_post(struct radv_shader_context *ctx) } } +static LLVMValueRef get_wave_id_in_tg(struct radv_shader_context *ctx) +{ + return ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 24, 4); +} + +static LLVMValueRef ngg_get_vtx_cnt(struct radv_shader_context *ctx) +{ + return ac_build_bfe(&ctx->ac, ctx->gs_tg_info, + LLVMConstInt(ctx->ac.i32, 12, false), + LLVMConstInt(ctx->ac.i32, 9, false), + false); +} + +static LLVMValueRef ngg_get_prim_cnt(struct radv_shader_context *ctx) +{ + return ac_build_bfe(&ctx->ac, ctx->gs_tg_info, + LLVMConstInt(ctx->ac.i32, 22, false), + LLVMConstInt(ctx->ac.i32, 9, false), + false); +} + +/* Send GS Alloc Req message from the first wave of the group to SPI. + * Message payload is: + * - bits 0..10: vertices in group + * - bits 12..22: primitives in group + */ +static void build_sendmsg_gs_alloc_req(struct radv_shader_context *ctx, + LLVMValueRef vtx_cnt, + LLVMValueRef prim_cnt) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef tmp; + + tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ""); + ac_build_ifcc(&ctx->ac, tmp, 5020); + + tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->ac.i32, 12, false),""); + tmp = LLVMBuildOr(builder, tmp, vtx_cnt, ""); + ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_ALLOC_REQ, tmp); + + ac_build_endif(&ctx->ac, 5020); +} + +struct ngg_prim { + unsigned num_vertices; + LLVMValueRef isnull; + LLVMValueRef index[3]; + LLVMValueRef edgeflag[3]; +}; + +static void build_export_prim(struct radv_shader_context *ctx, + const struct ngg_prim *prim) +{ + LLVMBuilderRef builder = ctx->ac.builder; + struct ac_export_args args; + LLVMValueRef tmp; + + tmp = LLVMBuildZExt(builder, prim->isnull, ctx->ac.i32, ""); + args.out[0] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 31, false), ""); + + for (unsigned i = 0; i < prim->num_vertices; ++i) { + tmp = LLVMBuildShl(builder, prim->index[i], + LLVMConstInt(ctx->ac.i32, 10 * i, false), ""); + args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, ""); + tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->ac.i32, ""); + tmp = LLVMBuildShl(builder, tmp, + LLVMConstInt(ctx->ac.i32, 10 * i + 9, false), ""); + args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, ""); + } + + args.out[0] = LLVMBuildBitCast(builder, args.out[0], ctx->ac.f32, ""); + args.out[1] = LLVMGetUndef(ctx->ac.f32); + args.out[2] = LLVMGetUndef(ctx->ac.f32); + args.out[3] = LLVMGetUndef(ctx->ac.f32); + + args.target = V_008DFC_SQ_EXP_PRIM; + args.enabled_channels = 1; + args.done = true; + args.valid_mask = false; + args.compr = false; + + ac_build_export(&ctx->ac, &args); +} + +static void +handle_ngg_outputs_post(struct radv_shader_context *ctx) +{ + LLVMBuilderRef builder = ctx->ac.builder; + struct ac_build_if_state if_state; + unsigned num_vertices = 3; + LLVMValueRef tmp; + + assert(ctx->stage == MESA_SHADER_VERTEX && !ctx->is_gs_copy_shader); + + LLVMValueRef prims_in_wave = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 8, 8); + LLVMValueRef vtx_in_wave = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 0, 8); + LLVMValueRef is_gs_thread = LLVMBuildICmp(builder, LLVMIntULT, + ac_get_thread_id(&ctx->ac), prims_in_wave, ""); + LLVMValueRef is_es_thread = LLVMBuildICmp(builder, LLVMIntULT, + ac_get_thread_id(&ctx->ac), vtx_in_wave, ""); + LLVMValueRef vtxindex[] = { + ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[0], 0, 16), + ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[0], 16, 16), + ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[2], 0, 16), + }; + + /* TODO: streamout */ + + /* TODO: VS primitive ID */ + if (ctx->options->key.vs.out.export_prim_id) + assert(0); + + /* TODO: primitive culling */ + + build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx)); + + /* TODO: streamout queries */ + /* Export primitive data to the index buffer. Format is: + * - bits 0..8: index 0 + * - bit 9: edge flag 0 + * - bits 10..18: index 1 + * - bit 19: edge flag 1 + * - bits 20..28: index 2 + * - bit 29: edge flag 2 + * - bit 31: null primitive (skip) + * + * For the first version, we will always build up all three indices + * independent of the primitive type. The additional garbage data + * shouldn't hurt. + * + * TODO: culling depends on the primitive type, so can have some + * interaction here. + */ + ac_nir_build_if(&if_state, ctx, is_gs_thread); + { + struct ngg_prim prim = {}; + + prim.num_vertices = num_vertices; + prim.isnull = ctx->ac.i1false; + memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3); + + for (unsigned i = 0; i < num_vertices; ++i) { + tmp = LLVMBuildLShr(builder, ctx->abi.gs_invocation_id, + LLVMConstInt(ctx->ac.i32, 8 + i, false), ""); + prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + } + + build_export_prim(ctx, &prim); + } + ac_nir_build_endif(&if_state); + + /* Export per-vertex data (positions and parameters). */ + ac_nir_build_if(&if_state, ctx, is_es_thread); + { + handle_vs_outputs_post(ctx, ctx->options->key.vs.out.export_prim_id, + ctx->options->key.vs.out.export_layer_id, + ctx->options->key.vs.out.export_clip_dists, + &ctx->shader_info->vs.outinfo); + } + ac_nir_build_endif(&if_state); +} + static void write_tess_factors(struct radv_shader_context *ctx) { @@ -3452,6 +3649,8 @@ handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs, handle_ls_outputs_post(ctx); else if (ctx->options->key.vs.out.as_es) handle_es_outputs_post(ctx, &ctx->shader_info->vs.es_info); + else if (ctx->options->key.vs.out.as_ngg) + handle_ngg_outputs_post(ctx); else handle_vs_outputs_post(ctx, ctx->options->key.vs.out.export_prim_id, ctx->options->key.vs.out.export_layer_id, @@ -3703,6 +3902,13 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, shaders[i])); } + if (ctx.ac.chip_class >= GFX10) { + if (shaders[0]->info.stage == MESA_SHADER_VERTEX && + options->key.vs.out.as_ngg) { + ctx.max_workgroup_size = 128; + } + } + create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2, shader_count >= 2 ? shaders[shader_count - 2]->info.stage : MESA_SHADER_VERTEX); @@ -3722,7 +3928,8 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, */ ctx.abi.gfx9_stride_size_workaround_for_atomic = ctx.ac.chip_class == GFX9 && HAVE_LLVM < 0x900; - if (shader_count >= 2) + bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && ctx.options->key.vs.out.as_ngg; + if (shader_count >= 2 || is_ngg) ac_init_exec_full_mask(&ctx.ac); if ((ctx.ac.family == CHIP_VEGA10 || @@ -3788,7 +3995,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ac_setup_rings(&ctx); LLVMBasicBlockRef merge_block; - if (shader_count >= 2) { + if (shader_count >= 2 || is_ngg) { LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder)); LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, ""); merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, ""); @@ -3811,7 +4018,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ac_nir_translate(&ctx.ac, &ctx.abi, shaders[i]); - if (shader_count >= 2) { + if (shader_count >= 2 || is_ngg) { LLVMBuildBr(ctx.ac.builder, merge_block); LLVMPositionBuilderAtEnd(ctx.ac.builder, merge_block); } @@ -3955,6 +4162,7 @@ ac_fill_shader_info(struct radv_shader_variant_info *shader_info, struct nir_sha shader_info->vs.as_es = options->key.vs.out.as_es; shader_info->vs.as_ls = options->key.vs.out.as_ls; shader_info->vs.export_prim_id = options->key.vs.out.export_prim_id; + shader_info->is_ngg = options->key.vs.out.as_ngg; break; default: break; diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index ae08d57677f..cc5f339f34f 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -96,6 +96,30 @@ struct radv_gs_state { uint32_t lds_size; }; +struct radv_ngg_state { + uint16_t ngg_emit_size; /* in dwords */ + uint32_t hw_max_esverts; + uint32_t max_gsprims; + uint32_t max_out_verts; + uint32_t prim_amp_factor; + uint32_t vgt_esgs_ring_itemsize; + bool max_vert_out_per_gs_instance; +}; + +bool radv_pipeline_has_ngg(const struct radv_pipeline *pipeline) +{ + struct radv_shader_variant *variant = NULL; + if (pipeline->shaders[MESA_SHADER_GEOMETRY]) + variant = pipeline->shaders[MESA_SHADER_GEOMETRY]; + else if (pipeline->shaders[MESA_SHADER_TESS_EVAL]) + variant = pipeline->shaders[MESA_SHADER_TESS_EVAL]; + else if (pipeline->shaders[MESA_SHADER_VERTEX]) + variant = pipeline->shaders[MESA_SHADER_VERTEX]; + else + return false; + return variant->info.is_ngg; +} + static void radv_pipeline_destroy(struct radv_device *device, struct radv_pipeline *pipeline, @@ -1583,6 +1607,203 @@ calculate_gs_info(const VkGraphicsPipelineCreateInfo *pCreateInfo, return gs; } +static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, + unsigned min_verts_per_prim, bool use_adjacency) +{ + unsigned max_reuse = max_esverts - min_verts_per_prim; + if (use_adjacency) + max_reuse /= 2; + *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse); +} + +static struct radv_ngg_state +calculate_ngg_info(const VkGraphicsPipelineCreateInfo *pCreateInfo, + struct radv_pipeline *pipeline) +{ + struct radv_ngg_state ngg = {0}; + struct radv_shader_variant_info *gs_info = &pipeline->shaders[MESA_SHADER_GEOMETRY]->info; + struct radv_es_output_info *es_info = + radv_pipeline_has_tess(pipeline) ? &gs_info->tes.es_info : &gs_info->vs.es_info; + unsigned gs_type = MESA_SHADER_VERTEX; + unsigned max_verts_per_prim = 3; // triangles + unsigned min_verts_per_prim = + gs_type == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1; + unsigned gs_num_invocations = 1;//MAX2(gs_info->gs.invocations, 1); + bool uses_adjacency; + switch(pCreateInfo->pInputAssemblyState->topology) { + case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: + case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY: + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY: + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY: + uses_adjacency = true; + break; + default: + uses_adjacency = false; + break; + } + + /* All these are in dwords: */ + /* We can't allow using the whole LDS, because GS waves compete with + * other shader stages for LDS space. + * + * Streamout can increase the ESGS buffer size later on, so be more + * conservative with streamout and use 4K dwords. This may be suboptimal. + * + * Otherwise, use the limit of 7K dwords. The reason is that we need + * to leave some headroom for the max_esverts increase at the end. + * + * TODO: We should really take the shader's internal LDS use into + * account. The linker will fail if the size is greater than + * 8K dwords. + */ + const unsigned max_lds_size = (0 /*gs_info->info.so.num_outputs*/ ? 4 : 7) * 1024 - 128; + const unsigned target_lds_size = max_lds_size; + unsigned esvert_lds_size = 0; + unsigned gsprim_lds_size = 0; + + /* All these are per subgroup: */ + bool max_vert_out_per_gs_instance = false; + unsigned max_esverts_base = 256; + unsigned max_gsprims_base = 128; /* default prim group size clamp */ + + /* Hardware has the following non-natural restrictions on the value + * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of + * the draw: + * - at most 252 for any line input primitive type + * - at most 251 for any quad input primitive type + * - at most 251 for triangle strips with adjacency (this happens to + * be the natural limit for triangle *lists* with adjacency) + */ + max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1); + + if (gs_type == MESA_SHADER_GEOMETRY) { + unsigned max_out_verts_per_gsprim = + gs_info->gs.vertices_out * gs_num_invocations; + + if (max_out_verts_per_gsprim <= 256) { + if (max_out_verts_per_gsprim) { + max_gsprims_base = MIN2(max_gsprims_base, + 256 / max_out_verts_per_gsprim); + } + } else { + /* Use special multi-cycling mode in which each GS + * instance gets its own subgroup. Does not work with + * tessellation. */ + max_vert_out_per_gs_instance = true; + max_gsprims_base = 1; + max_out_verts_per_gsprim = gs_info->gs.vertices_out; + } + + esvert_lds_size = es_info->esgs_itemsize / 4; + gsprim_lds_size = (gs_info->gs.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim; + } else { + /* TODO: This needs to be adjusted once LDS use for compaction + * after culling is implemented. */ + /* + if (es_info->info.so.num_outputs) + esvert_lds_size = 4 * es_info->info.so.num_outputs + 1; + */ + } + + unsigned max_gsprims = max_gsprims_base; + unsigned max_esverts = max_esverts_base; + + if (esvert_lds_size) + max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size); + if (gsprim_lds_size) + max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size); + + max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); + clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency); + assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); + + if (esvert_lds_size || gsprim_lds_size) { + /* Now that we have a rough proportionality between esverts + * and gsprims based on the primitive type, scale both of them + * down simultaneously based on required LDS space. + * + * We could be smarter about this if we knew how much vertex + * reuse to expect. + */ + unsigned lds_total = max_esverts * esvert_lds_size + + max_gsprims * gsprim_lds_size; + if (lds_total > target_lds_size) { + max_esverts = max_esverts * target_lds_size / lds_total; + max_gsprims = max_gsprims * target_lds_size / lds_total; + + max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); + clamp_gsprims_to_esverts(&max_gsprims, max_esverts, + min_verts_per_prim, uses_adjacency); + assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); + } + } + + /* Round up towards full wave sizes for better ALU utilization. */ + if (!max_vert_out_per_gs_instance) { + const unsigned wavesize = 64; + unsigned orig_max_esverts; + unsigned orig_max_gsprims; + do { + orig_max_esverts = max_esverts; + orig_max_gsprims = max_gsprims; + + max_esverts = align(max_esverts, wavesize); + max_esverts = MIN2(max_esverts, max_esverts_base); + if (esvert_lds_size) + max_esverts = MIN2(max_esverts, + (max_lds_size - max_gsprims * gsprim_lds_size) / + esvert_lds_size); + max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); + + max_gsprims = align(max_gsprims, wavesize); + max_gsprims = MIN2(max_gsprims, max_gsprims_base); + if (gsprim_lds_size) + max_gsprims = MIN2(max_gsprims, + (max_lds_size - max_esverts * esvert_lds_size) / + gsprim_lds_size); + clamp_gsprims_to_esverts(&max_gsprims, max_esverts, + min_verts_per_prim, uses_adjacency); + assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); + } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims); + } + + /* Hardware restriction: minimum value of max_esverts */ + max_esverts = MAX2(max_esverts, 23 + max_verts_per_prim); + + unsigned max_out_vertices = + max_vert_out_per_gs_instance ? gs_info->gs.vertices_out : + gs_type == MESA_SHADER_GEOMETRY ? + max_gsprims * gs_num_invocations * gs_info->gs.vertices_out : + max_esverts; + assert(max_out_vertices <= 256); + + unsigned prim_amp_factor = 1; + if (gs_type == MESA_SHADER_GEOMETRY) { + /* Number of output primitives per GS input primitive after + * GS instancing. */ + prim_amp_factor = gs_info->gs.vertices_out; + } + + /* The GE only checks against the maximum number of ES verts after + * allocating a full GS primitive. So we need to ensure that whenever + * this check passes, there is enough space for a full primitive without + * vertex reuse. + */ + ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1; + ngg.max_gsprims = max_gsprims; + ngg.max_out_verts = max_out_vertices; + ngg.prim_amp_factor = prim_amp_factor; + ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance; + ngg.ngg_emit_size = max_gsprims * gsprim_lds_size; + ngg.vgt_esgs_ring_itemsize = 1; + + pipeline->graphics.esgs_ring_size = 4 * max_esverts * esvert_lds_size; + + assert(ngg.hw_max_esverts >= 24); /* HW limitation */ + + return ngg; +} + static void calculate_gs_ring_sizes(struct radv_pipeline *pipeline, const struct radv_gs_state *gs) { @@ -2000,7 +2221,8 @@ radv_generate_graphics_pipeline_key(struct radv_pipeline *pipeline, } static void -radv_fill_shader_keys(struct radv_shader_variant_key *keys, +radv_fill_shader_keys(struct radv_device *device, + struct radv_shader_variant_key *keys, const struct radv_pipeline_key *key, nir_shader **nir) { @@ -2031,6 +2253,10 @@ radv_fill_shader_keys(struct radv_shader_variant_key *keys, keys[MESA_SHADER_VERTEX].vs.out.as_es = true; } + if (device->physical_device->rad_info.chip_class >= GFX10) { + keys[MESA_SHADER_VERTEX].vs.out.as_ngg = true; + } + for(int i = 0; i < MESA_SHADER_STAGES; ++i) keys[i].has_multiview_view_index = key->has_multiview_view_index; @@ -2221,7 +2447,7 @@ void radv_create_shaders(struct radv_pipeline *pipeline, nir_print_shader(nir[i], stderr); } - radv_fill_shader_keys(keys, key, nir); + radv_fill_shader_keys(device, keys, key, nir); if (nir[MESA_SHADER_FRAGMENT]) { if (!pipeline->shaders[MESA_SHADER_FRAGMENT]) { @@ -2356,6 +2582,8 @@ radv_pipeline_stage_to_user_data_0(struct radv_pipeline *pipeline, { bool has_gs = radv_pipeline_has_gs(pipeline); bool has_tess = radv_pipeline_has_tess(pipeline); + bool has_ngg = radv_pipeline_has_ngg(pipeline); + switch (stage) { case MESA_SHADER_FRAGMENT: return R_00B030_SPI_SHADER_USER_DATA_PS_0; @@ -2379,6 +2607,9 @@ radv_pipeline_stage_to_user_data_0(struct radv_pipeline *pipeline, } } + if (has_ngg) + return R_00B230_SPI_SHADER_USER_DATA_GS_0; + return R_00B130_SPI_SHADER_USER_DATA_VS_0; case MESA_SHADER_GEOMETRY: return chip_class == GFX9 ? R_00B330_SPI_SHADER_USER_DATA_ES_0 : @@ -2968,8 +3199,7 @@ radv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf *ctx_cs, struct radv_pipeline *pipeline) { const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline); - - uint32_t vgt_primitiveid_en = false; + unsigned vgt_primitiveid_en = 0; uint32_t vgt_gs_mode = 0; if (radv_pipeline_has_gs(pipeline)) { @@ -2978,9 +3208,17 @@ radv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf *ctx_cs, vgt_gs_mode = ac_vgt_gs_mode(gs->info.gs.vertices_out, pipeline->device->physical_device->rad_info.chip_class); + } else if (radv_pipeline_has_ngg(pipeline)) { + const struct radv_shader_variant *vs = + pipeline->shaders[MESA_SHADER_VERTEX]; + bool enable_prim_id = + outinfo->export_prim_id || vs->info.info.uses_prim_id; + + vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(enable_prim_id) | + S_028A84_NGG_DISABLE_PROVOK_REUSE(enable_prim_id); } else if (outinfo->export_prim_id) { vgt_gs_mode = S_028A40_MODE(V_028A40_GS_SCENARIO_A); - vgt_primitiveid_en = true; + vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(1); } radeon_set_context_reg(ctx_cs, R_028A84_VGT_PRIMITIVEID_EN, vgt_primitiveid_en); @@ -3084,6 +3322,105 @@ radv_pipeline_generate_hw_ls(struct radeon_cmdbuf *cs, radeon_emit(cs, rsrc2); } +static void +radv_pipeline_generate_hw_ngg(struct radeon_cmdbuf *ctx_cs, + struct radeon_cmdbuf *cs, + struct radv_pipeline *pipeline, + struct radv_shader_variant *shader, + const struct radv_ngg_state *ngg_state) +{ + uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset; + + radeon_set_sh_reg_seq(cs, R_00B320_SPI_SHADER_PGM_LO_ES, 2); + radeon_emit(cs, va >> 8); + radeon_emit(cs, va >> 40); + radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2); + radeon_emit(cs, shader->config.rsrc1); + radeon_emit(cs, shader->config.rsrc2); + + const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline); + unsigned clip_dist_mask, cull_dist_mask, total_mask; + clip_dist_mask = outinfo->clip_dist_mask; + cull_dist_mask = outinfo->cull_dist_mask; + total_mask = clip_dist_mask | cull_dist_mask; + bool misc_vec_ena = outinfo->writes_pointsize || + outinfo->writes_layer || + outinfo->writes_viewport_index; + bool break_wave_at_eoi = false; + + radeon_set_context_reg(ctx_cs, R_0286C4_SPI_VS_OUT_CONFIG, + S_0286C4_VS_EXPORT_COUNT(MAX2(1, outinfo->param_exports) - 1)); + radeon_set_context_reg(ctx_cs, R_028708_SPI_SHADER_IDX_FORMAT, + S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP)); + radeon_set_context_reg(ctx_cs, R_02870C_SPI_SHADER_POS_FORMAT, + S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) | + S_02870C_POS1_EXPORT_FORMAT(outinfo->pos_exports > 1 ? + V_02870C_SPI_SHADER_4COMP : + V_02870C_SPI_SHADER_NONE) | + S_02870C_POS2_EXPORT_FORMAT(outinfo->pos_exports > 2 ? + V_02870C_SPI_SHADER_4COMP : + V_02870C_SPI_SHADER_NONE) | + S_02870C_POS3_EXPORT_FORMAT(outinfo->pos_exports > 3 ? + V_02870C_SPI_SHADER_4COMP : + V_02870C_SPI_SHADER_NONE)); + + radeon_set_context_reg(ctx_cs, R_028818_PA_CL_VTE_CNTL, + S_028818_VTX_W0_FMT(1) | + S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) | + S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) | + S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1)); + radeon_set_context_reg(ctx_cs, R_02881C_PA_CL_VS_OUT_CNTL, + S_02881C_USE_VTX_POINT_SIZE(outinfo->writes_pointsize) | + S_02881C_USE_VTX_RENDER_TARGET_INDX(outinfo->writes_layer) | + S_02881C_USE_VTX_VIEWPORT_INDX(outinfo->writes_viewport_index) | + S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) | + S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena) | + S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0f) != 0) | + S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) | + cull_dist_mask << 8 | + clip_dist_mask); + + /* TODO: Correctly set REUSE_OFF */ + radeon_set_context_reg(ctx_cs, R_028AB4_VGT_REUSE_OFF, + S_028AB4_REUSE_OFF(0)); + radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE, + ngg_state->vgt_esgs_ring_itemsize); + + /* NGG specific registers. */ + struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY]; + uint32_t gs_num_invocations = gs ? gs->info.gs.invocations : 1; + + radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL, + S_028A44_ES_VERTS_PER_SUBGRP(ngg_state->hw_max_esverts) | + S_028A44_GS_PRIMS_PER_SUBGRP(ngg_state->max_gsprims) | + S_028A44_GS_INST_PRIMS_IN_SUBGRP(ngg_state->max_gsprims * gs_num_invocations)); + radeon_set_context_reg(ctx_cs, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP, + S_0287FC_MAX_VERTS_PER_SUBGROUP(ngg_state->max_out_verts)); + radeon_set_context_reg(ctx_cs, R_028B4C_GE_NGG_SUBGRP_CNTL, + S_028B4C_PRIM_AMP_FACTOR(ngg_state->prim_amp_factor) | + S_028B4C_THDS_PER_SUBGRP(0)); /* for fast launch */ + radeon_set_context_reg(ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT, + S_028B90_CNT(gs_num_invocations) | + S_028B90_ENABLE(gs_num_invocations > 1) | + S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(ngg_state->max_vert_out_per_gs_instance)); + + /* User edge flags are set by the pos exports. If user edge flags are + * not used, we must use hw-generated edge flags and pass them via + * the prim export to prevent drawing lines on internal edges of + * decomposed primitives (such as quads) with polygon mode = lines. + * + * TODO: We should combine hw-generated edge flags with user edge + * flags in the shader. + */ + radeon_set_context_reg(ctx_cs, R_028838_PA_CL_NGG_CNTL, + S_028838_INDEX_BUF_EDGE_FLAG_ENA(1)); + + radeon_set_context_reg(ctx_cs, R_03096C_GE_CNTL, + S_03096C_PRIM_GRP_SIZE(ngg_state->max_gsprims) | + S_03096C_VERT_GRP_SIZE(ngg_state->hw_max_esverts) | + S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi)); +} + static void radv_pipeline_generate_hw_hs(struct radeon_cmdbuf *cs, struct radv_pipeline *pipeline, @@ -3127,7 +3464,8 @@ static void radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs, struct radv_pipeline *pipeline, - const struct radv_tessellation_state *tess) + const struct radv_tessellation_state *tess, + const struct radv_ngg_state *ngg) { struct radv_shader_variant *vs; @@ -3140,6 +3478,8 @@ radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf *ctx_cs, radv_pipeline_generate_hw_ls(cs, pipeline, vs, tess); else if (vs->info.vs.as_es) radv_pipeline_generate_hw_es(cs, pipeline, vs); + else if (vs->info.is_ngg) + radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, vs, ngg); else radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, vs); } @@ -3468,13 +3808,20 @@ radv_compute_vgt_shader_stages_en(const struct radv_pipeline *pipeline) stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) | S_028B54_GS_EN(1) | S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER); + else if (radv_pipeline_has_ngg(pipeline)) + stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) | + S_028B54_PRIMGEN_EN(1); else stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS); - } else if (radv_pipeline_has_gs(pipeline)) + } else if (radv_pipeline_has_gs(pipeline)) { stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) | S_028B54_GS_EN(1) | S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER); + } else if (radv_pipeline_has_ngg(pipeline)) { + stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) | + S_028B54_PRIMGEN_EN(1); + } if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2); @@ -3555,6 +3902,7 @@ radv_pipeline_generate_pm4(struct radv_pipeline *pipeline, const struct radv_blend_state *blend, const struct radv_tessellation_state *tess, const struct radv_gs_state *gs, + const struct radv_ngg_state *ngg, unsigned prim, unsigned gs_out) { struct radeon_cmdbuf *ctx_cs = &pipeline->ctx_cs; @@ -3570,7 +3918,7 @@ radv_pipeline_generate_pm4(struct radv_pipeline *pipeline, radv_pipeline_generate_raster_state(ctx_cs, pipeline, pCreateInfo); radv_pipeline_generate_multisample_state(ctx_cs, pipeline); radv_pipeline_generate_vgt_gs_mode(ctx_cs, pipeline); - radv_pipeline_generate_vertex_shader(ctx_cs, cs, pipeline, tess); + radv_pipeline_generate_vertex_shader(ctx_cs, cs, pipeline, tess, ngg); radv_pipeline_generate_tess_shaders(ctx_cs, cs, pipeline, tess); radv_pipeline_generate_geometry_shader(ctx_cs, cs, pipeline, gs); radv_pipeline_generate_fragment_shader(ctx_cs, cs, pipeline); @@ -3578,7 +3926,7 @@ radv_pipeline_generate_pm4(struct radv_pipeline *pipeline, radv_pipeline_generate_vgt_vertex_reuse(ctx_cs, pipeline); radv_pipeline_generate_binning_state(ctx_cs, pipeline, pCreateInfo); - if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) + if (pipeline->device->physical_device->rad_info.chip_class >= GFX10 && !radv_pipeline_has_ngg(pipeline)) gfx10_pipeline_generate_ge_cntl(ctx_cs, pipeline, tess, gs); radeon_set_context_reg(ctx_cs, R_0286E8_SPI_TMPRING_SIZE, @@ -3848,8 +4196,12 @@ radv_pipeline_init(struct radv_pipeline *pipeline, } } + struct radv_ngg_state ngg = {0}; struct radv_gs_state gs = {0}; - if (radv_pipeline_has_gs(pipeline)) { + + if (radv_pipeline_has_ngg(pipeline)) { + ngg = calculate_ngg_info(pCreateInfo, pipeline); + } else if (radv_pipeline_has_gs(pipeline)) { gs = calculate_gs_info(pCreateInfo, pipeline); calculate_gs_ring_sizes(pipeline, &gs); } @@ -3885,7 +4237,7 @@ radv_pipeline_init(struct radv_pipeline *pipeline, pipeline->streamout_shader = radv_pipeline_get_streamout_shader(pipeline); result = radv_pipeline_scratch_init(device, pipeline); - radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend, &tess, &gs, prim, gs_out); + radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend, &tess, &gs, &ngg, prim, gs_out); return result; } diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 3fa71905adf..fd1f8972adc 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -1510,6 +1510,8 @@ static inline bool radv_pipeline_has_tess(const struct radv_pipeline *pipeline) return pipeline->shaders[MESA_SHADER_TESS_CTRL] ? true : false; } +bool radv_pipeline_has_ngg(const struct radv_pipeline *pipeline); + struct radv_userdata_info *radv_lookup_user_sgpr(struct radv_pipeline *pipeline, gl_shader_stage stage, int idx); diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 315d522b63e..d36dfbdf332 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -583,7 +583,9 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice, config_out->rsrc1 |= S_00B428_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10); break; case MESA_SHADER_VERTEX: - if (info->vs.as_ls) { + if (info->is_ngg) { + config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10); + } else if (info->vs.as_ls) { assert(pdevice->rad_info.chip_class <= GFX8); /* We need at least 2 components for LS. * VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID). @@ -632,8 +634,19 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice, break; } - if (pdevice->rad_info.chip_class >= GFX9 && - stage == MESA_SHADER_GEOMETRY) { + if (pdevice->rad_info.chip_class >= GFX10 && + stage == MESA_SHADER_VERTEX) { + unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt; + + /* VGPR5-8: (VertexID, UserVGPR0, UserVGPR1, UserVGPR2 / InstanceID) */ + es_vgpr_comp_cnt = info->info.vs.needs_instance_id ? 3 : 0; + gs_vgpr_comp_cnt = 3; + + config_out->rsrc1 |= S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt); + config_out->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) | + S_00B22C_LDS_SIZE(config_in->lds_size); + } else if (pdevice->rad_info.chip_class >= GFX9 && + stage == MESA_SHADER_GEOMETRY) { unsigned es_type = info->gs.es_type; unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt; diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 1ee7fea5890..acd417cdb57 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -65,6 +65,7 @@ enum { struct radv_vs_out_key { uint32_t as_es:1; uint32_t as_ls:1; + uint32_t as_ngg:1; uint32_t export_prim_id:1; uint32_t export_layer_id:1; uint32_t export_clip_dists:1; @@ -264,6 +265,7 @@ struct radv_shader_variant_info { unsigned num_input_vgprs; unsigned private_mem_vgprs; bool need_indirect_descriptor_sets; + bool is_ngg; struct { struct { struct radv_vs_output_info outinfo; diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c index b3d12df4575..84e9663963b 100644 --- a/src/amd/vulkan/si_cmd_buffer.c +++ b/src/amd/vulkan/si_cmd_buffer.c @@ -317,6 +317,17 @@ si_emit_graphics(struct radv_physical_device *physical_device, } if (physical_device->rad_info.chip_class >= GFX10) { + /* Break up a pixel wave if it contains deallocs for more than + * half the parameter cache. + * + * To avoid a deadlock where pixel waves aren't launched + * because they're waiting for more pixels while the frontend + * is stuck waiting for PC space, the maximum allowed value is + * the size of the PC minus the largest possible allocation for + * a single primitive shader subgroup. + */ + radeon_set_context_reg(cs, R_028C50_PA_SC_NGG_MODE_CNTL, + S_028C50_MAX_DEALLOCS_IN_WAVE(512)); radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); radeon_set_context_reg(cs, R_02835C_PA_SC_TILE_STEERING_OVERRIDE, physical_device->rad_info.pa_sc_tile_steering_override); -- 2.30.2