radv/gfx10: implement NGG support (VS only)
authorSamuel Pitoiset <samuel.pitoiset@gmail.com>
Fri, 5 Jul 2019 06:33:06 +0000 (08:33 +0200)
committerBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Sun, 7 Jul 2019 15:51:32 +0000 (17:51 +0200)
This needs to be cleaned up a bit, and it probably contains
missing stuff and/or bugs.

This doesn't fix the "half of the triangles" issue.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
src/amd/vulkan/radv_nir_to_llvm.c
src/amd/vulkan/radv_pipeline.c
src/amd/vulkan/radv_private.h
src/amd/vulkan/radv_shader.c
src/amd/vulkan/radv_shader.h
src/amd/vulkan/si_cmd_buffer.c

index 7c0275be7cf35f9e9352471b1510e79375595e13..51414be2304a9a298f678be276ea3f08f125c570 100644 (file)
@@ -70,6 +70,13 @@ struct radv_shader_context {
        LLVMValueRef tes_u;
        LLVMValueRef tes_v;
 
+       /* HW GS */
+       /* On gfx10:
+        *  - bits 0..10: ordered_wave_id
+        *  - bits 12..20: number of vertices in group
+        *  - bits 22..30: number of primitives in group
+        */
+       LLVMValueRef gs_tg_info;
        LLVMValueRef gs2vs_offset;
        LLVMValueRef gs_wave_id;
        LLVMValueRef gs_vtx_offset[6];
@@ -823,11 +830,18 @@ declare_vs_input_vgprs(struct radv_shader_context *ctx, struct arg_info *args)
                if (ctx->options->key.vs.out.as_ls) {
                        add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->rel_auto_id);
                        add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
+                       add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */
                } else {
-                       add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
-                       add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id);
+                       if (ctx->ac.chip_class >= GFX10) {
+                               add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */
+                               add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */
+                               add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
+                       } else {
+                               add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
+                               add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id);
+                               add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */
+                       }
                }
-               add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */
        }
 }
 
@@ -969,6 +983,12 @@ static void set_llvm_calling_convention(LLVMValueRef func,
        LLVMSetFunctionCallConv(func, calling_conv);
 }
 
+/* Returns whether the stage is a stage that can be directly before the GS */
+static bool is_pre_gs_stage(gl_shader_stage stage)
+{
+       return stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL;
+}
+
 static void create_function(struct radv_shader_context *ctx,
                             gl_shader_stage stage,
                             bool has_previous_stage,
@@ -987,6 +1007,15 @@ static void create_function(struct radv_shader_context *ctx,
                        &ctx->ring_offsets);
        }
 
+       if (ctx->ac.chip_class >= GFX10) {
+               if (stage == MESA_SHADER_VERTEX && ctx->options->key.vs.out.as_ngg) {
+                       /* On GFX10, VS is merged into GS for NGG. */
+                       stage = MESA_SHADER_GEOMETRY;
+                       has_previous_stage = true;
+                       previous_stage = MESA_SHADER_VERTEX;
+               }
+       }
+
        switch (stage) {
        case MESA_SHADER_COMPUTE:
                declare_global_input_sgprs(ctx, &user_sgpr_info, &args,
@@ -1101,8 +1130,14 @@ static void create_function(struct radv_shader_context *ctx,
        case MESA_SHADER_GEOMETRY:
                if (has_previous_stage) {
                        // First 6 system regs
-                       add_arg(&args, ARG_SGPR, ctx->ac.i32,
-                               &ctx->gs2vs_offset);
+                       if (ctx->options->key.vs.out.as_ngg) {
+                               add_arg(&args, ARG_SGPR, ctx->ac.i32,
+                                       &ctx->gs_tg_info);
+                       } else {
+                               add_arg(&args, ARG_SGPR, ctx->ac.i32,
+                                       &ctx->gs2vs_offset);
+                       }
+
                        add_arg(&args, ARG_SGPR, ctx->ac.i32,
                                &ctx->merged_wave_info);
                        add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds);
@@ -3194,6 +3229,168 @@ handle_ls_outputs_post(struct radv_shader_context *ctx)
        }
 }
 
+static LLVMValueRef get_wave_id_in_tg(struct radv_shader_context *ctx)
+{
+       return ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 24, 4);
+}
+
+static LLVMValueRef ngg_get_vtx_cnt(struct radv_shader_context *ctx)
+{
+       return ac_build_bfe(&ctx->ac, ctx->gs_tg_info,
+                           LLVMConstInt(ctx->ac.i32, 12, false),
+                           LLVMConstInt(ctx->ac.i32, 9, false),
+                           false);
+}
+
+static LLVMValueRef ngg_get_prim_cnt(struct radv_shader_context *ctx)
+{
+       return ac_build_bfe(&ctx->ac, ctx->gs_tg_info,
+                           LLVMConstInt(ctx->ac.i32, 22, false),
+                           LLVMConstInt(ctx->ac.i32, 9, false),
+                           false);
+}
+
+/* Send GS Alloc Req message from the first wave of the group to SPI.
+ * Message payload is:
+ * - bits 0..10: vertices in group
+ * - bits 12..22: primitives in group
+ */
+static void build_sendmsg_gs_alloc_req(struct radv_shader_context *ctx,
+                                      LLVMValueRef vtx_cnt,
+                                      LLVMValueRef prim_cnt)
+{
+       LLVMBuilderRef builder = ctx->ac.builder;
+       LLVMValueRef tmp;
+
+       tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
+       ac_build_ifcc(&ctx->ac, tmp, 5020);
+
+       tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->ac.i32, 12, false),"");
+       tmp = LLVMBuildOr(builder, tmp, vtx_cnt, "");
+       ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_ALLOC_REQ, tmp);
+
+       ac_build_endif(&ctx->ac, 5020);
+}
+
+struct ngg_prim {
+       unsigned num_vertices;
+       LLVMValueRef isnull;
+       LLVMValueRef index[3];
+       LLVMValueRef edgeflag[3];
+};
+
+static void build_export_prim(struct radv_shader_context *ctx,
+                             const struct ngg_prim *prim)
+{
+       LLVMBuilderRef builder = ctx->ac.builder;
+       struct ac_export_args args;
+       LLVMValueRef tmp;
+
+       tmp = LLVMBuildZExt(builder, prim->isnull, ctx->ac.i32, "");
+       args.out[0] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 31, false), "");
+
+       for (unsigned i = 0; i < prim->num_vertices; ++i) {
+               tmp = LLVMBuildShl(builder, prim->index[i],
+                                  LLVMConstInt(ctx->ac.i32, 10 * i, false), "");
+               args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, "");
+               tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->ac.i32, "");
+               tmp = LLVMBuildShl(builder, tmp,
+                                  LLVMConstInt(ctx->ac.i32, 10 * i + 9, false), "");
+               args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, "");
+       }
+
+       args.out[0] = LLVMBuildBitCast(builder, args.out[0], ctx->ac.f32, "");
+       args.out[1] = LLVMGetUndef(ctx->ac.f32);
+       args.out[2] = LLVMGetUndef(ctx->ac.f32);
+       args.out[3] = LLVMGetUndef(ctx->ac.f32);
+
+       args.target = V_008DFC_SQ_EXP_PRIM;
+       args.enabled_channels = 1;
+       args.done = true;
+       args.valid_mask = false;
+       args.compr = false;
+
+       ac_build_export(&ctx->ac, &args);
+}
+
+static void
+handle_ngg_outputs_post(struct radv_shader_context *ctx)
+{
+       LLVMBuilderRef builder = ctx->ac.builder;
+       struct ac_build_if_state if_state;
+       unsigned num_vertices = 3;
+       LLVMValueRef tmp;
+
+       assert(ctx->stage == MESA_SHADER_VERTEX && !ctx->is_gs_copy_shader);
+
+       LLVMValueRef prims_in_wave = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 8, 8);
+       LLVMValueRef vtx_in_wave = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 0, 8);
+       LLVMValueRef is_gs_thread = LLVMBuildICmp(builder, LLVMIntULT,
+                                                 ac_get_thread_id(&ctx->ac), prims_in_wave, "");
+       LLVMValueRef is_es_thread = LLVMBuildICmp(builder, LLVMIntULT,
+                                                 ac_get_thread_id(&ctx->ac), vtx_in_wave, "");
+       LLVMValueRef vtxindex[] = {
+               ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[0], 0, 16),
+               ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[0], 16, 16),
+               ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[2], 0, 16),
+       };
+
+       /* TODO: streamout */
+
+       /* TODO: VS primitive ID */
+       if (ctx->options->key.vs.out.export_prim_id)
+               assert(0);
+
+       /* TODO: primitive culling */
+
+       build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx));
+
+       /* TODO: streamout queries */
+       /* Export primitive data to the index buffer. Format is:
+        *  - bits 0..8: index 0
+        *  - bit 9: edge flag 0
+        *  - bits 10..18: index 1
+        *  - bit 19: edge flag 1
+        *  - bits 20..28: index 2
+        *  - bit 29: edge flag 2
+        *  - bit 31: null primitive (skip)
+        *
+        * For the first version, we will always build up all three indices
+        * independent of the primitive type. The additional garbage data
+        * shouldn't hurt.
+        *
+        * TODO: culling depends on the primitive type, so can have some
+        * interaction here.
+        */
+       ac_nir_build_if(&if_state, ctx, is_gs_thread);
+       {
+               struct ngg_prim prim = {};
+
+               prim.num_vertices = num_vertices;
+               prim.isnull = ctx->ac.i1false;
+               memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3);
+
+               for (unsigned i = 0; i < num_vertices; ++i) {
+                       tmp = LLVMBuildLShr(builder, ctx->abi.gs_invocation_id,
+                                           LLVMConstInt(ctx->ac.i32, 8 + i, false), "");
+                       prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+               }
+
+               build_export_prim(ctx, &prim);
+       }
+       ac_nir_build_endif(&if_state);
+
+       /* Export per-vertex data (positions and parameters). */
+       ac_nir_build_if(&if_state, ctx, is_es_thread);
+       {
+               handle_vs_outputs_post(ctx, ctx->options->key.vs.out.export_prim_id,
+                                      ctx->options->key.vs.out.export_layer_id,
+                                      ctx->options->key.vs.out.export_clip_dists,
+                                      &ctx->shader_info->vs.outinfo);
+       }
+       ac_nir_build_endif(&if_state);
+}
+
 static void
 write_tess_factors(struct radv_shader_context *ctx)
 {
@@ -3452,6 +3649,8 @@ handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs,
                        handle_ls_outputs_post(ctx);
                else if (ctx->options->key.vs.out.as_es)
                        handle_es_outputs_post(ctx, &ctx->shader_info->vs.es_info);
+               else if (ctx->options->key.vs.out.as_ngg)
+                       handle_ngg_outputs_post(ctx);
                else
                        handle_vs_outputs_post(ctx, ctx->options->key.vs.out.export_prim_id,
                                               ctx->options->key.vs.out.export_layer_id,
@@ -3703,6 +3902,13 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
                                                                            shaders[i]));
        }
 
+       if (ctx.ac.chip_class >= GFX10) {
+               if (shaders[0]->info.stage == MESA_SHADER_VERTEX &&
+                   options->key.vs.out.as_ngg) {
+                       ctx.max_workgroup_size = 128;
+               }
+       }
+
        create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2,
                        shader_count >= 2 ? shaders[shader_count - 2]->info.stage  : MESA_SHADER_VERTEX);
 
@@ -3722,7 +3928,8 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
         */
        ctx.abi.gfx9_stride_size_workaround_for_atomic = ctx.ac.chip_class == GFX9 && HAVE_LLVM < 0x900;
 
-       if (shader_count >= 2)
+       bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) &&  ctx.options->key.vs.out.as_ngg;
+       if (shader_count >= 2 || is_ngg)
                ac_init_exec_full_mask(&ctx.ac);
 
        if ((ctx.ac.family == CHIP_VEGA10 ||
@@ -3788,7 +3995,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
                ac_setup_rings(&ctx);
 
                LLVMBasicBlockRef merge_block;
-               if (shader_count >= 2) {
+               if (shader_count >= 2 || is_ngg) {
                        LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
                        LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
                        merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
@@ -3811,7 +4018,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
 
                ac_nir_translate(&ctx.ac, &ctx.abi, shaders[i]);
 
-               if (shader_count >= 2) {
+               if (shader_count >= 2 || is_ngg) {
                        LLVMBuildBr(ctx.ac.builder, merge_block);
                        LLVMPositionBuilderAtEnd(ctx.ac.builder, merge_block);
                }
@@ -3955,6 +4162,7 @@ ac_fill_shader_info(struct radv_shader_variant_info *shader_info, struct nir_sha
                 shader_info->vs.as_es = options->key.vs.out.as_es;
                 shader_info->vs.as_ls = options->key.vs.out.as_ls;
                 shader_info->vs.export_prim_id = options->key.vs.out.export_prim_id;
+                shader_info->is_ngg = options->key.vs.out.as_ngg;
                 break;
         default:
                 break;
index ae08d57677fbfce5b7d5998cf49a3e9d8dfaa041..cc5f339f34fb44cdc653781208010f776067ecbd 100644 (file)
@@ -96,6 +96,30 @@ struct radv_gs_state {
        uint32_t lds_size;
 };
 
+struct radv_ngg_state {
+       uint16_t ngg_emit_size; /* in dwords */
+       uint32_t hw_max_esverts;
+       uint32_t max_gsprims;
+       uint32_t max_out_verts;
+       uint32_t prim_amp_factor;
+       uint32_t vgt_esgs_ring_itemsize;
+       bool max_vert_out_per_gs_instance;
+};
+
+bool radv_pipeline_has_ngg(const struct radv_pipeline *pipeline)
+{
+       struct radv_shader_variant *variant = NULL;
+       if (pipeline->shaders[MESA_SHADER_GEOMETRY])
+               variant = pipeline->shaders[MESA_SHADER_GEOMETRY];
+       else if (pipeline->shaders[MESA_SHADER_TESS_EVAL])
+               variant = pipeline->shaders[MESA_SHADER_TESS_EVAL];
+       else if (pipeline->shaders[MESA_SHADER_VERTEX])
+               variant = pipeline->shaders[MESA_SHADER_VERTEX];
+       else
+               return false;
+       return variant->info.is_ngg;
+}
+
 static void
 radv_pipeline_destroy(struct radv_device *device,
                       struct radv_pipeline *pipeline,
@@ -1583,6 +1607,203 @@ calculate_gs_info(const VkGraphicsPipelineCreateInfo *pCreateInfo,
        return gs;
 }
 
+static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts,
+                                    unsigned min_verts_per_prim, bool use_adjacency)
+{
+       unsigned max_reuse = max_esverts - min_verts_per_prim;
+       if (use_adjacency)
+               max_reuse /= 2;
+       *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
+}
+
+static struct radv_ngg_state
+calculate_ngg_info(const VkGraphicsPipelineCreateInfo *pCreateInfo,
+                  struct radv_pipeline *pipeline)
+{
+       struct radv_ngg_state ngg = {0};
+       struct radv_shader_variant_info *gs_info = &pipeline->shaders[MESA_SHADER_GEOMETRY]->info;
+       struct radv_es_output_info *es_info =
+               radv_pipeline_has_tess(pipeline) ? &gs_info->tes.es_info : &gs_info->vs.es_info;
+       unsigned gs_type = MESA_SHADER_VERTEX;
+       unsigned max_verts_per_prim = 3; // triangles
+       unsigned min_verts_per_prim =
+               gs_type == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1;
+       unsigned gs_num_invocations = 1;//MAX2(gs_info->gs.invocations, 1);
+       bool uses_adjacency;
+       switch(pCreateInfo->pInputAssemblyState->topology) {
+       case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
+       case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
+       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
+       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
+               uses_adjacency = true;
+               break;
+       default:
+               uses_adjacency = false;
+               break;
+       }
+
+       /* All these are in dwords: */
+       /* We can't allow using the whole LDS, because GS waves compete with
+        * other shader stages for LDS space.
+        *
+        * Streamout can increase the ESGS buffer size later on, so be more
+        * conservative with streamout and use 4K dwords. This may be suboptimal.
+        *
+        * Otherwise, use the limit of 7K dwords. The reason is that we need
+        * to leave some headroom for the max_esverts increase at the end.
+        *
+        * TODO: We should really take the shader's internal LDS use into
+        *       account. The linker will fail if the size is greater than
+        *       8K dwords.
+        */
+       const unsigned max_lds_size = (0 /*gs_info->info.so.num_outputs*/ ? 4 : 7) * 1024 - 128;
+       const unsigned target_lds_size = max_lds_size;
+       unsigned esvert_lds_size = 0;
+       unsigned gsprim_lds_size = 0;
+
+       /* All these are per subgroup: */
+       bool max_vert_out_per_gs_instance = false;
+       unsigned max_esverts_base = 256;
+       unsigned max_gsprims_base = 128; /* default prim group size clamp */
+
+       /* Hardware has the following non-natural restrictions on the value
+        * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
+        * the draw:
+        *  - at most 252 for any line input primitive type
+        *  - at most 251 for any quad input primitive type
+        *  - at most 251 for triangle strips with adjacency (this happens to
+        *    be the natural limit for triangle *lists* with adjacency)
+        */
+       max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
+
+       if (gs_type == MESA_SHADER_GEOMETRY) {
+               unsigned max_out_verts_per_gsprim =
+                       gs_info->gs.vertices_out * gs_num_invocations;
+
+               if (max_out_verts_per_gsprim <= 256) {
+                       if (max_out_verts_per_gsprim) {
+                               max_gsprims_base = MIN2(max_gsprims_base,
+                                                       256 / max_out_verts_per_gsprim);
+                       }
+               } else {
+                       /* Use special multi-cycling mode in which each GS
+                        * instance gets its own subgroup. Does not work with
+                        * tessellation. */
+                       max_vert_out_per_gs_instance = true;
+                       max_gsprims_base = 1;
+                       max_out_verts_per_gsprim = gs_info->gs.vertices_out;
+               }
+
+               esvert_lds_size = es_info->esgs_itemsize / 4;
+               gsprim_lds_size = (gs_info->gs.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
+       } else {
+               /* TODO: This needs to be adjusted once LDS use for compaction
+                * after culling is implemented. */
+               /*
+               if (es_info->info.so.num_outputs)
+                       esvert_lds_size = 4 * es_info->info.so.num_outputs + 1;
+               */
+       }
+
+       unsigned max_gsprims = max_gsprims_base;
+       unsigned max_esverts = max_esverts_base;
+
+       if (esvert_lds_size)
+               max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
+       if (gsprim_lds_size)
+               max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
+
+       max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+       clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
+       assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+
+       if (esvert_lds_size || gsprim_lds_size) {
+               /* Now that we have a rough proportionality between esverts
+                * and gsprims based on the primitive type, scale both of them
+                * down simultaneously based on required LDS space.
+                *
+                * We could be smarter about this if we knew how much vertex
+                * reuse to expect.
+                */
+               unsigned lds_total = max_esverts * esvert_lds_size +
+                                    max_gsprims * gsprim_lds_size;
+               if (lds_total > target_lds_size) {
+                       max_esverts = max_esverts * target_lds_size / lds_total;
+                       max_gsprims = max_gsprims * target_lds_size / lds_total;
+
+                       max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+                       clamp_gsprims_to_esverts(&max_gsprims, max_esverts,
+                                                min_verts_per_prim, uses_adjacency);
+                       assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+               }
+       }
+
+       /* Round up towards full wave sizes for better ALU utilization. */
+       if (!max_vert_out_per_gs_instance) {
+               const unsigned wavesize = 64;
+               unsigned orig_max_esverts;
+               unsigned orig_max_gsprims;
+               do {
+                       orig_max_esverts = max_esverts;
+                       orig_max_gsprims = max_gsprims;
+
+                       max_esverts = align(max_esverts, wavesize);
+                       max_esverts = MIN2(max_esverts, max_esverts_base);
+                       if (esvert_lds_size)
+                               max_esverts = MIN2(max_esverts,
+                                                  (max_lds_size - max_gsprims * gsprim_lds_size) /
+                                                  esvert_lds_size);
+                       max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+
+                       max_gsprims = align(max_gsprims, wavesize);
+                       max_gsprims = MIN2(max_gsprims, max_gsprims_base);
+                       if (gsprim_lds_size)
+                               max_gsprims = MIN2(max_gsprims,
+                                                  (max_lds_size - max_esverts * esvert_lds_size) /
+                                                  gsprim_lds_size);
+                       clamp_gsprims_to_esverts(&max_gsprims, max_esverts,
+                                                min_verts_per_prim, uses_adjacency);
+                       assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+               } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
+       }
+
+       /* Hardware restriction: minimum value of max_esverts */
+       max_esverts = MAX2(max_esverts, 23 + max_verts_per_prim);
+
+       unsigned max_out_vertices =
+               max_vert_out_per_gs_instance ? gs_info->gs.vertices_out :
+               gs_type == MESA_SHADER_GEOMETRY ?
+               max_gsprims * gs_num_invocations * gs_info->gs.vertices_out :
+               max_esverts;
+       assert(max_out_vertices <= 256);
+
+       unsigned prim_amp_factor = 1;
+       if (gs_type == MESA_SHADER_GEOMETRY) {
+               /* Number of output primitives per GS input primitive after
+                * GS instancing. */
+               prim_amp_factor = gs_info->gs.vertices_out;
+       }
+
+       /* The GE only checks against the maximum number of ES verts after
+        * allocating a full GS primitive. So we need to ensure that whenever
+        * this check passes, there is enough space for a full primitive without
+        * vertex reuse.
+        */
+       ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1;
+       ngg.max_gsprims = max_gsprims;
+       ngg.max_out_verts = max_out_vertices;
+       ngg.prim_amp_factor = prim_amp_factor;
+       ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
+       ngg.ngg_emit_size = max_gsprims * gsprim_lds_size;
+       ngg.vgt_esgs_ring_itemsize = 1;
+
+       pipeline->graphics.esgs_ring_size = 4 * max_esverts * esvert_lds_size;
+
+       assert(ngg.hw_max_esverts >= 24); /* HW limitation */
+
+       return ngg;
+}
+
 static void
 calculate_gs_ring_sizes(struct radv_pipeline *pipeline, const struct radv_gs_state *gs)
 {
@@ -2000,7 +2221,8 @@ radv_generate_graphics_pipeline_key(struct radv_pipeline *pipeline,
 }
 
 static void
-radv_fill_shader_keys(struct radv_shader_variant_key *keys,
+radv_fill_shader_keys(struct radv_device *device,
+                     struct radv_shader_variant_key *keys,
                       const struct radv_pipeline_key *key,
                       nir_shader **nir)
 {
@@ -2031,6 +2253,10 @@ radv_fill_shader_keys(struct radv_shader_variant_key *keys,
                        keys[MESA_SHADER_VERTEX].vs.out.as_es = true;
        }
 
+       if (device->physical_device->rad_info.chip_class >= GFX10) {
+               keys[MESA_SHADER_VERTEX].vs.out.as_ngg = true;
+       }
+
        for(int i = 0; i < MESA_SHADER_STAGES; ++i)
                keys[i].has_multiview_view_index = key->has_multiview_view_index;
 
@@ -2221,7 +2447,7 @@ void radv_create_shaders(struct radv_pipeline *pipeline,
                        nir_print_shader(nir[i], stderr);
        }
 
-       radv_fill_shader_keys(keys, key, nir);
+       radv_fill_shader_keys(device, keys, key, nir);
 
        if (nir[MESA_SHADER_FRAGMENT]) {
                if (!pipeline->shaders[MESA_SHADER_FRAGMENT]) {
@@ -2356,6 +2582,8 @@ radv_pipeline_stage_to_user_data_0(struct radv_pipeline *pipeline,
 {
        bool has_gs = radv_pipeline_has_gs(pipeline);
        bool has_tess = radv_pipeline_has_tess(pipeline);
+       bool has_ngg = radv_pipeline_has_ngg(pipeline);
+
        switch (stage) {
        case MESA_SHADER_FRAGMENT:
                return R_00B030_SPI_SHADER_USER_DATA_PS_0;
@@ -2379,6 +2607,9 @@ radv_pipeline_stage_to_user_data_0(struct radv_pipeline *pipeline,
                        }
                }
 
+               if (has_ngg)
+                       return R_00B230_SPI_SHADER_USER_DATA_GS_0;
+
                return R_00B130_SPI_SHADER_USER_DATA_VS_0;
        case MESA_SHADER_GEOMETRY:
                return chip_class == GFX9 ? R_00B330_SPI_SHADER_USER_DATA_ES_0 :
@@ -2968,8 +3199,7 @@ radv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf *ctx_cs,
                                    struct radv_pipeline *pipeline)
 {
        const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
-
-       uint32_t vgt_primitiveid_en = false;
+       unsigned vgt_primitiveid_en = 0;
        uint32_t vgt_gs_mode = 0;
 
        if (radv_pipeline_has_gs(pipeline)) {
@@ -2978,9 +3208,17 @@ radv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf *ctx_cs,
 
                vgt_gs_mode = ac_vgt_gs_mode(gs->info.gs.vertices_out,
                                             pipeline->device->physical_device->rad_info.chip_class);
+       } else if (radv_pipeline_has_ngg(pipeline)) {
+               const struct radv_shader_variant *vs =
+                       pipeline->shaders[MESA_SHADER_VERTEX];
+               bool enable_prim_id =
+                       outinfo->export_prim_id || vs->info.info.uses_prim_id;
+
+               vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(enable_prim_id) |
+                                     S_028A84_NGG_DISABLE_PROVOK_REUSE(enable_prim_id);
        } else if (outinfo->export_prim_id) {
                vgt_gs_mode = S_028A40_MODE(V_028A40_GS_SCENARIO_A);
-               vgt_primitiveid_en = true;
+               vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(1);
        }
 
        radeon_set_context_reg(ctx_cs, R_028A84_VGT_PRIMITIVEID_EN, vgt_primitiveid_en);
@@ -3084,6 +3322,105 @@ radv_pipeline_generate_hw_ls(struct radeon_cmdbuf *cs,
        radeon_emit(cs, rsrc2);
 }
 
+static void
+radv_pipeline_generate_hw_ngg(struct radeon_cmdbuf *ctx_cs,
+                             struct radeon_cmdbuf *cs,
+                             struct radv_pipeline *pipeline,
+                             struct radv_shader_variant *shader,
+                             const struct radv_ngg_state *ngg_state)
+{
+       uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
+
+       radeon_set_sh_reg_seq(cs, R_00B320_SPI_SHADER_PGM_LO_ES, 2);
+       radeon_emit(cs, va >> 8);
+       radeon_emit(cs, va >> 40);
+       radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2);
+       radeon_emit(cs, shader->config.rsrc1);
+       radeon_emit(cs, shader->config.rsrc2);
+
+       const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
+       unsigned clip_dist_mask, cull_dist_mask, total_mask;
+       clip_dist_mask = outinfo->clip_dist_mask;
+       cull_dist_mask = outinfo->cull_dist_mask;
+       total_mask = clip_dist_mask | cull_dist_mask;
+       bool misc_vec_ena = outinfo->writes_pointsize ||
+               outinfo->writes_layer ||
+               outinfo->writes_viewport_index;
+       bool break_wave_at_eoi = false;
+
+       radeon_set_context_reg(ctx_cs, R_0286C4_SPI_VS_OUT_CONFIG,
+                              S_0286C4_VS_EXPORT_COUNT(MAX2(1, outinfo->param_exports) - 1));
+       radeon_set_context_reg(ctx_cs, R_028708_SPI_SHADER_IDX_FORMAT,
+                              S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP));
+       radeon_set_context_reg(ctx_cs, R_02870C_SPI_SHADER_POS_FORMAT,
+                              S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
+                              S_02870C_POS1_EXPORT_FORMAT(outinfo->pos_exports > 1 ?
+                                                          V_02870C_SPI_SHADER_4COMP :
+                                                          V_02870C_SPI_SHADER_NONE) |
+                              S_02870C_POS2_EXPORT_FORMAT(outinfo->pos_exports > 2 ?
+                                                          V_02870C_SPI_SHADER_4COMP :
+                                                          V_02870C_SPI_SHADER_NONE) |
+                              S_02870C_POS3_EXPORT_FORMAT(outinfo->pos_exports > 3 ?
+                                                          V_02870C_SPI_SHADER_4COMP :
+                                                          V_02870C_SPI_SHADER_NONE));
+
+       radeon_set_context_reg(ctx_cs, R_028818_PA_CL_VTE_CNTL,
+                              S_028818_VTX_W0_FMT(1) |
+                              S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
+                              S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
+                              S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1));
+       radeon_set_context_reg(ctx_cs, R_02881C_PA_CL_VS_OUT_CNTL,
+                              S_02881C_USE_VTX_POINT_SIZE(outinfo->writes_pointsize) |
+                              S_02881C_USE_VTX_RENDER_TARGET_INDX(outinfo->writes_layer) |
+                              S_02881C_USE_VTX_VIEWPORT_INDX(outinfo->writes_viewport_index) |
+                              S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
+                              S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena) |
+                              S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0f) != 0) |
+                              S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) |
+                              cull_dist_mask << 8 |
+                              clip_dist_mask);
+
+       /* TODO: Correctly set REUSE_OFF */
+       radeon_set_context_reg(ctx_cs, R_028AB4_VGT_REUSE_OFF,
+                              S_028AB4_REUSE_OFF(0));
+       radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+                              ngg_state->vgt_esgs_ring_itemsize);
+
+       /* NGG specific registers. */
+       struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
+       uint32_t gs_num_invocations = gs ? gs->info.gs.invocations : 1;
+
+       radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL,
+                              S_028A44_ES_VERTS_PER_SUBGRP(ngg_state->hw_max_esverts) |
+                              S_028A44_GS_PRIMS_PER_SUBGRP(ngg_state->max_gsprims) |
+                              S_028A44_GS_INST_PRIMS_IN_SUBGRP(ngg_state->max_gsprims * gs_num_invocations));
+       radeon_set_context_reg(ctx_cs, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
+                              S_0287FC_MAX_VERTS_PER_SUBGROUP(ngg_state->max_out_verts));
+       radeon_set_context_reg(ctx_cs, R_028B4C_GE_NGG_SUBGRP_CNTL,
+                              S_028B4C_PRIM_AMP_FACTOR(ngg_state->prim_amp_factor) |
+                              S_028B4C_THDS_PER_SUBGRP(0)); /* for fast launch */
+       radeon_set_context_reg(ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT,
+                              S_028B90_CNT(gs_num_invocations) |
+                              S_028B90_ENABLE(gs_num_invocations > 1) |
+                              S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(ngg_state->max_vert_out_per_gs_instance));
+
+       /* User edge flags are set by the pos exports. If user edge flags are
+        * not used, we must use hw-generated edge flags and pass them via
+        * the prim export to prevent drawing lines on internal edges of
+        * decomposed primitives (such as quads) with polygon mode = lines.
+        *
+        * TODO: We should combine hw-generated edge flags with user edge
+        *       flags in the shader.
+        */
+       radeon_set_context_reg(ctx_cs, R_028838_PA_CL_NGG_CNTL,
+                              S_028838_INDEX_BUF_EDGE_FLAG_ENA(1));
+
+       radeon_set_context_reg(ctx_cs, R_03096C_GE_CNTL,
+                              S_03096C_PRIM_GRP_SIZE(ngg_state->max_gsprims) |
+                              S_03096C_VERT_GRP_SIZE(ngg_state->hw_max_esverts) |
+                              S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi));
+}
+
 static void
 radv_pipeline_generate_hw_hs(struct radeon_cmdbuf *cs,
                             struct radv_pipeline *pipeline,
@@ -3127,7 +3464,8 @@ static void
 radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf *ctx_cs,
                                     struct radeon_cmdbuf *cs,
                                     struct radv_pipeline *pipeline,
-                                    const struct radv_tessellation_state *tess)
+                                    const struct radv_tessellation_state *tess,
+                                    const struct radv_ngg_state *ngg)
 {
        struct radv_shader_variant *vs;
 
@@ -3140,6 +3478,8 @@ radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf *ctx_cs,
                radv_pipeline_generate_hw_ls(cs, pipeline, vs, tess);
        else if (vs->info.vs.as_es)
                radv_pipeline_generate_hw_es(cs, pipeline, vs);
+       else if (vs->info.is_ngg)
+               radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, vs, ngg);
        else
                radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, vs);
 }
@@ -3468,13 +3808,20 @@ radv_compute_vgt_shader_stages_en(const struct radv_pipeline *pipeline)
                        stages |=  S_028B54_ES_EN(V_028B54_ES_STAGE_DS) |
                                S_028B54_GS_EN(1) |
                                S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
+               else if (radv_pipeline_has_ngg(pipeline))
+                       stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) |
+                                 S_028B54_PRIMGEN_EN(1);
                else
                        stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
 
-       } else if (radv_pipeline_has_gs(pipeline))
+       } else if (radv_pipeline_has_gs(pipeline)) {
                stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
                        S_028B54_GS_EN(1) |
                        S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
+       } else if (radv_pipeline_has_ngg(pipeline)) {
+               stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
+                         S_028B54_PRIMGEN_EN(1);
+       }
 
        if (pipeline->device->physical_device->rad_info.chip_class >= GFX9)
                stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
@@ -3555,6 +3902,7 @@ radv_pipeline_generate_pm4(struct radv_pipeline *pipeline,
                            const struct radv_blend_state *blend,
                            const struct radv_tessellation_state *tess,
                            const struct radv_gs_state *gs,
+                           const struct radv_ngg_state *ngg,
                            unsigned prim, unsigned gs_out)
 {
        struct radeon_cmdbuf *ctx_cs = &pipeline->ctx_cs;
@@ -3570,7 +3918,7 @@ radv_pipeline_generate_pm4(struct radv_pipeline *pipeline,
        radv_pipeline_generate_raster_state(ctx_cs, pipeline, pCreateInfo);
        radv_pipeline_generate_multisample_state(ctx_cs, pipeline);
        radv_pipeline_generate_vgt_gs_mode(ctx_cs, pipeline);
-       radv_pipeline_generate_vertex_shader(ctx_cs, cs, pipeline, tess);
+       radv_pipeline_generate_vertex_shader(ctx_cs, cs, pipeline, tess, ngg);
        radv_pipeline_generate_tess_shaders(ctx_cs, cs, pipeline, tess);
        radv_pipeline_generate_geometry_shader(ctx_cs, cs, pipeline, gs);
        radv_pipeline_generate_fragment_shader(ctx_cs, cs, pipeline);
@@ -3578,7 +3926,7 @@ radv_pipeline_generate_pm4(struct radv_pipeline *pipeline,
        radv_pipeline_generate_vgt_vertex_reuse(ctx_cs, pipeline);
        radv_pipeline_generate_binning_state(ctx_cs, pipeline, pCreateInfo);
 
-       if (pipeline->device->physical_device->rad_info.chip_class >= GFX10)
+       if (pipeline->device->physical_device->rad_info.chip_class >= GFX10 && !radv_pipeline_has_ngg(pipeline))
                gfx10_pipeline_generate_ge_cntl(ctx_cs, pipeline, tess, gs);
 
        radeon_set_context_reg(ctx_cs, R_0286E8_SPI_TMPRING_SIZE,
@@ -3848,8 +4196,12 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
                }
        }
 
+       struct radv_ngg_state ngg = {0};
        struct radv_gs_state gs = {0};
-       if (radv_pipeline_has_gs(pipeline)) {
+
+       if (radv_pipeline_has_ngg(pipeline)) {
+               ngg = calculate_ngg_info(pCreateInfo, pipeline);
+       } else if (radv_pipeline_has_gs(pipeline)) {
                gs = calculate_gs_info(pCreateInfo, pipeline);
                calculate_gs_ring_sizes(pipeline, &gs);
        }
@@ -3885,7 +4237,7 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
        pipeline->streamout_shader = radv_pipeline_get_streamout_shader(pipeline);
 
        result = radv_pipeline_scratch_init(device, pipeline);
-       radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend, &tess, &gs, prim, gs_out);
+       radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend, &tess, &gs, &ngg, prim, gs_out);
 
        return result;
 }
index 3fa71905adfe917a30781140c60e86edc6335bff..fd1f8972adc87f7cb1c5e899ab469a44e108fa1b 100644 (file)
@@ -1510,6 +1510,8 @@ static inline bool radv_pipeline_has_tess(const struct radv_pipeline *pipeline)
        return pipeline->shaders[MESA_SHADER_TESS_CTRL] ? true : false;
 }
 
+bool radv_pipeline_has_ngg(const struct radv_pipeline *pipeline);
+
 struct radv_userdata_info *radv_lookup_user_sgpr(struct radv_pipeline *pipeline,
                                                 gl_shader_stage stage,
                                                 int idx);
index 315d522b63ea72ddbe9a889e21aea5ff29788411..d36dfbdf332c6cec921167c3f21ff78f2186ab27 100644 (file)
@@ -583,7 +583,9 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice,
                config_out->rsrc1 |= S_00B428_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
                break;
        case MESA_SHADER_VERTEX:
-               if (info->vs.as_ls) {
+               if (info->is_ngg) {
+                       config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
+               } else if (info->vs.as_ls) {
                        assert(pdevice->rad_info.chip_class <= GFX8);
                        /* We need at least 2 components for LS.
                         * VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID).
@@ -632,8 +634,19 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice,
                break;
        }
 
-       if (pdevice->rad_info.chip_class >= GFX9 &&
-           stage == MESA_SHADER_GEOMETRY) {
+       if (pdevice->rad_info.chip_class >= GFX10 &&
+           stage == MESA_SHADER_VERTEX) {
+               unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt;
+
+               /* VGPR5-8: (VertexID, UserVGPR0, UserVGPR1, UserVGPR2 / InstanceID) */
+               es_vgpr_comp_cnt = info->info.vs.needs_instance_id ? 3 : 0;
+               gs_vgpr_comp_cnt = 3;
+
+               config_out->rsrc1 |= S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt);
+               config_out->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
+                                    S_00B22C_LDS_SIZE(config_in->lds_size);
+       } else if (pdevice->rad_info.chip_class >= GFX9 &&
+                  stage == MESA_SHADER_GEOMETRY) {
                unsigned es_type = info->gs.es_type;
                unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt;
 
index 1ee7fea5890ad4d627c46a712375f31dfccb7afc..acd417cdb571367f6d564b91906f861ca1e77f5b 100644 (file)
@@ -65,6 +65,7 @@ enum {
 struct radv_vs_out_key {
        uint32_t as_es:1;
        uint32_t as_ls:1;
+       uint32_t as_ngg:1;
        uint32_t export_prim_id:1;
        uint32_t export_layer_id:1;
        uint32_t export_clip_dists:1;
@@ -264,6 +265,7 @@ struct radv_shader_variant_info {
        unsigned num_input_vgprs;
        unsigned private_mem_vgprs;
        bool need_indirect_descriptor_sets;
+       bool is_ngg;
        struct {
                struct {
                        struct radv_vs_output_info outinfo;
index b3d12df45755f1b6b9c301fe04ba20840ff732b8..84e9663963bc7a723826c1d03c86555b8c7c6a73 100644 (file)
@@ -317,6 +317,17 @@ si_emit_graphics(struct radv_physical_device *physical_device,
        }
 
        if (physical_device->rad_info.chip_class >= GFX10) {
+               /* Break up a pixel wave if it contains deallocs for more than
+                * half the parameter cache.
+                *
+                * To avoid a deadlock where pixel waves aren't launched
+                * because they're waiting for more pixels while the frontend
+                * is stuck waiting for PC space, the maximum allowed value is
+                * the size of the PC minus the largest possible allocation for
+                * a single primitive shader subgroup.
+                */
+               radeon_set_context_reg(cs, R_028C50_PA_SC_NGG_MODE_CNTL,
+                                      S_028C50_MAX_DEALLOCS_IN_WAVE(512));
                radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
                radeon_set_context_reg(cs, R_02835C_PA_SC_TILE_STEERING_OVERRIDE,
                                       physical_device->rad_info.pa_sc_tile_steering_override);