radeonsi/gfx10: implement streamout-related queries
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
index ba79af3817e2fb513dd19e4286aa70eac04dad7b..68506b7a92cb105e061a578224071af006ec6b4b 100644 (file)
@@ -83,7 +83,8 @@ static bool llvm_type_is_64bit(struct si_shader_context *ctx,
        return false;
 }
 
-static bool is_merged_shader(struct si_shader_context *ctx)
+/** Whether the shader runs as a combination of multiple API shaders */
+static bool is_multi_part_shader(struct si_shader_context *ctx)
 {
        if (ctx->screen->info.chip_class <= GFX8)
                return false;
@@ -94,6 +95,12 @@ static bool is_merged_shader(struct si_shader_context *ctx)
               ctx->type == PIPE_SHADER_GEOMETRY;
 }
 
+/** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
+static bool is_merged_shader(struct si_shader_context *ctx)
+{
+       return ctx->shader->key.as_ngg || is_multi_part_shader(ctx);
+}
+
 void si_init_function_info(struct si_function_info *fninfo)
 {
        fninfo->num_params = 0;
@@ -1067,18 +1074,25 @@ static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx,
                                    LLVMConstInt(ctx->i32, tf_offset, 0), "");
        }
 
+       uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+                        S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                        S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+                        S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+
+       if (ctx->screen->info.chip_class >= GFX10)
+               rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+                        S_008F0C_OOB_SELECT(3) |
+                        S_008F0C_RESOURCE_LEVEL(1);
+       else
+               rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+                        S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+
        LLVMValueRef desc[4];
        desc[0] = addr;
        desc[1] = LLVMConstInt(ctx->i32,
                               S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
        desc[2] = LLVMConstInt(ctx->i32, 0xffffffff, 0);
-       desc[3] = LLVMConstInt(ctx->i32,
-                              S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-                              S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                              S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-                              S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
-                              S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-                              S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0);
+       desc[3] = LLVMConstInt(ctx->i32, rsrc3, false);
 
        return ac_build_gather_values(&ctx->ac, desc, 4);
 }
@@ -2242,17 +2256,24 @@ static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *c
        desc1 = LLVMConstInt(ctx->i32,
                             S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
 
+       uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+                        S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                        S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+                        S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+
+       if (ctx->screen->info.chip_class >= GFX10)
+               rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+                        S_008F0C_OOB_SELECT(3) |
+                        S_008F0C_RESOURCE_LEVEL(1);
+       else
+               rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+                        S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+
        LLVMValueRef desc_elems[] = {
                desc0,
                desc1,
                LLVMConstInt(ctx->i32, (sel->info.const_file_max[0] + 1) * 16, 0),
-               LLVMConstInt(ctx->i32,
-                       S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-                       S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                       S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-                       S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
-                       S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-                       S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0)
+               LLVMConstInt(ctx->i32, rsrc3, false)
        };
 
        return ac_build_gather_values(&ctx->ac, desc_elems, 4);
@@ -3394,11 +3415,15 @@ static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
 /* Pass GS inputs from ES to GS on GFX9. */
 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
 {
+       LLVMBuilderRef builder = ctx->ac.builder;
        LLVMValueRef ret = ctx->return_value;
 
        ret = si_insert_input_ptr(ctx, ret, 0, 0);
        ret = si_insert_input_ptr(ctx, ret, 1, 1);
-       ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
+       if (ctx->shader->key.as_ngg)
+               ret = LLVMBuildInsertValue(builder, ret, ctx->gs_tg_info, 2, "");
+       else
+               ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
        ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
        ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
 
@@ -3548,6 +3573,11 @@ static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
 
 static void emit_gs_epilogue(struct si_shader_context *ctx)
 {
+       if (ctx->shader->key.as_ngg) {
+               gfx10_ngg_gs_emit_epilogue(ctx);
+               return;
+       }
+
        ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
                         si_get_gs_wave_id(ctx));
 
@@ -4185,6 +4215,12 @@ static void si_llvm_emit_vertex(struct ac_shader_abi *abi,
                                LLVMValueRef *addrs)
 {
        struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+
+       if (ctx->shader->key.as_ngg) {
+               gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
+               return;
+       }
+
        struct tgsi_shader_info *info = &ctx->shader->selector->info;
        struct si_shader *shader = ctx->shader;
        struct lp_build_if_state if_state;
@@ -4277,6 +4313,11 @@ static void si_llvm_emit_primitive(struct ac_shader_abi *abi,
 {
        struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 
+       if (ctx->shader->key.as_ngg) {
+               LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
+               return;
+       }
+
        /* Signal primitive cut */
        ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
                         si_get_gs_wave_id(ctx));
@@ -4405,6 +4446,10 @@ static void declare_streamout_params(struct si_shader_context *ctx,
 static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
 {
        switch (shader->selector->type) {
+       case PIPE_SHADER_VERTEX:
+       case PIPE_SHADER_TESS_EVAL:
+               return shader->key.as_ngg ? 128 : 0;
+
        case PIPE_SHADER_TESS_CTRL:
                /* Return this so that LLVM doesn't remove s_barrier
                 * instructions on chips where we use s_barrier. */
@@ -4502,12 +4547,23 @@ static void declare_vs_input_vgprs(struct si_shader_context *ctx,
        add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.vertex_id);
        if (shader->key.as_ls) {
                ctx->param_rel_auto_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
+               if (ctx->screen->info.chip_class >= GFX10) {
+                       add_arg(fninfo, ARG_VGPR, ctx->i32); /* user VGPR */
+                       add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
+               } else {
+                       add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
+                       add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */
+               }
+       } else if (ctx->screen->info.chip_class == GFX10 &&
+                  !shader->is_gs_copy_shader) {
+               add_arg(fninfo, ARG_VGPR, ctx->i32); /* user vgpr */
+               add_arg(fninfo, ARG_VGPR, ctx->i32); /* user vgpr */
                add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
        } else {
                add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
                ctx->param_vs_prim_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
+               add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */
        }
-       add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */
 
        if (!shader->is_gs_copy_shader) {
                /* Vertex load indices. */
@@ -4575,7 +4631,7 @@ static void create_function(struct si_shader_context *ctx)
        if (ctx->screen->info.chip_class >= GFX9) {
                if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
                        type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
-               else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
+               else if (shader->key.as_es || shader->key.as_ngg || type == PIPE_SHADER_GEOMETRY)
                        type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
        }
 
@@ -4701,7 +4757,12 @@ static void create_function(struct si_shader_context *ctx)
                /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
                declare_per_stage_desc_pointers(ctx, &fninfo,
                                                ctx->type == PIPE_SHADER_GEOMETRY);
-               ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+
+               if (ctx->shader->key.as_ngg)
+                       add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &ctx->gs_tg_info);
+               else
+                       ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+
                ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
@@ -4709,11 +4770,17 @@ static void create_function(struct si_shader_context *ctx)
                add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
 
                declare_global_desc_pointers(ctx, &fninfo);
-               declare_per_stage_desc_pointers(ctx, &fninfo,
-                                               (ctx->type == PIPE_SHADER_VERTEX ||
-                                                ctx->type == PIPE_SHADER_TESS_EVAL));
+               if (ctx->type != PIPE_SHADER_VERTEX || !vs_blit_property) {
+                       declare_per_stage_desc_pointers(ctx, &fninfo,
+                                                       (ctx->type == PIPE_SHADER_VERTEX ||
+                                                        ctx->type == PIPE_SHADER_TESS_EVAL));
+               }
+
                if (ctx->type == PIPE_SHADER_VERTEX) {
-                       declare_vs_specific_input_sgprs(ctx, &fninfo);
+                       if (vs_blit_property)
+                               declare_vs_blit_inputs(ctx, &fninfo, vs_blit_property);
+                       else
+                               declare_vs_specific_input_sgprs(ctx, &fninfo);
                } else {
                        ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                        ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
@@ -4740,8 +4807,9 @@ static void create_function(struct si_shader_context *ctx)
                        declare_tes_input_vgprs(ctx, &fninfo);
                }
 
-               if (ctx->type == PIPE_SHADER_VERTEX ||
-                   ctx->type == PIPE_SHADER_TESS_EVAL) {
+               if (ctx->shader->key.as_es &&
+                   (ctx->type == PIPE_SHADER_VERTEX ||
+                    ctx->type == PIPE_SHADER_TESS_EVAL)) {
                        unsigned num_user_sgprs;
 
                        if (ctx->type == PIPE_SHADER_VERTEX)
@@ -5035,18 +5103,27 @@ static void preload_ring_buffers(struct si_shader_context *ctx)
                        ring = LLVMBuildInsertElement(builder, ring,
                                        LLVMConstInt(ctx->i32, num_records, 0),
                                        LLVMConstInt(ctx->i32, 2, 0), "");
+
+                       uint32_t rsrc3 =
+                                       S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+                                       S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                                       S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+                                       S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+                                       S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
+                                       S_008F0C_ADD_TID_ENABLE(1);
+
+                       if (ctx->ac.chip_class >= GFX10) {
+                               rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+                                        S_008F0C_OOB_SELECT(2) |
+                                        S_008F0C_RESOURCE_LEVEL(1);
+                       } else {
+                               rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+                                        S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
+                                        S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
+                       }
+
                        ring = LLVMBuildInsertElement(builder, ring,
-                               LLVMConstInt(ctx->i32,
-                                            S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-                                            S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                                            S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-                                            S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
-                                            S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-                                            S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
-                                            S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
-                                            S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
-                                            S_008F0C_ADD_TID_ENABLE(1),
-                                            0),
+                               LLVMConstInt(ctx->i32, rsrc3, false),
                                LLVMConstInt(ctx->i32, 3, 0), "");
 
                        ctx->gsvs_ring[stream] = ring;
@@ -5113,7 +5190,7 @@ static bool si_shader_binary_open(struct si_screen *screen,
 
 #undef add_part
 
-       struct ac_rtld_symbol lds_symbols[1];
+       struct ac_rtld_symbol lds_symbols[2];
        unsigned num_lds_symbols = 0;
 
        if (sel && screen->info.chip_class >= GFX9 &&
@@ -5127,6 +5204,13 @@ static bool si_shader_binary_open(struct si_screen *screen,
                sym->align = 64 * 1024;
        }
 
+       if (shader->key.as_ngg && sel->type == PIPE_SHADER_GEOMETRY) {
+               struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
+               sym->name = "ngg_emit";
+               sym->size = shader->ngg.ngg_emit_size * 4;
+               sym->align = 4;
+       }
+
        bool ok = ac_rtld_open(rtld, (struct ac_rtld_open_info){
                        .info = &screen->info,
                        .options = {
@@ -5155,7 +5239,6 @@ static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_sh
        return rtld.rx_size;
 }
 
-
 static bool si_get_external_symbol(void *data, const char *name, uint64_t *value)
 {
        uint64_t *scratch_va = data;
@@ -5399,6 +5482,8 @@ const char *si_get_shader_name(const struct si_shader *shader, unsigned processo
                        return "Vertex Shader as LS";
                else if (shader->key.opt.vs_as_prim_discard_cs)
                        return "Vertex Shader as Primitive Discard CS";
+               else if (shader->key.as_ngg)
+                       return "Vertex Shader as ESGS";
                else
                        return "Vertex Shader as VS";
        case PIPE_SHADER_TESS_CTRL:
@@ -5406,6 +5491,8 @@ const char *si_get_shader_name(const struct si_shader *shader, unsigned processo
        case PIPE_SHADER_TESS_EVAL:
                if (shader->key.as_es)
                        return "Tessellation Evaluation Shader as ES";
+               else if (shader->key.as_ngg)
+                       return "Tessellation Evaluation Shader as ESGS";
                else
                        return "Tessellation Evaluation Shader as VS";
        case PIPE_SHADER_GEOMETRY:
@@ -5914,6 +6001,8 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx)
                        ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
                else if (shader->key.opt.vs_as_prim_discard_cs)
                        ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
+               else if (shader->key.as_ngg)
+                       ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
                else
                        ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
                bld_base->emit_epilogue = si_tgsi_emit_epilogue;
@@ -5937,8 +6026,12 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx)
                ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
                if (shader->key.as_es)
                        ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
-               else
-                       ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
+               else {
+                       if (shader->key.as_ngg)
+                               ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
+                       else
+                               ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
+               }
                bld_base->emit_epilogue = si_tgsi_emit_epilogue;
                break;
        case PIPE_SHADER_GEOMETRY:
@@ -5972,6 +6065,40 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx)
        create_function(ctx);
        preload_ring_buffers(ctx);
 
+       if (ctx->type == PIPE_SHADER_TESS_CTRL &&
+           sel->tcs_info.tessfactors_are_def_in_all_invocs) {
+               for (unsigned i = 0; i < 6; i++) {
+                       ctx->invoc0_tess_factors[i] =
+                               ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
+               }
+       }
+
+       if (ctx->type == PIPE_SHADER_GEOMETRY) {
+               for (unsigned i = 0; i < 4; i++) {
+                       ctx->gs_next_vertex[i] =
+                               ac_build_alloca(&ctx->ac, ctx->i32, "");
+               }
+               if (shader->key.as_ngg) {
+                       for (unsigned i = 0; i < 4; ++i) {
+                               ctx->gs_curprim_verts[i] =
+                                       lp_build_alloca(&ctx->gallivm, ctx->ac.i32, "");
+                               ctx->gs_generated_prims[i] =
+                                       lp_build_alloca(&ctx->gallivm, ctx->ac.i32, "");
+                       }
+
+                       LLVMTypeRef a8i32 = LLVMArrayType(ctx->i32, 8);
+                       ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
+                               a8i32, "ngg_scratch", AC_ADDR_SPACE_LDS);
+                       LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(a8i32));
+                       LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
+
+                       ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(ctx->ac.module,
+                               LLVMArrayType(ctx->i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
+                       LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
+                       LLVMSetAlignment(ctx->gs_ngg_emit, 4);
+               }
+       }
+
        /* For GFX9 merged shaders:
         * - Set EXEC for the first shader. If the prolog is present, set
         *   EXEC there instead.
@@ -5983,6 +6110,10 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx)
         *
         * For monolithic merged shaders, the first shader is wrapped in an
         * if-block together with its prolog in si_build_wrapper_function.
+        *
+        * NGG vertex and tess eval shaders running as the last
+        * vertex/geometry stage handle execution explicitly using
+        * if-statements.
         */
        if (ctx->screen->info.chip_class >= GFX9) {
                if (!shader->is_monolithic &&
@@ -5994,44 +6125,58 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx)
                        si_init_exec_from_input(ctx,
                                                ctx->param_merged_wave_info, 0);
                } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
-                          ctx->type == PIPE_SHADER_GEOMETRY) {
-                       if (!shader->is_monolithic)
+                          ctx->type == PIPE_SHADER_GEOMETRY ||
+                          shader->key.as_ngg) {
+                       LLVMValueRef num_threads;
+                       bool nested_barrier;
+
+                       if (!shader->is_monolithic ||
+                           (ctx->type == PIPE_SHADER_TESS_EVAL &&
+                            shader->key.as_ngg))
                                ac_init_exec_full_mask(&ctx->ac);
 
-                       LLVMValueRef num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
+                       if (ctx->type == PIPE_SHADER_TESS_CTRL ||
+                           ctx->type == PIPE_SHADER_GEOMETRY) {
+                               if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) {
+                                       gfx10_ngg_gs_emit_prologue(ctx);
+                                       nested_barrier = false;
+                               } else {
+                                       nested_barrier = true;
+                               }
+
+                               /* Number of patches / primitives */
+                               num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
+                       } else {
+                               /* Number of vertices */
+                               num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 0, 8);
+                               nested_barrier = false;
+                       }
+
                        LLVMValueRef ena =
                                LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
                                            ac_get_thread_id(&ctx->ac), num_threads, "");
                        lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena);
 
-                       /* The barrier must execute for all shaders in a
-                        * threadgroup.
-                        *
-                        * Execute the barrier inside the conditional block,
-                        * so that empty waves can jump directly to s_endpgm,
-                        * which will also signal the barrier.
-                        *
-                        * If the shader is TCS and the TCS epilog is present
-                        * and contains a barrier, it will wait there and then
-                        * reach s_endpgm.
-                        */
-                       si_llvm_emit_barrier(NULL, bld_base, NULL);
-               }
-       }
-
-       if (ctx->type == PIPE_SHADER_TESS_CTRL &&
-           sel->tcs_info.tessfactors_are_def_in_all_invocs) {
-               for (unsigned i = 0; i < 6; i++) {
-                       ctx->invoc0_tess_factors[i] =
-                               ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
-               }
-       }
-
-       if (ctx->type == PIPE_SHADER_GEOMETRY) {
-               int i;
-               for (i = 0; i < 4; i++) {
-                       ctx->gs_next_vertex[i] =
-                               ac_build_alloca(&ctx->ac, ctx->i32, "");
+                       if (nested_barrier) {
+                               /* Execute a barrier before the second shader in
+                                * a merged shader.
+                                *
+                                * Execute the barrier inside the conditional block,
+                                * so that empty waves can jump directly to s_endpgm,
+                                * which will also signal the barrier.
+                                *
+                                * This is possible in gfx9, because an empty wave
+                                * for the second shader does not participate in
+                                * the epilogue. With NGG, empty waves may still
+                                * be required to export data (e.g. GS output vertices),
+                                * so we cannot let them exit early.
+                                *
+                                * If the shader is TCS and the TCS epilog is present
+                                * and contains a barrier, it will wait there and then
+                                * reach s_endpgm.
+                                */
+                               si_llvm_emit_barrier(NULL, bld_base, NULL);
+                       }
                }
        }
 
@@ -6088,6 +6233,8 @@ static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
        } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
                key->vs_prolog.as_es = 1;
                key->vs_prolog.num_merged_next_stage_vgprs = 5;
+       } else if (shader_out->key.as_ngg) {
+               key->vs_prolog.num_merged_next_stage_vgprs = 5;
        }
 
        /* Enable loading the InstanceID VGPR. */
@@ -6539,7 +6686,7 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
 
                /* Merged shaders are executed conditionally depending
                 * on the number of enabled threads passed in the input SGPRs. */
-               if (is_merged_shader(ctx) && part == 0) {
+               if (is_multi_part_shader(ctx) && part == 0) {
                        LLVMValueRef ena, count = initial[3];
 
                        count = LLVMBuildAnd(builder, count,
@@ -6601,7 +6748,7 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
 
                ret = ac_build_call(&ctx->ac, parts[part], in, num_params);
 
-               if (is_merged_shader(ctx) &&
+               if (is_multi_part_shader(ctx) &&
                    part + 1 == next_shader_first_part) {
                        lp_build_endif(&if_state);
 
@@ -7180,7 +7327,10 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
        }
 
        unsigned vertex_id_vgpr = first_vs_vgpr;
-       unsigned instance_id_vgpr = first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
+       unsigned instance_id_vgpr =
+               ctx->screen->info.chip_class >= GFX10 ?
+                       first_vs_vgpr + 3 :
+                       first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
 
        ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
        ctx->abi.instance_id = input_vgprs[instance_id_vgpr];
@@ -7216,6 +7366,21 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                                           key->vs_prolog.num_input_sgprs + i, "");
        }
 
+       struct lp_build_if_state wrap_if_state;
+       LLVMValueRef original_ret = ret;
+       bool wrapped = false;
+
+       if (key->vs_prolog.is_monolithic && key->vs_prolog.as_ngg) {
+               LLVMValueRef num_threads;
+               LLVMValueRef ena;
+
+               num_threads = si_unpack_param(ctx, 3, 0, 8);
+               ena = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
+                                       ac_get_thread_id(&ctx->ac), num_threads, "");
+               lp_build_if(&wrap_if_state, &ctx->gallivm, ena);
+               wrapped = true;
+       }
+
        /* Compute vertex load indices from instance divisors. */
        LLVMValueRef instance_divisor_constbuf = NULL;
 
@@ -7271,6 +7436,20 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                                           fninfo.num_params + i, "");
        }
 
+       if (wrapped) {
+               lp_build_endif(&wrap_if_state);
+
+               LLVMValueRef values[2] = {
+                       ret,
+                       original_ret
+               };
+               LLVMBasicBlockRef bbs[2] = {
+                       wrap_if_state.true_block,
+                       wrap_if_state.entry_block
+               };
+               ret = ac_build_phi(&ctx->ac, LLVMTypeOf(ret), 2, values, bbs);
+       }
+
        si_llvm_build_ret(ctx, ret);
 }
 
@@ -8093,8 +8272,12 @@ bool si_shader_create(struct si_screen *sscreen, struct ac_llvm_compiler *compil
                si_calculate_max_simd_waves(shader);
        }
 
-       if (sscreen->info.chip_class >= GFX9 && sel->type == PIPE_SHADER_GEOMETRY)
+       if (shader->key.as_ngg) {
+               assert(!shader->key.as_es && !shader->key.as_ls);
+               gfx10_ngg_calculate_subgroup_info(shader);
+       } else if (sscreen->info.chip_class >= GFX9 && sel->type == PIPE_SHADER_GEOMETRY) {
                gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
+       }
 
        si_fix_resource_usage(sscreen, shader);
        si_shader_dump(sscreen, shader, debug, sel->info.processor,