radeonsi: move non-LLVM code out of si_shader_llvm.c
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
index 24f744ba5cd99ab388a283922dfcfe9491cb8beb..9f8be2b72142be8248cdd497485631720bdc3c0e 100644 (file)
@@ -160,12 +160,12 @@ static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx,
 
        if (rshift)
                value = LLVMBuildLShr(ctx->ac.builder, value,
-                                     LLVMConstInt(ctx->i32, rshift, 0), "");
+                                     LLVMConstInt(ctx->ac.i32, rshift, 0), "");
 
        if (rshift + bitwidth < 32) {
                unsigned mask = (1 << bitwidth) - 1;
                value = LLVMBuildAnd(ctx->ac.builder, value,
-                                    LLVMConstInt(ctx->i32, mask, 0), "");
+                                    LLVMConstInt(ctx->ac.i32, mask, 0), "");
        }
 
        return value;
@@ -187,12 +187,12 @@ static LLVMValueRef unpack_sint16(struct si_shader_context *ctx,
 
        if (index == 1)
                return LLVMBuildAShr(ctx->ac.builder, i32,
-                                    LLVMConstInt(ctx->i32, 16, 0), "");
+                                    LLVMConstInt(ctx->ac.i32, 16, 0), "");
 
        return LLVMBuildSExt(ctx->ac.builder,
                             LLVMBuildTrunc(ctx->ac.builder, i32,
                                            ctx->ac.i16, ""),
-                            ctx->i32, "");
+                            ctx->ac.i32, "");
 }
 
 void si_llvm_load_input_vs(
@@ -207,13 +207,13 @@ void si_llvm_load_input_vs(
                LLVMValueRef vertex_id = ctx->abi.vertex_id;
                LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder,
                                                    LLVMIntULE, vertex_id,
-                                                   ctx->i32_1, "");
+                                                   ctx->ac.i32_1, "");
                /* Use LLVMIntNE, because we have 3 vertices and only
                 * the middle one should use y2.
                 */
                LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder,
                                                    LLVMIntNE, vertex_id,
-                                                   ctx->i32_1, "");
+                                                   ctx->ac.i32_1, "");
 
                unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index;
                if (input_index == 0) {
@@ -233,8 +233,8 @@ void si_llvm_load_input_vs(
                        LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1,
                                                         y1, y2, "");
 
-                       out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->f32, "");
-                       out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->f32, "");
+                       out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, "");
+                       out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, "");
                        out[2] = LLVMGetParam(ctx->main_fn,
                                              param_vs_blit_inputs + 2);
                        out[3] = ctx->ac.f32_1;
@@ -284,7 +284,7 @@ void si_llvm_load_input_vs(
                unsigned index= input_index - num_vbos_in_user_sgprs;
                vb_desc = ac_build_load_to_sgpr(&ctx->ac,
                                                ac_get_arg(&ctx->ac, ctx->vertex_buffers),
-                                               LLVMConstInt(ctx->i32, index, 0));
+                                               LLVMConstInt(ctx->ac.i32, index, 0));
        }
 
        vertex_index = LLVMGetParam(ctx->main_fn,
@@ -308,7 +308,7 @@ void si_llvm_load_input_vs(
                                fix_fetch.u.format, fix_fetch.u.reverse, !opencode,
                                vb_desc, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true);
                for (unsigned i = 0; i < 4; ++i)
-                       out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->i32, i, false), "");
+                       out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), "");
                return;
        }
 
@@ -330,7 +330,7 @@ void si_llvm_load_input_vs(
        }
 
        for (unsigned i = 0; i < num_fetches; ++i) {
-               LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
+               LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0);
                fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
                                                         channels_per_fetch, 0, true);
        }
@@ -338,7 +338,7 @@ void si_llvm_load_input_vs(
        if (num_fetches == 1 && channels_per_fetch > 1) {
                LLVMValueRef fetch = fetches[0];
                for (unsigned i = 0; i < channels_per_fetch; ++i) {
-                       tmp = LLVMConstInt(ctx->i32, i, false);
+                       tmp = LLVMConstInt(ctx->ac.i32, i, false);
                        fetches[i] = LLVMBuildExtractElement(
                                ctx->ac.builder, fetch, tmp, "");
                }
@@ -347,7 +347,7 @@ void si_llvm_load_input_vs(
        }
 
        for (unsigned i = num_fetches; i < 4; ++i)
-               fetches[i] = LLVMGetUndef(ctx->f32);
+               fetches[i] = LLVMGetUndef(ctx->ac.f32);
 
        if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 &&
            required_channels == 4) {
@@ -364,11 +364,11 @@ void si_llvm_load_input_vs(
                 * convert it to a signed one.
                 */
                LLVMValueRef tmp = fetches[3];
-               LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
+               LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
 
                /* First, recover the sign-extended signed integer value. */
                if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
-                       tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->i32, "");
+                       tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->ac.i32, "");
                else
                        tmp = ac_to_integer(&ctx->ac, tmp);
 
@@ -380,18 +380,18 @@ void si_llvm_load_input_vs(
                 */
                tmp = LLVMBuildShl(ctx->ac.builder, tmp,
                                   fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ?
-                                  LLVMConstInt(ctx->i32, 7, 0) : c30, "");
+                                  LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
                tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
 
                /* Convert back to the right type. */
                if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
                        LLVMValueRef clamp;
-                       LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
-                       tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
+                       LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
+                       tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
                        clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
                        tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
                } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
-                       tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
+                       tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
                }
 
                fetches[3] = tmp;
@@ -405,7 +405,7 @@ LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx,
                                 unsigned swizzle)
 {
        if (swizzle > 0)
-               return ctx->i32_0;
+               return ctx->ac.i32_0;
 
        switch (ctx->type) {
        case PIPE_SHADER_VERTEX:
@@ -418,7 +418,7 @@ LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx,
                return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id);
        default:
                assert(0);
-               return ctx->i32_0;
+               return ctx->ac.i32_0;
        }
 }
 
@@ -434,12 +434,12 @@ static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi)
                                           ctx->vs_state_bits);
        LLVMValueRef indexed;
 
-       indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->i32_1, "");
-       indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->i1, "");
+       indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->ac.i32_1, "");
+       indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, "");
 
        return LLVMBuildSelect(ctx->ac.builder, indexed,
                               ac_get_arg(&ctx->ac, ctx->args.base_vertex),
-                              ctx->i32_0, "");
+                              ctx->ac.i32_0, "");
 }
 
 static LLVMValueRef get_block_size(struct ac_shader_abi *abi)
@@ -459,7 +459,7 @@ static LLVMValueRef get_block_size(struct ac_shader_abi *abi)
                };
 
                for (i = 0; i < 3; ++i)
-                       values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
+                       values[i] = LLVMConstInt(ctx->ac.i32, sizes[i], 0);
 
                result = ac_build_gather_values(&ctx->ac, values, 3);
        } else {
@@ -474,13 +474,13 @@ void si_declare_compute_memory(struct si_shader_context *ctx)
        struct si_shader_selector *sel = ctx->shader->selector;
        unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE];
 
-       LLVMTypeRef i8p = LLVMPointerType(ctx->i8, AC_ADDR_SPACE_LDS);
+       LLVMTypeRef i8p = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
        LLVMValueRef var;
 
        assert(!ctx->ac.lds);
 
        var = LLVMAddGlobalInAddressSpace(ctx->ac.module,
-                                         LLVMArrayType(ctx->i8, lds_size),
+                                         LLVMArrayType(ctx->ac.i8, lds_size),
                                          "compute_lds",
                                          AC_ADDR_SPACE_LDS);
        LLVMSetAlignment(var, 64 * 1024);
@@ -511,7 +511,7 @@ static void si_llvm_emit_clipvertex(struct si_shader_context *ctx,
        unsigned const_chan;
        LLVMValueRef base_elt;
        LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
-       LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
+       LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32,
                                                   SI_VS_CONST_CLIP_PLANES, 0);
        LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
 
@@ -521,13 +521,13 @@ static void si_llvm_emit_clipvertex(struct si_shader_context *ctx,
                args->out[0] =
                args->out[1] =
                args->out[2] =
-               args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
+               args->out[3] = LLVMConstReal(ctx->ac.f32, 0.0f);
 
                /* Compute dot products of position and user clip plane vectors */
                for (chan = 0; chan < 4; chan++) {
                        for (const_chan = 0; const_chan < 4; const_chan++) {
                                LLVMValueRef addr =
-                                       LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
+                                       LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 +
                                                                const_chan) * 4, 0);
                                base_elt = si_buffer_load_const(ctx, const_resource,
                                                                addr);
@@ -601,7 +601,7 @@ void si_emit_streamout_output(struct si_shader_context *ctx,
                        break;
                }
                /* as v4i32 (aligned to 4) */
-               out[3] = LLVMGetUndef(ctx->i32);
+               out[3] = LLVMGetUndef(ctx->ac.i32);
                /* fall through */
        case 4: /* as v4i32 */
                vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
@@ -611,7 +611,7 @@ void si_emit_streamout_output(struct si_shader_context *ctx,
        ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
                                    vdata, num_comps,
                                    so_write_offsets[buf_idx],
-                                   ctx->i32_0,
+                                   ctx->ac.i32_0,
                                    stream_out->dst_offset * 4, ac_glc | ac_slc);
 }
 
@@ -667,17 +667,17 @@ void si_llvm_emit_streamout(struct si_shader_context *ctx,
                        if (!so->stride[i])
                                continue;
 
-                       LLVMValueRef offset = LLVMConstInt(ctx->i32,
+                       LLVMValueRef offset = LLVMConstInt(ctx->ac.i32,
                                                           SI_VS_STREAMOUT_BUF0 + i, 0);
 
                        so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
 
                        LLVMValueRef so_offset = ac_get_arg(&ctx->ac,
                                                            ctx->streamout_offset[i]);
-                       so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
+                       so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
 
                        so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index,
-                                                          LLVMConstInt(ctx->i32, so->stride[i]*4, 0),
+                                                          LLVMConstInt(ctx->ac.i32, so->stride[i]*4, 0),
                                                           so_offset);
                }
 
@@ -777,7 +777,7 @@ static void si_vertex_color_clamping(struct si_shader_context *ctx,
                        continue;
 
                for (unsigned j = 0; j < 4; j++) {
-                       addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->f32, "");
+                       addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
                        LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]);
                }
                has_colors = true;
@@ -788,7 +788,7 @@ static void si_vertex_color_clamping(struct si_shader_context *ctx,
 
        /* The state is in the first bit of the user SGPR. */
        LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
-       cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->i1, "");
+       cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, "");
 
        ac_build_ifcc(&ctx->ac, cond, 6502);
 
@@ -912,10 +912,10 @@ void si_llvm_export_vs(struct si_shader_context *ctx,
                         * with the first bit containing the edge flag. */
                        edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder,
                                                         edgeflag_value,
-                                                        ctx->i32, "");
+                                                        ctx->ac.i32, "");
                        edgeflag_value = ac_build_umin(&ctx->ac,
                                                      edgeflag_value,
-                                                     ctx->i32_1);
+                                                     ctx->ac.i32_1);
 
                        /* The LLVM intrinsic expects a float. */
                        pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
@@ -933,7 +933,7 @@ void si_llvm_export_vs(struct si_shader_context *ctx,
 
                                v = ac_to_integer(&ctx->ac, v);
                                v = LLVMBuildShl(ctx->ac.builder, v,
-                                                LLVMConstInt(ctx->i32, 16, 0), "");
+                                                LLVMConstInt(ctx->ac.i32, 16, 0), "");
                                v = LLVMBuildOr(ctx->ac.builder, v,
                                                ac_to_integer(&ctx->ac,  pos_args[1].out[2]), "");
                                pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
@@ -1019,7 +1019,7 @@ static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
                outputs[i].semantic_index = 0;
                outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
                for (j = 1; j < 4; j++)
-                       outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
+                       outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0);
 
                memset(outputs[i].vertex_stream, 0,
                       sizeof(outputs[i].vertex_stream));
@@ -1192,7 +1192,8 @@ static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx)
 }
 
 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
-                                  unsigned *num_prolog_vgprs)
+                                  unsigned *num_prolog_vgprs,
+                                  bool ngg_cull_shader)
 {
        struct si_shader *shader = ctx->shader;
 
@@ -1218,6 +1219,11 @@ static void declare_vs_input_vgprs(struct si_shader_context *ctx,
        }
 
        if (!shader->is_gs_copy_shader) {
+               if (shader->key.opt.ngg_culling && !ngg_cull_shader) {
+                       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
+                                  &ctx->ngg_old_thread_id);
+               }
+
                /* Vertex load indices. */
                if (shader->selector->info.num_inputs) {
                        ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
@@ -1252,12 +1258,17 @@ static void declare_vs_blit_inputs(struct si_shader_context *ctx,
        }
 }
 
-static void declare_tes_input_vgprs(struct si_shader_context *ctx)
+static void declare_tes_input_vgprs(struct si_shader_context *ctx, bool ngg_cull_shader)
 {
        ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u);
        ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v);
        ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id);
        ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
+
+       if (ctx->shader->key.opt.ngg_culling && !ngg_cull_shader) {
+               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
+                          &ctx->ngg_old_thread_id);
+       }
 }
 
 enum {
@@ -1276,7 +1287,7 @@ void si_add_arg_checked(struct ac_shader_args *args,
        ac_add_arg(args, file, registers, type, arg);
 }
 
-void si_create_function(struct si_shader_context *ctx)
+void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
 {
        struct si_shader *shader = ctx->shader;
        LLVMTypeRef returns[AC_MAX_ARGS];
@@ -1305,7 +1316,7 @@ void si_create_function(struct si_shader_context *ctx)
                        declare_vs_blit_inputs(ctx, vs_blit_property);
 
                        /* VGPRs */
-                       declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
+                       declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
                        break;
                }
 
@@ -1325,12 +1336,12 @@ void si_create_function(struct si_shader_context *ctx)
                }
 
                /* VGPRs */
-               declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
+               declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
 
                /* Return values */
                if (shader->key.opt.vs_as_prim_discard_cs) {
                        for (i = 0; i < 4; i++)
-                               returns[num_returns++] = ctx->f32; /* VGPRs */
+                               returns[num_returns++] = ctx->ac.f32; /* VGPRs */
                }
                break;
 
@@ -1352,9 +1363,9 @@ void si_create_function(struct si_shader_context *ctx)
                 * placed after the user SGPRs.
                 */
                for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
-                       returns[num_returns++] = ctx->i32; /* SGPRs */
+                       returns[num_returns++] = ctx->ac.i32; /* SGPRs */
                for (i = 0; i < 11; i++)
-                       returns[num_returns++] = ctx->f32; /* VGPRs */
+                       returns[num_returns++] = ctx->ac.f32; /* VGPRs */
                break;
 
        case SI_SHADER_MERGED_VERTEX_TESSCTRL:
@@ -1384,13 +1395,13 @@ void si_create_function(struct si_shader_context *ctx)
                ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
 
                if (ctx->type == PIPE_SHADER_VERTEX) {
-                       declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
+                       declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
 
                        /* LS return values are inputs to the TCS main shader part. */
                        for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
-                               returns[num_returns++] = ctx->i32; /* SGPRs */
+                               returns[num_returns++] = ctx->ac.i32; /* SGPRs */
                        for (i = 0; i < 2; i++)
-                               returns[num_returns++] = ctx->f32; /* VGPRs */
+                               returns[num_returns++] = ctx->ac.f32; /* VGPRs */
                } else {
                        /* TCS return values are inputs to the TCS epilog.
                         *
@@ -1399,9 +1410,9 @@ void si_create_function(struct si_shader_context *ctx)
                         * should be passed to the epilog.
                         */
                        for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
-                               returns[num_returns++] = ctx->i32; /* SGPRs */
+                               returns[num_returns++] = ctx->ac.i32; /* SGPRs */
                        for (i = 0; i < 11; i++)
-                               returns[num_returns++] = ctx->f32; /* VGPRs */
+                               returns[num_returns++] = ctx->ac.f32; /* VGPRs */
                }
                break;
 
@@ -1419,7 +1430,8 @@ void si_create_function(struct si_shader_context *ctx)
                ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
                ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
                ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
+               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
+                          &ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */
                ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
 
                declare_global_desc_pointers(ctx);
@@ -1452,26 +1464,43 @@ void si_create_function(struct si_shader_context *ctx)
                ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);
 
                if (ctx->type == PIPE_SHADER_VERTEX) {
-                       declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
+                       declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
                } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
-                       declare_tes_input_vgprs(ctx);
+                       declare_tes_input_vgprs(ctx, ngg_cull_shader);
                }
 
-               if (ctx->shader->key.as_es &&
+               if ((ctx->shader->key.as_es || ngg_cull_shader) &&
                    (ctx->type == PIPE_SHADER_VERTEX ||
                     ctx->type == PIPE_SHADER_TESS_EVAL)) {
-                       unsigned num_user_sgprs;
+                       unsigned num_user_sgprs, num_vgprs;
 
-                       if (ctx->type == PIPE_SHADER_VERTEX)
-                               num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR;
-                       else
+                       if (ctx->type == PIPE_SHADER_VERTEX) {
+                               /* For the NGG cull shader, add 1 SGPR to hold
+                                * the vertex buffer pointer.
+                                */
+                               num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader;
+
+                               if (ngg_cull_shader && shader->selector->num_vbos_in_user_sgprs) {
+                                       assert(num_user_sgprs <= 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
+                                       num_user_sgprs = SI_SGPR_VS_VB_DESCRIPTOR_FIRST +
+                                                        shader->selector->num_vbos_in_user_sgprs * 4;
+                               }
+                       } else {
                                num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
+                       }
+
+                       /* The NGG cull shader has to return all 9 VGPRs + the old thread ID.
+                        *
+                        * The normal merged ESGS shader only has to return the 5 VGPRs
+                        * for the GS stage.
+                        */
+                       num_vgprs = ngg_cull_shader ? 10 : 5;
 
                        /* ES return values are inputs to GS. */
                        for (i = 0; i < 8 + num_user_sgprs; i++)
-                               returns[num_returns++] = ctx->i32; /* SGPRs */
-                       for (i = 0; i < 5; i++)
-                               returns[num_returns++] = ctx->f32; /* VGPRs */
+                               returns[num_returns++] = ctx->ac.i32; /* SGPRs */
+                       for (i = 0; i < num_vgprs; i++)
+                               returns[num_returns++] = ctx->ac.f32; /* VGPRs */
                }
                break;
 
@@ -1492,7 +1521,7 @@ void si_create_function(struct si_shader_context *ctx)
                }
 
                /* VGPRs */
-               declare_tes_input_vgprs(ctx);
+               declare_tes_input_vgprs(ctx, ngg_cull_shader);
                break;
 
        case PIPE_SHADER_GEOMETRY:
@@ -1581,9 +1610,9 @@ void si_create_function(struct si_shader_context *ctx)
                                   PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
 
                for (i = 0; i < num_return_sgprs; i++)
-                       returns[i] = ctx->i32;
+                       returns[i] = ctx->ac.i32;
                for (; i < num_returns; i++)
-                       returns[i] = ctx->f32;
+                       returns[i] = ctx->ac.f32;
                break;
 
        case PIPE_SHADER_COMPUTE:
@@ -1622,8 +1651,8 @@ void si_create_function(struct si_shader_context *ctx)
                return;
        }
 
-       si_llvm_create_func(ctx, "main", returns, num_returns,
-                           si_get_max_workgroup_size(shader));
+       si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main",
+                           returns, num_returns, si_get_max_workgroup_size(shader));
 
        /* Reserve register locations for VGPR inputs the PS prolog may need. */
        if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
@@ -1654,7 +1683,7 @@ void si_create_function(struct si_shader_context *ctx)
                         * own LDS-based lowering).
                         */
                        ctx->ac.lds = LLVMAddGlobalInAddressSpace(
-                               ctx->ac.module, LLVMArrayType(ctx->i32, 0),
+                               ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
                                "__lds_end", AC_ADDR_SPACE_LDS);
                        LLVMSetAlignment(ctx->ac.lds, 256);
                } else {
@@ -2222,6 +2251,8 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
            !key->as_es && !key->as_ls) {
                fprintf(f, "  opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
                fprintf(f, "  opt.clip_disable = %u\n", key->opt.clip_disable);
+               if (shader_type != PIPE_SHADER_GEOMETRY)
+                       fprintf(f, "  opt.ngg_culling = 0x%x\n", key->opt.ngg_culling);
        }
 }
 
@@ -2248,25 +2279,29 @@ static void si_init_exec_from_input(struct si_shader_context *ctx,
 {
        LLVMValueRef args[] = {
                ac_get_arg(&ctx->ac, param),
-               LLVMConstInt(ctx->i32, bitoffset, 0),
+               LLVMConstInt(ctx->ac.i32, bitoffset, 0),
        };
        ac_build_intrinsic(&ctx->ac,
                           "llvm.amdgcn.init.exec.from.input",
-                          ctx->voidt, args, 2, AC_FUNC_ATTR_CONVERGENT);
+                          ctx->ac.voidt, args, 2, AC_FUNC_ATTR_CONVERGENT);
 }
 
 static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
-                              const struct si_vs_prolog_bits *key)
+                              const struct si_vs_prolog_bits *prolog_key,
+                              const struct si_shader_key *key,
+                              bool ngg_cull_shader)
 {
        /* VGPR initialization fixup for Vega10 and Raven is always done in the
         * VS prolog. */
        return sel->vs_needs_prolog ||
-              key->ls_vgpr_fix ||
-              key->unpack_instance_id_from_vertex_id;
+              prolog_key->ls_vgpr_fix ||
+              prolog_key->unpack_instance_id_from_vertex_id ||
+              (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
 }
 
 static bool si_build_main_function(struct si_shader_context *ctx,
-                                  struct nir_shader *nir, bool free_nir)
+                                  struct nir_shader *nir, bool free_nir,
+                                  bool ngg_cull_shader)
 {
        struct si_shader *shader = ctx->shader;
        struct si_shader_selector *sel = shader->selector;
@@ -2281,6 +2316,8 @@ static bool si_build_main_function(struct si_shader_context *ctx,
                        ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
                else if (shader->key.opt.vs_as_prim_discard_cs)
                        ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
+               else if (ngg_cull_shader)
+                       ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
                else if (shader->key.as_ngg)
                        ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
                else
@@ -2295,6 +2332,8 @@ static bool si_build_main_function(struct si_shader_context *ctx,
 
                if (shader->key.as_es)
                        ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
+               else if (ngg_cull_shader)
+                       ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
                else if (shader->key.as_ngg)
                        ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
                else
@@ -2314,7 +2353,7 @@ static bool si_build_main_function(struct si_shader_context *ctx,
                return false;
        }
 
-       si_create_function(ctx);
+       si_create_function(ctx, ngg_cull_shader);
 
        if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)
                si_preload_esgs_ring(ctx);
@@ -2328,14 +2367,14 @@ static bool si_build_main_function(struct si_shader_context *ctx,
            sel->info.tessfactors_are_def_in_all_invocs) {
                for (unsigned i = 0; i < 6; i++) {
                        ctx->invoc0_tess_factors[i] =
-                               ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
+                               ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
                }
        }
 
        if (ctx->type == PIPE_SHADER_GEOMETRY) {
                for (unsigned i = 0; i < 4; i++) {
                        ctx->gs_next_vertex[i] =
-                               ac_build_alloca(&ctx->ac, ctx->i32, "");
+                               ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
                }
                if (shader->key.as_ngg) {
                        for (unsigned i = 0; i < 4; ++i) {
@@ -2349,14 +2388,15 @@ static bool si_build_main_function(struct si_shader_context *ctx,
                        if (sel->so.num_outputs)
                                scratch_size = 44;
 
-                       LLVMTypeRef ai32 = LLVMArrayType(ctx->i32, scratch_size);
+                       assert(!ctx->gs_ngg_scratch);
+                       LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, scratch_size);
                        ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
                                ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
                        LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32));
                        LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
 
                        ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(ctx->ac.module,
-                               LLVMArrayType(ctx->i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
+                               LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
                        LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
                        LLVMSetAlignment(ctx->gs_ngg_emit, 4);
                }
@@ -2377,8 +2417,9 @@ static bool si_build_main_function(struct si_shader_context *ctx,
                /* This is really only needed when streamout and / or vertex
                 * compaction is enabled.
                 */
-               if (sel->so.num_outputs && !ctx->gs_ngg_scratch) {
-                       LLVMTypeRef asi32 = LLVMArrayType(ctx->i32, 8);
+               if (!ctx->gs_ngg_scratch &&
+                   (sel->so.num_outputs || shader->key.opt.ngg_culling)) {
+                       LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, 8);
                        ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
                                asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
                        LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32));
@@ -2407,7 +2448,8 @@ static bool si_build_main_function(struct si_shader_context *ctx,
                    (shader->key.as_es || shader->key.as_ls) &&
                    (ctx->type == PIPE_SHADER_TESS_EVAL ||
                     (ctx->type == PIPE_SHADER_VERTEX &&
-                     !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
+                     !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
+                                         &shader->key, ngg_cull_shader)))) {
                        si_init_exec_from_input(ctx,
                                                ctx->merged_wave_info, 0);
                } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
@@ -2418,19 +2460,21 @@ static bool si_build_main_function(struct si_shader_context *ctx,
 
                        if (!shader->is_monolithic ||
                            (ctx->type == PIPE_SHADER_TESS_EVAL &&
-                            (shader->key.as_ngg && !shader->key.as_es)))
+                            shader->key.as_ngg && !shader->key.as_es &&
+                            !shader->key.opt.ngg_culling))
                                ac_init_exec_full_mask(&ctx->ac);
 
                        if ((ctx->type == PIPE_SHADER_VERTEX ||
                             ctx->type == PIPE_SHADER_TESS_EVAL) &&
-                           shader->key.as_ngg && !shader->key.as_es) {
+                           shader->key.as_ngg && !shader->key.as_es &&
+                           !shader->key.opt.ngg_culling) {
                                gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
 
                                /* Build the primitive export at the beginning
                                 * of the shader if possible.
                                 */
                                if (gfx10_ngg_export_prim_early(shader))
-                                       gfx10_ngg_build_export_prim(ctx, NULL);
+                                       gfx10_ngg_build_export_prim(ctx, NULL, NULL);
                        }
 
                        if (ctx->type == PIPE_SHADER_TESS_CTRL ||
@@ -2476,9 +2520,9 @@ static bool si_build_main_function(struct si_shader_context *ctx,
        }
 
        if (sel->force_correct_derivs_after_kill) {
-               ctx->postponed_kill = ac_build_alloca_undef(&ctx->ac, ctx->i1, "");
+               ctx->postponed_kill = ac_build_alloca_undef(&ctx->ac, ctx->ac.i1, "");
                /* true = don't kill. */
-               LLVMBuildStore(ctx->ac.builder, ctx->i1true,
+               LLVMBuildStore(ctx->ac.builder, ctx->ac.i1true,
                               ctx->postponed_kill);
        }
 
@@ -2500,12 +2544,14 @@ static bool si_build_main_function(struct si_shader_context *ctx,
  *
  * \param info             Shader info of the vertex shader.
  * \param num_input_sgprs  Number of input SGPRs for the vertex shader.
+ * \param has_old_  Whether the preceding shader part is the NGG cull shader.
  * \param prolog_key       Key of the VS prolog
  * \param shader_out       The vertex shader, or the next shader if merging LS+HS or ES+GS.
  * \param key              Output shader part key.
  */
 static void si_get_vs_prolog_key(const struct si_shader_info *info,
                                 unsigned num_input_sgprs,
+                                bool ngg_cull_shader,
                                 const struct si_vs_prolog_bits *prolog_key,
                                 struct si_shader *shader_out,
                                 union si_shader_part_key *key)
@@ -2518,6 +2564,15 @@ static void si_get_vs_prolog_key(const struct si_shader_info *info,
        key->vs_prolog.as_es = shader_out->key.as_es;
        key->vs_prolog.as_ngg = shader_out->key.as_ngg;
 
+       if (ngg_cull_shader) {
+               key->vs_prolog.gs_fast_launch_tri_list = !!(shader_out->key.opt.ngg_culling &
+                                                           SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
+               key->vs_prolog.gs_fast_launch_tri_strip = !!(shader_out->key.opt.ngg_culling &
+                                                            SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
+       } else {
+               key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling;
+       }
+
        if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
                key->vs_prolog.as_ls = 1;
                key->vs_prolog.num_merged_next_stage_vgprs = 2;
@@ -2608,7 +2663,7 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part
                                        arg_type = AC_ARG_CONST_IMAGE_PTR;
                                else
                                        assert(0);
-                       } else if (type == ctx->f32) {
+                       } else if (type == ctx->ac.f32) {
                                arg_type = AC_ARG_CONST_FLOAT_PTR;
                        } else {
                                assert(0);
@@ -2659,13 +2714,13 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part
        for (unsigned i = 0; i < ctx->args.arg_count; ++i) {
                LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
                LLVMTypeRef param_type = LLVMTypeOf(param);
-               LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->i32 : ctx->f32;
+               LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->ac.i32 : ctx->ac.f32;
                unsigned size = ac_get_type_size(param_type) / 4;
 
                if (size == 1) {
                        if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
-                               param = LLVMBuildPtrToInt(builder, param, ctx->i32, "");
-                               param_type = ctx->i32;
+                               param = LLVMBuildPtrToInt(builder, param, ctx->ac.i32, "");
+                               param_type = ctx->ac.i32;
                        }
 
                        if (param_type != out_type)
@@ -2675,8 +2730,8 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part
                        LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
 
                        if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
-                               param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
-                               param_type = ctx->i64;
+                               param = LLVMBuildPtrToInt(builder, param, ctx->ac.i64, "");
+                               param_type = ctx->ac.i64;
                        }
 
                        if (param_type != vector_type)
@@ -2684,7 +2739,7 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part
 
                        for (unsigned j = 0; j < size; ++j)
                                out[num_out++] = LLVMBuildExtractElement(
-                                       builder, param, LLVMConstInt(ctx->i32, j, 0), "");
+                                       builder, param, LLVMConstInt(ctx->ac.i32, j, 0), "");
                }
 
                if (ctx->args.args[i].file == AC_ARG_SGPR)
@@ -2709,7 +2764,7 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part
                        LLVMValueRef ena, count = initial[3];
 
                        count = LLVMBuildAnd(builder, count,
-                                            LLVMConstInt(ctx->i32, 0x7f, 0), "");
+                                            LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
                        ena = LLVMBuildICmp(builder, LLVMIntULT,
                                            ac_get_thread_id(&ctx->ac), count, "");
                        ac_build_ifcc(&ctx->ac, ena, 6506);
@@ -2750,10 +2805,10 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part
                                if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
                                        if (LLVMGetPointerAddressSpace(param_type) ==
                                            AC_ADDR_SPACE_CONST_32BIT) {
-                                               arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
+                                               arg = LLVMBuildBitCast(builder, arg, ctx->ac.i32, "");
                                                arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
                                        } else {
-                                               arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
+                                               arg = LLVMBuildBitCast(builder, arg, ctx->ac.i64, "");
                                                arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
                                        }
                                } else {
@@ -2802,7 +2857,7 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part
                                assert(num_out < ARRAY_SIZE(out));
                                out[num_out++] = val;
 
-                               if (LLVMTypeOf(val) == ctx->i32) {
+                               if (LLVMTypeOf(val) == ctx->ac.i32) {
                                        assert(num_out_sgpr + 1 == num_out);
                                        num_out_sgpr = num_out;
                                }
@@ -2854,6 +2909,24 @@ static struct nir_shader *get_nir_shader(struct si_shader_selector *sel,
        return NULL;
 }
 
+/* Set the context to a certain shader. Can be called repeatedly
+ * to change the shader. */
+static void si_shader_context_set_ir(struct si_shader_context *ctx,
+                                    struct si_shader *shader)
+{
+       struct si_shader_selector *sel = shader->selector;
+       const struct si_shader_info *info = &sel->info;
+
+       ctx->shader = shader;
+       ctx->type = sel->type;
+
+       ctx->num_const_buffers = util_last_bit(info->const_buffers_declared);
+       ctx->num_shader_buffers = util_last_bit(info->shader_buffers_declared);
+
+       ctx->num_samplers = util_last_bit(info->samplers_declared);
+       ctx->num_images = util_last_bit(info->images_declared);
+}
+
 int si_compile_shader(struct si_screen *sscreen,
                      struct ac_llvm_compiler *compiler,
                      struct si_shader *shader,
@@ -2874,46 +2947,88 @@ int si_compile_shader(struct si_screen *sscreen,
        }
 
        si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader));
-       si_llvm_context_set_ir(&ctx, shader);
+       si_shader_context_set_ir(&ctx, shader);
 
        memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
               sizeof(shader->info.vs_output_param_offset));
 
        shader->info.uses_instanceid = sel->info.uses_instanceid;
 
-       if (!si_build_main_function(&ctx, nir, free_nir)) {
+       LLVMValueRef ngg_cull_main_fn = NULL;
+       if (ctx.shader->key.opt.ngg_culling) {
+               if (!si_build_main_function(&ctx, nir, false, true)) {
+                       si_llvm_dispose(&ctx);
+                       return -1;
+               }
+               ngg_cull_main_fn = ctx.main_fn;
+               ctx.main_fn = NULL;
+               /* Re-set the IR. */
+               si_shader_context_set_ir(&ctx, shader);
+       }
+
+       if (!si_build_main_function(&ctx, nir, free_nir, false)) {
                si_llvm_dispose(&ctx);
                return -1;
        }
 
        if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
-               LLVMValueRef parts[2];
-               bool need_prolog = si_vs_needs_prolog(sel, &shader->key.part.vs.prolog);
-
-               parts[1] = ctx.main_fn;
+               LLVMValueRef parts[4];
+               unsigned num_parts = 0;
+               bool has_prolog = false;
+               LLVMValueRef main_fn = ctx.main_fn;
+
+               if (ngg_cull_main_fn) {
+                       if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
+                                              &shader->key, true)) {
+                               union si_shader_part_key prolog_key;
+                               si_get_vs_prolog_key(&sel->info,
+                                                    shader->info.num_input_sgprs,
+                                                    true,
+                                                    &shader->key.part.vs.prolog,
+                                                    shader, &prolog_key);
+                               prolog_key.vs_prolog.is_monolithic = true;
+                               si_build_vs_prolog_function(&ctx, &prolog_key);
+                               parts[num_parts++] = ctx.main_fn;
+                               has_prolog = true;
+                       }
+                       parts[num_parts++] = ngg_cull_main_fn;
+               }
 
-               if (need_prolog) {
+               if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
+                                      &shader->key, false)) {
                        union si_shader_part_key prolog_key;
                        si_get_vs_prolog_key(&sel->info,
                                             shader->info.num_input_sgprs,
+                                            false,
                                             &shader->key.part.vs.prolog,
                                             shader, &prolog_key);
                        prolog_key.vs_prolog.is_monolithic = true;
                        si_build_vs_prolog_function(&ctx, &prolog_key);
-                       parts[0] = ctx.main_fn;
+                       parts[num_parts++] = ctx.main_fn;
+                       has_prolog = true;
                }
+               parts[num_parts++] = main_fn;
 
-               si_build_wrapper_function(&ctx, parts + !need_prolog,
-                                         1 + need_prolog, need_prolog, 0);
+               si_build_wrapper_function(&ctx, parts, num_parts,
+                                         has_prolog ? 1 : 0, 0);
 
                if (ctx.shader->key.opt.vs_as_prim_discard_cs)
                        si_build_prim_discard_compute_shader(&ctx);
+       } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL &&
+                  ngg_cull_main_fn) {
+               LLVMValueRef parts[2];
+
+               parts[0] = ngg_cull_main_fn;
+               parts[1] = ctx.main_fn;
+
+               si_build_wrapper_function(&ctx, parts, 2, 0, 0);
        } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
                if (sscreen->info.chip_class >= GFX9) {
                        struct si_shader_selector *ls = shader->key.part.tcs.ls;
                        LLVMValueRef parts[4];
                        bool vs_needs_prolog =
-                               si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
+                               si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog,
+                                                  &shader->key, false);
 
                        /* TCS main part */
                        parts[2] = ctx.main_fn;
@@ -2933,9 +3048,9 @@ int si_compile_shader(struct si_screen *sscreen,
                        shader_ls.key.mono = shader->key.mono;
                        shader_ls.key.opt = shader->key.opt;
                        shader_ls.is_monolithic = true;
-                       si_llvm_context_set_ir(&ctx, &shader_ls);
+                       si_shader_context_set_ir(&ctx, &shader_ls);
 
-                       if (!si_build_main_function(&ctx, nir, free_nir)) {
+                       if (!si_build_main_function(&ctx, nir, free_nir, false)) {
                                si_llvm_dispose(&ctx);
                                return -1;
                        }
@@ -2947,6 +3062,7 @@ int si_compile_shader(struct si_screen *sscreen,
                                union si_shader_part_key vs_prolog_key;
                                si_get_vs_prolog_key(&ls->info,
                                                     shader_ls.info.num_input_sgprs,
+                                                    false,
                                                     &shader->key.part.tcs.ls_prolog,
                                                     shader, &vs_prolog_key);
                                vs_prolog_key.vs_prolog.is_monolithic = true;
@@ -3001,9 +3117,9 @@ int si_compile_shader(struct si_screen *sscreen,
                        shader_es.key.mono = shader->key.mono;
                        shader_es.key.opt = shader->key.opt;
                        shader_es.is_monolithic = true;
-                       si_llvm_context_set_ir(&ctx, &shader_es);
+                       si_shader_context_set_ir(&ctx, &shader_es);
 
-                       if (!si_build_main_function(&ctx, nir, free_nir)) {
+                       if (!si_build_main_function(&ctx, nir, free_nir, false)) {
                                si_llvm_dispose(&ctx);
                                return -1;
                        }
@@ -3012,10 +3128,12 @@ int si_compile_shader(struct si_screen *sscreen,
 
                        /* ES prolog */
                        if (es->type == PIPE_SHADER_VERTEX &&
-                           si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog)) {
+                           si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog,
+                                              &shader->key, false)) {
                                union si_shader_part_key vs_prolog_key;
                                si_get_vs_prolog_key(&es->info,
                                                     shader_es.info.num_input_sgprs,
+                                                    false,
                                                     &shader->key.part.gs.vs_prolog,
                                                     shader, &vs_prolog_key);
                                vs_prolog_key.vs_prolog.is_monolithic = true;
@@ -3249,10 +3367,11 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
        LLVMValueRef ret, func;
        int num_returns, i;
        unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
-       unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
+       unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4 +
+                                  (key->vs_prolog.has_ngg_cull_inputs ? 1 : 0);
        struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
-       struct ac_arg input_vgpr_param[9];
-       LLVMValueRef input_vgprs[9];
+       struct ac_arg input_vgpr_param[13];
+       LLVMValueRef input_vgprs[13];
        unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
                                      num_input_vgprs;
        unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
@@ -3268,7 +3387,7 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
        for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
                ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
                           &input_sgpr_param[i]);
-               returns[num_returns++] = ctx->i32;
+               returns[num_returns++] = ctx->ac.i32;
        }
 
        struct ac_arg merged_wave_info = input_sgpr_param[3];
@@ -3276,12 +3395,12 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
        /* Preloaded VGPRs (outputs must be floats) */
        for (i = 0; i < num_input_vgprs; i++) {
                ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
-               returns[num_returns++] = ctx->f32;
+               returns[num_returns++] = ctx->ac.f32;
        }
 
        /* Vertex load indices. */
        for (i = 0; i < key->vs_prolog.num_inputs; i++)
-               returns[num_returns++] = ctx->f32;
+               returns[num_returns++] = ctx->ac.f32;
 
        /* Create the function. */
        si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0);
@@ -3304,7 +3423,7 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                        LLVMValueRef has_hs_threads =
                                LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
                                    si_unpack_param(ctx, input_sgpr_param[3], 8, 8),
-                                   ctx->i32_0, "");
+                                   ctx->ac.i32_0, "");
 
                        for (i = 4; i > 0; --i) {
                                input_vgprs[i + 1] =
@@ -3315,6 +3434,72 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                }
        }
 
+       if (key->vs_prolog.gs_fast_launch_tri_list ||
+           key->vs_prolog.gs_fast_launch_tri_strip) {
+               LLVMValueRef wave_id, thread_id_in_tg;
+
+               wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4);
+               thread_id_in_tg = ac_build_imad(&ctx->ac, wave_id,
+                                               LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false),
+                                               ac_get_thread_id(&ctx->ac));
+
+               /* The GS fast launch initializes all VGPRs to the value of
+                * the first thread, so we have to add the thread ID.
+                *
+                * Only these are initialized by the hw:
+                *   VGPR2: Base Primitive ID
+                *   VGPR5: Base Vertex ID
+                *   VGPR6: Instance ID
+                */
+
+               /* Put the vertex thread IDs into VGPRs as-is instead of packing them.
+                * The NGG cull shader will read them from there.
+                */
+               if (key->vs_prolog.gs_fast_launch_tri_list) {
+                       input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx01_offset */
+                                                      LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */
+                                                      LLVMConstInt(ctx->ac.i32, 0, 0));
+                       input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx23_offset */
+                                                      LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */
+                                                      LLVMConstInt(ctx->ac.i32, 1, 0));
+                       input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx45_offset */
+                                                      LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */
+                                                      LLVMConstInt(ctx->ac.i32, 2, 0));
+               } else {
+                       assert(key->vs_prolog.gs_fast_launch_tri_strip);
+                       LLVMBuilderRef builder = ctx->ac.builder;
+                       /* Triangle indices: */
+                       LLVMValueRef index[3] = {
+                               thread_id_in_tg,
+                               LLVMBuildAdd(builder, thread_id_in_tg,
+                                            LLVMConstInt(ctx->ac.i32, 1, 0), ""),
+                               LLVMBuildAdd(builder, thread_id_in_tg,
+                                            LLVMConstInt(ctx->ac.i32, 2, 0), ""),
+                       };
+                       LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder,
+                                                            thread_id_in_tg, ctx->ac.i1, "");
+                       LLVMValueRef flatshade_first =
+                               LLVMBuildICmp(builder, LLVMIntEQ,
+                                             si_unpack_param(ctx, ctx->vs_state_bits, 4, 2),
+                                             ctx->ac.i32_0, "");
+
+                       ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd,
+                                                                   flatshade_first, index);
+                       input_vgprs[0] = index[0];
+                       input_vgprs[1] = index[1];
+                       input_vgprs[4] = index[2];
+               }
+
+               /* Triangles always have all edge flags set initially. */
+               input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0);
+
+               input_vgprs[2] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[2],
+                                             thread_id_in_tg, ""); /* PrimID */
+               input_vgprs[5] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[5],
+                                             thread_id_in_tg, ""); /* VertexID */
+               input_vgprs[8] = input_vgprs[6]; /* InstanceID */
+       }
+
        unsigned vertex_id_vgpr = first_vs_vgpr;
        unsigned instance_id_vgpr =
                ctx->screen->info.chip_class >= GFX10 ?
@@ -3329,9 +3514,9 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
         */
        if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) {
                ctx->abi.instance_id = LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id,
-                                                    LLVMConstInt(ctx->i32, 16, 0), "");
+                                                    LLVMConstInt(ctx->ac.i32, 16, 0), "");
                ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id,
-                                                 LLVMConstInt(ctx->i32, 0xffff, 0), "");
+                                                 LLVMConstInt(ctx->ac.i32, 0xffff, 0), "");
        }
 
        /* Copy inputs to outputs. This should be no-op, as the registers match,
@@ -3361,7 +3546,7 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
        if (key->vs_prolog.states.instance_divisor_is_fetched) {
                LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
                LLVMValueRef buf_index =
-                       LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
+                       LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
                instance_divisor_constbuf =
                        ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
        }
@@ -3381,7 +3566,7 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                        for (unsigned j = 0; j < 4; j++) {
                                udiv_factors[j] =
                                        si_buffer_load_const(ctx, instance_divisor_constbuf,
-                                                            LLVMConstInt(ctx->i32, i*16 + j*4, 0));
+                                                            LLVMConstInt(ctx->ac.i32, i*16 + j*4, 0));
                                udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
                        }
                        /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
@@ -3422,12 +3607,12 @@ static bool si_get_vs_prolog(struct si_screen *sscreen,
 {
        struct si_shader_selector *vs = main_part->selector;
 
-       if (!si_vs_needs_prolog(vs, key))
+       if (!si_vs_needs_prolog(vs, key, &shader->key, false))
                return true;
 
        /* Get the prolog. */
        union si_shader_part_key prolog_key;
-       si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
+       si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false,
                             key, shader, &prolog_key);
 
        shader->prolog =
@@ -3971,6 +4156,15 @@ bool si_create_shader_variant(struct si_screen *sscreen,
        return true;
 }
 
+void si_shader_binary_clean(struct si_shader_binary *binary)
+{
+       free((void *)binary->elf_buffer);
+       binary->elf_buffer = NULL;
+
+       free(binary->llvm_ir_string);
+       binary->llvm_ir_string = NULL;
+}
+
 void si_shader_destroy(struct si_shader *shader)
 {
        if (shader->scratch_bo)