radeonsi: unify the si_compile_* functions for prologs and epilogs
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
index 447293c334c325f804bf426d779fc79d7c092243..58f8c15060dbd451d1a7a2d7ec5ea732cc02b420 100644 (file)
@@ -71,6 +71,14 @@ static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
 static void si_dump_shader_key(unsigned shader, union si_shader_key *key,
                               FILE *f);
 
+static void si_build_vs_prolog_function(struct si_shader_context *ctx,
+                                       union si_shader_part_key *key);
+static void si_build_vs_epilog_function(struct si_shader_context *ctx,
+                                       union si_shader_part_key *key);
+static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
+                                        union si_shader_part_key *key);
+static void si_build_ps_prolog_function(struct si_shader_context *ctx,
+                                       union si_shader_part_key *key);
 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
                                        union si_shader_part_key *key);
 
@@ -370,8 +378,6 @@ static void declare_input_vs(
        struct gallivm_state *gallivm = base->gallivm;
        struct si_shader_context *ctx =
                si_shader_context(&radeon_bld->soa.bld_base);
-       unsigned divisor =
-               ctx->shader->key.vs.prolog.instance_divisors[input_index];
 
        unsigned chan;
 
@@ -393,24 +399,9 @@ static void declare_input_vs(
        /* Build the attribute offset */
        attribute_offset = lp_build_const_int32(gallivm, 0);
 
-       if (!ctx->no_prolog) {
-               buffer_index = LLVMGetParam(radeon_bld->main_fn,
-                                           ctx->param_vertex_index0 +
-                                           input_index);
-       } else if (divisor) {
-               /* Build index from instance ID, start instance and divisor */
-               ctx->shader->info.uses_instanceid = true;
-               buffer_index = get_instance_index_for_fetch(ctx,
-                                                           SI_PARAM_START_INSTANCE,
-                                                           divisor);
-       } else {
-               /* Load the buffer index for vertices. */
-               LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
-                                                     ctx->param_vertex_id);
-               LLVMValueRef base_vertex = LLVMGetParam(radeon_bld->main_fn,
-                                                       SI_PARAM_BASE_VERTEX);
-               buffer_index = LLVMBuildAdd(gallivm->builder, base_vertex, vertex_id, "");
-       }
+       buffer_index = LLVMGetParam(radeon_bld->main_fn,
+                                   ctx->param_vertex_index0 +
+                                   input_index);
 
        args[0] = t_list;
        args[1] = attribute_offset;
@@ -1175,45 +1166,6 @@ static int lookup_interp_param_index(unsigned interpolate, unsigned location)
        }
 }
 
-/* This shouldn't be used by explicit INTERP opcodes. */
-static unsigned select_interp_param(struct si_shader_context *ctx,
-                                   unsigned param)
-{
-       if (!ctx->no_prolog)
-               return param;
-
-       if (ctx->shader->key.ps.prolog.force_persp_sample_interp) {
-               switch (param) {
-               case SI_PARAM_PERSP_CENTROID:
-               case SI_PARAM_PERSP_CENTER:
-                       return SI_PARAM_PERSP_SAMPLE;
-               }
-       }
-       if (ctx->shader->key.ps.prolog.force_linear_sample_interp) {
-               switch (param) {
-               case SI_PARAM_LINEAR_CENTROID:
-               case SI_PARAM_LINEAR_CENTER:
-                       return SI_PARAM_LINEAR_SAMPLE;
-               }
-       }
-       if (ctx->shader->key.ps.prolog.force_persp_center_interp) {
-               switch (param) {
-               case SI_PARAM_PERSP_CENTROID:
-               case SI_PARAM_PERSP_SAMPLE:
-                       return SI_PARAM_PERSP_CENTER;
-               }
-       }
-       if (ctx->shader->key.ps.prolog.force_linear_center_interp) {
-               switch (param) {
-               case SI_PARAM_LINEAR_CENTROID:
-               case SI_PARAM_LINEAR_SAMPLE:
-                       return SI_PARAM_LINEAR_CENTER;
-               }
-       }
-
-       return param;
-}
-
 /**
  * Interpolate a fragment shader input.
  *
@@ -1331,56 +1283,6 @@ static void interp_fs_input(struct si_shader_context *ctx,
        }
 }
 
-/* LLVMGetParam with bc_optimize resolved. */
-static LLVMValueRef get_interp_param(struct si_shader_context *ctx,
-                                    int interp_param_idx)
-{
-       LLVMBuilderRef builder = ctx->gallivm.builder;
-       LLVMValueRef main_fn = ctx->main_fn;
-       LLVMValueRef param = NULL;
-
-       /* Handle PRIM_MASK[31] (bc_optimize). */
-       if (ctx->no_prolog &&
-           ((ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
-             interp_param_idx == SI_PARAM_PERSP_CENTROID) ||
-            (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
-             interp_param_idx == SI_PARAM_LINEAR_CENTROID))) {
-               /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
-                * The hw doesn't compute CENTROID if the whole wave only
-                * contains fully-covered quads.
-                */
-               LLVMValueRef bc_optimize =
-                       LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK);
-               bc_optimize = LLVMBuildLShr(builder,
-                                           bc_optimize,
-                                           LLVMConstInt(ctx->i32, 31, 0), "");
-               bc_optimize = LLVMBuildTrunc(builder, bc_optimize, ctx->i1, "");
-
-               if (ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
-                   interp_param_idx == SI_PARAM_PERSP_CENTROID) {
-                       param = LLVMBuildSelect(builder, bc_optimize,
-                                               LLVMGetParam(main_fn,
-                                                            SI_PARAM_PERSP_CENTER),
-                                               LLVMGetParam(main_fn,
-                                                            SI_PARAM_PERSP_CENTROID),
-                                               "");
-               }
-               if (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
-                   interp_param_idx == SI_PARAM_LINEAR_CENTROID) {
-                       param = LLVMBuildSelect(builder, bc_optimize,
-                                               LLVMGetParam(main_fn,
-                                                            SI_PARAM_LINEAR_CENTER),
-                                               LLVMGetParam(main_fn,
-                                                            SI_PARAM_LINEAR_CENTROID),
-                                               "");
-               }
-       }
-
-       if (!param)
-               param = LLVMGetParam(main_fn, interp_param_idx);
-       return param;
-}
-
 static void declare_input_fs(
        struct si_shader_context *radeon_bld,
        unsigned input_index,
@@ -1396,8 +1298,7 @@ static void declare_input_fs(
        int interp_param_idx;
 
        /* Get colors from input VGPRs (set by the prolog). */
-       if (!ctx->no_prolog &&
-           decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
+       if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
                unsigned i = decl->Semantic.Index;
                unsigned colors_read = shader->selector->info.colors_read;
                unsigned mask = colors_read >> (i * 4);
@@ -1416,9 +1317,7 @@ static void declare_input_fs(
        if (interp_param_idx == -1)
                return;
        else if (interp_param_idx) {
-               interp_param_idx = select_interp_param(ctx,
-                                                      interp_param_idx);
-               interp_param = get_interp_param(ctx, interp_param_idx);
+               interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
        }
 
        if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
@@ -2477,6 +2376,10 @@ handle_semantic:
        }
 }
 
+/**
+ * Forward all outputs from the vertex shader to the TES. This is only used
+ * for the fixed function TCS.
+ */
 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
@@ -2631,50 +2534,46 @@ static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
        struct si_shader_context *ctx = si_shader_context(bld_base);
        LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
 
+       si_copy_tcs_inputs(bld_base);
+
        rel_patch_id = get_rel_patch_id(ctx);
        invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
        tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
 
-       if (!ctx->no_epilog) {
-               /* Return epilog parameters from this function. */
-               LLVMBuilderRef builder = bld_base->base.gallivm->builder;
-               LLVMValueRef ret = ctx->return_value;
-               LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
-               unsigned vgpr;
-
-               /* RW_BUFFERS pointer */
-               rw_buffers = LLVMGetParam(ctx->main_fn,
-                                         SI_PARAM_RW_BUFFERS);
-               rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
-               rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
-               rw0 = LLVMBuildExtractElement(builder, rw_buffers,
-                                             bld_base->uint_bld.zero, "");
-               rw1 = LLVMBuildExtractElement(builder, rw_buffers,
-                                             bld_base->uint_bld.one, "");
-               ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
-               ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
-
-               /* Tess factor buffer soffset is after user SGPRs. */
-               tf_soffset = LLVMGetParam(ctx->main_fn,
-                                         SI_PARAM_TESS_FACTOR_OFFSET);
-               ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
-                                          SI_TCS_NUM_USER_SGPR + 1, "");
-
-               /* VGPRs */
-               rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
-               invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
-               tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
-
-               vgpr = SI_TCS_NUM_USER_SGPR + 2;
-               ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
-               ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
-               ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
-               ctx->return_value = ret;
-               return;
-       }
+       /* Return epilog parameters from this function. */
+       LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+       LLVMValueRef ret = ctx->return_value;
+       LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
+       unsigned vgpr;
 
-       si_copy_tcs_inputs(bld_base);
-       si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset);
+       /* RW_BUFFERS pointer */
+       rw_buffers = LLVMGetParam(ctx->main_fn,
+                                 SI_PARAM_RW_BUFFERS);
+       rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
+       rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
+       rw0 = LLVMBuildExtractElement(builder, rw_buffers,
+                                     bld_base->uint_bld.zero, "");
+       rw1 = LLVMBuildExtractElement(builder, rw_buffers,
+                                     bld_base->uint_bld.one, "");
+       ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
+       ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
+
+       /* Tess factor buffer soffset is after user SGPRs. */
+       tf_soffset = LLVMGetParam(ctx->main_fn,
+                                 SI_PARAM_TESS_FACTOR_OFFSET);
+       ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
+                                  SI_TCS_NUM_USER_SGPR + 1, "");
+
+       /* VGPRs */
+       rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
+       invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
+       tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
+
+       vgpr = SI_TCS_NUM_USER_SGPR + 2;
+       ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
+       ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
+       ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
+       ctx->return_value = ret;
 }
 
 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
@@ -2820,27 +2719,13 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
                                              "");
        }
 
-       if (ctx->no_epilog) {
-               /* Export PrimitiveID when PS needs it. */
-               if (si_vs_exports_prim_id(ctx->shader)) {
-                       outputs[i].name = TGSI_SEMANTIC_PRIMID;
-                       outputs[i].sid = 0;
-                       outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
-                                                      get_primitive_id(bld_base, 0));
-                       outputs[i].values[1] = bld_base->base.undef;
-                       outputs[i].values[2] = bld_base->base.undef;
-                       outputs[i].values[3] = bld_base->base.undef;
-                       i++;
-               }
-       } else {
-               /* Return the primitive ID from the LLVM function. */
-               ctx->return_value =
-                       LLVMBuildInsertValue(gallivm->builder,
-                                            ctx->return_value,
-                                            bitcast(bld_base, TGSI_TYPE_FLOAT,
-                                                    get_primitive_id(bld_base, 0)),
-                                            VS_EPILOG_PRIMID_LOC, "");
-       }
+       /* Return the primitive ID from the LLVM function. */
+       ctx->return_value =
+               LLVMBuildInsertValue(gallivm->builder,
+                                    ctx->return_value,
+                                    bitcast(bld_base, TGSI_TYPE_FLOAT,
+                                            get_primitive_id(bld_base, 0)),
+                                    VS_EPILOG_PRIMID_LOC, "");
 
        si_llvm_export_vs(bld_base, outputs, i);
        FREE(outputs);
@@ -3038,98 +2923,6 @@ static void si_export_null(struct lp_build_tgsi_context *bld_base)
                           ctx->voidt, args, 9, 0);
 }
 
-static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
-{
-       struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct si_shader *shader = ctx->shader;
-       struct lp_build_context *base = &bld_base->base;
-       struct tgsi_shader_info *info = &shader->selector->info;
-       LLVMBuilderRef builder = base->gallivm->builder;
-       LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
-       int last_color_export = -1;
-       int i;
-       struct si_ps_exports exp = {};
-
-       /* Determine the last export. If MRTZ is present, it's always last.
-        * Otherwise, find the last color export.
-        */
-       if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) {
-               unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format;
-
-               /* Don't export NULL and return if alpha-test is enabled. */
-               if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS &&
-                   shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER &&
-                   (spi_format & 0xf) == 0)
-                       spi_format |= V_028714_SPI_SHADER_32_AR;
-
-               for (i = 0; i < info->num_outputs; i++) {
-                       unsigned index = info->output_semantic_index[i];
-
-                       if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR)
-                               continue;
-
-                       /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
-                       if (shader->key.ps.epilog.last_cbuf > 0) {
-                               /* Just set this if any of the colorbuffers are enabled. */
-                               if (spi_format &
-                                   ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1))
-                                       last_color_export = i;
-                               continue;
-                       }
-
-                       if ((spi_format >> (index * 4)) & 0xf)
-                               last_color_export = i;
-               }
-
-               /* If there are no outputs, export NULL. */
-               if (last_color_export == -1) {
-                       si_export_null(bld_base);
-                       return;
-               }
-       }
-
-       for (i = 0; i < info->num_outputs; i++) {
-               unsigned semantic_name = info->output_semantic_name[i];
-               unsigned semantic_index = info->output_semantic_index[i];
-               unsigned j;
-               LLVMValueRef color[4] = {};
-
-               /* Select the correct target */
-               switch (semantic_name) {
-               case TGSI_SEMANTIC_POSITION:
-                       depth = LLVMBuildLoad(builder,
-                                             ctx->soa.outputs[i][2], "");
-                       break;
-               case TGSI_SEMANTIC_STENCIL:
-                       stencil = LLVMBuildLoad(builder,
-                                               ctx->soa.outputs[i][1], "");
-                       break;
-               case TGSI_SEMANTIC_SAMPLEMASK:
-                       samplemask = LLVMBuildLoad(builder,
-                                                  ctx->soa.outputs[i][0], "");
-                       break;
-               case TGSI_SEMANTIC_COLOR:
-                       for (j = 0; j < 4; j++)
-                               color[j] = LLVMBuildLoad(builder,
-                                                        ctx->soa.outputs[i][j], "");
-
-                       si_export_mrt_color(bld_base, color, semantic_index,
-                                           SI_PARAM_SAMPLE_COVERAGE,
-                                           last_color_export == i, &exp);
-                       break;
-               default:
-                       fprintf(stderr,
-                               "Warning: SI unhandled fs output type:%d\n",
-                               semantic_name);
-               }
-       }
-
-       if (depth || stencil || samplemask)
-               si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
-
-       si_emit_ps_exports(ctx, &exp);
-}
-
 /**
  * Return PS outputs in this order:
  *
@@ -5159,7 +4952,7 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
        if (interp_param_idx == -1)
                return;
        else if (interp_param_idx)
-               interp_param = get_interp_param(ctx, interp_param_idx);
+               interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
        else
                interp_param = NULL;
 
@@ -5495,6 +5288,7 @@ static void create_function(struct si_shader_context *ctx)
        LLVMTypeRef returns[16+32*4];
        unsigned i, last_sgpr, num_params, num_return_sgprs;
        unsigned num_returns = 0;
+       unsigned num_prolog_vgprs = 0;
 
        v3i32 = LLVMVectorType(ctx->i32, 3);
 
@@ -5538,17 +5332,17 @@ static void create_function(struct si_shader_context *ctx)
                params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
                params[ctx->param_instance_id = num_params++] = ctx->i32;
 
-               if (!ctx->no_prolog &&
-                   !ctx->is_gs_copy_shader) {
+               if (!ctx->is_gs_copy_shader) {
                        /* Vertex load indices. */
                        ctx->param_vertex_index0 = num_params;
 
                        for (i = 0; i < shader->selector->info.num_inputs; i++)
                                params[num_params++] = ctx->i32;
+
+                       num_prolog_vgprs += shader->selector->info.num_inputs;
                }
 
-               if (!ctx->no_epilog &&
-                   !ctx->is_gs_copy_shader) {
+               if (!ctx->is_gs_copy_shader) {
                        /* PrimitiveID output. */
                        if (!shader->key.vs.as_es && !shader->key.vs.as_ls)
                                for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
@@ -5570,16 +5364,14 @@ static void create_function(struct si_shader_context *ctx)
                params[SI_PARAM_REL_IDS] = ctx->i32;
                num_params = SI_PARAM_REL_IDS+1;
 
-               if (!ctx->no_epilog) {
-                       /* SI_PARAM_TCS_OC_LDS and PARAM_TESS_FACTOR_OFFSET are
-                        * placed after the user SGPRs.
-                        */
-                       for (i = 0; i < SI_TCS_NUM_USER_SGPR + 2; i++)
-                               returns[num_returns++] = ctx->i32; /* SGPRs */
+               /* SI_PARAM_TCS_OC_LDS and PARAM_TESS_FACTOR_OFFSET are
+                * placed after the user SGPRs.
+                */
+               for (i = 0; i < SI_TCS_NUM_USER_SGPR + 2; i++)
+                       returns[num_returns++] = ctx->i32; /* SGPRs */
 
-                       for (i = 0; i < 3; i++)
-                               returns[num_returns++] = ctx->f32; /* VGPRs */
-               }
+               for (i = 0; i < 3; i++)
+                       returns[num_returns++] = ctx->f32; /* VGPRs */
                break;
 
        case PIPE_SHADER_TESS_EVAL:
@@ -5605,7 +5397,7 @@ static void create_function(struct si_shader_context *ctx)
                params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
 
                /* PrimitiveID output. */
-               if (!ctx->no_epilog && !shader->key.tes.as_es)
+               if (!shader->key.tes.as_es)
                        for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
                                returns[num_returns++] = ctx->f32;
                break;
@@ -5644,43 +5436,42 @@ static void create_function(struct si_shader_context *ctx)
                params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
                params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
                params[SI_PARAM_FRONT_FACE] = ctx->i32;
+               shader->info.face_vgpr_index = 20;
                params[SI_PARAM_ANCILLARY] = ctx->i32;
                params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
                params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
                num_params = SI_PARAM_POS_FIXED_PT+1;
 
-               if (!ctx->no_prolog) {
-                       /* Color inputs from the prolog. */
-                       if (shader->selector->info.colors_read) {
-                               unsigned num_color_elements =
-                                       util_bitcount(shader->selector->info.colors_read);
+               /* Color inputs from the prolog. */
+               if (shader->selector->info.colors_read) {
+                       unsigned num_color_elements =
+                               util_bitcount(shader->selector->info.colors_read);
 
-                               assert(num_params + num_color_elements <= ARRAY_SIZE(params));
-                               for (i = 0; i < num_color_elements; i++)
-                                       params[num_params++] = ctx->f32;
-                       }
-               }
+                       assert(num_params + num_color_elements <= ARRAY_SIZE(params));
+                       for (i = 0; i < num_color_elements; i++)
+                               params[num_params++] = ctx->f32;
 
-               if (!ctx->no_epilog) {
-                       /* Outputs for the epilog. */
-                       num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
-                       num_returns =
-                               num_return_sgprs +
-                               util_bitcount(shader->selector->info.colors_written) * 4 +
-                               shader->selector->info.writes_z +
-                               shader->selector->info.writes_stencil +
-                               shader->selector->info.writes_samplemask +
-                               1 /* SampleMaskIn */;
-
-                       num_returns = MAX2(num_returns,
-                                          num_return_sgprs +
-                                          PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
-
-                       for (i = 0; i < num_return_sgprs; i++)
-                               returns[i] = ctx->i32;
-                       for (; i < num_returns; i++)
-                               returns[i] = ctx->f32;
+                       num_prolog_vgprs += num_color_elements;
                }
+
+               /* Outputs for the epilog. */
+               num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
+               num_returns =
+                       num_return_sgprs +
+                       util_bitcount(shader->selector->info.colors_written) * 4 +
+                       shader->selector->info.writes_z +
+                       shader->selector->info.writes_stencil +
+                       shader->selector->info.writes_samplemask +
+                       1 /* SampleMaskIn */;
+
+               num_returns = MAX2(num_returns,
+                                  num_return_sgprs +
+                                  PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
+
+               for (i = 0; i < num_return_sgprs; i++)
+                       returns[i] = ctx->i32;
+               for (; i < num_returns; i++)
+                       returns[i] = ctx->f32;
                break;
 
        case PIPE_SHADER_COMPUTE:
@@ -5740,12 +5531,11 @@ static void create_function(struct si_shader_context *ctx)
        for (i = 0; i <= last_sgpr; ++i)
                shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
 
-       /* Unused fragment shader inputs are eliminated by the compiler,
-        * so we don't know yet how many there will be.
-        */
-       if (ctx->type != PIPE_SHADER_FRAGMENT)
-               for (; i < num_params; ++i)
-                       shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
+       for (; i < num_params; ++i)
+               shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
+
+       assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
+       shader->info.num_input_vgprs -= num_prolog_vgprs;
 
        if (!ctx->screen->has_ds_bpermute &&
            bld_base->info &&
@@ -6719,10 +6509,7 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
                break;
        case PIPE_SHADER_FRAGMENT:
                ctx->load_input = declare_input_fs;
-               if (ctx->no_epilog)
-                       bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
-               else
-                       bld_base->emit_epilogue = si_llvm_return_fs_outputs;
+               bld_base->emit_epilogue = si_llvm_return_fs_outputs;
                break;
        case PIPE_SHADER_COMPUTE:
                ctx->declare_memory_region = declare_compute_memory;
@@ -6736,14 +6523,6 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
        create_function(ctx);
        preload_ring_buffers(ctx);
 
-       if (ctx->no_prolog && sel->type == PIPE_SHADER_FRAGMENT &&
-           shader->key.ps.prolog.poly_stipple) {
-               LLVMValueRef list = LLVMGetParam(ctx->main_fn,
-                                                SI_PARAM_RW_BUFFERS);
-               si_llvm_emit_polygon_stipple(ctx, list,
-                                            SI_PARAM_POS_FIXED_PT);
-       }
-
        if (ctx->type == PIPE_SHADER_GEOMETRY) {
                int i;
                for (i = 0; i < 4; i++) {
@@ -6762,12 +6541,55 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
        return true;
 }
 
+/**
+ * Compute the VS prolog key, which contains all the information needed to
+ * build the VS prolog function, and set shader->info bits where needed.
+ */
+static void si_get_vs_prolog_key(struct si_shader *shader,
+                                union si_shader_part_key *key)
+{
+       struct tgsi_shader_info *info = &shader->selector->info;
+
+       memset(key, 0, sizeof(*key));
+       key->vs_prolog.states = shader->key.vs.prolog;
+       key->vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
+       key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
+
+       /* Set the instanceID flag. */
+       for (unsigned i = 0; i < info->num_inputs; i++)
+               if (key->vs_prolog.states.instance_divisors[i])
+                       shader->info.uses_instanceid = true;
+}
+
+/**
+ * Compute the VS epilog key, which contains all the information needed to
+ * build the VS epilog function, and set the PrimitiveID output offset.
+ */
+static void si_get_vs_epilog_key(struct si_shader *shader,
+                                struct si_vs_epilog_bits *states,
+                                union si_shader_part_key *key)
+{
+       memset(key, 0, sizeof(*key));
+       key->vs_epilog.states = *states;
+
+       /* Set up the PrimitiveID output. */
+       if (shader->key.vs.epilog.export_prim_id) {
+               unsigned index = shader->selector->info.num_outputs;
+               unsigned offset = shader->info.nr_param_exports++;
+
+               key->vs_epilog.prim_id_param_offset = offset;
+               assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
+               shader->info.vs_output_param_offset[index] = offset;
+       }
+}
+
 /**
  * Compute the PS prolog key, which contains all the information needed to
  * build the PS prolog function, and set related bits in shader->config.
  */
 static void si_get_ps_prolog_key(struct si_shader *shader,
-                                union si_shader_part_key *key)
+                                union si_shader_part_key *key,
+                                bool separate_prolog)
 {
        struct tgsi_shader_info *info = &shader->selector->info;
 
@@ -6847,19 +6669,26 @@ static void si_get_ps_prolog_key(struct si_shader *shader,
                                if (shader->key.ps.prolog.force_linear_center_interp)
                                        location = TGSI_INTERPOLATE_LOC_CENTER;
 
+                               /* The VGPR assignment for non-monolithic shaders
+                                * works because InitialPSInputAddr is set on the
+                                * main shader and PERSP_PULL_MODEL is never used.
+                                */
                                switch (location) {
                                case TGSI_INTERPOLATE_LOC_SAMPLE:
-                                       key->ps_prolog.color_interp_vgpr_index[i] = 6;
+                                       key->ps_prolog.color_interp_vgpr_index[i] =
+                                               separate_prolog ? 6 : 9;
                                        shader->config.spi_ps_input_ena |=
                                                S_0286CC_LINEAR_SAMPLE_ENA(1);
                                        break;
                                case TGSI_INTERPOLATE_LOC_CENTER:
-                                       key->ps_prolog.color_interp_vgpr_index[i] = 8;
+                                       key->ps_prolog.color_interp_vgpr_index[i] =
+                                               separate_prolog ? 8 : 11;
                                        shader->config.spi_ps_input_ena |=
                                                S_0286CC_LINEAR_CENTER_ENA(1);
                                        break;
                                case TGSI_INTERPOLATE_LOC_CENTROID:
-                                       key->ps_prolog.color_interp_vgpr_index[i] = 10;
+                                       key->ps_prolog.color_interp_vgpr_index[i] =
+                                               separate_prolog ? 10 : 13;
                                        shader->config.spi_ps_input_ena |=
                                                S_0286CC_LINEAR_CENTROID_ENA(1);
                                        break;
@@ -7113,13 +6942,8 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
        }
 
        si_init_shader_ctx(&ctx, sscreen, shader, tm);
-       ctx.no_prolog = is_monolithic;
-       ctx.no_epilog = is_monolithic;
        ctx.separate_prolog = !is_monolithic;
 
-       if (ctx.type == PIPE_SHADER_FRAGMENT)
-               ctx.no_epilog = false;
-
        memset(shader->info.vs_output_param_offset, 0xff,
               sizeof(shader->info.vs_output_param_offset));
 
@@ -7133,17 +6957,77 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
                return -1;
        }
 
-       if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
+       if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
+               LLVMValueRef parts[3];
+               bool need_prolog;
+               bool need_epilog;
+
+               need_prolog = sel->info.num_inputs;
+               need_epilog = !shader->key.vs.as_es && !shader->key.vs.as_ls;
+
+               parts[need_prolog ? 1 : 0] = ctx.main_fn;
+
+               if (need_prolog) {
+                       union si_shader_part_key prolog_key;
+                       si_get_vs_prolog_key(shader, &prolog_key);
+                       si_build_vs_prolog_function(&ctx, &prolog_key);
+                       parts[0] = ctx.main_fn;
+               }
+
+               if (need_epilog) {
+                       union si_shader_part_key epilog_key;
+                       si_get_vs_epilog_key(shader, &shader->key.vs.epilog, &epilog_key);
+                       si_build_vs_epilog_function(&ctx, &epilog_key);
+                       parts[need_prolog ? 2 : 1] = ctx.main_fn;
+               }
+
+               si_build_wrapper_function(&ctx, parts, 1 + need_prolog + need_epilog,
+                                         need_prolog ? 1 : 0);
+       } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
                LLVMValueRef parts[2];
                union si_shader_part_key epilog_key;
 
                parts[0] = ctx.main_fn;
 
-               si_get_ps_epilog_key(shader, &epilog_key);
-               si_build_ps_epilog_function(&ctx, &epilog_key);
+               memset(&epilog_key, 0, sizeof(epilog_key));
+               epilog_key.tcs_epilog.states = shader->key.tcs.epilog;
+               si_build_tcs_epilog_function(&ctx, &epilog_key);
                parts[1] = ctx.main_fn;
 
                si_build_wrapper_function(&ctx, parts, 2, 0);
+       } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL &&
+                  !shader->key.tes.as_es) {
+               LLVMValueRef parts[2];
+               union si_shader_part_key epilog_key;
+
+               parts[0] = ctx.main_fn;
+
+               si_get_vs_epilog_key(shader, &shader->key.tes.epilog, &epilog_key);
+               si_build_vs_epilog_function(&ctx, &epilog_key);
+               parts[1] = ctx.main_fn;
+
+               si_build_wrapper_function(&ctx, parts, 2, 0);
+       } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
+               LLVMValueRef parts[3];
+               union si_shader_part_key prolog_key;
+               union si_shader_part_key epilog_key;
+               bool need_prolog;
+
+               si_get_ps_prolog_key(shader, &prolog_key, false);
+               need_prolog = si_need_ps_prolog(&prolog_key);
+
+               parts[need_prolog ? 1 : 0] = ctx.main_fn;
+
+               if (need_prolog) {
+                       si_build_ps_prolog_function(&ctx, &prolog_key);
+                       parts[0] = ctx.main_fn;
+               }
+
+               si_get_ps_epilog_key(shader, &epilog_key);
+               si_build_ps_epilog_function(&ctx, &epilog_key);
+               parts[need_prolog ? 2 : 1] = ctx.main_fn;
+
+               si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2, need_prolog ? 1 : 0);
        }
 
        mod = bld_base->base.gallivm->module;
@@ -7274,22 +7158,25 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
  *
  * \param sscreen      screen
  * \param list         list of shader parts of the same category
+ * \param type         shader type
  * \param key          shader part key
+ * \param prolog       whether the part being requested is a prolog
  * \param tm           LLVM target machine
  * \param debug                debug callback
- * \param compile      the callback responsible for compilation
+ * \param build                the callback responsible for building the main function
  * \return             non-NULL on success
  */
 static struct si_shader_part *
 si_get_shader_part(struct si_screen *sscreen,
                   struct si_shader_part **list,
+                  enum pipe_shader_type type,
+                  bool prolog,
                   union si_shader_part_key *key,
                   LLVMTargetMachineRef tm,
                   struct pipe_debug_callback *debug,
-                  bool (*compile)(struct si_screen *,
-                                  LLVMTargetMachineRef,
-                                  struct pipe_debug_callback *,
-                                  struct si_shader_part *))
+                  void (*build)(struct si_shader_context *,
+                                union si_shader_part_key *),
+                  const char *name)
 {
        struct si_shader_part *result;
 
@@ -7306,24 +7193,59 @@ si_get_shader_part(struct si_screen *sscreen,
        /* Compile a new one. */
        result = CALLOC_STRUCT(si_shader_part);
        result->key = *key;
-       if (!compile(sscreen, tm, debug, result)) {
+
+       struct si_shader shader = {};
+       struct si_shader_context ctx;
+       struct gallivm_state *gallivm = &ctx.gallivm;
+
+       si_init_shader_ctx(&ctx, sscreen, &shader, tm);
+       ctx.type = type;
+
+       switch (type) {
+       case PIPE_SHADER_VERTEX:
+               break;
+       case PIPE_SHADER_TESS_CTRL:
+               assert(!prolog);
+               shader.key.tcs.epilog = key->tcs_epilog.states;
+               break;
+       case PIPE_SHADER_FRAGMENT:
+               if (prolog)
+                       shader.key.ps.prolog = key->ps_prolog.states;
+               else
+                       shader.key.ps.epilog = key->ps_epilog.states;
+               break;
+       default:
+               unreachable("bad shader part");
+       }
+
+       build(&ctx, key);
+
+       /* Compile. */
+       si_llvm_finalize_module(&ctx,
+               r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_FRAGMENT));
+
+       if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
+                           gallivm->module, debug, ctx.type, name)) {
                FREE(result);
-               pipe_mutex_unlock(sscreen->shader_parts_mutex);
-               return NULL;
+               result = NULL;
+               goto out;
        }
 
        result->next = *list;
        *list = result;
+
+out:
+       si_llvm_dispose(&ctx);
        pipe_mutex_unlock(sscreen->shader_parts_mutex);
        return result;
 }
 
 /**
- * Create a vertex shader prolog.
+ * Build the vertex shader prolog function.
  *
  * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
  * All inputs are returned unmodified. The vertex load indices are
- * stored after them, which will used by the API VS for fetching inputs.
+ * stored after them, which will be used by the API VS for fetching inputs.
  *
  * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
  *   input_v0,
@@ -7334,24 +7256,16 @@ si_get_shader_part(struct si_screen *sscreen,
  *   (InstanceID + StartInstance),
  *   (InstanceID / 2 + StartInstance)
  */
-static bool si_compile_vs_prolog(struct si_screen *sscreen,
-                                LLVMTargetMachineRef tm,
-                                struct pipe_debug_callback *debug,
-                                struct si_shader_part *out)
+static void si_build_vs_prolog_function(struct si_shader_context *ctx,
+                                       union si_shader_part_key *key)
 {
-       union si_shader_part_key *key = &out->key;
-       struct si_shader shader = {};
-       struct si_shader_context ctx;
-       struct gallivm_state *gallivm = &ctx.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
        LLVMTypeRef *params, *returns;
        LLVMValueRef ret, func;
        int last_sgpr, num_params, num_returns, i;
-       bool status = true;
 
-       si_init_shader_ctx(&ctx, sscreen, &shader, tm);
-       ctx.type = PIPE_SHADER_VERTEX;
-       ctx.param_vertex_id = key->vs_prolog.num_input_sgprs;
-       ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3;
+       ctx->param_vertex_id = key->vs_prolog.num_input_sgprs;
+       ctx->param_instance_id = key->vs_prolog.num_input_sgprs + 3;
 
        /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
        params = alloca((key->vs_prolog.num_input_sgprs + 4) *
@@ -7365,37 +7279,37 @@ static bool si_compile_vs_prolog(struct si_screen *sscreen,
        /* Declare input and output SGPRs. */
        num_params = 0;
        for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
-               params[num_params++] = ctx.i32;
-               returns[num_returns++] = ctx.i32;
+               params[num_params++] = ctx->i32;
+               returns[num_returns++] = ctx->i32;
        }
        last_sgpr = num_params - 1;
 
        /* 4 preloaded VGPRs (outputs must be floats) */
        for (i = 0; i < 4; i++) {
-               params[num_params++] = ctx.i32;
-               returns[num_returns++] = ctx.f32;
+               params[num_params++] = ctx->i32;
+               returns[num_returns++] = ctx->f32;
        }
 
        /* Vertex load indices. */
        for (i = 0; i <= key->vs_prolog.last_input; i++)
-               returns[num_returns++] = ctx.f32;
+               returns[num_returns++] = ctx->f32;
 
        /* Create the function. */
-       si_create_function(&ctx, "vs_prolog", returns, num_returns, params,
+       si_create_function(ctx, "vs_prolog", returns, num_returns, params,
                           num_params, last_sgpr);
-       func = ctx.main_fn;
+       func = ctx->main_fn;
 
        /* Copy inputs to outputs. This should be no-op, as the registers match,
         * but it will prevent the compiler from overwriting them unintentionally.
         */
-       ret = ctx.return_value;
+       ret = ctx->return_value;
        for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
                LLVMValueRef p = LLVMGetParam(func, i);
                ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
        }
        for (i = num_params - 4; i < num_params; i++) {
                LLVMValueRef p = LLVMGetParam(func, i);
-               p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, "");
+               p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
                ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
        }
 
@@ -7406,37 +7320,26 @@ static bool si_compile_vs_prolog(struct si_screen *sscreen,
 
                if (divisor) {
                        /* InstanceID / Divisor + StartInstance */
-                       index = get_instance_index_for_fetch(&ctx,
+                       index = get_instance_index_for_fetch(ctx,
                                                             SI_SGPR_START_INSTANCE,
                                                             divisor);
                } else {
                        /* VertexID + BaseVertex */
                        index = LLVMBuildAdd(gallivm->builder,
-                                            LLVMGetParam(func, ctx.param_vertex_id),
+                                            LLVMGetParam(func, ctx->param_vertex_id),
                                             LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
                }
 
-               index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, "");
+               index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
                ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
                                           num_params++, "");
        }
 
-       /* Compile. */
-       si_llvm_build_ret(&ctx, ret);
-       si_llvm_finalize_module(&ctx,
-               r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_VERTEX));
-
-       if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
-                           gallivm->module, debug, ctx.type,
-                           "Vertex Shader Prolog"))
-               status = false;
-
-       si_llvm_dispose(&ctx);
-       return status;
+       si_llvm_build_ret(ctx, ret);
 }
 
 /**
- * Compile the vertex shader epilog. This is also used by the tessellation
+ * Build the vertex shader epilog function. This is also used by the tessellation
  * evaluation shader compiled as VS.
  *
  * The input is PrimitiveID.
@@ -7444,21 +7347,13 @@ static bool si_compile_vs_prolog(struct si_screen *sscreen,
  * If PrimitiveID is required by the pixel shader, export it.
  * Otherwise, do nothing.
  */
-static bool si_compile_vs_epilog(struct si_screen *sscreen,
-                                LLVMTargetMachineRef tm,
-                                struct pipe_debug_callback *debug,
-                                struct si_shader_part *out)
+static void si_build_vs_epilog_function(struct si_shader_context *ctx,
+                                       union si_shader_part_key *key)
 {
-       union si_shader_part_key *key = &out->key;
-       struct si_shader_context ctx;
-       struct gallivm_state *gallivm = &ctx.gallivm;
-       struct lp_build_tgsi_context *bld_base = &ctx.soa.bld_base;
+       struct gallivm_state *gallivm = &ctx->gallivm;
+       struct lp_build_tgsi_context *bld_base = &ctx->soa.bld_base;
        LLVMTypeRef params[5];
        int num_params, i;
-       bool status = true;
-
-       si_init_shader_ctx(&ctx, sscreen, NULL, tm);
-       ctx.type = PIPE_SHADER_VERTEX;
 
        /* Declare input VGPRs. */
        num_params = key->vs_epilog.states.export_prim_id ?
@@ -7466,10 +7361,10 @@ static bool si_compile_vs_epilog(struct si_screen *sscreen,
        assert(num_params <= ARRAY_SIZE(params));
 
        for (i = 0; i < num_params; i++)
-               params[i] = ctx.f32;
+               params[i] = ctx->f32;
 
        /* Create the function. */
-       si_create_function(&ctx, "vs_epilog", NULL, 0, params, num_params, -1);
+       si_create_function(ctx, "vs_epilog", NULL, 0, params, num_params, -1);
 
        /* Emit exports. */
        if (key->vs_epilog.states.export_prim_id) {
@@ -7483,7 +7378,7 @@ static bool si_compile_vs_epilog(struct si_screen *sscreen,
                args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM +
                                               key->vs_epilog.prim_id_param_offset);
                args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
-               args[5] = LLVMGetParam(ctx.main_fn,
+               args[5] = LLVMGetParam(ctx->main_fn,
                                       VS_EPILOG_PRIMID_LOC); /* X */
                args[6] = base->undef; /* Y */
                args[7] = base->undef; /* Z */
@@ -7494,18 +7389,7 @@ static bool si_compile_vs_epilog(struct si_screen *sscreen,
                                   args, 9, 0);
        }
 
-       /* Compile. */
        LLVMBuildRetVoid(gallivm->builder);
-       si_llvm_finalize_module(&ctx,
-               r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_VERTEX));
-
-       if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
-                           gallivm->module, debug, ctx.type,
-                           "Vertex Shader Epilog"))
-               status = false;
-
-       si_llvm_dispose(&ctx);
-       return status;
 }
 
 /**
@@ -7519,22 +7403,13 @@ static bool si_get_vs_epilog(struct si_screen *sscreen,
 {
        union si_shader_part_key epilog_key;
 
-       memset(&epilog_key, 0, sizeof(epilog_key));
-       epilog_key.vs_epilog.states = *states;
-
-       /* Set up the PrimitiveID output. */
-       if (shader->key.vs.epilog.export_prim_id) {
-               unsigned index = shader->selector->info.num_outputs;
-               unsigned offset = shader->info.nr_param_exports++;
-
-               epilog_key.vs_epilog.prim_id_param_offset = offset;
-               assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
-               shader->info.vs_output_param_offset[index] = offset;
-       }
+       si_get_vs_epilog_key(shader, states, &epilog_key);
 
        shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
+                                           PIPE_SHADER_VERTEX, true,
                                            &epilog_key, tm, debug,
-                                           si_compile_vs_epilog);
+                                           si_build_vs_epilog_function,
+                                           "Vertex Shader Epilog");
        return shader->epilog != NULL;
 }
 
@@ -7548,20 +7423,18 @@ static bool si_shader_select_vs_parts(struct si_screen *sscreen,
 {
        struct tgsi_shader_info *info = &shader->selector->info;
        union si_shader_part_key prolog_key;
-       unsigned i;
 
        /* Get the prolog. */
-       memset(&prolog_key, 0, sizeof(prolog_key));
-       prolog_key.vs_prolog.states = shader->key.vs.prolog;
-       prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
-       prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
+       si_get_vs_prolog_key(shader, &prolog_key);
 
        /* The prolog is a no-op if there are no inputs. */
        if (info->num_inputs) {
                shader->prolog =
                        si_get_shader_part(sscreen, &sscreen->vs_prologs,
+                                          PIPE_SHADER_VERTEX, true,
                                           &prolog_key, tm, debug,
-                                          si_compile_vs_prolog);
+                                          si_build_vs_prolog_function,
+                                          "Vertex Shader Prolog");
                if (!shader->prolog)
                        return false;
        }
@@ -7572,11 +7445,6 @@ static bool si_shader_select_vs_parts(struct si_screen *sscreen,
                              &shader->key.vs.epilog))
                return false;
 
-       /* Set the instanceID flag. */
-       for (i = 0; i < info->num_inputs; i++)
-               if (prolog_key.vs_prolog.states.instance_divisors[i])
-                       shader->info.uses_instanceid = true;
-
        return true;
 }
 
@@ -7597,69 +7465,48 @@ static bool si_shader_select_tes_parts(struct si_screen *sscreen,
 }
 
 /**
- * Compile the TCS epilog. This writes tesselation factors to memory based on
- * the output primitive type of the tesselator (determined by TES).
+ * Compile the TCS epilog function. This writes tesselation factors to memory
+ * based on the output primitive type of the tesselator (determined by TES).
  */
-static bool si_compile_tcs_epilog(struct si_screen *sscreen,
-                                 LLVMTargetMachineRef tm,
-                                 struct pipe_debug_callback *debug,
-                                 struct si_shader_part *out)
+static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
+                                        union si_shader_part_key *key)
 {
-       union si_shader_part_key *key = &out->key;
-       struct si_shader shader = {};
-       struct si_shader_context ctx;
-       struct gallivm_state *gallivm = &ctx.gallivm;
-       struct lp_build_tgsi_context *bld_base = &ctx.soa.bld_base;
+       struct gallivm_state *gallivm = &ctx->gallivm;
+       struct lp_build_tgsi_context *bld_base = &ctx->soa.bld_base;
        LLVMTypeRef params[16];
        LLVMValueRef func;
        int last_sgpr, num_params;
-       bool status = true;
-
-       si_init_shader_ctx(&ctx, sscreen, &shader, tm);
-       ctx.type = PIPE_SHADER_TESS_CTRL;
-       shader.key.tcs.epilog = key->tcs_epilog.states;
 
        /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
-       params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS);
-       params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
-       params[SI_PARAM_SAMPLERS] = ctx.i64;
-       params[SI_PARAM_IMAGES] = ctx.i64;
-       params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
-       params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx.i32;
-       params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
-       params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
-       params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
-       params[ctx.param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx.i32;
-       params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32;
+       params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
+       params[SI_PARAM_CONST_BUFFERS] = ctx->i64;
+       params[SI_PARAM_SAMPLERS] = ctx->i64;
+       params[SI_PARAM_IMAGES] = ctx->i64;
+       params[SI_PARAM_SHADER_BUFFERS] = ctx->i64;
+       params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
+       params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
+       params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
+       params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32;
+       params[ctx->param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx->i32;
+       params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32;
        last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
        num_params = last_sgpr + 1;
 
-       params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */
-       params[num_params++] = ctx.i32; /* invocation ID within the patch */
-       params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */
+       params[num_params++] = ctx->i32; /* patch index within the wave (REL_PATCH_ID) */
+       params[num_params++] = ctx->i32; /* invocation ID within the patch */
+       params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */
 
        /* Create the function. */
-       si_create_function(&ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr);
-       declare_tess_lds(&ctx);
-       func = ctx.main_fn;
+       si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr);
+       declare_tess_lds(ctx);
+       func = ctx->main_fn;
 
        si_write_tess_factors(bld_base,
                              LLVMGetParam(func, last_sgpr + 1),
                              LLVMGetParam(func, last_sgpr + 2),
                              LLVMGetParam(func, last_sgpr + 3));
 
-       /* Compile. */
        LLVMBuildRetVoid(gallivm->builder);
-       si_llvm_finalize_module(&ctx,
-               r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_TESS_CTRL));
-
-       if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
-                           gallivm->module, debug, ctx.type,
-                           "Tessellation Control Shader Epilog"))
-               status = false;
-
-       si_llvm_dispose(&ctx);
-       return status;
 }
 
 /**
@@ -7677,8 +7524,10 @@ static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
        epilog_key.tcs_epilog.states = shader->key.tcs.epilog;
 
        shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
+                                           PIPE_SHADER_TESS_CTRL, false,
                                            &epilog_key, tm, debug,
-                                           si_compile_tcs_epilog);
+                                           si_build_tcs_epilog_function,
+                                           "Tessellation Control Shader Epilog");
        return shader->epilog != NULL;
 }
 
@@ -7928,39 +7777,6 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx,
        si_llvm_build_ret(ctx, ret);
 }
 
-/**
- * Compile the pixel shader prolog.
- */
-static bool si_compile_ps_prolog(struct si_screen *sscreen,
-                                LLVMTargetMachineRef tm,
-                                struct pipe_debug_callback *debug,
-                                struct si_shader_part *out)
-{
-       union si_shader_part_key *key = &out->key;
-       struct si_shader shader = {};
-       struct si_shader_context ctx;
-       struct gallivm_state *gallivm = &ctx.gallivm;
-       bool status = true;
-
-       si_init_shader_ctx(&ctx, sscreen, &shader, tm);
-       ctx.type = PIPE_SHADER_FRAGMENT;
-       shader.key.ps.prolog = key->ps_prolog.states;
-
-       si_build_ps_prolog_function(&ctx, key);
-
-       /* Compile. */
-       si_llvm_finalize_module(&ctx,
-               r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_FRAGMENT));
-
-       if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
-                           gallivm->module, debug, ctx.type,
-                           "Fragment Shader Prolog"))
-               status = false;
-
-       si_llvm_dispose(&ctx);
-       return status;
-}
-
 /**
  * Build the pixel shader epilog function. This handles everything that must be
  * emulated for pixel shader exports. (alpha-test, format conversions, etc)
@@ -8062,40 +7878,6 @@ static void si_build_ps_epilog_function(struct si_shader_context *ctx,
        LLVMBuildRetVoid(gallivm->builder);
 }
 
-
-/**
- * Compile the pixel shader epilog to a binary for concatenation.
- */
-static bool si_compile_ps_epilog(struct si_screen *sscreen,
-                                LLVMTargetMachineRef tm,
-                                struct pipe_debug_callback *debug,
-                                struct si_shader_part *out)
-{
-       union si_shader_part_key *key = &out->key;
-       struct si_shader shader = {};
-       struct si_shader_context ctx;
-       struct gallivm_state *gallivm = &ctx.gallivm;
-       bool status = true;
-
-       si_init_shader_ctx(&ctx, sscreen, &shader, tm);
-       ctx.type = PIPE_SHADER_FRAGMENT;
-       shader.key.ps.epilog = key->ps_epilog.states;
-
-       si_build_ps_epilog_function(&ctx, key);
-
-       /* Compile. */
-       si_llvm_finalize_module(&ctx,
-               r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_FRAGMENT));
-
-       if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
-                           gallivm->module, debug, ctx.type,
-                           "Fragment Shader Epilog"))
-               status = false;
-
-       si_llvm_dispose(&ctx);
-       return status;
-}
-
 /**
  * Select and compile (or reuse) pixel shader parts (prolog & epilog).
  */
@@ -8108,14 +7890,16 @@ static bool si_shader_select_ps_parts(struct si_screen *sscreen,
        union si_shader_part_key epilog_key;
 
        /* Get the prolog. */
-       si_get_ps_prolog_key(shader, &prolog_key);
+       si_get_ps_prolog_key(shader, &prolog_key, true);
 
        /* The prolog is a no-op if these aren't set. */
        if (si_need_ps_prolog(&prolog_key)) {
                shader->prolog =
                        si_get_shader_part(sscreen, &sscreen->ps_prologs,
+                                          PIPE_SHADER_FRAGMENT, true,
                                           &prolog_key, tm, debug,
-                                          si_compile_ps_prolog);
+                                          si_build_ps_prolog_function,
+                                          "Fragment Shader Prolog");
                if (!shader->prolog)
                        return false;
        }
@@ -8125,8 +7909,10 @@ static bool si_shader_select_ps_parts(struct si_screen *sscreen,
 
        shader->epilog =
                si_get_shader_part(sscreen, &sscreen->ps_epilogs,
+                                  PIPE_SHADER_FRAGMENT, false,
                                   &epilog_key, tm, debug,
-                                  si_compile_ps_epilog);
+                                  si_build_ps_epilog_function,
+                                  "Fragment Shader Epilog");
        if (!shader->epilog)
                return false;