gallium/radeon: eliminate fast color clear before sharing
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
index 19c427a1bb9160860a6c855f038dd6626e595b44..8c1151aa493cb6f430fa3b94486ba9985b7e27dc 100644 (file)
@@ -70,6 +70,12 @@ struct si_shader_context
 
        unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */
        bool is_gs_copy_shader;
+
+       /* Whether to generate the optimized shader variant compiled as a whole
+        * (without a prolog and epilog)
+        */
+       bool is_monolithic;
+
        int param_streamout_config;
        int param_streamout_write_index;
        int param_streamout_offset[4];
@@ -77,6 +83,7 @@ struct si_shader_context
        int param_rel_auto_id;
        int param_vs_prim_id;
        int param_instance_id;
+       int param_vertex_index0;
        int param_tes_u;
        int param_tes_v;
        int param_tes_rel_patch_id;
@@ -96,14 +103,17 @@ struct si_shader_context
        LLVMValueRef esgs_ring;
        LLVMValueRef gsvs_ring[4];
        LLVMValueRef gs_next_vertex[4];
+       LLVMValueRef return_value;
 
        LLVMTypeRef voidt;
        LLVMTypeRef i1;
        LLVMTypeRef i8;
        LLVMTypeRef i32;
+       LLVMTypeRef i64;
        LLVMTypeRef i128;
        LLVMTypeRef f32;
        LLVMTypeRef v16i8;
+       LLVMTypeRef v2i32;
        LLVMTypeRef v4i32;
        LLVMTypeRef v4f32;
        LLVMTypeRef v8i32;
@@ -118,9 +128,17 @@ static struct si_shader_context *si_shader_context(
 static void si_init_shader_ctx(struct si_shader_context *ctx,
                               struct si_screen *sscreen,
                               struct si_shader *shader,
-                              LLVMTargetMachineRef tm,
-                              struct tgsi_shader_info *info);
+                              LLVMTargetMachineRef tm);
 
+/* Ideally pass the sample mask input to the PS epilog as v13, which
+ * is its usual location, so that the shader doesn't have to add v_mov.
+ */
+#define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
+
+/* The VS location of the PrimitiveID input is the same in the epilog,
+ * so that the main shader part doesn't have to move it.
+ */
+#define VS_EPILOG_PRIMID_LOC 2
 
 #define PERSPECTIVE_BASE 0
 #define LINEAR_BASE 9
@@ -196,6 +214,10 @@ static LLVMValueRef unpack_param(struct si_shader_context *ctx,
        LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn,
                                          param);
 
+       if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
+               value = bitcast(&ctx->radeon_bld.soa.bld_base,
+                               TGSI_TYPE_UNSIGNED, value);
+
        if (rshift)
                value = LLVMBuildLShr(gallivm->builder, value,
                                      lp_build_const_int32(gallivm, rshift), "");
@@ -375,7 +397,7 @@ static LLVMValueRef build_indexed_load_const(
 
 static LLVMValueRef get_instance_index_for_fetch(
        struct radeon_llvm_context *radeon_bld,
-       unsigned divisor)
+       unsigned param_start_instance, unsigned divisor)
 {
        struct si_shader_context *ctx =
                si_shader_context(&radeon_bld->soa.bld_base);
@@ -389,8 +411,8 @@ static LLVMValueRef get_instance_index_for_fetch(
                result = LLVMBuildUDiv(gallivm->builder, result,
                                lp_build_const_int32(gallivm, divisor), "");
 
-       return LLVMBuildAdd(gallivm->builder, result, LLVMGetParam(
-                       radeon_bld->main_fn, SI_PARAM_START_INSTANCE), "");
+       return LLVMBuildAdd(gallivm->builder, result,
+                           LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
 }
 
 static void declare_input_vs(
@@ -402,7 +424,8 @@ static void declare_input_vs(
        struct gallivm_state *gallivm = base->gallivm;
        struct si_shader_context *ctx =
                si_shader_context(&radeon_bld->soa.bld_base);
-       unsigned divisor = ctx->shader->key.vs.instance_divisors[input_index];
+       unsigned divisor =
+               ctx->shader->key.vs.prolog.instance_divisors[input_index];
 
        unsigned chan;
 
@@ -424,10 +447,16 @@ static void declare_input_vs(
        /* Build the attribute offset */
        attribute_offset = lp_build_const_int32(gallivm, 0);
 
-       if (divisor) {
+       if (!ctx->is_monolithic) {
+               buffer_index = LLVMGetParam(radeon_bld->main_fn,
+                                           ctx->param_vertex_index0 +
+                                           input_index);
+       } else if (divisor) {
                /* Build index from instance ID, start instance and divisor */
-               ctx->shader->uses_instanceid = true;
-               buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld, divisor);
+               ctx->shader->info.uses_instanceid = true;
+               buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld,
+                                                           SI_PARAM_START_INSTANCE,
+                                                           divisor);
        } else {
                /* Load the buffer index for vertices. */
                LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
@@ -853,7 +882,8 @@ static int lookup_interp_param_index(unsigned interpolate, unsigned location)
 static unsigned select_interp_param(struct si_shader_context *ctx,
                                    unsigned param)
 {
-       if (!ctx->shader->key.ps.force_persample_interp)
+       if (!ctx->shader->key.ps.prolog.force_persample_interp ||
+           !ctx->is_monolithic)
                return param;
 
        /* If the shader doesn't use center/centroid, just return the parameter.
@@ -923,7 +953,7 @@ static void interp_fs_input(struct si_shader_context *ctx,
        intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
 
        if (semantic_name == TGSI_SEMANTIC_COLOR &&
-           ctx->shader->key.ps.color_two_side) {
+           ctx->shader->key.ps.prolog.color_two_side) {
                LLVMValueRef args[4];
                LLVMValueRef is_face_positive;
                LLVMValueRef back_attr_number;
@@ -997,6 +1027,7 @@ static void declare_input_fs(
        unsigned input_index,
        const struct tgsi_full_declaration *decl)
 {
+       struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
        struct si_shader_context *ctx =
                si_shader_context(&radeon_bld->soa.bld_base);
        struct si_shader *shader = ctx->shader;
@@ -1004,6 +1035,26 @@ static void declare_input_fs(
        LLVMValueRef interp_param = NULL;
        int interp_param_idx;
 
+       /* Get colors from input VGPRs (set by the prolog). */
+       if (!ctx->is_monolithic &&
+           decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
+               unsigned i = decl->Semantic.Index;
+               unsigned colors_read = shader->selector->info.colors_read;
+               unsigned mask = colors_read >> (i * 4);
+               unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
+                                 (i ? util_bitcount(colors_read & 0xf) : 0);
+
+               radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
+                       mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
+               radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
+                       mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
+               radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
+                       mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
+               radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
+                       mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
+               return;
+       }
+
        interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
                                                     decl->Interp.Location);
        if (interp_param_idx == -1)
@@ -1330,12 +1381,12 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 
        if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
                const union si_shader_key *key = &ctx->shader->key;
-               unsigned col_formats = key->ps.spi_shader_col_format;
+               unsigned col_formats = key->ps.epilog.spi_shader_col_format;
                int cbuf = target - V_008DFC_SQ_EXP_MRT;
 
                assert(cbuf >= 0 && cbuf < 8);
                spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
-               is_int8 = (key->ps.color_is_int8 >> cbuf) & 0x1;
+               is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1;
        }
 
        args[4] = uint->zero; /* COMPR flag */
@@ -1488,13 +1539,13 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
        struct si_shader_context *ctx = si_shader_context(bld_base);
        struct gallivm_state *gallivm = bld_base->base.gallivm;
 
-       if (ctx->shader->key.ps.alpha_func != PIPE_FUNC_NEVER) {
+       if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
                LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn,
                                SI_PARAM_ALPHA_REF);
 
                LLVMValueRef alpha_pass =
                        lp_build_cmp(&bld_base->base,
-                                    ctx->shader->key.ps.alpha_func,
+                                    ctx->shader->key.ps.epilog.alpha_func,
                                     alpha, alpha_ref);
                LLVMValueRef arg =
                        lp_build_select(&bld_base->base,
@@ -1511,7 +1562,8 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
 }
 
 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
-                                                 LLVMValueRef alpha)
+                                                 LLVMValueRef alpha,
+                                                 unsigned samplemask_param)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        struct gallivm_state *gallivm = bld_base->base.gallivm;
@@ -1519,7 +1571,7 @@ static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *
 
        /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
        coverage = LLVMGetParam(ctx->radeon_bld.main_fn,
-                               SI_PARAM_SAMPLE_COVERAGE);
+                               samplemask_param);
        coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
 
        coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
@@ -1841,7 +1893,8 @@ handle_semantic:
                case TGSI_SEMANTIC_COLOR:
                case TGSI_SEMANTIC_BCOLOR:
                        target = V_008DFC_SQ_EXP_PARAM + param_count;
-                       shader->vs_output_param_offset[i] = param_count;
+                       assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
+                       shader->info.vs_output_param_offset[i] = param_count;
                        param_count++;
                        break;
                case TGSI_SEMANTIC_CLIPDIST:
@@ -1855,7 +1908,8 @@ handle_semantic:
                case TGSI_SEMANTIC_TEXCOORD:
                case TGSI_SEMANTIC_GENERIC:
                        target = V_008DFC_SQ_EXP_PARAM + param_count;
-                       shader->vs_output_param_offset[i] = param_count;
+                       assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
+                       shader->info.vs_output_param_offset[i] = param_count;
                        param_count++;
                        break;
                default:
@@ -1883,7 +1937,7 @@ handle_semantic:
                }
        }
 
-       shader->nr_param_exports = param_count;
+       shader->info.nr_param_exports = param_count;
 
        /* We need to add the position output manually if it's missing. */
        if (!pos_args[0][0]) {
@@ -1945,7 +1999,7 @@ handle_semantic:
 
        for (i = 0; i < 4; i++)
                if (pos_args[i][0])
-                       shader->nr_pos_exports++;
+                       shader->info.nr_pos_exports++;
 
        pos_idx = 0;
        for (i = 0; i < 4; i++) {
@@ -1955,7 +2009,7 @@ handle_semantic:
                /* Specify the target we are exporting */
                pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++);
 
-               if (pos_idx == shader->nr_pos_exports)
+               if (pos_idx == shader->info.nr_pos_exports)
                        /* Specify that this is the last export */
                        pos_args[i][2] = uint->one;
 
@@ -1989,7 +2043,7 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
                                  invocation_id, bld_base->uint_bld.zero, ""));
 
        /* Determine the layout of one tess factor element in the buffer. */
-       switch (shader->key.tcs.prim_mode) {
+       switch (shader->key.tcs.epilog.prim_mode) {
        case PIPE_PRIM_LINES:
                stride = 2; /* 2 dwords, 1 vec2 store */
                outer_comps = 2;
@@ -2061,14 +2115,51 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
-       LLVMValueRef invocation_id;
+       LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
 
+       rel_patch_id = get_rel_patch_id(ctx);
        invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
+       tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
 
-       si_write_tess_factors(bld_base,
-                             get_rel_patch_id(ctx),
-                             invocation_id,
-                             get_tcs_out_current_patch_data_offset(ctx));
+       if (!ctx->is_monolithic) {
+               /* Return epilog parameters from this function. */
+               LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+               LLVMValueRef ret = ctx->return_value;
+               LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
+               unsigned vgpr;
+
+               /* RW_BUFFERS pointer */
+               rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
+                                         SI_PARAM_RW_BUFFERS);
+               rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
+               rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
+               rw0 = LLVMBuildExtractElement(builder, rw_buffers,
+                                             bld_base->uint_bld.zero, "");
+               rw1 = LLVMBuildExtractElement(builder, rw_buffers,
+                                             bld_base->uint_bld.one, "");
+               ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
+               ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
+
+               /* Tess factor buffer soffset is after user SGPRs. */
+               tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
+                                         SI_PARAM_TESS_FACTOR_OFFSET);
+               ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
+                                          SI_TCS_NUM_USER_SGPR, "");
+
+               /* VGPRs */
+               rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
+               invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
+               tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
+
+               vgpr = SI_TCS_NUM_USER_SGPR + 1;
+               ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
+               ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
+               ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
+               ctx->return_value = ret;
+               return;
+       }
+
+       si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset);
 }
 
 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
@@ -2214,16 +2305,26 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
                                              "");
        }
 
-       /* Export PrimitiveID when PS needs it. */
-       if (si_vs_exports_prim_id(ctx->shader)) {
-               outputs[i].name = TGSI_SEMANTIC_PRIMID;
-               outputs[i].sid = 0;
-               outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
-                                              get_primitive_id(bld_base, 0));
-               outputs[i].values[1] = bld_base->base.undef;
-               outputs[i].values[2] = bld_base->base.undef;
-               outputs[i].values[3] = bld_base->base.undef;
-               i++;
+       if (ctx->is_monolithic) {
+               /* Export PrimitiveID when PS needs it. */
+               if (si_vs_exports_prim_id(ctx->shader)) {
+                       outputs[i].name = TGSI_SEMANTIC_PRIMID;
+                       outputs[i].sid = 0;
+                       outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+                                                      get_primitive_id(bld_base, 0));
+                       outputs[i].values[1] = bld_base->base.undef;
+                       outputs[i].values[2] = bld_base->base.undef;
+                       outputs[i].values[3] = bld_base->base.undef;
+                       i++;
+               }
+       } else {
+               /* Return the primitive ID from the LLVM function. */
+               ctx->return_value =
+                       LLVMBuildInsertValue(gallivm->builder,
+                                            ctx->return_value,
+                                            bitcast(bld_base, TGSI_TYPE_FLOAT,
+                                                    get_primitive_id(bld_base, 0)),
+                                            VS_EPILOG_PRIMID_LOC, "");
        }
 
        si_llvm_export_vs(bld_base, outputs, i);
@@ -2284,6 +2385,7 @@ static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
 
 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
                                LLVMValueRef *color, unsigned index,
+                               unsigned samplemask_param,
                                bool is_last)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
@@ -2291,30 +2393,31 @@ static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
        int i;
 
        /* Clamp color */
-       if (ctx->shader->key.ps.clamp_color)
+       if (ctx->shader->key.ps.epilog.clamp_color)
                for (i = 0; i < 4; i++)
                        color[i] = radeon_llvm_saturate(bld_base, color[i]);
 
        /* Alpha to one */
-       if (ctx->shader->key.ps.alpha_to_one)
+       if (ctx->shader->key.ps.epilog.alpha_to_one)
                color[3] = base->one;
 
        /* Alpha test */
        if (index == 0 &&
-           ctx->shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS)
+           ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
                si_alpha_test(bld_base, color[3]);
 
        /* Line & polygon smoothing */
-       if (ctx->shader->key.ps.poly_line_smoothing)
-               color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3]);
+       if (ctx->shader->key.ps.epilog.poly_line_smoothing)
+               color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
+                                                        samplemask_param);
 
        /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
-       if (ctx->shader->key.ps.last_cbuf > 0) {
+       if (ctx->shader->key.ps.epilog.last_cbuf > 0) {
                LLVMValueRef args[8][9];
                int c, last = -1;
 
                /* Get the export arguments, also find out what the last one is. */
-               for (c = 0; c <= ctx->shader->key.ps.last_cbuf; c++) {
+               for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
                        si_llvm_init_export_args(bld_base, color,
                                                 V_008DFC_SQ_EXP_MRT + c, args[c]);
                        if (args[c][0] != bld_base->uint_bld.zero)
@@ -2322,7 +2425,7 @@ static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
                }
 
                /* Emit all exports. */
-               for (c = 0; c <= ctx->shader->key.ps.last_cbuf; c++) {
+               for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
                        if (is_last && last == c) {
                                args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
                                args[c][2] = bld_base->uint_bld.one; /* DONE bit */
@@ -2385,11 +2488,11 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
         * Otherwise, find the last color export.
         */
        if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) {
-               unsigned spi_format = shader->key.ps.spi_shader_col_format;
+               unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format;
 
                /* Don't export NULL and return if alpha-test is enabled. */
-               if (shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS &&
-                   shader->key.ps.alpha_func != PIPE_FUNC_NEVER &&
+               if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS &&
+                   shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER &&
                    (spi_format & 0xf) == 0)
                        spi_format |= V_028714_SPI_SHADER_32_AR;
 
@@ -2400,10 +2503,10 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
                                continue;
 
                        /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
-                       if (shader->key.ps.last_cbuf > 0) {
+                       if (shader->key.ps.epilog.last_cbuf > 0) {
                                /* Just set this if any of the colorbuffers are enabled. */
                                if (spi_format &
-                                   ((1llu << (4 * (shader->key.ps.last_cbuf + 1))) - 1))
+                                   ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1))
                                        last_color_export = i;
                                continue;
                        }
@@ -2445,6 +2548,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
                                                         ctx->radeon_bld.soa.outputs[i][j], "");
 
                        si_export_mrt_color(bld_base, color, semantic_index,
+                                           SI_PARAM_SAMPLE_COVERAGE,
                                            last_color_export == i);
                        break;
                default:
@@ -2458,6 +2562,100 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
                si_export_mrt_z(bld_base, depth, stencil, samplemask);
 }
 
+/**
+ * Return PS outputs in this order:
+ *
+ * v[0:3] = color0.xyzw
+ * v[4:7] = color1.xyzw
+ * ...
+ * vN+0 = Depth
+ * vN+1 = Stencil
+ * vN+2 = SampleMask
+ * vN+3 = SampleMaskIn (used for OpenGL smoothing)
+ *
+ * The alpha-ref SGPR is returned via its original location.
+ */
+static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
+{
+       struct si_shader_context *ctx = si_shader_context(bld_base);
+       struct si_shader *shader = ctx->shader;
+       struct lp_build_context *base = &bld_base->base;
+       struct tgsi_shader_info *info = &shader->selector->info;
+       LLVMBuilderRef builder = base->gallivm->builder;
+       unsigned i, j, first_vgpr, vgpr;
+
+       LLVMValueRef color[8][4] = {};
+       LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
+       LLVMValueRef ret;
+
+       /* Read the output values. */
+       for (i = 0; i < info->num_outputs; i++) {
+               unsigned semantic_name = info->output_semantic_name[i];
+               unsigned semantic_index = info->output_semantic_index[i];
+
+               switch (semantic_name) {
+               case TGSI_SEMANTIC_COLOR:
+                       assert(semantic_index < 8);
+                       for (j = 0; j < 4; j++) {
+                               LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j];
+                               LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
+                               color[semantic_index][j] = result;
+                       }
+                       break;
+               case TGSI_SEMANTIC_POSITION:
+                       depth = LLVMBuildLoad(builder,
+                                             ctx->radeon_bld.soa.outputs[i][2], "");
+                       break;
+               case TGSI_SEMANTIC_STENCIL:
+                       stencil = LLVMBuildLoad(builder,
+                                               ctx->radeon_bld.soa.outputs[i][1], "");
+                       break;
+               case TGSI_SEMANTIC_SAMPLEMASK:
+                       samplemask = LLVMBuildLoad(builder,
+                                                  ctx->radeon_bld.soa.outputs[i][0], "");
+                       break;
+               default:
+                       fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
+                               semantic_name);
+               }
+       }
+
+       /* Fill the return structure. */
+       ret = ctx->return_value;
+
+       /* Set SGPRs. */
+       ret = LLVMBuildInsertValue(builder, ret,
+                                  bitcast(bld_base, TGSI_TYPE_SIGNED,
+                                          LLVMGetParam(ctx->radeon_bld.main_fn,
+                                                       SI_PARAM_ALPHA_REF)),
+                                  SI_SGPR_ALPHA_REF, "");
+
+       /* Set VGPRs */
+       first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
+       for (i = 0; i < ARRAY_SIZE(color); i++) {
+               if (!color[i][0])
+                       continue;
+
+               for (j = 0; j < 4; j++)
+                       ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
+       }
+       if (depth)
+               ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
+       if (stencil)
+               ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
+       if (samplemask)
+               ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
+
+       /* Add the input sample mask for smoothing at the end. */
+       if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
+               vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
+       ret = LLVMBuildInsertValue(builder, ret,
+                                  LLVMGetParam(ctx->radeon_bld.main_fn,
+                                               SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
+
+       ctx->return_value = ret;
+}
+
 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
                                struct lp_build_tgsi_context *bld_base,
                                struct lp_build_emit_data *emit_data);
@@ -2536,13 +2734,12 @@ static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
 /**
  * Load an image view, fmask view. or sampler state descriptor.
  */
-static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
-                                    LLVMValueRef index, enum desc_type type)
+static LLVMValueRef get_sampler_desc_custom(struct si_shader_context *ctx,
+                                           LLVMValueRef list, LLVMValueRef index,
+                                           enum desc_type type)
 {
        struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
        LLVMBuilderRef builder = gallivm->builder;
-       LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
-                                       SI_PARAM_SAMPLERS);
 
        switch (type) {
        case DESC_IMAGE:
@@ -2558,12 +2755,21 @@ static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
                /* The sampler state is at [12:15]. */
                index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
                index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
-               ptr = LLVMBuildPointerCast(builder, ptr,
-                                          const_array(ctx->v4i32, 0), "");
+               list = LLVMBuildPointerCast(builder, list,
+                                           const_array(ctx->v4i32, 0), "");
                break;
        }
 
-       return build_indexed_load_const(ctx, ptr, index);
+       return build_indexed_load_const(ctx, list, index);
+}
+
+static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
+                                    LLVMValueRef index, enum desc_type type)
+{
+       LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn,
+                                        SI_PARAM_SAMPLERS);
+
+       return get_sampler_desc_custom(ctx, list, index, type);
 }
 
 static void tex_fetch_ptrs(
@@ -3546,6 +3752,30 @@ static const struct lp_build_tgsi_action interp_action = {
        .emit = build_interp_intrinsic,
 };
 
+static void si_create_function(struct si_shader_context *ctx,
+                              LLVMTypeRef *returns, unsigned num_returns,
+                              LLVMTypeRef *params, unsigned num_params,
+                              int last_array_pointer, int last_sgpr)
+{
+       int i;
+
+       radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns,
+                               params, num_params);
+       radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
+       ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type);
+
+       for (i = 0; i <= last_sgpr; ++i) {
+               LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
+
+               /* We tell llvm that array inputs are passed by value to allow Sinking pass
+                * to move load. Inputs are constant so this is fine. */
+               if (i <= last_array_pointer)
+                       LLVMAddAttribute(P, LLVMByValAttribute);
+               else
+                       LLVMAddAttribute(P, LLVMInRegAttribute);
+       }
+}
+
 static void create_meta_data(struct si_shader_context *ctx)
 {
        struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
@@ -3579,15 +3809,57 @@ static void declare_streamout_params(struct si_shader_context *ctx,
        }
 }
 
+static unsigned llvm_get_type_size(LLVMTypeRef type)
+{
+       LLVMTypeKind kind = LLVMGetTypeKind(type);
+
+       switch (kind) {
+       case LLVMIntegerTypeKind:
+               return LLVMGetIntTypeWidth(type) / 8;
+       case LLVMFloatTypeKind:
+               return 4;
+       case LLVMPointerTypeKind:
+               return 8;
+       case LLVMVectorTypeKind:
+               return LLVMGetVectorSize(type) *
+                      llvm_get_type_size(LLVMGetElementType(type));
+       default:
+               assert(0);
+               return 0;
+       }
+}
+
+static void declare_tess_lds(struct si_shader_context *ctx)
+{
+       struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
+       LLVMTypeRef i32 = ctx->radeon_bld.soa.bld_base.uint_bld.elem_type;
+
+       /* This is the upper bound, maximum is 32 inputs times 32 vertices */
+       unsigned vertex_data_dw_size = 32*32*4;
+       unsigned patch_data_dw_size = 32*4;
+       /* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */
+       unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size;
+       unsigned lds_dwords = patch_dw_size;
+
+       /* The actual size is computed outside of the shader to reduce
+        * the number of shader variants. */
+       ctx->lds =
+               LLVMAddGlobalInAddressSpace(gallivm->module,
+                                           LLVMArrayType(i32, lds_dwords),
+                                           "tess_lds",
+                                           LOCAL_ADDR_SPACE);
+}
+
 static void create_function(struct si_shader_context *ctx)
 {
        struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
        struct gallivm_state *gallivm = bld_base->base.gallivm;
        struct si_shader *shader = ctx->shader;
-       LLVMTypeRef params[SI_NUM_PARAMS], v2i32, v3i32;
-       unsigned i, last_array_pointer, last_sgpr, num_params;
+       LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32;
+       LLVMTypeRef returns[16+32*4];
+       unsigned i, last_array_pointer, last_sgpr, num_params, num_return_sgprs;
+       unsigned num_returns = 0;
 
-       v2i32 = LLVMVectorType(ctx->i32, 2);
        v3i32 = LLVMVectorType(ctx->i32, 3);
 
        params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
@@ -3630,6 +3902,20 @@ static void create_function(struct si_shader_context *ctx)
                params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
                params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
                params[ctx->param_instance_id = num_params++] = ctx->i32;
+
+               if (!ctx->is_monolithic &&
+                   !ctx->is_gs_copy_shader) {
+                       /* Vertex load indices. */
+                       ctx->param_vertex_index0 = num_params;
+
+                       for (i = 0; i < shader->selector->info.num_inputs; i++)
+                               params[num_params++] = ctx->i32;
+
+                       /* PrimitiveID output. */
+                       if (!shader->key.vs.as_es && !shader->key.vs.as_ls)
+                               for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
+                                       returns[num_returns++] = ctx->f32;
+               }
                break;
 
        case TGSI_PROCESSOR_TESS_CTRL:
@@ -3643,6 +3929,15 @@ static void create_function(struct si_shader_context *ctx)
                params[SI_PARAM_PATCH_ID] = ctx->i32;
                params[SI_PARAM_REL_IDS] = ctx->i32;
                num_params = SI_PARAM_REL_IDS+1;
+
+               if (!ctx->is_monolithic) {
+                       /* PARAM_TESS_FACTOR_OFFSET is after user SGPRs. */
+                       for (i = 0; i <= SI_TCS_NUM_USER_SGPR; i++)
+                               returns[num_returns++] = ctx->i32; /* SGPRs */
+
+                       for (i = 0; i < 3; i++)
+                               returns[num_returns++] = ctx->f32; /* VGPRs */
+               }
                break;
 
        case TGSI_PROCESSOR_TESS_EVAL:
@@ -3663,6 +3958,11 @@ static void create_function(struct si_shader_context *ctx)
                params[ctx->param_tes_v = num_params++] = ctx->f32;
                params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
                params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
+
+               /* PrimitiveID output. */
+               if (!ctx->is_monolithic && !shader->key.tes.as_es)
+                       for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
+                               returns[num_returns++] = ctx->f32;
                break;
 
        case TGSI_PROCESSOR_GEOMETRY:
@@ -3686,13 +3986,13 @@ static void create_function(struct si_shader_context *ctx)
                params[SI_PARAM_ALPHA_REF] = ctx->f32;
                params[SI_PARAM_PRIM_MASK] = ctx->i32;
                last_sgpr = SI_PARAM_PRIM_MASK;
-               params[SI_PARAM_PERSP_SAMPLE] = v2i32;
-               params[SI_PARAM_PERSP_CENTER] = v2i32;
-               params[SI_PARAM_PERSP_CENTROID] = v2i32;
+               params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
+               params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
+               params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
                params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
-               params[SI_PARAM_LINEAR_SAMPLE] = v2i32;
-               params[SI_PARAM_LINEAR_CENTER] = v2i32;
-               params[SI_PARAM_LINEAR_CENTROID] = v2i32;
+               params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
+               params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
+               params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
                params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
                params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
                params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
@@ -3701,8 +4001,39 @@ static void create_function(struct si_shader_context *ctx)
                params[SI_PARAM_FRONT_FACE] = ctx->i32;
                params[SI_PARAM_ANCILLARY] = ctx->i32;
                params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
-               params[SI_PARAM_POS_FIXED_PT] = ctx->f32;
+               params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
                num_params = SI_PARAM_POS_FIXED_PT+1;
+
+               if (!ctx->is_monolithic) {
+                       /* Color inputs from the prolog. */
+                       if (shader->selector->info.colors_read) {
+                               unsigned num_color_elements =
+                                       util_bitcount(shader->selector->info.colors_read);
+
+                               assert(num_params + num_color_elements <= ARRAY_SIZE(params));
+                               for (i = 0; i < num_color_elements; i++)
+                                       params[num_params++] = ctx->f32;
+                       }
+
+                       /* Outputs for the epilog. */
+                       num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
+                       num_returns =
+                               num_return_sgprs +
+                               util_bitcount(shader->selector->info.colors_written) * 4 +
+                               shader->selector->info.writes_z +
+                               shader->selector->info.writes_stencil +
+                               shader->selector->info.writes_samplemask +
+                               1 /* SampleMaskIn */;
+
+                       num_returns = MAX2(num_returns,
+                                          num_return_sgprs +
+                                          PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
+
+                       for (i = 0; i < num_return_sgprs; i++)
+                               returns[i] = ctx->i32;
+                       for (; i < num_returns; i++)
+                               returns[i] = ctx->f32;
+               }
                break;
 
        default:
@@ -3711,20 +4042,38 @@ static void create_function(struct si_shader_context *ctx)
        }
 
        assert(num_params <= Elements(params));
-       radeon_llvm_create_func(&ctx->radeon_bld, params, num_params);
-       radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
-
-       for (i = 0; i <= last_sgpr; ++i) {
-               LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
 
-               /* We tell llvm that array inputs are passed by value to allow Sinking pass
-                * to move load. Inputs are constant so this is fine. */
-               if (i <= last_array_pointer)
-                       LLVMAddAttribute(P, LLVMByValAttribute);
-               else
-                       LLVMAddAttribute(P, LLVMInRegAttribute);
+       si_create_function(ctx, returns, num_returns, params,
+                          num_params, last_array_pointer, last_sgpr);
+
+       /* Reserve register locations for VGPR inputs the PS prolog may need. */
+       if (ctx->type == TGSI_PROCESSOR_FRAGMENT &&
+           !ctx->is_monolithic) {
+               radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
+                                         "InitialPSInputAddr",
+                                         S_0286D0_PERSP_SAMPLE_ENA(1) |
+                                         S_0286D0_PERSP_CENTER_ENA(1) |
+                                         S_0286D0_PERSP_CENTROID_ENA(1) |
+                                         S_0286D0_LINEAR_SAMPLE_ENA(1) |
+                                         S_0286D0_LINEAR_CENTER_ENA(1) |
+                                         S_0286D0_LINEAR_CENTROID_ENA(1) |
+                                         S_0286D0_FRONT_FACE_ENA(1) |
+                                         S_0286D0_POS_FIXED_PT_ENA(1));
        }
 
+       shader->info.num_input_sgprs = 0;
+       shader->info.num_input_vgprs = 0;
+
+       for (i = 0; i <= last_sgpr; ++i)
+               shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
+
+       /* Unused fragment shader inputs are eliminated by the compiler,
+        * so we don't know yet how many there will be.
+        */
+       if (ctx->type != TGSI_PROCESSOR_FRAGMENT)
+               for (; i < num_params; ++i)
+                       shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
+
        if (bld_base->info &&
            (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
             bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
@@ -3740,22 +4089,8 @@ static void create_function(struct si_shader_context *ctx)
 
        if ((ctx->type == TGSI_PROCESSOR_VERTEX && shader->key.vs.as_ls) ||
            ctx->type == TGSI_PROCESSOR_TESS_CTRL ||
-           ctx->type == TGSI_PROCESSOR_TESS_EVAL) {
-               /* This is the upper bound, maximum is 32 inputs times 32 vertices */
-               unsigned vertex_data_dw_size = 32*32*4;
-               unsigned patch_data_dw_size = 32*4;
-               /* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */
-               unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size;
-               unsigned lds_dwords = patch_dw_size;
-
-               /* The actual size is computed outside of the shader to reduce
-                * the number of shader variants. */
-               ctx->lds =
-                       LLVMAddGlobalInAddressSpace(gallivm->module,
-                                                   LLVMArrayType(ctx->i32, lds_dwords),
-                                                   "tess_lds",
-                                                   LOCAL_ADDR_SPACE);
-       }
+           ctx->type == TGSI_PROCESSOR_TESS_EVAL)
+               declare_tess_lds(ctx);
 }
 
 static void preload_constants(struct si_shader_context *ctx)
@@ -3887,6 +4222,49 @@ static void preload_ring_buffers(struct si_shader_context *ctx)
        }
 }
 
+static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
+                                        LLVMValueRef param_sampler_views,
+                                        unsigned param_pos_fixed_pt)
+{
+       struct lp_build_tgsi_context *bld_base =
+               &ctx->radeon_bld.soa.bld_base;
+       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct lp_build_emit_data result = {};
+       struct tgsi_full_instruction inst = {};
+       LLVMValueRef desc, sampler_index, address[2], pix;
+
+       /* Use the fixed-point gl_FragCoord input.
+        * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
+        * per coordinate to get the repeating effect.
+        */
+       address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
+       address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
+
+       /* Load the sampler view descriptor. */
+       sampler_index = lp_build_const_int32(gallivm, SI_POLY_STIPPLE_SAMPLER);
+       desc = get_sampler_desc_custom(ctx, param_sampler_views,
+                                      sampler_index, DESC_IMAGE);
+
+       /* Load the texel. */
+       inst.Instruction.Opcode = TGSI_OPCODE_TXF;
+       inst.Texture.Texture = TGSI_TEXTURE_2D_MSAA; /* = use load, not load_mip */
+       result.inst = &inst;
+       set_tex_fetch_args(ctx, &result, TGSI_OPCODE_TXF,
+                          inst.Texture.Texture,
+                          desc, NULL, address, ARRAY_SIZE(address), 0xf);
+       build_tex_intrinsic(&tex_action, bld_base, &result);
+
+       /* Kill the thread accordingly. */
+       pix = LLVMBuildExtractElement(gallivm->builder, result.output[0],
+                                     lp_build_const_int32(gallivm, 3), "");
+       pix = bitcast(bld_base, TGSI_TYPE_FLOAT, pix);
+       pix = LLVMBuildFNeg(gallivm->builder, pix, "");
+
+       lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
+                          LLVMVoidTypeInContext(gallivm->context),
+                          &pix, 1, 0);
+}
+
 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
                                  struct si_shader_config *conf,
                                  unsigned symbol_offset)
@@ -3972,41 +4350,70 @@ void si_shader_apply_scratch_relocs(struct si_context *sctx,
        }
 }
 
+static unsigned si_get_shader_binary_size(struct si_shader *shader)
+{
+       unsigned size = shader->binary.code_size;
+
+       if (shader->prolog)
+               size += shader->prolog->binary.code_size;
+       if (shader->epilog)
+               size += shader->epilog->binary.code_size;
+       return size;
+}
+
 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
 {
-       const struct radeon_shader_binary *binary = &shader->binary;
-       unsigned code_size = binary->code_size + binary->rodata_size;
+       const struct radeon_shader_binary *prolog =
+               shader->prolog ? &shader->prolog->binary : NULL;
+       const struct radeon_shader_binary *epilog =
+               shader->epilog ? &shader->epilog->binary : NULL;
+       const struct radeon_shader_binary *mainb = &shader->binary;
+       unsigned bo_size = si_get_shader_binary_size(shader) +
+                          (!epilog ? mainb->rodata_size : 0);
        unsigned char *ptr;
 
+       assert(!prolog || !prolog->rodata_size);
+       assert((!prolog && !epilog) || !mainb->rodata_size);
+       assert(!epilog || !epilog->rodata_size);
+
        r600_resource_reference(&shader->bo, NULL);
        shader->bo = si_resource_create_custom(&sscreen->b.b,
                                               PIPE_USAGE_IMMUTABLE,
-                                              code_size);
+                                              bo_size);
        if (!shader->bo)
                return -ENOMEM;
 
+       /* Upload. */
        ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
                                        PIPE_TRANSFER_READ_WRITE);
-       util_memcpy_cpu_to_le32(ptr, binary->code, binary->code_size);
-       if (binary->rodata_size > 0) {
-               ptr += binary->code_size;
-               util_memcpy_cpu_to_le32(ptr, binary->rodata,
-                                       binary->rodata_size);
+
+       if (prolog) {
+               util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
+               ptr += prolog->code_size;
        }
 
+       util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
+       ptr += mainb->code_size;
+
+       if (epilog)
+               util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
+       else if (mainb->rodata_size > 0)
+               util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
+
        sscreen->b.ws->buffer_unmap(shader->bo->buf);
        return 0;
 }
 
 static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
-                                      struct pipe_debug_callback *debug)
+                                      struct pipe_debug_callback *debug,
+                                      const char *name, FILE *file)
 {
        char *line, *p;
        unsigned i, count;
 
        if (binary->disasm_string) {
-               fprintf(stderr, "\nShader Disassembly:\n\n");
-               fprintf(stderr, "%s\n", binary->disasm_string);
+               fprintf(file, "Shader %s disassembly:\n", name);
+               fprintf(file, "%s", binary->disasm_string);
 
                if (debug && debug->debug_message) {
                        /* Very long debug messages are cut off, so send the
@@ -4036,9 +4443,9 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary
                                           "Shader Disassembly End");
                }
        } else {
-               fprintf(stderr, "SI CODE:\n");
+               fprintf(file, "Shader %s binary:\n", name);
                for (i = 0; i < binary->code_size; i += 4) {
-                       fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i,
+                       fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
                                binary->code[i + 3], binary->code[i + 2],
                                binary->code[i + 1], binary->code[i]);
                }
@@ -4050,7 +4457,8 @@ static void si_shader_dump_stats(struct si_screen *sscreen,
                                 unsigned num_inputs,
                                 unsigned code_size,
                                 struct pipe_debug_callback *debug,
-                                unsigned processor)
+                                unsigned processor,
+                                FILE *file)
 {
        unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
        unsigned lds_per_wave = 0;
@@ -4086,15 +4494,16 @@ static void si_shader_dump_stats(struct si_screen *sscreen,
        if (lds_per_wave)
                max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
 
-       if (r600_can_dump_shader(&sscreen->b, processor)) {
+       if (file != stderr ||
+           r600_can_dump_shader(&sscreen->b, processor)) {
                if (processor == TGSI_PROCESSOR_FRAGMENT) {
-                       fprintf(stderr, "*** SHADER CONFIG ***\n"
+                       fprintf(file, "*** SHADER CONFIG ***\n"
                                "SPI_PS_INPUT_ADDR = 0x%04x\n"
                                "SPI_PS_INPUT_ENA  = 0x%04x\n",
                                conf->spi_ps_input_addr, conf->spi_ps_input_ena);
                }
 
-               fprintf(stderr, "*** SHADER STATS ***\n"
+               fprintf(file, "*** SHADER STATS ***\n"
                        "SGPRS: %d\n"
                        "VGPRS: %d\n"
                        "Code Size: %d bytes\n"
@@ -4115,16 +4524,63 @@ static void si_shader_dump_stats(struct si_screen *sscreen,
                           max_simd_waves);
 }
 
+static const char *si_get_shader_name(struct si_shader *shader,
+                                     unsigned processor)
+{
+       switch (processor) {
+       case TGSI_PROCESSOR_VERTEX:
+               if (shader->key.vs.as_es)
+                       return "Vertex Shader as ES";
+               else if (shader->key.vs.as_ls)
+                       return "Vertex Shader as LS";
+               else
+                       return "Vertex Shader as VS";
+       case TGSI_PROCESSOR_TESS_CTRL:
+               return "Tessellation Control Shader";
+       case TGSI_PROCESSOR_TESS_EVAL:
+               if (shader->key.tes.as_es)
+                       return "Tessellation Evaluation Shader as ES";
+               else
+                       return "Tessellation Evaluation Shader as VS";
+       case TGSI_PROCESSOR_GEOMETRY:
+               if (shader->gs_copy_shader == NULL)
+                       return "GS Copy Shader as VS";
+               else
+                       return "Geometry Shader";
+       case TGSI_PROCESSOR_FRAGMENT:
+               return "Pixel Shader";
+       case TGSI_PROCESSOR_COMPUTE:
+               return "Compute Shader";
+       default:
+               return "Unknown Shader";
+       }
+}
+
 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
-                   struct pipe_debug_callback *debug, unsigned processor)
+                   struct pipe_debug_callback *debug, unsigned processor,
+                   FILE *file)
 {
-       if (r600_can_dump_shader(&sscreen->b, processor))
-               if (!(sscreen->b.debug_flags & DBG_NO_ASM))
-                       si_shader_dump_disassembly(&shader->binary, debug);
+       if (file != stderr ||
+           (r600_can_dump_shader(&sscreen->b, processor) &&
+            !(sscreen->b.debug_flags & DBG_NO_ASM))) {
+               fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
+
+               if (shader->prolog)
+                       si_shader_dump_disassembly(&shader->prolog->binary,
+                                                  debug, "prolog", file);
+
+               si_shader_dump_disassembly(&shader->binary, debug, "main", file);
+
+               if (shader->epilog)
+                       si_shader_dump_disassembly(&shader->epilog->binary,
+                                                  debug, "epilog", file);
+               fprintf(file, "\n");
+       }
 
        si_shader_dump_stats(sscreen, &shader->config,
                             shader->selector ? shader->selector->info.num_inputs : 0,
-                            shader->binary.code_size, debug, processor);
+                            si_get_shader_binary_size(shader), debug, processor,
+                            file);
 }
 
 int si_compile_llvm(struct si_screen *sscreen,
@@ -4177,6 +4633,19 @@ int si_compile_llvm(struct si_screen *sscreen,
        FREE(binary->global_symbol_offsets);
        binary->config = NULL;
        binary->global_symbol_offsets = NULL;
+
+       /* Some shaders can't have rodata because their binaries can be
+        * concatenated.
+        */
+       if (binary->rodata_size &&
+           (processor == TGSI_PROCESSOR_VERTEX ||
+            processor == TGSI_PROCESSOR_TESS_CTRL ||
+            processor == TGSI_PROCESSOR_TESS_EVAL ||
+            processor == TGSI_PROCESSOR_FRAGMENT)) {
+               fprintf(stderr, "radeonsi: The shader can't have rodata.");
+               return -EINVAL;
+       }
+
        return r;
 }
 
@@ -4196,7 +4665,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 
        outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
 
-       si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm, gsinfo);
+       si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm);
        ctx->type = TGSI_PROCESSOR_VERTEX;
        ctx->is_gs_copy_shader = true;
 
@@ -4241,7 +4710,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 
        si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
 
-       LLVMBuildRetVoid(bld_base->base.gallivm->builder);
+       LLVMBuildRet(gallivm->builder, ctx->return_value);
 
        /* Dump LLVM IR before any optimization passes */
        if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
@@ -4259,7 +4728,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
                if (r600_can_dump_shader(&sscreen->b, TGSI_PROCESSOR_GEOMETRY))
                        fprintf(stderr, "GS Copy Shader:\n");
                si_shader_dump(sscreen, ctx->shader, debug,
-                              TGSI_PROCESSOR_GEOMETRY);
+                              TGSI_PROCESSOR_GEOMETRY, stderr);
                r = si_shader_binary_upload(sscreen, ctx->shader);
        }
 
@@ -4278,35 +4747,38 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
        switch (shader) {
        case PIPE_SHADER_VERTEX:
                fprintf(f, "  instance_divisors = {");
-               for (i = 0; i < Elements(key->vs.instance_divisors); i++)
+               for (i = 0; i < Elements(key->vs.prolog.instance_divisors); i++)
                        fprintf(f, !i ? "%u" : ", %u",
-                               key->vs.instance_divisors[i]);
+                               key->vs.prolog.instance_divisors[i]);
                fprintf(f, "}\n");
                fprintf(f, "  as_es = %u\n", key->vs.as_es);
                fprintf(f, "  as_ls = %u\n", key->vs.as_ls);
-               fprintf(f, "  export_prim_id = %u\n", key->vs.export_prim_id);
+               fprintf(f, "  export_prim_id = %u\n", key->vs.epilog.export_prim_id);
                break;
 
        case PIPE_SHADER_TESS_CTRL:
-               fprintf(f, "  prim_mode = %u\n", key->tcs.prim_mode);
+               fprintf(f, "  prim_mode = %u\n", key->tcs.epilog.prim_mode);
                break;
 
        case PIPE_SHADER_TESS_EVAL:
                fprintf(f, "  as_es = %u\n", key->tes.as_es);
-               fprintf(f, "  export_prim_id = %u\n", key->tes.export_prim_id);
+               fprintf(f, "  export_prim_id = %u\n", key->tes.epilog.export_prim_id);
                break;
 
        case PIPE_SHADER_GEOMETRY:
                break;
 
        case PIPE_SHADER_FRAGMENT:
-               fprintf(f, "  spi_shader_col_format = 0x%x\n", key->ps.spi_shader_col_format);
-               fprintf(f, "  last_cbuf = %u\n", key->ps.last_cbuf);
-               fprintf(f, "  color_two_side = %u\n", key->ps.color_two_side);
-               fprintf(f, "  alpha_func = %u\n", key->ps.alpha_func);
-               fprintf(f, "  alpha_to_one = %u\n", key->ps.alpha_to_one);
-               fprintf(f, "  poly_stipple = %u\n", key->ps.poly_stipple);
-               fprintf(f, "  clamp_color = %u\n", key->ps.clamp_color);
+               fprintf(f, "  prolog.color_two_side = %u\n", key->ps.prolog.color_two_side);
+               fprintf(f, "  prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple);
+               fprintf(f, "  prolog.force_persample_interp = %u\n", key->ps.prolog.force_persample_interp);
+               fprintf(f, "  epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format);
+               fprintf(f, "  epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8);
+               fprintf(f, "  epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf);
+               fprintf(f, "  epilog.alpha_func = %u\n", key->ps.epilog.alpha_func);
+               fprintf(f, "  epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one);
+               fprintf(f, "  epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing);
+               fprintf(f, "  epilog.clamp_color = %u\n", key->ps.epilog.clamp_color);
                break;
 
        default:
@@ -4317,13 +4789,12 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
 static void si_init_shader_ctx(struct si_shader_context *ctx,
                               struct si_screen *sscreen,
                               struct si_shader *shader,
-                              LLVMTargetMachineRef tm,
-                              struct tgsi_shader_info *info)
+                              LLVMTargetMachineRef tm)
 {
        struct lp_build_tgsi_context *bld_base;
 
        memset(ctx, 0, sizeof(*ctx));
-       radeon_llvm_context_init(&ctx->radeon_bld);
+       radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--");
        ctx->tm = tm;
        ctx->screen = sscreen;
        if (shader && shader->selector)
@@ -4336,15 +4807,18 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
        ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context);
        ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context);
        ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context);
+       ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context);
        ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128);
        ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context);
        ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
+       ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
        ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
        ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
        ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
 
        bld_base = &ctx->radeon_bld.soa.bld_base;
-       bld_base->info = info;
+       if (shader && shader->selector)
+               bld_base->info = &shader->selector->info;
        bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
 
        bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
@@ -4380,40 +4854,31 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
        bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
 }
 
-int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
-                    struct si_shader *shader,
-                    struct pipe_debug_callback *debug)
+int si_compile_tgsi_shader(struct si_screen *sscreen,
+                          LLVMTargetMachineRef tm,
+                          struct si_shader *shader,
+                          bool is_monolithic,
+                          struct pipe_debug_callback *debug)
 {
        struct si_shader_selector *sel = shader->selector;
-       struct tgsi_token *tokens = sel->tokens;
        struct si_shader_context ctx;
        struct lp_build_tgsi_context *bld_base;
-       struct tgsi_shader_info stipple_shader_info;
        LLVMModuleRef mod;
        int r = 0;
-       bool poly_stipple = sel->type == PIPE_SHADER_FRAGMENT &&
-                           shader->key.ps.poly_stipple;
-
-       if (poly_stipple) {
-               tokens = util_pstipple_create_fragment_shader(tokens, NULL,
-                                               SI_POLY_STIPPLE_SAMPLER,
-                                               TGSI_FILE_SYSTEM_VALUE);
-               tgsi_scan_shader(tokens, &stipple_shader_info);
-       }
 
        /* Dump TGSI code before doing TGSI->LLVM conversion in case the
         * conversion fails. */
        if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
            !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
                si_dump_shader_key(sel->type, &shader->key, stderr);
-               tgsi_dump(tokens, 0);
+               tgsi_dump(sel->tokens, 0);
                si_dump_streamout(&sel->so);
        }
 
-       si_init_shader_ctx(&ctx, sscreen, shader, tm,
-                          poly_stipple ? &stipple_shader_info : &sel->info);
+       si_init_shader_ctx(&ctx, sscreen, shader, tm);
+       ctx.is_monolithic = is_monolithic;
 
-       shader->uses_instanceid = sel->info.uses_instanceid;
+       shader->info.uses_instanceid = sel->info.uses_instanceid;
 
        bld_base = &ctx.radeon_bld.soa.bld_base;
        ctx.radeon_bld.load_system_value = declare_system_value;
@@ -4447,7 +4912,10 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
                break;
        case TGSI_PROCESSOR_FRAGMENT:
                ctx.radeon_bld.load_input = declare_input_fs;
-               bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
+               if (is_monolithic)
+                       bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
+               else
+                       bld_base->emit_epilogue = si_llvm_return_fs_outputs;
                break;
        default:
                assert(!"Unsupported shader type");
@@ -4461,6 +4929,14 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
        preload_streamout_buffers(&ctx);
        preload_ring_buffers(&ctx);
 
+       if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT &&
+           shader->key.ps.prolog.poly_stipple) {
+               LLVMValueRef views = LLVMGetParam(ctx.radeon_bld.main_fn,
+                                                 SI_PARAM_SAMPLERS);
+               si_llvm_emit_polygon_stipple(&ctx, views,
+                                            SI_PARAM_POS_FIXED_PT);
+       }
+
        if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
                int i;
                for (i = 0; i < 4; i++) {
@@ -4470,12 +4946,12 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
                }
        }
 
-       if (!lp_build_tgsi_llvm(bld_base, tokens)) {
+       if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
                fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
                goto out;
        }
 
-       LLVMBuildRetVoid(bld_base->base.gallivm->builder);
+       LLVMBuildRet(bld_base->base.gallivm->builder, ctx.return_value);
        mod = bld_base->base.gallivm->module;
 
        /* Dump LLVM IR before any optimization passes */
@@ -4492,16 +4968,49 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
                goto out;
        }
 
-       si_shader_dump(sscreen, shader, debug, ctx.type);
+       radeon_llvm_dispose(&ctx.radeon_bld);
 
-       r = si_shader_binary_upload(sscreen, shader);
-       if (r) {
-               fprintf(stderr, "LLVM failed to upload shader\n");
-               goto out;
+       /* Calculate the number of fragment input VGPRs. */
+       if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
+               shader->info.num_input_vgprs = 0;
+               shader->info.face_vgpr_index = -1;
+
+               if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
+                       shader->info.num_input_vgprs += 2;
+               if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
+                       shader->info.num_input_vgprs += 2;
+               if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
+                       shader->info.num_input_vgprs += 2;
+               if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
+                       shader->info.num_input_vgprs += 3;
+               if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
+                       shader->info.num_input_vgprs += 2;
+               if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
+                       shader->info.num_input_vgprs += 2;
+               if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
+                       shader->info.num_input_vgprs += 2;
+               if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
+                       shader->info.num_input_vgprs += 1;
+               if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
+                       shader->info.num_input_vgprs += 1;
+               if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
+                       shader->info.num_input_vgprs += 1;
+               if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
+                       shader->info.num_input_vgprs += 1;
+               if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
+                       shader->info.num_input_vgprs += 1;
+               if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
+                       shader->info.face_vgpr_index = shader->info.num_input_vgprs;
+                       shader->info.num_input_vgprs += 1;
+               }
+               if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
+                       shader->info.num_input_vgprs += 1;
+               if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
+                       shader->info.num_input_vgprs += 1;
+               if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
+                       shader->info.num_input_vgprs += 1;
        }
 
-       radeon_llvm_dispose(&ctx.radeon_bld);
-
        if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
                shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
                shader->gs_copy_shader->selector = shader->selector;
@@ -4517,11 +5026,969 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 out:
        for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++)
                FREE(ctx.constants[i]);
-       if (poly_stipple)
-               tgsi_free_tokens(tokens);
        return r;
 }
 
+/**
+ * Create, compile and return a shader part (prolog or epilog).
+ *
+ * \param sscreen      screen
+ * \param list         list of shader parts of the same category
+ * \param key          shader part key
+ * \param tm           LLVM target machine
+ * \param debug                debug callback
+ * \param compile      the callback responsible for compilation
+ * \return             non-NULL on success
+ */
+static struct si_shader_part *
+si_get_shader_part(struct si_screen *sscreen,
+                  struct si_shader_part **list,
+                  union si_shader_part_key *key,
+                  LLVMTargetMachineRef tm,
+                  struct pipe_debug_callback *debug,
+                  bool (*compile)(struct si_screen *,
+                                  LLVMTargetMachineRef,
+                                  struct pipe_debug_callback *,
+                                  struct si_shader_part *))
+{
+       struct si_shader_part *result;
+
+       pipe_mutex_lock(sscreen->shader_parts_mutex);
+
+       /* Find existing. */
+       for (result = *list; result; result = result->next) {
+               if (memcmp(&result->key, key, sizeof(*key)) == 0) {
+                       pipe_mutex_unlock(sscreen->shader_parts_mutex);
+                       return result;
+               }
+       }
+
+       /* Compile a new one. */
+       result = CALLOC_STRUCT(si_shader_part);
+       result->key = *key;
+       if (!compile(sscreen, tm, debug, result)) {
+               FREE(result);
+               pipe_mutex_unlock(sscreen->shader_parts_mutex);
+               return NULL;
+       }
+
+       result->next = *list;
+       *list = result;
+       pipe_mutex_unlock(sscreen->shader_parts_mutex);
+       return result;
+}
+
+/**
+ * Create a vertex shader prolog.
+ *
+ * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
+ * All inputs are returned unmodified. The vertex load indices are
+ * stored after them, which will used by the API VS for fetching inputs.
+ *
+ * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
+ *   input_v0,
+ *   input_v1,
+ *   input_v2,
+ *   input_v3,
+ *   (VertexID + BaseVertex),
+ *   (InstanceID + StartInstance),
+ *   (InstanceID / 2 + StartInstance)
+ */
+static bool si_compile_vs_prolog(struct si_screen *sscreen,
+                                LLVMTargetMachineRef tm,
+                                struct pipe_debug_callback *debug,
+                                struct si_shader_part *out)
+{
+       union si_shader_part_key *key = &out->key;
+       struct si_shader shader = {};
+       struct si_shader_context ctx;
+       struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
+       LLVMTypeRef *params, *returns;
+       LLVMValueRef ret, func;
+       int last_sgpr, num_params, num_returns, i;
+       bool status = true;
+
+       si_init_shader_ctx(&ctx, sscreen, &shader, tm);
+       ctx.type = TGSI_PROCESSOR_VERTEX;
+       ctx.param_vertex_id = key->vs_prolog.num_input_sgprs;
+       ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3;
+
+       /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
+       params = alloca((key->vs_prolog.num_input_sgprs + 4) *
+                       sizeof(LLVMTypeRef));
+       returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
+                         key->vs_prolog.last_input + 1) *
+                        sizeof(LLVMTypeRef));
+       num_params = 0;
+       num_returns = 0;
+
+       /* Declare input and output SGPRs. */
+       num_params = 0;
+       for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
+               params[num_params++] = ctx.i32;
+               returns[num_returns++] = ctx.i32;
+       }
+       last_sgpr = num_params - 1;
+
+       /* 4 preloaded VGPRs (outputs must be floats) */
+       for (i = 0; i < 4; i++) {
+               params[num_params++] = ctx.i32;
+               returns[num_returns++] = ctx.f32;
+       }
+
+       /* Vertex load indices. */
+       for (i = 0; i <= key->vs_prolog.last_input; i++)
+               returns[num_returns++] = ctx.f32;
+
+       /* Create the function. */
+       si_create_function(&ctx, returns, num_returns, params,
+                          num_params, -1, last_sgpr);
+       func = ctx.radeon_bld.main_fn;
+
+       /* Copy inputs to outputs. This should be no-op, as the registers match,
+        * but it will prevent the compiler from overwriting them unintentionally.
+        */
+       ret = ctx.return_value;
+       for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
+               LLVMValueRef p = LLVMGetParam(func, i);
+               ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
+       }
+       for (i = num_params - 4; i < num_params; i++) {
+               LLVMValueRef p = LLVMGetParam(func, i);
+               p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, "");
+               ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
+       }
+
+       /* Compute vertex load indices from instance divisors. */
+       for (i = 0; i <= key->vs_prolog.last_input; i++) {
+               unsigned divisor = key->vs_prolog.states.instance_divisors[i];
+               LLVMValueRef index;
+
+               if (divisor) {
+                       /* InstanceID / Divisor + StartInstance */
+                       index = get_instance_index_for_fetch(&ctx.radeon_bld,
+                                                            SI_SGPR_START_INSTANCE,
+                                                            divisor);
+               } else {
+                       /* VertexID + BaseVertex */
+                       index = LLVMBuildAdd(gallivm->builder,
+                                            LLVMGetParam(func, ctx.param_vertex_id),
+                                            LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
+               }
+
+               index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, "");
+               ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
+                                          num_params++, "");
+       }
+
+       /* Compile. */
+       LLVMBuildRet(gallivm->builder, ret);
+       radeon_llvm_finalize_module(&ctx.radeon_bld);
+
+       if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
+                           gallivm->module, debug, ctx.type,
+                           "Vertex Shader Prolog"))
+               status = false;
+
+       radeon_llvm_dispose(&ctx.radeon_bld);
+       return status;
+}
+
+/**
+ * Compile the vertex shader epilog. This is also used by the tessellation
+ * evaluation shader compiled as VS.
+ *
+ * The input is PrimitiveID.
+ *
+ * If PrimitiveID is required by the pixel shader, export it.
+ * Otherwise, do nothing.
+ */
+static bool si_compile_vs_epilog(struct si_screen *sscreen,
+                                LLVMTargetMachineRef tm,
+                                struct pipe_debug_callback *debug,
+                                struct si_shader_part *out)
+{
+       union si_shader_part_key *key = &out->key;
+       struct si_shader_context ctx;
+       struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
+       struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
+       LLVMTypeRef params[5];
+       int num_params, i;
+       bool status = true;
+
+       si_init_shader_ctx(&ctx, sscreen, NULL, tm);
+       ctx.type = TGSI_PROCESSOR_VERTEX;
+
+       /* Declare input VGPRs. */
+       num_params = key->vs_epilog.states.export_prim_id ?
+                          (VS_EPILOG_PRIMID_LOC + 1) : 0;
+       assert(num_params <= ARRAY_SIZE(params));
+
+       for (i = 0; i < num_params; i++)
+               params[i] = ctx.f32;
+
+       /* Create the function. */
+       si_create_function(&ctx, NULL, 0, params, num_params,
+                          -1, -1);
+
+       /* Emit exports. */
+       if (key->vs_epilog.states.export_prim_id) {
+               struct lp_build_context *base = &bld_base->base;
+               struct lp_build_context *uint = &bld_base->uint_bld;
+               LLVMValueRef args[9];
+
+               args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
+               args[1] = uint->zero; /* whether the EXEC mask is valid */
+               args[2] = uint->zero; /* DONE bit */
+               args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM +
+                                              key->vs_epilog.prim_id_param_offset);
+               args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
+               args[5] = LLVMGetParam(ctx.radeon_bld.main_fn,
+                                      VS_EPILOG_PRIMID_LOC); /* X */
+               args[6] = uint->undef; /* Y */
+               args[7] = uint->undef; /* Z */
+               args[8] = uint->undef; /* W */
+
+               lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+                                  LLVMVoidTypeInContext(base->gallivm->context),
+                                  args, 9, 0);
+       }
+
+       /* Compile. */
+       LLVMBuildRet(gallivm->builder, ctx.return_value);
+       radeon_llvm_finalize_module(&ctx.radeon_bld);
+
+       if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
+                           gallivm->module, debug, ctx.type,
+                           "Vertex Shader Epilog"))
+               status = false;
+
+       radeon_llvm_dispose(&ctx.radeon_bld);
+       return status;
+}
+
+/**
+ * Create & compile a vertex shader epilog. This a helper used by VS and TES.
+ */
+static bool si_get_vs_epilog(struct si_screen *sscreen,
+                            LLVMTargetMachineRef tm,
+                            struct si_shader *shader,
+                            struct pipe_debug_callback *debug,
+                            struct si_vs_epilog_bits *states)
+{
+       union si_shader_part_key epilog_key;
+
+       memset(&epilog_key, 0, sizeof(epilog_key));
+       epilog_key.vs_epilog.states = *states;
+
+       /* Set up the PrimitiveID output. */
+       if (shader->key.vs.epilog.export_prim_id) {
+               unsigned index = shader->selector->info.num_outputs;
+               unsigned offset = shader->info.nr_param_exports++;
+
+               epilog_key.vs_epilog.prim_id_param_offset = offset;
+               assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
+               shader->info.vs_output_param_offset[index] = offset;
+       }
+
+       shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
+                                           &epilog_key, tm, debug,
+                                           si_compile_vs_epilog);
+       return shader->epilog != NULL;
+}
+
+/**
+ * Select and compile (or reuse) vertex shader parts (prolog & epilog).
+ */
+static bool si_shader_select_vs_parts(struct si_screen *sscreen,
+                                     LLVMTargetMachineRef tm,
+                                     struct si_shader *shader,
+                                     struct pipe_debug_callback *debug)
+{
+       struct tgsi_shader_info *info = &shader->selector->info;
+       union si_shader_part_key prolog_key;
+       unsigned i;
+
+       /* Get the prolog. */
+       memset(&prolog_key, 0, sizeof(prolog_key));
+       prolog_key.vs_prolog.states = shader->key.vs.prolog;
+       prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
+       prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
+
+       /* The prolog is a no-op if there are no inputs. */
+       if (info->num_inputs) {
+               shader->prolog =
+                       si_get_shader_part(sscreen, &sscreen->vs_prologs,
+                                          &prolog_key, tm, debug,
+                                          si_compile_vs_prolog);
+               if (!shader->prolog)
+                       return false;
+       }
+
+       /* Get the epilog. */
+       if (!shader->key.vs.as_es && !shader->key.vs.as_ls &&
+           !si_get_vs_epilog(sscreen, tm, shader, debug,
+                             &shader->key.vs.epilog))
+               return false;
+
+       /* Set the instanceID flag. */
+       for (i = 0; i < info->num_inputs; i++)
+               if (prolog_key.vs_prolog.states.instance_divisors[i])
+                       shader->info.uses_instanceid = true;
+
+       return true;
+}
+
+/**
+ * Select and compile (or reuse) TES parts (epilog).
+ */
+static bool si_shader_select_tes_parts(struct si_screen *sscreen,
+                                      LLVMTargetMachineRef tm,
+                                      struct si_shader *shader,
+                                      struct pipe_debug_callback *debug)
+{
+       if (shader->key.tes.as_es)
+               return true;
+
+       /* TES compiled as VS. */
+       return si_get_vs_epilog(sscreen, tm, shader, debug,
+                               &shader->key.tes.epilog);
+}
+
+/**
+ * Compile the TCS epilog. This writes tesselation factors to memory based on
+ * the output primitive type of the tesselator (determined by TES).
+ */
+static bool si_compile_tcs_epilog(struct si_screen *sscreen,
+                                 LLVMTargetMachineRef tm,
+                                 struct pipe_debug_callback *debug,
+                                 struct si_shader_part *out)
+{
+       union si_shader_part_key *key = &out->key;
+       struct si_shader shader = {};
+       struct si_shader_context ctx;
+       struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
+       struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
+       LLVMTypeRef params[16];
+       LLVMValueRef func;
+       int last_array_pointer, last_sgpr, num_params;
+       bool status = true;
+
+       si_init_shader_ctx(&ctx, sscreen, &shader, tm);
+       ctx.type = TGSI_PROCESSOR_TESS_CTRL;
+       shader.key.tcs.epilog = key->tcs_epilog.states;
+
+       /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
+       params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS);
+       last_array_pointer = SI_PARAM_RW_BUFFERS;
+       params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
+       params[SI_PARAM_SAMPLERS] = ctx.i64;
+       params[SI_PARAM_UNUSED] = ctx.i64;
+       params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
+       params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
+       params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
+       params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32;
+       last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
+       num_params = last_sgpr + 1;
+
+       params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */
+       params[num_params++] = ctx.i32; /* invocation ID within the patch */
+       params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */
+
+       /* Create the function. */
+       si_create_function(&ctx, NULL, 0, params, num_params,
+                          last_array_pointer, last_sgpr);
+       declare_tess_lds(&ctx);
+       func = ctx.radeon_bld.main_fn;
+
+       si_write_tess_factors(bld_base,
+                             LLVMGetParam(func, last_sgpr + 1),
+                             LLVMGetParam(func, last_sgpr + 2),
+                             LLVMGetParam(func, last_sgpr + 3));
+
+       /* Compile. */
+       LLVMBuildRet(gallivm->builder, ctx.return_value);
+       radeon_llvm_finalize_module(&ctx.radeon_bld);
+
+       if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
+                           gallivm->module, debug, ctx.type,
+                           "Tessellation Control Shader Epilog"))
+               status = false;
+
+       radeon_llvm_dispose(&ctx.radeon_bld);
+       return status;
+}
+
+/**
+ * Select and compile (or reuse) TCS parts (epilog).
+ */
+static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
+                                      LLVMTargetMachineRef tm,
+                                      struct si_shader *shader,
+                                      struct pipe_debug_callback *debug)
+{
+       union si_shader_part_key epilog_key;
+
+       /* Get the epilog. */
+       memset(&epilog_key, 0, sizeof(epilog_key));
+       epilog_key.tcs_epilog.states = shader->key.tcs.epilog;
+
+       shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
+                                           &epilog_key, tm, debug,
+                                           si_compile_tcs_epilog);
+       return shader->epilog != NULL;
+}
+
+/**
+ * Compile the pixel shader prolog. This handles:
+ * - two-side color selection and interpolation
+ * - overriding interpolation parameters for the API PS
+ * - polygon stippling
+ *
+ * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
+ * overriden by other states. (e.g. per-sample interpolation)
+ * Interpolated colors are stored after the preloaded VGPRs.
+ */
+static bool si_compile_ps_prolog(struct si_screen *sscreen,
+                                LLVMTargetMachineRef tm,
+                                struct pipe_debug_callback *debug,
+                                struct si_shader_part *out)
+{
+       union si_shader_part_key *key = &out->key;
+       struct si_shader shader = {};
+       struct si_shader_context ctx;
+       struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
+       LLVMTypeRef *params;
+       LLVMValueRef ret, func;
+       int last_sgpr, num_params, num_returns, i, num_color_channels;
+       bool status = true;
+
+       si_init_shader_ctx(&ctx, sscreen, &shader, tm);
+       ctx.type = TGSI_PROCESSOR_FRAGMENT;
+       shader.key.ps.prolog = key->ps_prolog.states;
+
+       /* Number of inputs + 8 color elements. */
+       params = alloca((key->ps_prolog.num_input_sgprs +
+                        key->ps_prolog.num_input_vgprs + 8) *
+                       sizeof(LLVMTypeRef));
+
+       /* Declare inputs. */
+       num_params = 0;
+       for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
+               params[num_params++] = ctx.i32;
+       last_sgpr = num_params - 1;
+
+       for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
+               params[num_params++] = ctx.f32;
+
+       /* Declare outputs (same as inputs + add colors if needed) */
+       num_returns = num_params;
+       num_color_channels = util_bitcount(key->ps_prolog.colors_read);
+       for (i = 0; i < num_color_channels; i++)
+               params[num_returns++] = ctx.f32;
+
+       /* Create the function. */
+       si_create_function(&ctx, params, num_returns, params,
+                          num_params, -1, last_sgpr);
+       func = ctx.radeon_bld.main_fn;
+
+       /* Copy inputs to outputs. This should be no-op, as the registers match,
+        * but it will prevent the compiler from overwriting them unintentionally.
+        */
+       ret = ctx.return_value;
+       for (i = 0; i < num_params; i++) {
+               LLVMValueRef p = LLVMGetParam(func, i);
+               ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
+       }
+
+       /* Polygon stippling. */
+       if (key->ps_prolog.states.poly_stipple) {
+               /* POS_FIXED_PT is always last. */
+               unsigned pos = key->ps_prolog.num_input_sgprs +
+                              key->ps_prolog.num_input_vgprs - 1;
+               LLVMValueRef ptr[2], views;
+
+               /* Get the pointer to sampler views. */
+               ptr[0] = LLVMGetParam(func, SI_SGPR_SAMPLERS);
+               ptr[1] = LLVMGetParam(func, SI_SGPR_SAMPLERS+1);
+               views = lp_build_gather_values(gallivm, ptr, 2);
+               views = LLVMBuildBitCast(gallivm->builder, views, ctx.i64, "");
+               views = LLVMBuildIntToPtr(gallivm->builder, views,
+                                         const_array(ctx.v8i32, SI_NUM_SAMPLERS), "");
+
+               si_llvm_emit_polygon_stipple(&ctx, views, pos);
+       }
+
+       /* Interpolate colors. */
+       for (i = 0; i < 2; i++) {
+               unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
+               unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
+                                    key->ps_prolog.face_vgpr_index;
+               LLVMValueRef interp[2], color[4];
+               LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
+
+               if (!writemask)
+                       continue;
+
+               /* If the interpolation qualifier is not CONSTANT (-1). */
+               if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
+                       unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
+                                              key->ps_prolog.color_interp_vgpr_index[i];
+
+                       interp[0] = LLVMGetParam(func, interp_vgpr);
+                       interp[1] = LLVMGetParam(func, interp_vgpr + 1);
+                       interp_ij = lp_build_gather_values(gallivm, interp, 2);
+                       interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij,
+                                                    ctx.v2i32, "");
+               }
+
+               /* Use the absolute location of the input. */
+               prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
+
+               if (key->ps_prolog.states.color_two_side) {
+                       face = LLVMGetParam(func, face_vgpr);
+                       face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, "");
+               }
+
+               interp_fs_input(&ctx,
+                               key->ps_prolog.color_attr_index[i],
+                               TGSI_SEMANTIC_COLOR, i,
+                               key->ps_prolog.num_interp_inputs,
+                               key->ps_prolog.colors_read, interp_ij,
+                               prim_mask, face, color);
+
+               while (writemask) {
+                       unsigned chan = u_bit_scan(&writemask);
+                       ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
+                                                  num_params++, "");
+               }
+       }
+
+       /* Force per-sample interpolation. */
+       if (key->ps_prolog.states.force_persample_interp) {
+               unsigned i, base = key->ps_prolog.num_input_sgprs;
+               LLVMValueRef persp_sample[2], linear_sample[2];
+
+               /* Read PERSP_SAMPLE. */
+               for (i = 0; i < 2; i++)
+                       persp_sample[i] = LLVMGetParam(func, base + i);
+               /* Overwrite PERSP_CENTER. */
+               for (i = 0; i < 2; i++)
+                       ret = LLVMBuildInsertValue(gallivm->builder, ret,
+                                                  persp_sample[i], base + 2 + i, "");
+               /* Overwrite PERSP_CENTROID. */
+               for (i = 0; i < 2; i++)
+                       ret = LLVMBuildInsertValue(gallivm->builder, ret,
+                                                  persp_sample[i], base + 4 + i, "");
+               /* Read LINEAR_SAMPLE. */
+               for (i = 0; i < 2; i++)
+                       linear_sample[i] = LLVMGetParam(func, base + 6 + i);
+               /* Overwrite LINEAR_CENTER. */
+               for (i = 0; i < 2; i++)
+                       ret = LLVMBuildInsertValue(gallivm->builder, ret,
+                                                  linear_sample[i], base + 8 + i, "");
+               /* Overwrite LINEAR_CENTROID. */
+               for (i = 0; i < 2; i++)
+                       ret = LLVMBuildInsertValue(gallivm->builder, ret,
+                                                  linear_sample[i], base + 10 + i, "");
+       }
+
+       /* Compile. */
+       LLVMBuildRet(gallivm->builder, ret);
+       radeon_llvm_finalize_module(&ctx.radeon_bld);
+
+       if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
+                           gallivm->module, debug, ctx.type,
+                           "Fragment Shader Prolog"))
+               status = false;
+
+       radeon_llvm_dispose(&ctx.radeon_bld);
+       return status;
+}
+
+/**
+ * Compile the pixel shader epilog. This handles everything that must be
+ * emulated for pixel shader exports. (alpha-test, format conversions, etc)
+ */
+static bool si_compile_ps_epilog(struct si_screen *sscreen,
+                                LLVMTargetMachineRef tm,
+                                struct pipe_debug_callback *debug,
+                                struct si_shader_part *out)
+{
+       union si_shader_part_key *key = &out->key;
+       struct si_shader shader = {};
+       struct si_shader_context ctx;
+       struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
+       struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
+       LLVMTypeRef params[16+8*4+3];
+       LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
+       int last_array_pointer, last_sgpr, num_params, i;
+       bool status = true;
+
+       si_init_shader_ctx(&ctx, sscreen, &shader, tm);
+       ctx.type = TGSI_PROCESSOR_FRAGMENT;
+       shader.key.ps.epilog = key->ps_epilog.states;
+
+       /* Declare input SGPRs. */
+       params[SI_PARAM_RW_BUFFERS] = ctx.i64;
+       params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
+       params[SI_PARAM_SAMPLERS] = ctx.i64;
+       params[SI_PARAM_UNUSED] = ctx.i64;
+       params[SI_PARAM_ALPHA_REF] = ctx.f32;
+       last_array_pointer = -1;
+       last_sgpr = SI_PARAM_ALPHA_REF;
+
+       /* Declare input VGPRs. */
+       num_params = (last_sgpr + 1) +
+                    util_bitcount(key->ps_epilog.colors_written) * 4 +
+                    key->ps_epilog.writes_z +
+                    key->ps_epilog.writes_stencil +
+                    key->ps_epilog.writes_samplemask;
+
+       num_params = MAX2(num_params,
+                         last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
+
+       assert(num_params <= ARRAY_SIZE(params));
+
+       for (i = last_sgpr + 1; i < num_params; i++)
+               params[i] = ctx.f32;
+
+       /* Create the function. */
+       si_create_function(&ctx, NULL, 0, params, num_params,
+                          last_array_pointer, last_sgpr);
+       /* Disable elimination of unused inputs. */
+       radeon_llvm_add_attribute(ctx.radeon_bld.main_fn,
+                                 "InitialPSInputAddr", 0xffffff);
+
+       /* Process colors. */
+       unsigned vgpr = last_sgpr + 1;
+       unsigned colors_written = key->ps_epilog.colors_written;
+       int last_color_export = -1;
+
+       /* Find the last color export. */
+       if (!key->ps_epilog.writes_z &&
+           !key->ps_epilog.writes_stencil &&
+           !key->ps_epilog.writes_samplemask) {
+               unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
+
+               /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
+               if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
+                       /* Just set this if any of the colorbuffers are enabled. */
+                       if (spi_format &
+                           ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
+                               last_color_export = 0;
+               } else {
+                       for (i = 0; i < 8; i++)
+                               if (colors_written & (1 << i) &&
+                                   (spi_format >> (i * 4)) & 0xf)
+                                       last_color_export = i;
+               }
+       }
+
+       while (colors_written) {
+               LLVMValueRef color[4];
+               int mrt = u_bit_scan(&colors_written);
+
+               for (i = 0; i < 4; i++)
+                       color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
+
+               si_export_mrt_color(bld_base, color, mrt,
+                                   num_params - 1,
+                                   mrt == last_color_export);
+       }
+
+       /* Process depth, stencil, samplemask. */
+       if (key->ps_epilog.writes_z)
+               depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
+       if (key->ps_epilog.writes_stencil)
+               stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
+       if (key->ps_epilog.writes_samplemask)
+               samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
+
+       if (depth || stencil || samplemask)
+               si_export_mrt_z(bld_base, depth, stencil, samplemask);
+       else if (last_color_export == -1)
+               si_export_null(bld_base);
+
+       /* Compile. */
+       LLVMBuildRetVoid(gallivm->builder);
+       radeon_llvm_finalize_module(&ctx.radeon_bld);
+
+       if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
+                           gallivm->module, debug, ctx.type,
+                           "Fragment Shader Epilog"))
+               status = false;
+
+       radeon_llvm_dispose(&ctx.radeon_bld);
+       return status;
+}
+
+/**
+ * Select and compile (or reuse) pixel shader parts (prolog & epilog).
+ */
+static bool si_shader_select_ps_parts(struct si_screen *sscreen,
+                                     LLVMTargetMachineRef tm,
+                                     struct si_shader *shader,
+                                     struct pipe_debug_callback *debug)
+{
+       struct tgsi_shader_info *info = &shader->selector->info;
+       union si_shader_part_key prolog_key;
+       union si_shader_part_key epilog_key;
+       unsigned i;
+
+       /* Get the prolog. */
+       memset(&prolog_key, 0, sizeof(prolog_key));
+       prolog_key.ps_prolog.states = shader->key.ps.prolog;
+       prolog_key.ps_prolog.colors_read = info->colors_read;
+       prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
+       prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
+
+       if (info->colors_read) {
+               unsigned *color = shader->selector->color_attr_index;
+
+               if (shader->key.ps.prolog.color_two_side) {
+                       /* BCOLORs are stored after the last input. */
+                       prolog_key.ps_prolog.num_interp_inputs = info->num_inputs;
+                       prolog_key.ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
+                       shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
+               }
+
+               for (i = 0; i < 2; i++) {
+                       unsigned location = info->input_interpolate_loc[color[i]];
+
+                       if (!(info->colors_read & (0xf << i*4)))
+                               continue;
+
+                       prolog_key.ps_prolog.color_attr_index[i] = color[i];
+
+                       /* Force per-sample interpolation for the colors here. */
+                       if (shader->key.ps.prolog.force_persample_interp)
+                               location = TGSI_INTERPOLATE_LOC_SAMPLE;
+
+                       switch (info->input_interpolate[color[i]]) {
+                       case TGSI_INTERPOLATE_CONSTANT:
+                               prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1;
+                               break;
+                       case TGSI_INTERPOLATE_PERSPECTIVE:
+                       case TGSI_INTERPOLATE_COLOR:
+                               switch (location) {
+                               case TGSI_INTERPOLATE_LOC_SAMPLE:
+                                       prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0;
+                                       shader->config.spi_ps_input_ena |=
+                                               S_0286CC_PERSP_SAMPLE_ENA(1);
+                                       break;
+                               case TGSI_INTERPOLATE_LOC_CENTER:
+                                       prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2;
+                                       shader->config.spi_ps_input_ena |=
+                                               S_0286CC_PERSP_CENTER_ENA(1);
+                                       break;
+                               case TGSI_INTERPOLATE_LOC_CENTROID:
+                                       prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4;
+                                       shader->config.spi_ps_input_ena |=
+                                               S_0286CC_PERSP_CENTROID_ENA(1);
+                                       break;
+                               default:
+                                       assert(0);
+                               }
+                               break;
+                       case TGSI_INTERPOLATE_LINEAR:
+                               switch (location) {
+                               case TGSI_INTERPOLATE_LOC_SAMPLE:
+                                       prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6;
+                                       shader->config.spi_ps_input_ena |=
+                                               S_0286CC_LINEAR_SAMPLE_ENA(1);
+                                       break;
+                               case TGSI_INTERPOLATE_LOC_CENTER:
+                                       prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8;
+                                       shader->config.spi_ps_input_ena |=
+                                               S_0286CC_LINEAR_CENTER_ENA(1);
+                                       break;
+                               case TGSI_INTERPOLATE_LOC_CENTROID:
+                                       prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10;
+                                       shader->config.spi_ps_input_ena |=
+                                               S_0286CC_LINEAR_CENTROID_ENA(1);
+                                       break;
+                               default:
+                                       assert(0);
+                               }
+                               break;
+                       default:
+                               assert(0);
+                       }
+               }
+       }
+
+       /* The prolog is a no-op if these aren't set. */
+       if (prolog_key.ps_prolog.colors_read ||
+           prolog_key.ps_prolog.states.force_persample_interp ||
+           prolog_key.ps_prolog.states.poly_stipple) {
+               shader->prolog =
+                       si_get_shader_part(sscreen, &sscreen->ps_prologs,
+                                          &prolog_key, tm, debug,
+                                          si_compile_ps_prolog);
+               if (!shader->prolog)
+                       return false;
+       }
+
+       /* Get the epilog. */
+       memset(&epilog_key, 0, sizeof(epilog_key));
+       epilog_key.ps_epilog.colors_written = info->colors_written;
+       epilog_key.ps_epilog.writes_z = info->writes_z;
+       epilog_key.ps_epilog.writes_stencil = info->writes_stencil;
+       epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask;
+       epilog_key.ps_epilog.states = shader->key.ps.epilog;
+
+       shader->epilog =
+               si_get_shader_part(sscreen, &sscreen->ps_epilogs,
+                                  &epilog_key, tm, debug,
+                                  si_compile_ps_epilog);
+       if (!shader->epilog)
+               return false;
+
+       /* Enable POS_FIXED_PT if polygon stippling is enabled. */
+       if (shader->key.ps.prolog.poly_stipple) {
+               shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
+               assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
+       }
+
+       /* Set up the enable bits for per-sample shading if needed. */
+       if (shader->key.ps.prolog.force_persample_interp) {
+               if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
+                   G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena)) {
+                       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
+                       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
+                       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
+               }
+               if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
+                   G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena)) {
+                       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
+                       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
+                       shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
+               }
+       }
+
+       /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
+       if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
+           !(shader->config.spi_ps_input_ena & 0xf)) {
+               shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
+               assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
+       }
+
+       /* At least one pair of interpolation weights must be enabled. */
+       if (!(shader->config.spi_ps_input_ena & 0x7f)) {
+               shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
+               assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
+       }
+
+       /* The sample mask input is always enabled, because the API shader always
+        * passes it through to the epilog. Disable it here if it's unused.
+        */
+       if (!shader->key.ps.epilog.poly_line_smoothing &&
+           !shader->selector->info.reads_samplemask)
+               shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
+
+       return true;
+}
+
+int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
+                    struct si_shader *shader,
+                    struct pipe_debug_callback *debug)
+{
+       struct si_shader *mainp = shader->selector->main_shader_part;
+       int r;
+
+       /* LS and ES are always compiled on demand. */
+       if (!mainp ||
+           (shader->selector->type == PIPE_SHADER_VERTEX &&
+            (shader->key.vs.as_es || shader->key.vs.as_ls)) ||
+           (shader->selector->type == PIPE_SHADER_TESS_EVAL &&
+            shader->key.tes.as_es)) {
+               /* Monolithic shader (compiled as a whole, has many variants,
+                * may take a long time to compile).
+                */
+               r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
+               if (r)
+                       return r;
+       } else {
+               /* The shader consists of 2-3 parts:
+                *
+                * - the middle part is the user shader, it has 1 variant only
+                *   and it was compiled during the creation of the shader
+                *   selector
+                * - the prolog part is inserted at the beginning
+                * - the epilog part is inserted at the end
+                *
+                * The prolog and epilog have many (but simple) variants.
+                */
+
+               /* Copy the compiled TGSI shader data over. */
+               shader->is_binary_shared = true;
+               shader->binary = mainp->binary;
+               shader->config = mainp->config;
+               shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
+               shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
+               shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
+               memcpy(shader->info.vs_output_param_offset,
+                      mainp->info.vs_output_param_offset,
+                      sizeof(mainp->info.vs_output_param_offset));
+               shader->info.uses_instanceid = mainp->info.uses_instanceid;
+               shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
+               shader->info.nr_param_exports = mainp->info.nr_param_exports;
+
+               /* Select prologs and/or epilogs. */
+               switch (shader->selector->type) {
+               case PIPE_SHADER_VERTEX:
+                       if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
+                               return -1;
+                       break;
+               case PIPE_SHADER_TESS_CTRL:
+                       if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
+                               return -1;
+                       break;
+               case PIPE_SHADER_TESS_EVAL:
+                       if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
+                               return -1;
+                       break;
+               case PIPE_SHADER_FRAGMENT:
+                       if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
+                               return -1;
+
+                       /* Make sure we have at least as many VGPRs as there
+                        * are allocated inputs.
+                        */
+                       shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
+                                                       shader->info.num_input_vgprs);
+                       break;
+               }
+
+               /* Update SGPR and VGPR counts. */
+               if (shader->prolog) {
+                       shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
+                                                       shader->prolog->config.num_sgprs);
+                       shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
+                                                       shader->prolog->config.num_vgprs);
+               }
+               if (shader->epilog) {
+                       shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
+                                                       shader->epilog->config.num_sgprs);
+                       shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
+                                                       shader->epilog->config.num_vgprs);
+               }
+       }
+
+       si_shader_dump(sscreen, shader, debug, shader->selector->info.processor,
+                      stderr);
+
+       /* Upload. */
+       r = si_shader_binary_upload(sscreen, shader);
+       if (r) {
+               fprintf(stderr, "LLVM failed to upload shader\n");
+               return r;
+       }
+
+       return 0;
+}
+
 void si_shader_destroy(struct si_shader *shader)
 {
        if (shader->gs_copy_shader) {
@@ -4534,5 +6001,6 @@ void si_shader_destroy(struct si_shader *shader)
 
        r600_resource_reference(&shader->bo, NULL);
 
-       radeon_shader_binary_clean(&shader->binary);
+       if (!shader->is_binary_shared)
+               radeon_shader_binary_clean(&shader->binary);
 }