X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_shader.c;h=9183852a8568e65004c6104ed50903fa3a5285cb;hb=754cf171e90cc61d135e7c45f8be319ee2db02a5;hp=1db3e4849157a95c9e58cbf6304ba0495bd74c06;hpb=69f43c2cc903d5973bab2515be51465c9e8f9f9e;p=mesa.git

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 1db3e484915..9183852a856 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -67,7 +67,15 @@ struct si_shader_context
 	struct radeon_llvm_context radeon_bld;
 	struct si_shader *shader;
 	struct si_screen *screen;
+
 	unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */
+	bool is_gs_copy_shader;
+
+	/* Whether to generate the optimized shader variant compiled as a whole
+	 * (without a prolog and epilog)
+	 */
+	bool is_monolithic;
+
 	int param_streamout_config;
 	int param_streamout_write_index;
 	int param_streamout_offset[4];
@@ -75,30 +83,62 @@ struct si_shader_context
 	int param_rel_auto_id;
 	int param_vs_prim_id;
 	int param_instance_id;
+	int param_vertex_index0;
 	int param_tes_u;
 	int param_tes_v;
 	int param_tes_rel_patch_id;
 	int param_tes_patch_id;
 	int param_es2gs_offset;
+
 	LLVMTargetMachineRef tm;
+
 	LLVMValueRef const_md;
 	LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS];
 	LLVMValueRef lds;
 	LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
-	LLVMValueRef sampler_views[SI_NUM_SAMPLER_VIEWS];
-	LLVMValueRef sampler_states[SI_NUM_SAMPLER_STATES];
+	LLVMValueRef sampler_views[SI_NUM_SAMPLERS];
+	LLVMValueRef sampler_states[SI_NUM_SAMPLERS];
+	LLVMValueRef fmasks[SI_NUM_USER_SAMPLERS];
 	LLVMValueRef so_buffers[4];
 	LLVMValueRef esgs_ring;
 	LLVMValueRef gsvs_ring[4];
 	LLVMValueRef gs_next_vertex[4];
+	LLVMValueRef return_value;
+
+	LLVMTypeRef voidt;
+	LLVMTypeRef i1;
+	LLVMTypeRef i8;
+	LLVMTypeRef i32;
+	LLVMTypeRef i64;
+	LLVMTypeRef i128;
+	LLVMTypeRef f32;
+	LLVMTypeRef v16i8;
+	LLVMTypeRef v2i32;
+	LLVMTypeRef v4i32;
+	LLVMTypeRef v4f32;
+	LLVMTypeRef v8i32;
 };
 
-static struct si_shader_context * si_shader_context(
-	struct lp_build_tgsi_context * bld_base)
+static struct si_shader_context *si_shader_context(
+	struct lp_build_tgsi_context *bld_base)
 {
 	return (struct si_shader_context *)bld_base;
 }
 
+static void si_init_shader_ctx(struct si_shader_context *ctx,
+			       struct si_screen *sscreen,
+			       struct si_shader *shader,
+			       LLVMTargetMachineRef tm);
+
+/* Ideally pass the sample mask input to the PS epilog as v13, which
+ * is its usual location, so that the shader doesn't have to add v_mov.
+ */
+#define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
+
+/* The VS location of the PrimitiveID input is the same in the epilog,
+ * so that the main shader part doesn't have to move it.
+ */
+#define VS_EPILOG_PRIMID_LOC 2
 
 #define PERSPECTIVE_BASE 0
 #define LINEAR_BASE 9
@@ -166,14 +206,18 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 /**
  * Get the value of a shader input parameter and extract a bitfield.
  */
-static LLVMValueRef unpack_param(struct si_shader_context *si_shader_ctx,
+static LLVMValueRef unpack_param(struct si_shader_context *ctx,
 				 unsigned param, unsigned rshift,
 				 unsigned bitwidth)
 {
-	struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
-	LLVMValueRef value = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
+	LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn,
 					  param);
 
+	if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
+		value = bitcast(&ctx->radeon_bld.soa.bld_base,
+				TGSI_TYPE_UNSIGNED, value);
+
 	if (rshift)
 		value = LLVMBuildLShr(gallivm->builder, value,
 				      lp_build_const_int32(gallivm, rshift), "");
@@ -187,15 +231,15 @@ static LLVMValueRef unpack_param(struct si_shader_context *si_shader_ctx,
 	return value;
 }
 
-static LLVMValueRef get_rel_patch_id(struct si_shader_context *si_shader_ctx)
+static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
 {
-	switch (si_shader_ctx->type) {
+	switch (ctx->type) {
 	case TGSI_PROCESSOR_TESS_CTRL:
-		return unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 0, 8);
+		return unpack_param(ctx, SI_PARAM_REL_IDS, 0, 8);
 
 	case TGSI_PROCESSOR_TESS_EVAL:
-		return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
-				    si_shader_ctx->param_tes_rel_patch_id);
+		return LLVMGetParam(ctx->radeon_bld.main_fn,
+				    ctx->param_tes_rel_patch_id);
 
 	default:
 		assert(0);
@@ -225,12 +269,12 @@ static LLVMValueRef get_rel_patch_id(struct si_shader_context *si_shader_ctx)
  */
 
 static LLVMValueRef
-get_tcs_in_patch_stride(struct si_shader_context *si_shader_ctx)
+get_tcs_in_patch_stride(struct si_shader_context *ctx)
 {
-	if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX)
-		return unpack_param(si_shader_ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
-	else if (si_shader_ctx->type == TGSI_PROCESSOR_TESS_CTRL)
-		return unpack_param(si_shader_ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
+	if (ctx->type == TGSI_PROCESSOR_VERTEX)
+		return unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
+	else if (ctx->type == TGSI_PROCESSOR_TESS_CTRL)
+		return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
 	else {
 		assert(0);
 		return NULL;
@@ -238,48 +282,48 @@ get_tcs_in_patch_stride(struct si_shader_context *si_shader_ctx)
 }
 
 static LLVMValueRef
-get_tcs_out_patch_stride(struct si_shader_context *si_shader_ctx)
+get_tcs_out_patch_stride(struct si_shader_context *ctx)
 {
-	return unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
+	return unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
 }
 
 static LLVMValueRef
-get_tcs_out_patch0_offset(struct si_shader_context *si_shader_ctx)
+get_tcs_out_patch0_offset(struct si_shader_context *ctx)
 {
-	return lp_build_mul_imm(&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld,
-				unpack_param(si_shader_ctx,
+	return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
+				unpack_param(ctx,
 					     SI_PARAM_TCS_OUT_OFFSETS,
 					     0, 16),
 				4);
 }
 
 static LLVMValueRef
-get_tcs_out_patch0_patch_data_offset(struct si_shader_context *si_shader_ctx)
+get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
 {
-	return lp_build_mul_imm(&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld,
-				unpack_param(si_shader_ctx,
+	return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
+				unpack_param(ctx,
 					     SI_PARAM_TCS_OUT_OFFSETS,
 					     16, 16),
 				4);
 }
 
 static LLVMValueRef
-get_tcs_in_current_patch_offset(struct si_shader_context *si_shader_ctx)
+get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
 {
-	struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
-	LLVMValueRef patch_stride = get_tcs_in_patch_stride(si_shader_ctx);
-	LLVMValueRef rel_patch_id = get_rel_patch_id(si_shader_ctx);
+	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
+	LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
+	LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 
 	return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
 }
 
 static LLVMValueRef
-get_tcs_out_current_patch_offset(struct si_shader_context *si_shader_ctx)
+get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
 {
-	struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
-	LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(si_shader_ctx);
-	LLVMValueRef patch_stride = get_tcs_out_patch_stride(si_shader_ctx);
-	LLVMValueRef rel_patch_id = get_rel_patch_id(si_shader_ctx);
+	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
+	LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
+	LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
+	LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 
 	return LLVMBuildAdd(gallivm->builder, patch0_offset,
 			    LLVMBuildMul(gallivm->builder, patch_stride,
@@ -288,13 +332,13 @@ get_tcs_out_current_patch_offset(struct si_shader_context *si_shader_ctx)
 }
 
 static LLVMValueRef
-get_tcs_out_current_patch_data_offset(struct si_shader_context *si_shader_ctx)
+get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
 {
-	struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
+	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 	LLVMValueRef patch0_patch_data_offset =
-		get_tcs_out_patch0_patch_data_offset(si_shader_ctx);
-	LLVMValueRef patch_stride = get_tcs_out_patch_stride(si_shader_ctx);
-	LLVMValueRef rel_patch_id = get_rel_patch_id(si_shader_ctx);
+		get_tcs_out_patch0_patch_data_offset(ctx);
+	LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
+	LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 
 	return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
 			    LLVMBuildMul(gallivm->builder, patch_stride,
@@ -302,11 +346,11 @@ get_tcs_out_current_patch_data_offset(struct si_shader_context *si_shader_ctx)
 			    "");
 }
 
-static void build_indexed_store(struct si_shader_context *si_shader_ctx,
+static void build_indexed_store(struct si_shader_context *ctx,
 				LLVMValueRef base_ptr, LLVMValueRef index,
 				LLVMValueRef value)
 {
-	struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
+	struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMValueRef indices[2], pointer;
 
@@ -324,10 +368,10 @@ static void build_indexed_store(struct si_shader_context *si_shader_ctx,
  * \param base_ptr  Where the array starts.
  * \param index     The element index into the array.
  */
-static LLVMValueRef build_indexed_load(struct si_shader_context *si_shader_ctx,
+static LLVMValueRef build_indexed_load(struct si_shader_context *ctx,
 				       LLVMValueRef base_ptr, LLVMValueRef index)
 {
-	struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
+	struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMValueRef indices[2], pointer;
 
@@ -343,32 +387,32 @@ static LLVMValueRef build_indexed_load(struct si_shader_context *si_shader_ctx,
  * a constant.
  */
 static LLVMValueRef build_indexed_load_const(
-	struct si_shader_context * si_shader_ctx,
+	struct si_shader_context *ctx,
 	LLVMValueRef base_ptr, LLVMValueRef index)
 {
-	LLVMValueRef result = build_indexed_load(si_shader_ctx, base_ptr, index);
-	LLVMSetMetadata(result, 1, si_shader_ctx->const_md);
+	LLVMValueRef result = build_indexed_load(ctx, base_ptr, index);
+	LLVMSetMetadata(result, 1, ctx->const_md);
 	return result;
 }
 
 static LLVMValueRef get_instance_index_for_fetch(
-	struct radeon_llvm_context * radeon_bld,
-	unsigned divisor)
+	struct radeon_llvm_context *radeon_bld,
+	unsigned param_start_instance, unsigned divisor)
 {
-	struct si_shader_context *si_shader_ctx =
+	struct si_shader_context *ctx =
 		si_shader_context(&radeon_bld->soa.bld_base);
-	struct gallivm_state * gallivm = radeon_bld->soa.bld_base.base.gallivm;
+	struct gallivm_state *gallivm = radeon_bld->soa.bld_base.base.gallivm;
 
 	LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn,
-					   si_shader_ctx->param_instance_id);
+					   ctx->param_instance_id);
 
 	/* The division must be done before START_INSTANCE is added. */
 	if (divisor > 1)
 		result = LLVMBuildUDiv(gallivm->builder, result,
 				lp_build_const_int32(gallivm, divisor), "");
 
-	return LLVMBuildAdd(gallivm->builder, result, LLVMGetParam(
-			radeon_bld->main_fn, SI_PARAM_START_INSTANCE), "");
+	return LLVMBuildAdd(gallivm->builder, result,
+			    LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
 }
 
 static void declare_input_vs(
@@ -378,9 +422,10 @@ static void declare_input_vs(
 {
 	struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
 	struct gallivm_state *gallivm = base->gallivm;
-	struct si_shader_context *si_shader_ctx =
+	struct si_shader_context *ctx =
 		si_shader_context(&radeon_bld->soa.bld_base);
-	unsigned divisor = si_shader_ctx->shader->key.vs.instance_divisors[input_index];
+	unsigned divisor =
+		ctx->shader->key.vs.prolog.instance_divisors[input_index];
 
 	unsigned chan;
 
@@ -390,38 +435,42 @@ static void declare_input_vs(
 	LLVMValueRef attribute_offset;
 	LLVMValueRef buffer_index;
 	LLVMValueRef args[3];
-	LLVMTypeRef vec4_type;
 	LLVMValueRef input;
 
 	/* Load the T list */
-	t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS);
+	t_list_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS);
 
 	t_offset = lp_build_const_int32(gallivm, input_index);
 
-	t_list = build_indexed_load_const(si_shader_ctx, t_list_ptr, t_offset);
+	t_list = build_indexed_load_const(ctx, t_list_ptr, t_offset);
 
 	/* Build the attribute offset */
 	attribute_offset = lp_build_const_int32(gallivm, 0);
 
-	if (divisor) {
+	if (!ctx->is_monolithic) {
+		buffer_index = LLVMGetParam(radeon_bld->main_fn,
+					    ctx->param_vertex_index0 +
+					    input_index);
+	} else if (divisor) {
 		/* Build index from instance ID, start instance and divisor */
-		si_shader_ctx->shader->uses_instanceid = true;
-		buffer_index = get_instance_index_for_fetch(&si_shader_ctx->radeon_bld, divisor);
+		ctx->shader->uses_instanceid = true;
+		buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld,
+							    SI_PARAM_START_INSTANCE,
+							    divisor);
 	} else {
 		/* Load the buffer index for vertices. */
-		LLVMValueRef vertex_id = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
-						      si_shader_ctx->param_vertex_id);
+		LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
+						      ctx->param_vertex_id);
 		LLVMValueRef base_vertex = LLVMGetParam(radeon_bld->main_fn,
 							SI_PARAM_BASE_VERTEX);
 		buffer_index = LLVMBuildAdd(gallivm->builder, base_vertex, vertex_id, "");
 	}
 
-	vec4_type = LLVMVectorType(base->elem_type, 4);
 	args[0] = t_list;
 	args[1] = attribute_offset;
 	args[2] = buffer_index;
 	input = lp_build_intrinsic(gallivm->builder,
-		"llvm.SI.vs.load.input", vec4_type, args, 3,
+		"llvm.SI.vs.load.input", ctx->v4f32, args, 3,
 		LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 
 	/* Break up the vec4 into individual components */
@@ -429,7 +478,7 @@ static void declare_input_vs(
 		LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
 		/* XXX: Use a helper function for this.  There is one in
  		 * tgsi_llvm.c. */
-		si_shader_ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] =
+		ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] =
 				LLVMBuildExtractElement(gallivm->builder,
 				input, llvm_chan, "");
 	}
@@ -438,23 +487,23 @@ static void declare_input_vs(
 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
 				     unsigned swizzle)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 
 	if (swizzle > 0)
 		return bld_base->uint_bld.zero;
 
-	switch (si_shader_ctx->type) {
+	switch (ctx->type) {
 	case TGSI_PROCESSOR_VERTEX:
-		return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
-				    si_shader_ctx->param_vs_prim_id);
+		return LLVMGetParam(ctx->radeon_bld.main_fn,
+				    ctx->param_vs_prim_id);
 	case TGSI_PROCESSOR_TESS_CTRL:
-		return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+		return LLVMGetParam(ctx->radeon_bld.main_fn,
 				    SI_PARAM_PATCH_ID);
 	case TGSI_PROCESSOR_TESS_EVAL:
-		return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
-				    si_shader_ctx->param_tes_patch_id);
+		return LLVMGetParam(ctx->radeon_bld.main_fn,
+				    ctx->param_tes_patch_id);
 	case TGSI_PROCESSOR_GEOMETRY:
-		return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+		return LLVMGetParam(ctx->radeon_bld.main_fn,
 				    SI_PARAM_PRIMITIVE_ID);
 	default:
 		assert(0);
@@ -466,14 +515,14 @@ static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
  * Return the value of tgsi_ind_register for indexing.
  * This is the indirect index with the constant offset added to it.
  */
-static LLVMValueRef get_indirect_index(struct si_shader_context *si_shader_ctx,
+static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
 				       const struct tgsi_ind_register *ind,
 				       int rel_index)
 {
-	struct gallivm_state *gallivm = si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm;
+	struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
 	LLVMValueRef result;
 
-	result = si_shader_ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle];
+	result = ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle];
 	result = LLVMBuildLoad(gallivm->builder, result, "");
 	result = LLVMBuildAdd(gallivm->builder, result,
 			      lp_build_const_int32(gallivm, rel_index), "");
@@ -483,14 +532,14 @@ static LLVMValueRef get_indirect_index(struct si_shader_context *si_shader_ctx,
 /**
  * Calculate a dword address given an input or output register and a stride.
  */
-static LLVMValueRef get_dw_address(struct si_shader_context *si_shader_ctx,
+static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
 				   const struct tgsi_full_dst_register *dst,
 				   const struct tgsi_full_src_register *src,
 				   LLVMValueRef vertex_dw_stride,
 				   LLVMValueRef base_addr)
 {
-	struct gallivm_state *gallivm = si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm;
-	struct tgsi_shader_info *info = &si_shader_ctx->shader->selector->info;
+	struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
+	struct tgsi_shader_info *info = &ctx->shader->selector->info;
 	ubyte *name, *index, *array_first;
 	int first, param;
 	struct tgsi_full_dst_register reg;
@@ -514,7 +563,7 @@ static LLVMValueRef get_dw_address(struct si_shader_context *si_shader_ctx,
 		LLVMValueRef index;
 
 		if (reg.Dimension.Indirect)
-			index = get_indirect_index(si_shader_ctx, &reg.DimIndirect,
+			index = get_indirect_index(ctx, &reg.DimIndirect,
 						   reg.Dimension.Index);
 		else
 			index = lp_build_const_int32(gallivm, reg.Dimension.Index);
@@ -547,7 +596,7 @@ static LLVMValueRef get_dw_address(struct si_shader_context *si_shader_ctx,
 		else
 			first = reg.Register.Index;
 
-		ind_index = get_indirect_index(si_shader_ctx, &reg.Indirect,
+		ind_index = get_indirect_index(ctx, &reg.Indirect,
 					   reg.Register.Index - first);
 
 		base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
@@ -576,7 +625,7 @@ static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
 			     enum tgsi_opcode_type type, unsigned swizzle,
 			     LLVMValueRef dw_addr)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMValueRef value;
 
@@ -593,12 +642,12 @@ static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
 	dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
 			    lp_build_const_int32(gallivm, swizzle));
 
-	value = build_indexed_load(si_shader_ctx, si_shader_ctx->lds, dw_addr);
+	value = build_indexed_load(ctx, ctx->lds, dw_addr);
 	if (type == TGSI_TYPE_DOUBLE) {
 		LLVMValueRef value2;
 		dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
 				       lp_build_const_int32(gallivm, swizzle + 1));
-		value2 = build_indexed_load(si_shader_ctx, si_shader_ctx->lds, dw_addr);
+		value2 = build_indexed_load(ctx, ctx->lds, dw_addr);
 		return radeon_llvm_emit_fetch_double(bld_base, value, value2);
 	}
 
@@ -613,19 +662,18 @@ static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
  * \param dw_addr	address in dwords
  * \param value		value to store
  */
-static void lds_store(struct lp_build_tgsi_context * bld_base,
+static void lds_store(struct lp_build_tgsi_context *bld_base,
 		      unsigned swizzle, LLVMValueRef dw_addr,
 		      LLVMValueRef value)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 
 	dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
 			    lp_build_const_int32(gallivm, swizzle));
 
-	value = LLVMBuildBitCast(gallivm->builder, value,
-				 LLVMInt32TypeInContext(gallivm->context), "");
-	build_indexed_store(si_shader_ctx, si_shader_ctx->lds,
+	value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
+	build_indexed_store(ctx, ctx->lds,
 			    dw_addr, value);
 }
 
@@ -634,12 +682,12 @@ static LLVMValueRef fetch_input_tcs(
 	const struct tgsi_full_src_register *reg,
 	enum tgsi_opcode_type type, unsigned swizzle)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	LLVMValueRef dw_addr, stride;
 
-	stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
-	dw_addr = get_tcs_in_current_patch_offset(si_shader_ctx);
-	dw_addr = get_dw_address(si_shader_ctx, NULL, reg, stride, dw_addr);
+	stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
+	dw_addr = get_tcs_in_current_patch_offset(ctx);
+	dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
 
 	return lds_load(bld_base, type, swizzle, dw_addr);
 }
@@ -649,16 +697,16 @@ static LLVMValueRef fetch_output_tcs(
 		const struct tgsi_full_src_register *reg,
 		enum tgsi_opcode_type type, unsigned swizzle)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	LLVMValueRef dw_addr, stride;
 
 	if (reg->Register.Dimension) {
-		stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
-		dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx);
-		dw_addr = get_dw_address(si_shader_ctx, NULL, reg, stride, dw_addr);
+		stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
+		dw_addr = get_tcs_out_current_patch_offset(ctx);
+		dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
 	} else {
-		dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx);
-		dw_addr = get_dw_address(si_shader_ctx, NULL, reg, NULL, dw_addr);
+		dw_addr = get_tcs_out_current_patch_data_offset(ctx);
+		dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
 	}
 
 	return lds_load(bld_base, type, swizzle, dw_addr);
@@ -669,27 +717,27 @@ static LLVMValueRef fetch_input_tes(
 	const struct tgsi_full_src_register *reg,
 	enum tgsi_opcode_type type, unsigned swizzle)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	LLVMValueRef dw_addr, stride;
 
 	if (reg->Register.Dimension) {
-		stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
-		dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx);
-		dw_addr = get_dw_address(si_shader_ctx, NULL, reg, stride, dw_addr);
+		stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
+		dw_addr = get_tcs_out_current_patch_offset(ctx);
+		dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
 	} else {
-		dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx);
-		dw_addr = get_dw_address(si_shader_ctx, NULL, reg, NULL, dw_addr);
+		dw_addr = get_tcs_out_current_patch_data_offset(ctx);
+		dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
 	}
 
 	return lds_load(bld_base, type, swizzle, dw_addr);
 }
 
-static void store_output_tcs(struct lp_build_tgsi_context * bld_base,
-			     const struct tgsi_full_instruction * inst,
-			     const struct tgsi_opcode_info * info,
+static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
+			     const struct tgsi_full_instruction *inst,
+			     const struct tgsi_opcode_info *info,
 			     LLVMValueRef dst[4])
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	const struct tgsi_full_dst_register *reg = &inst->Dst[0];
 	unsigned chan_index;
 	LLVMValueRef dw_addr, stride;
@@ -704,12 +752,12 @@ static void store_output_tcs(struct lp_build_tgsi_context * bld_base,
 	}
 
 	if (reg->Register.Dimension) {
-		stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
-		dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx);
-		dw_addr = get_dw_address(si_shader_ctx, reg, NULL, stride, dw_addr);
+		stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
+		dw_addr = get_tcs_out_current_patch_offset(ctx);
+		dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
 	} else {
-		dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx);
-		dw_addr = get_dw_address(si_shader_ctx, reg, NULL, NULL, dw_addr);
+		dw_addr = get_tcs_out_current_patch_data_offset(ctx);
+		dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
 	}
 
 	TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
@@ -729,11 +777,10 @@ static LLVMValueRef fetch_input_gs(
 	unsigned swizzle)
 {
 	struct lp_build_context *base = &bld_base->base;
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
-	struct si_shader *shader = si_shader_ctx->shader;
-	struct lp_build_context *uint =	&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld;
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct si_shader *shader = ctx->shader;
+	struct lp_build_context *uint =	&ctx->radeon_bld.soa.bld_base.uint_bld;
 	struct gallivm_state *gallivm = base->gallivm;
-	LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
 	LLVMValueRef vtx_offset;
 	LLVMValueRef args[9];
 	unsigned vtx_offset_param;
@@ -768,12 +815,12 @@ static LLVMValueRef fetch_input_gs(
 		vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2;
 	}
 	vtx_offset = lp_build_mul_imm(uint,
-				      LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				      LLVMGetParam(ctx->radeon_bld.main_fn,
 						   vtx_offset_param),
 				      4);
 
 	param = si_shader_io_get_unique_index(semantic_name, semantic_index);
-	args[0] = si_shader_ctx->esgs_ring;
+	args[0] = ctx->esgs_ring;
 	args[1] = vtx_offset;
 	args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256);
 	args[3] = uint->zero;
@@ -785,14 +832,14 @@ static LLVMValueRef fetch_input_gs(
 
 	value = lp_build_intrinsic(gallivm->builder,
 				   "llvm.SI.buffer.load.dword.i32.i32",
-				   i32, args, 9,
+				   ctx->i32, args, 9,
 				   LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
 	if (type == TGSI_TYPE_DOUBLE) {
 		LLVMValueRef value2;
 		args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256);
 		value2 = lp_build_intrinsic(gallivm->builder,
 					    "llvm.SI.buffer.load.dword.i32.i32",
-					    i32, args, 9,
+					    ctx->i32, args, 9,
 					    LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
 		return radeon_llvm_emit_fetch_double(bld_base,
 						     value, value2);
@@ -832,14 +879,12 @@ static int lookup_interp_param_index(unsigned interpolate, unsigned location)
 }
 
 /* This shouldn't be used by explicit INTERP opcodes. */
-static LLVMValueRef get_interp_param(struct si_shader_context *si_shader_ctx,
-				     unsigned param)
+static unsigned select_interp_param(struct si_shader_context *ctx,
+				    unsigned param)
 {
-	struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
-	unsigned sample_param = 0;
-	LLVMValueRef default_ij, sample_ij, force_sample;
-
-	default_ij = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, param);
+	if (!ctx->shader->key.ps.prolog.force_persample_interp ||
+	    !ctx->is_monolithic)
+		return param;
 
 	/* If the shader doesn't use center/centroid, just return the parameter.
 	 *
@@ -849,109 +894,51 @@ static LLVMValueRef get_interp_param(struct si_shader_context *si_shader_ctx,
 	switch (param) {
 	case SI_PARAM_PERSP_CENTROID:
 	case SI_PARAM_PERSP_CENTER:
-		if (!si_shader_ctx->shader->selector->forces_persample_interp_for_persp)
-			return default_ij;
-
-		sample_param = SI_PARAM_PERSP_SAMPLE;
-		break;
+		return SI_PARAM_PERSP_SAMPLE;
 
 	case SI_PARAM_LINEAR_CENTROID:
 	case SI_PARAM_LINEAR_CENTER:
-		if (!si_shader_ctx->shader->selector->forces_persample_interp_for_linear)
-			return default_ij;
-
-		sample_param = SI_PARAM_LINEAR_SAMPLE;
-		break;
+		return SI_PARAM_LINEAR_SAMPLE;
 
 	default:
-		return default_ij;
+		return param;
 	}
-
-	/* Otherwise, we have to select (i,j) based on a user data SGPR. */
-	sample_ij = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, sample_param);
-
-	/* TODO: this can be done more efficiently by switching between
-	 * 2 prologs.
-	 */
-	force_sample = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
-				    SI_PARAM_PS_STATE_BITS);
-	force_sample = LLVMBuildTrunc(gallivm->builder, force_sample,
-				      LLVMInt1TypeInContext(gallivm->context), "");
-	return LLVMBuildSelect(gallivm->builder, force_sample,
-			       sample_ij, default_ij, "");
 }
 
-static void declare_input_fs(
-	struct radeon_llvm_context *radeon_bld,
-	unsigned input_index,
-	const struct tgsi_full_declaration *decl)
+/**
+ * Interpolate a fragment shader input.
+ *
+ * @param ctx		context
+ * @param input_index		index of the input in hardware
+ * @param semantic_name		TGSI_SEMANTIC_*
+ * @param semantic_index	semantic index
+ * @param num_interp_inputs	number of all interpolated inputs (= BCOLOR offset)
+ * @param colors_read_mask	color components read (4 bits for each color, 8 bits in total)
+ * @param interp_param		interpolation weights (i,j)
+ * @param prim_mask		SI_PARAM_PRIM_MASK
+ * @param face			SI_PARAM_FRONT_FACE
+ * @param result		the return value (4 components)
+ */
+static void interp_fs_input(struct si_shader_context *ctx,
+			    unsigned input_index,
+			    unsigned semantic_name,
+			    unsigned semantic_index,
+			    unsigned num_interp_inputs,
+			    unsigned colors_read_mask,
+			    LLVMValueRef interp_param,
+			    LLVMValueRef prim_mask,
+			    LLVMValueRef face,
+			    LLVMValueRef result[4])
 {
-	struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
-	struct si_shader_context *si_shader_ctx =
-		si_shader_context(&radeon_bld->soa.bld_base);
-	struct si_shader *shader = si_shader_ctx->shader;
-	struct lp_build_context *uint =	&radeon_bld->soa.bld_base.uint_bld;
+	struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
+	struct lp_build_context *uint =	&ctx->radeon_bld.soa.bld_base.uint_bld;
 	struct gallivm_state *gallivm = base->gallivm;
-	LLVMTypeRef input_type = LLVMFloatTypeInContext(gallivm->context);
-	LLVMValueRef main_fn = radeon_bld->main_fn;
-
-	LLVMValueRef interp_param = NULL;
-	int interp_param_idx;
-	const char * intr_name;
-
-	/* This value is:
-	 * [15:0] NewPrimMask (Bit mask for each quad.  It is set it the
-	 *                     quad begins a new primitive.  Bit 0 always needs
-	 *                     to be unset)
-	 * [32:16] ParamOffset
-	 *
-	 */
-	LLVMValueRef params = LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK);
+	const char *intr_name;
 	LLVMValueRef attr_number;
 
 	unsigned chan;
 
-	if (decl->Semantic.Name == TGSI_SEMANTIC_POSITION) {
-		for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-			unsigned soa_index =
-				radeon_llvm_reg_index_soa(input_index, chan);
-			radeon_bld->inputs[soa_index] =
-				LLVMGetParam(main_fn, SI_PARAM_POS_X_FLOAT + chan);
-
-			if (chan == 3)
-				/* RCP for fragcoord.w */
-				radeon_bld->inputs[soa_index] =
-					LLVMBuildFDiv(gallivm->builder,
-						      lp_build_const_float(gallivm, 1.0f),
-						      radeon_bld->inputs[soa_index],
-						      "");
-		}
-		return;
-	}
-
-	if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
-		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
-			LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE);
-		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
-		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
-			lp_build_const_float(gallivm, 0.0f);
-		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
-			lp_build_const_float(gallivm, 1.0f);
-
-		return;
-	}
-
-	shader->ps_input_param_offset[input_index] = shader->nparam++;
-	attr_number = lp_build_const_int32(gallivm,
-					   shader->ps_input_param_offset[input_index]);
-
-	shader->ps_input_interpolate[input_index] = decl->Interp.Interpolate;
-	interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
-						     decl->Interp.Location);
-	if (interp_param_idx == -1)
-		return;
-	else if (interp_param_idx)
-		interp_param = get_interp_param(si_shader_ctx, interp_param_idx);
+	attr_number = lp_build_const_int32(gallivm, input_index);
 
 	/* fs.constant returns the param from the middle vertex, so it's not
 	 * really useful for flat shading. It's meant to be used for custom
@@ -965,81 +952,127 @@ static void declare_input_fs(
 	 */
 	intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
 
-	if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
-	    si_shader_ctx->shader->key.ps.color_two_side) {
+	if (semantic_name == TGSI_SEMANTIC_COLOR &&
+	    ctx->shader->key.ps.prolog.color_two_side) {
 		LLVMValueRef args[4];
-		LLVMValueRef face, is_face_positive;
-		LLVMValueRef back_attr_number =
-			lp_build_const_int32(gallivm,
-					     shader->ps_input_param_offset[input_index] + 1);
+		LLVMValueRef is_face_positive;
+		LLVMValueRef back_attr_number;
 
-		face = LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE);
+		/* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
+		 * otherwise it's at offset "num_inputs".
+		 */
+		unsigned back_attr_offset = num_interp_inputs;
+		if (semantic_index == 1 && colors_read_mask & 0xf)
+			back_attr_offset += 1;
 
-		is_face_positive = LLVMBuildFCmp(gallivm->builder,
-						 LLVMRealOGT, face,
-						 lp_build_const_float(gallivm, 0.0f),
-						 "");
+		back_attr_number = lp_build_const_int32(gallivm, back_attr_offset);
 
-		args[2] = params;
+		is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
+						 face, uint->zero, "");
+
+		args[2] = prim_mask;
 		args[3] = interp_param;
 		for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 			LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
-			unsigned soa_index = radeon_llvm_reg_index_soa(input_index, chan);
 			LLVMValueRef front, back;
 
 			args[0] = llvm_chan;
 			args[1] = attr_number;
 			front = lp_build_intrinsic(gallivm->builder, intr_name,
-						input_type, args, args[3] ? 4 : 3,
+						ctx->f32, args, args[3] ? 4 : 3,
 						LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 
 			args[1] = back_attr_number;
 			back = lp_build_intrinsic(gallivm->builder, intr_name,
-					       input_type, args, args[3] ? 4 : 3,
+					       ctx->f32, args, args[3] ? 4 : 3,
 					       LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 
-			radeon_bld->inputs[soa_index] =
-				LLVMBuildSelect(gallivm->builder,
+			result[chan] = LLVMBuildSelect(gallivm->builder,
 						is_face_positive,
 						front,
 						back,
 						"");
 		}
-
-		shader->nparam++;
-	} else if (decl->Semantic.Name == TGSI_SEMANTIC_FOG) {
+	} else if (semantic_name == TGSI_SEMANTIC_FOG) {
 		LLVMValueRef args[4];
 
 		args[0] = uint->zero;
 		args[1] = attr_number;
-		args[2] = params;
+		args[2] = prim_mask;
 		args[3] = interp_param;
-		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
-			lp_build_intrinsic(gallivm->builder, intr_name,
-					input_type, args, args[3] ? 4 : 3,
+		result[0] = lp_build_intrinsic(gallivm->builder, intr_name,
+					ctx->f32, args, args[3] ? 4 : 3,
 					LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
-		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
-		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
-			lp_build_const_float(gallivm, 0.0f);
-		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
-			lp_build_const_float(gallivm, 1.0f);
+		result[1] =
+		result[2] = lp_build_const_float(gallivm, 0.0f);
+		result[3] = lp_build_const_float(gallivm, 1.0f);
 	} else {
 		for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 			LLVMValueRef args[4];
 			LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
-			unsigned soa_index = radeon_llvm_reg_index_soa(input_index, chan);
+
 			args[0] = llvm_chan;
 			args[1] = attr_number;
-			args[2] = params;
+			args[2] = prim_mask;
 			args[3] = interp_param;
-			radeon_bld->inputs[soa_index] =
-				lp_build_intrinsic(gallivm->builder, intr_name,
-						input_type, args, args[3] ? 4 : 3,
+			result[chan] = lp_build_intrinsic(gallivm->builder, intr_name,
+						ctx->f32, args, args[3] ? 4 : 3,
 						LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 		}
 	}
 }
 
+static void declare_input_fs(
+	struct radeon_llvm_context *radeon_bld,
+	unsigned input_index,
+	const struct tgsi_full_declaration *decl)
+{
+	struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
+	struct si_shader_context *ctx =
+		si_shader_context(&radeon_bld->soa.bld_base);
+	struct si_shader *shader = ctx->shader;
+	LLVMValueRef main_fn = radeon_bld->main_fn;
+	LLVMValueRef interp_param = NULL;
+	int interp_param_idx;
+
+	/* Get colors from input VGPRs (set by the prolog). */
+	if (!ctx->is_monolithic &&
+	    decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
+		unsigned i = decl->Semantic.Index;
+		unsigned colors_read = shader->selector->info.colors_read;
+		unsigned mask = colors_read >> (i * 4);
+		unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
+				  (i ? util_bitcount(colors_read & 0xf) : 0);
+
+		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
+			mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
+		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
+			mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
+		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
+			mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
+		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
+			mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
+		return;
+	}
+
+	interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
+						     decl->Interp.Location);
+	if (interp_param_idx == -1)
+		return;
+	else if (interp_param_idx) {
+		interp_param_idx = select_interp_param(ctx,
+						       interp_param_idx);
+		interp_param = LLVMGetParam(main_fn, interp_param_idx);
+	}
+
+	interp_fs_input(ctx, input_index, decl->Semantic.Name,
+			decl->Semantic.Index, shader->selector->info.num_inputs,
+			shader->selector->info.colors_read, interp_param,
+			LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
+			LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
+			&radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]);
+}
+
 static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)
 {
 	return unpack_param(si_shader_context(&radeon_bld->soa.bld_base),
@@ -1060,22 +1093,22 @@ static LLVMValueRef buffer_load_const(LLVMBuilderRef builder, LLVMValueRef resou
 
 static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, LLVMValueRef sample_id)
 {
-	struct si_shader_context *si_shader_ctx =
+	struct si_shader_context *ctx =
 		si_shader_context(&radeon_bld->soa.bld_base);
 	struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
 	struct gallivm_state *gallivm = &radeon_bld->gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
-	LLVMValueRef desc = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
+	LLVMValueRef desc = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
 	LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_DRIVER_STATE_CONST_BUF);
-	LLVMValueRef resource = build_indexed_load_const(si_shader_ctx, desc, buf_index);
+	LLVMValueRef resource = build_indexed_load_const(ctx, desc, buf_index);
 
 	/* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
 	LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
 	LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
 
 	LLVMValueRef pos[4] = {
-		buffer_load_const(builder, resource, offset0, radeon_bld->soa.bld_base.base.elem_type),
-		buffer_load_const(builder, resource, offset1, radeon_bld->soa.bld_base.base.elem_type),
+		buffer_load_const(builder, resource, offset0, ctx->f32),
+		buffer_load_const(builder, resource, offset1, ctx->f32),
 		lp_build_const_float(gallivm, 0),
 		lp_build_const_float(gallivm, 0)
 	};
@@ -1084,34 +1117,33 @@ static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld,
 }
 
 static void declare_system_value(
-	struct radeon_llvm_context * radeon_bld,
+	struct radeon_llvm_context *radeon_bld,
 	unsigned index,
 	const struct tgsi_full_declaration *decl)
 {
-	struct si_shader_context *si_shader_ctx =
+	struct si_shader_context *ctx =
 		si_shader_context(&radeon_bld->soa.bld_base);
 	struct lp_build_context *bld = &radeon_bld->soa.bld_base.base;
-	struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
 	struct gallivm_state *gallivm = &radeon_bld->gallivm;
 	LLVMValueRef value = 0;
 
 	switch (decl->Semantic.Name) {
 	case TGSI_SEMANTIC_INSTANCEID:
 		value = LLVMGetParam(radeon_bld->main_fn,
-				     si_shader_ctx->param_instance_id);
+				     ctx->param_instance_id);
 		break;
 
 	case TGSI_SEMANTIC_VERTEXID:
 		value = LLVMBuildAdd(gallivm->builder,
 				     LLVMGetParam(radeon_bld->main_fn,
-						  si_shader_ctx->param_vertex_id),
+						  ctx->param_vertex_id),
 				     LLVMGetParam(radeon_bld->main_fn,
 						  SI_PARAM_BASE_VERTEX), "");
 		break;
 
 	case TGSI_SEMANTIC_VERTEXID_NOBASE:
 		value = LLVMGetParam(radeon_bld->main_fn,
-				     si_shader_ctx->param_vertex_id);
+				     ctx->param_vertex_id);
 		break;
 
 	case TGSI_SEMANTIC_BASEVERTEX:
@@ -1120,43 +1152,70 @@ static void declare_system_value(
 		break;
 
 	case TGSI_SEMANTIC_INVOCATIONID:
-		if (si_shader_ctx->type == TGSI_PROCESSOR_TESS_CTRL)
-			value = unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 8, 5);
-		else if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY)
+		if (ctx->type == TGSI_PROCESSOR_TESS_CTRL)
+			value = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
+		else if (ctx->type == TGSI_PROCESSOR_GEOMETRY)
 			value = LLVMGetParam(radeon_bld->main_fn,
 					     SI_PARAM_GS_INSTANCE_ID);
 		else
 			assert(!"INVOCATIONID not implemented");
 		break;
 
+	case TGSI_SEMANTIC_POSITION:
+	{
+		LLVMValueRef pos[4] = {
+			LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
+			LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
+			LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Z_FLOAT),
+			lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, TGSI_OPCODE_RCP,
+						 LLVMGetParam(radeon_bld->main_fn,
+							      SI_PARAM_POS_W_FLOAT)),
+		};
+		value = lp_build_gather_values(gallivm, pos, 4);
+		break;
+	}
+
+	case TGSI_SEMANTIC_FACE:
+		value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_FRONT_FACE);
+		break;
+
 	case TGSI_SEMANTIC_SAMPLEID:
 		value = get_sample_id(radeon_bld);
 		break;
 
-	case TGSI_SEMANTIC_SAMPLEPOS:
-		value = load_sample_position(radeon_bld, get_sample_id(radeon_bld));
+	case TGSI_SEMANTIC_SAMPLEPOS: {
+		LLVMValueRef pos[4] = {
+			LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
+			LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
+			lp_build_const_float(gallivm, 0),
+			lp_build_const_float(gallivm, 0)
+		};
+		pos[0] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
+						  TGSI_OPCODE_FRC, pos[0]);
+		pos[1] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
+						  TGSI_OPCODE_FRC, pos[1]);
+		value = lp_build_gather_values(gallivm, pos, 4);
 		break;
+	}
 
 	case TGSI_SEMANTIC_SAMPLEMASK:
-		/* Smoothing isn't MSAA in GL, but it's MSAA in hardware.
-		 * Therefore, force gl_SampleMaskIn to 1 for GL. */
-		if (si_shader_ctx->shader->key.ps.poly_line_smoothing)
-			value = uint_bld->one;
-		else
-			value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
+		/* This can only occur with the OpenGL Core profile, which
+		 * doesn't support smoothing.
+		 */
+		value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
 		break;
 
 	case TGSI_SEMANTIC_TESSCOORD:
 	{
 		LLVMValueRef coord[4] = {
-			LLVMGetParam(radeon_bld->main_fn, si_shader_ctx->param_tes_u),
-			LLVMGetParam(radeon_bld->main_fn, si_shader_ctx->param_tes_v),
+			LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_u),
+			LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_v),
 			bld->zero,
 			bld->zero
 		};
 
 		/* For triangles, the vector should be (u, v, 1-u-v). */
-		if (si_shader_ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
+		if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
 		    PIPE_PRIM_TRIANGLES)
 			coord[2] = lp_build_sub(bld, bld->one,
 						lp_build_add(bld, coord[0], coord[1]));
@@ -1166,7 +1225,7 @@ static void declare_system_value(
 	}
 
 	case TGSI_SEMANTIC_VERTICESIN:
-		value = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
+		value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
 		break;
 
 	case TGSI_SEMANTIC_TESSINNER:
@@ -1175,7 +1234,7 @@ static void declare_system_value(
 		LLVMValueRef dw_addr;
 		int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
 
-		dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx);
+		dw_addr = get_tcs_out_current_patch_data_offset(ctx);
 		dw_addr = LLVMBuildAdd(gallivm->builder, dw_addr,
 				       lp_build_const_int32(gallivm, param * 4), "");
 
@@ -1197,13 +1256,13 @@ static void declare_system_value(
 }
 
 static LLVMValueRef fetch_constant(
-	struct lp_build_tgsi_context * bld_base,
+	struct lp_build_tgsi_context *bld_base,
 	const struct tgsi_full_src_register *reg,
 	enum tgsi_opcode_type type,
 	unsigned swizzle)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
-	struct lp_build_context * base = &bld_base->base;
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct lp_build_context *base = &bld_base->base;
 	const struct tgsi_ind_register *ireg = &reg->Indirect;
 	unsigned buf, idx;
 
@@ -1224,44 +1283,44 @@ static LLVMValueRef fetch_constant(
 
 	if (!reg->Register.Indirect && !reg->Dimension.Indirect) {
 		if (type != TGSI_TYPE_DOUBLE)
-			return bitcast(bld_base, type, si_shader_ctx->constants[buf][idx]);
+			return bitcast(bld_base, type, ctx->constants[buf][idx]);
 		else {
 			return radeon_llvm_emit_fetch_double(bld_base,
-							     si_shader_ctx->constants[buf][idx],
-							     si_shader_ctx->constants[buf][idx + 1]);
+							     ctx->constants[buf][idx],
+							     ctx->constants[buf][idx + 1]);
 		}
 	}
 
 	if (reg->Register.Dimension && reg->Dimension.Indirect) {
-		LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
+		LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
 		LLVMValueRef index;
-		index = get_indirect_index(si_shader_ctx, &reg->DimIndirect,
+		index = get_indirect_index(ctx, &reg->DimIndirect,
 						   reg->Dimension.Index);
-		bufp = build_indexed_load_const(si_shader_ctx, ptr, index);
+		bufp = build_indexed_load_const(ctx, ptr, index);
 	} else
-		bufp = si_shader_ctx->const_buffers[buf];
+		bufp = ctx->const_buffers[buf];
 
-	addr = si_shader_ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
+	addr = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
 	addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
 	addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
 	addr = lp_build_add(&bld_base->uint_bld, addr,
 			    lp_build_const_int32(base->gallivm, idx * 4));
 
 	result = buffer_load_const(base->gallivm->builder, bufp,
-				   addr, bld_base->base.elem_type);
+				   addr, ctx->f32);
 
 	if (type != TGSI_TYPE_DOUBLE)
 		result = bitcast(bld_base, type, result);
 	else {
 		LLVMValueRef addr2, result2;
-		addr2 = si_shader_ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1];
+		addr2 = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1];
 		addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2");
 		addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16);
 		addr2 = lp_build_add(&bld_base->uint_bld, addr2,
 				     lp_build_const_int32(base->gallivm, idx * 4));
 
-		result2 = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_buffers[buf],
-				   addr2, bld_base->base.elem_type);
+		result2 = buffer_load_const(base->gallivm->builder, ctx->const_buffers[buf],
+				   addr2, ctx->f32);
 
 		result = radeon_llvm_emit_fetch_double(bld_base,
 					               result, result2);
@@ -1269,26 +1328,47 @@ static LLVMValueRef fetch_constant(
 	return result;
 }
 
+/* Upper 16 bits must be zero. */
+static LLVMValueRef si_llvm_pack_two_int16(struct gallivm_state *gallivm,
+					   LLVMValueRef val[2])
+{
+	return LLVMBuildOr(gallivm->builder, val[0],
+			   LLVMBuildShl(gallivm->builder, val[1],
+					lp_build_const_int32(gallivm, 16),
+					""), "");
+}
+
+/* Upper 16 bits are ignored and will be dropped. */
+static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct gallivm_state *gallivm,
+						    LLVMValueRef val[2])
+{
+	LLVMValueRef v[2] = {
+		LLVMBuildAnd(gallivm->builder, val[0],
+			     lp_build_const_int32(gallivm, 0xffff), ""),
+		val[1],
+	};
+	return si_llvm_pack_two_int16(gallivm, v);
+}
+
 /* Initialize arguments for the shader export intrinsic */
 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 				     LLVMValueRef *values,
 				     unsigned target,
 				     LLVMValueRef *args)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct lp_build_context *uint =
-				&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld;
+				&ctx->radeon_bld.soa.bld_base.uint_bld;
 	struct lp_build_context *base = &bld_base->base;
-	unsigned compressed = 0;
+	struct gallivm_state *gallivm = base->gallivm;
+	LLVMBuilderRef builder = base->gallivm->builder;
+	LLVMValueRef val[4];
+	unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
 	unsigned chan;
+	bool is_int8;
 
-	/* XXX: This controls which components of the output
-	 * registers actually get exported. (e.g bit 0 means export
-	 * X component, bit 1 means export Y component, etc.)  I'm
-	 * hard coding this to 0xf for now.  In the future, we might
-	 * want to do something else.
-	 */
-	args[0] = lp_build_const_int32(base->gallivm, 0xf);
+	/* Default is 0xf. Adjusted below depending on the format. */
+	args[0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
 
 	/* Specify whether the EXEC mask represents the valid mask */
 	args[1] = uint->zero;
@@ -1299,18 +1379,48 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 	/* Specify the target we are exporting */
 	args[3] = lp_build_const_int32(base->gallivm, target);
 
-	if (si_shader_ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+	if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+		const union si_shader_key *key = &ctx->shader->key;
+		unsigned col_formats = key->ps.epilog.spi_shader_col_format;
 		int cbuf = target - V_008DFC_SQ_EXP_MRT;
 
-		if (cbuf >= 0 && cbuf < 8)
-			compressed = (si_shader_ctx->shader->key.ps.export_16bpc >> cbuf) & 0x1;
+		assert(cbuf >= 0 && cbuf < 8);
+		spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
+		is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1;
 	}
 
-	/* Set COMPR flag */
-	args[4] = compressed ? uint->one : uint->zero;
+	args[4] = uint->zero; /* COMPR flag */
+	args[5] = base->undef;
+	args[6] = base->undef;
+	args[7] = base->undef;
+	args[8] = base->undef;
+
+	switch (spi_shader_col_format) {
+	case V_028714_SPI_SHADER_ZERO:
+		args[0] = uint->zero; /* writemask */
+		args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
+		break;
+
+	case V_028714_SPI_SHADER_32_R:
+		args[0] = uint->one; /* writemask */
+		args[5] = values[0];
+		break;
+
+	case V_028714_SPI_SHADER_32_GR:
+		args[0] = lp_build_const_int32(base->gallivm, 0x3); /* writemask */
+		args[5] = values[0];
+		args[6] = values[1];
+		break;
+
+	case V_028714_SPI_SHADER_32_AR:
+		args[0] = lp_build_const_int32(base->gallivm, 0x9); /* writemask */
+		args[5] = values[0];
+		args[8] = values[3];
+		break;
+
+	case V_028714_SPI_SHADER_FP16_ABGR:
+		args[4] = uint->one; /* COMPR flag */
 
-	if (compressed) {
-		/* Pixel shader needs to pack output values before export */
 		for (chan = 0; chan < 2; chan++) {
 			LLVMValueRef pack_args[2] = {
 				values[2 * chan],
@@ -1320,33 +1430,122 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 
 			packed = lp_build_intrinsic(base->gallivm->builder,
 						    "llvm.SI.packf16",
-						    LLVMInt32TypeInContext(base->gallivm->context),
-						    pack_args, 2,
+						    ctx->i32, pack_args, 2,
 						    LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 			args[chan + 5] =
 				LLVMBuildBitCast(base->gallivm->builder,
-						 packed,
-						 LLVMFloatTypeInContext(base->gallivm->context),
-						 "");
-			args[chan + 7] = base->undef;
+						 packed, ctx->f32, "");
 		}
-	} else
+		break;
+
+	case V_028714_SPI_SHADER_UNORM16_ABGR:
+		for (chan = 0; chan < 4; chan++) {
+			val[chan] = radeon_llvm_saturate(bld_base, values[chan]);
+			val[chan] = LLVMBuildFMul(builder, val[chan],
+						  lp_build_const_float(gallivm, 65535), "");
+			val[chan] = LLVMBuildFAdd(builder, val[chan],
+						  lp_build_const_float(gallivm, 0.5), "");
+			val[chan] = LLVMBuildFPToUI(builder, val[chan],
+						    ctx->i32, "");
+		}
+
+		args[4] = uint->one; /* COMPR flag */
+		args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int16(gallivm, val));
+		args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int16(gallivm, val+2));
+		break;
+
+	case V_028714_SPI_SHADER_SNORM16_ABGR:
+		for (chan = 0; chan < 4; chan++) {
+			/* Clamp between [-1, 1]. */
+			val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
+							      values[chan],
+							      lp_build_const_float(gallivm, 1));
+			val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
+							      val[chan],
+							      lp_build_const_float(gallivm, -1));
+			/* Convert to a signed integer in [-32767, 32767]. */
+			val[chan] = LLVMBuildFMul(builder, val[chan],
+						  lp_build_const_float(gallivm, 32767), "");
+			/* If positive, add 0.5, else add -0.5. */
+			val[chan] = LLVMBuildFAdd(builder, val[chan],
+					LLVMBuildSelect(builder,
+						LLVMBuildFCmp(builder, LLVMRealOGE,
+							      val[chan], base->zero, ""),
+						lp_build_const_float(gallivm, 0.5),
+						lp_build_const_float(gallivm, -0.5), ""), "");
+			val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
+		}
+
+		args[4] = uint->one; /* COMPR flag */
+		args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int32_as_int16(gallivm, val));
+		args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int32_as_int16(gallivm, val+2));
+		break;
+
+	case V_028714_SPI_SHADER_UINT16_ABGR: {
+		LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
+							255 : 65535);
+		/* Clamp. */
+		for (chan = 0; chan < 4; chan++) {
+			val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
+			val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
+							      val[chan], max);
+		}
+
+		args[4] = uint->one; /* COMPR flag */
+		args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int16(gallivm, val));
+		args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int16(gallivm, val+2));
+		break;
+	}
+
+	case V_028714_SPI_SHADER_SINT16_ABGR: {
+		LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
+							127 : 32767);
+		LLVMValueRef min = lp_build_const_int32(gallivm, is_int8 ?
+							-128 : -32768);
+		/* Clamp. */
+		for (chan = 0; chan < 4; chan++) {
+			val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
+			val[chan] = lp_build_emit_llvm_binary(bld_base,
+							      TGSI_OPCODE_IMIN,
+							      val[chan], max);
+			val[chan] = lp_build_emit_llvm_binary(bld_base,
+							      TGSI_OPCODE_IMAX,
+							      val[chan], min);
+		}
+
+		args[4] = uint->one; /* COMPR flag */
+		args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int32_as_int16(gallivm, val));
+		args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+				  si_llvm_pack_two_int32_as_int16(gallivm, val+2));
+		break;
+	}
+
+	case V_028714_SPI_SHADER_32_ABGR:
 		memcpy(&args[5], values, sizeof(values[0]) * 4);
+		break;
+	}
 }
 
 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
 			  LLVMValueRef alpha)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 
-	if (si_shader_ctx->shader->key.ps.alpha_func != PIPE_FUNC_NEVER) {
-		LLVMValueRef alpha_ref = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+	if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
+		LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn,
 				SI_PARAM_ALPHA_REF);
 
 		LLVMValueRef alpha_pass =
 			lp_build_cmp(&bld_base->base,
-				     si_shader_ctx->shader->key.ps.alpha_func,
+				     ctx->shader->key.ps.epilog.alpha_func,
 				     alpha, alpha_ref);
 		LLVMValueRef arg =
 			lp_build_select(&bld_base->base,
@@ -1354,36 +1553,33 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
 					lp_build_const_float(gallivm, 1.0f),
 					lp_build_const_float(gallivm, -1.0f));
 
-		lp_build_intrinsic(gallivm->builder,
-				"llvm.AMDGPU.kill",
-				LLVMVoidTypeInContext(gallivm->context),
-				&arg, 1, 0);
+		lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
+				   ctx->voidt, &arg, 1, 0);
 	} else {
-		lp_build_intrinsic(gallivm->builder,
-				"llvm.AMDGPU.kilp",
-				LLVMVoidTypeInContext(gallivm->context),
-				NULL, 0, 0);
+		lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kilp",
+				   ctx->voidt, NULL, 0, 0);
 	}
 }
 
 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
-						  LLVMValueRef alpha)
+						  LLVMValueRef alpha,
+						  unsigned samplemask_param)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMValueRef coverage;
 
 	/* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
-	coverage = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
-				SI_PARAM_SAMPLE_COVERAGE);
+	coverage = LLVMGetParam(ctx->radeon_bld.main_fn,
+				samplemask_param);
 	coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
 
 	coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
-				   bld_base->int_bld.elem_type,
+				   ctx->i32,
 				   &coverage, 1, LLVMReadNoneAttribute);
 
 	coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
-				   bld_base->base.elem_type, "");
+				   ctx->f32, "");
 
 	coverage = LLVMBuildFMul(gallivm->builder, coverage,
 				 lp_build_const_float(gallivm,
@@ -1392,19 +1588,19 @@ static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *
 	return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
 }
 
-static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context * bld_base,
+static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
 				    LLVMValueRef (*pos)[9], LLVMValueRef *out_elts)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct lp_build_context *base = &bld_base->base;
-	struct lp_build_context *uint = &si_shader_ctx->radeon_bld.soa.bld_base.uint_bld;
+	struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
 	unsigned reg_index;
 	unsigned chan;
 	unsigned const_chan;
 	LLVMValueRef base_elt;
-	LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
+	LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
 	LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm, SI_DRIVER_STATE_CONST_BUF);
-	LLVMValueRef const_resource = build_indexed_load_const(si_shader_ctx, ptr, constbuf_index);
+	LLVMValueRef const_resource = build_indexed_load_const(ctx, ptr, constbuf_index);
 
 	for (reg_index = 0; reg_index < 2; reg_index ++) {
 		LLVMValueRef *args = pos[2 + reg_index];
@@ -1421,7 +1617,7 @@ static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context * bld_base,
 							       ((reg_index * 4 + chan) * 4 +
 								const_chan) * 4);
 				base_elt = buffer_load_const(base->gallivm->builder, const_resource,
-						      args[1], base->elem_type);
+						      args[1], ctx->f32);
 				args[5 + chan] =
 					lp_build_add(base, args[5 + chan],
 						     lp_build_mul(base, base_elt,
@@ -1462,7 +1658,7 @@ static void si_dump_streamout(struct pipe_stream_output_info *so)
 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
  * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
  * or v4i32 (num_channels=3,4). */
-static void build_tbuffer_store(struct si_shader_context *shader,
+static void build_tbuffer_store(struct si_shader_context *ctx,
 				LLVMValueRef rsrc,
 				LLVMValueRef vdata,
 				unsigned num_channels,
@@ -1477,22 +1673,21 @@ static void build_tbuffer_store(struct si_shader_context *shader,
 				unsigned slc,
 				unsigned tfe)
 {
-	struct gallivm_state *gallivm = &shader->radeon_bld.gallivm;
-	LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
+	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 	LLVMValueRef args[] = {
 		rsrc,
 		vdata,
-		LLVMConstInt(i32, num_channels, 0),
+		LLVMConstInt(ctx->i32, num_channels, 0),
 		vaddr,
 		soffset,
-		LLVMConstInt(i32, inst_offset, 0),
-		LLVMConstInt(i32, dfmt, 0),
-		LLVMConstInt(i32, nfmt, 0),
-		LLVMConstInt(i32, offen, 0),
-		LLVMConstInt(i32, idxen, 0),
-		LLVMConstInt(i32, glc, 0),
-		LLVMConstInt(i32, slc, 0),
-		LLVMConstInt(i32, tfe, 0)
+		LLVMConstInt(ctx->i32, inst_offset, 0),
+		LLVMConstInt(ctx->i32, dfmt, 0),
+		LLVMConstInt(ctx->i32, nfmt, 0),
+		LLVMConstInt(ctx->i32, offen, 0),
+		LLVMConstInt(ctx->i32, idxen, 0),
+		LLVMConstInt(ctx->i32, glc, 0),
+		LLVMConstInt(ctx->i32, slc, 0),
+		LLVMConstInt(ctx->i32, tfe, 0)
 	};
 
 	/* The instruction offset field has 12 bits */
@@ -1504,12 +1699,11 @@ static void build_tbuffer_store(struct si_shader_context *shader,
 	char name[256];
 	snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
 
-	lp_build_intrinsic(gallivm->builder, name,
-			   LLVMVoidTypeInContext(gallivm->context),
+	lp_build_intrinsic(gallivm->builder, name, ctx->voidt,
 			   args, Elements(args), 0);
 }
 
-static void build_tbuffer_store_dwords(struct si_shader_context *shader,
+static void build_tbuffer_store_dwords(struct si_shader_context *ctx,
 				     LLVMValueRef rsrc,
 				     LLVMValueRef vdata,
 				     unsigned num_channels,
@@ -1525,30 +1719,28 @@ static void build_tbuffer_store_dwords(struct si_shader_context *shader,
 	};
 	assert(num_channels >= 1 && num_channels <= 4);
 
-	build_tbuffer_store(shader, rsrc, vdata, num_channels, vaddr, soffset,
+	build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset,
 			    inst_offset, dfmt[num_channels-1],
 			    V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
 }
 
 /* On SI, the vertex shader is responsible for writing streamout data
  * to buffers. */
-static void si_llvm_emit_streamout(struct si_shader_context *shader,
+static void si_llvm_emit_streamout(struct si_shader_context *ctx,
 				   struct si_shader_output_values *outputs,
 				   unsigned noutput)
 {
-	struct pipe_stream_output_info *so = &shader->shader->selector->so;
-	struct gallivm_state *gallivm = &shader->radeon_bld.gallivm;
+	struct pipe_stream_output_info *so = &ctx->shader->selector->so;
+	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
 	int i, j;
 	struct lp_build_if_state if_ctx;
 
-	LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
-
 	/* Get bits [22:16], i.e. (so_param >> 16) & 127; */
 	LLVMValueRef so_vtx_count =
-		unpack_param(shader, shader->param_streamout_config, 16, 7);
+		unpack_param(ctx, ctx->param_streamout_config, 16, 7);
 
-	LLVMValueRef tid = lp_build_intrinsic(builder, "llvm.SI.tid", i32,
+	LLVMValueRef tid = lp_build_intrinsic(builder, "llvm.SI.tid", ctx->i32,
 					   NULL, 0, LLVMReadNoneAttribute);
 
 	/* can_emit = tid < so_vtx_count; */
@@ -1556,7 +1748,7 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader,
 		LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
 
 	LLVMValueRef stream_id =
-		unpack_param(shader, shader->param_streamout_config, 24, 2);
+		unpack_param(ctx, ctx->param_streamout_config, 24, 2);
 
 	/* Emit the streamout code conditionally. This actually avoids
 	 * out-of-bounds buffer access. The hw tells us via the SGPR
@@ -1570,8 +1762,8 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader,
                  */
 
 		LLVMValueRef so_write_index =
-			LLVMGetParam(shader->radeon_bld.main_fn,
-				     shader->param_streamout_write_index);
+			LLVMGetParam(ctx->radeon_bld.main_fn,
+				     ctx->param_streamout_write_index);
 
 		/* Compute (streamout_write_index + thread_id). */
 		so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
@@ -1582,12 +1774,12 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader,
 			if (!so->stride[i])
 				continue;
 
-			LLVMValueRef so_offset = LLVMGetParam(shader->radeon_bld.main_fn,
-							      shader->param_streamout_offset[i]);
-			so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(i32, 4, 0), "");
+			LLVMValueRef so_offset = LLVMGetParam(ctx->radeon_bld.main_fn,
+							      ctx->param_streamout_offset[i]);
+			so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
 
 			so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
-							  LLVMConstInt(i32, so->stride[i]*4, 0), "");
+							  LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
 			so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
 		}
 
@@ -1612,7 +1804,7 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader,
 			for (j = 0; j < num_comps; j++) {
 				out[j] = LLVMBuildBitCast(builder,
 							  outputs[reg].values[start+j],
-						i32, "");
+						ctx->i32, "");
 			}
 
 			/* Pack the output. */
@@ -1625,10 +1817,10 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader,
 			case 2: /* as v2i32 */
 			case 3: /* as v4i32 (aligned to 4) */
 			case 4: /* as v4i32 */
-				vdata = LLVMGetUndef(LLVMVectorType(i32, util_next_power_of_two(num_comps)));
+				vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
 				for (j = 0; j < num_comps; j++) {
 					vdata = LLVMBuildInsertElement(builder, vdata, out[j],
-								       LLVMConstInt(i32, j, 0), "");
+								       LLVMConstInt(ctx->i32, j, 0), "");
 				}
 				break;
 			}
@@ -1639,10 +1831,10 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader,
 					      lp_build_const_int32(gallivm, stream), "");
 
 			lp_build_if(&if_ctx_stream, gallivm, can_emit_stream);
-			build_tbuffer_store_dwords(shader, shader->so_buffers[buf_idx],
+			build_tbuffer_store_dwords(ctx, ctx->so_buffers[buf_idx],
 						   vdata, num_comps,
 						   so_write_offset[buf_idx],
-						   LLVMConstInt(i32, 0, 0),
+						   LLVMConstInt(ctx->i32, 0, 0),
 						   so->output[i].dst_offset*4);
 			lp_build_endif(&if_ctx_stream);
 		}
@@ -1656,11 +1848,11 @@ static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
 			      struct si_shader_output_values *outputs,
 			      unsigned noutput)
 {
-	struct si_shader_context * si_shader_ctx = si_shader_context(bld_base);
-	struct si_shader * shader = si_shader_ctx->shader;
-	struct lp_build_context * base = &bld_base->base;
-	struct lp_build_context * uint =
-				&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld;
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct si_shader *shader = ctx->shader;
+	struct lp_build_context *base = &bld_base->base;
+	struct lp_build_context *uint =
+				&ctx->radeon_bld.soa.bld_base.uint_bld;
 	LLVMValueRef args[9];
 	LLVMValueRef pos_args[4][9] = { { 0 } };
 	LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
@@ -1670,8 +1862,8 @@ static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
 	unsigned pos_idx;
 	int i;
 
-	if (outputs && si_shader_ctx->shader->selector->so.num_outputs) {
-		si_llvm_emit_streamout(si_shader_ctx, outputs, noutput);
+	if (outputs && ctx->shader->selector->so.num_outputs) {
+		si_llvm_emit_streamout(ctx, outputs, noutput);
 	}
 
 	for (i = 0; i < noutput; i++) {
@@ -1733,8 +1925,7 @@ handle_semantic:
 			       args, sizeof(args));
 		} else {
 			lp_build_intrinsic(base->gallivm->builder,
-					   "llvm.SI.export",
-					   LLVMVoidTypeInContext(base->gallivm->context),
+					   "llvm.SI.export", ctx->voidt,
 					   args, 9, 0);
 		}
 
@@ -1786,7 +1977,7 @@ handle_semantic:
 			 * with the first bit containing the edge flag. */
 			edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder,
 							 edgeflag_value,
-							 bld_base->uint_bld.elem_type, "");
+							 ctx->i32, "");
 			edgeflag_value = lp_build_min(&bld_base->int_bld,
 						      edgeflag_value,
 						      bld_base->int_bld.one);
@@ -1794,7 +1985,7 @@ handle_semantic:
 			/* The LLVM intrinsic expects a float. */
 			pos_args[1][6] = LLVMBuildBitCast(base->gallivm->builder,
 							  edgeflag_value,
-							  base->elem_type, "");
+							  ctx->f32, "");
 		}
 
 		if (shader->selector->info.writes_layer)
@@ -1820,28 +2011,25 @@ handle_semantic:
 			/* Specify that this is the last export */
 			pos_args[i][2] = uint->one;
 
-		lp_build_intrinsic(base->gallivm->builder,
-				   "llvm.SI.export",
-				   LLVMVoidTypeInContext(base->gallivm->context),
-				   pos_args[i], 9, 0);
+		lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+				   ctx->voidt, pos_args[i], 9, 0);
 	}
 }
 
-/* This only writes the tessellation factor levels. */
-static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
+static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
+				  LLVMValueRef rel_patch_id,
+				  LLVMValueRef invocation_id,
+				  LLVMValueRef tcs_out_current_patch_data_offset)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
-	struct si_shader *shader = si_shader_ctx->shader;
+	struct si_shader *shader = ctx->shader;
 	unsigned tess_inner_index, tess_outer_index;
-	LLVMValueRef lds_base, lds_inner, lds_outer;
-	LLVMValueRef tf_base, rel_patch_id, byteoffset, buffer, rw_buffers;
-	LLVMValueRef out[6], vec0, vec1, invocation_id;
+	LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
+	LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base;
 	unsigned stride, outer_comps, inner_comps, i;
 	struct lp_build_if_state if_ctx;
 
-	invocation_id = unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 8, 5);
-
 	/* Do this only for invocation 0, because the tess levels are per-patch,
 	 * not per-vertex.
 	 *
@@ -1853,7 +2041,7 @@ static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
 				  invocation_id, bld_base->uint_bld.zero, ""));
 
 	/* Determine the layout of one tess factor element in the buffer. */
-	switch (shader->key.tcs.prim_mode) {
+	switch (shader->key.tcs.epilog.prim_mode) {
 	case PIPE_PRIM_LINES:
 		stride = 2; /* 2 dwords, 1 vec2 store */
 		outer_comps = 2;
@@ -1880,7 +2068,7 @@ static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
 	tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
 	tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
 
-	lds_base = get_tcs_out_current_patch_data_offset(si_shader_ctx);
+	lds_base = tcs_out_current_patch_data_offset;
 	lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
 				 lp_build_const_int32(gallivm,
 						      tess_inner_index * 4), "");
@@ -1901,45 +2089,95 @@ static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
 		vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
 
 	/* Get the buffer. */
-	rw_buffers = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+	rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
 				  SI_PARAM_RW_BUFFERS);
-	buffer = build_indexed_load_const(si_shader_ctx, rw_buffers,
+	buffer = build_indexed_load_const(ctx, rw_buffers,
 			lp_build_const_int32(gallivm, SI_RING_TESS_FACTOR));
 
 	/* Get the offset. */
-	tf_base = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+	tf_base = LLVMGetParam(ctx->radeon_bld.main_fn,
 			       SI_PARAM_TESS_FACTOR_OFFSET);
-	rel_patch_id = get_rel_patch_id(si_shader_ctx);
 	byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
 				  lp_build_const_int32(gallivm, 4 * stride), "");
 
 	/* Store the outputs. */
-	build_tbuffer_store_dwords(si_shader_ctx, buffer, vec0,
+	build_tbuffer_store_dwords(ctx, buffer, vec0,
 				   MIN2(stride, 4), byteoffset, tf_base, 0);
 	if (vec1)
-		build_tbuffer_store_dwords(si_shader_ctx, buffer, vec1,
+		build_tbuffer_store_dwords(ctx, buffer, vec1,
 					   stride - 4, byteoffset, tf_base, 16);
 	lp_build_endif(&if_ctx);
 }
 
-static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context * bld_base)
+/* This only writes the tessellation factor levels. */
+static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
+
+	rel_patch_id = get_rel_patch_id(ctx);
+	invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
+	tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
+
+	if (!ctx->is_monolithic) {
+		/* Return epilog parameters from this function. */
+		LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+		LLVMValueRef ret = ctx->return_value;
+		LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
+		unsigned vgpr;
+
+		/* RW_BUFFERS pointer */
+		rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
+					  SI_PARAM_RW_BUFFERS);
+		rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
+		rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
+		rw0 = LLVMBuildExtractElement(builder, rw_buffers,
+					      bld_base->uint_bld.zero, "");
+		rw1 = LLVMBuildExtractElement(builder, rw_buffers,
+					      bld_base->uint_bld.one, "");
+		ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
+		ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
+
+		/* Tess factor buffer soffset is after user SGPRs. */
+		tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
+					  SI_PARAM_TESS_FACTOR_OFFSET);
+		ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
+					   SI_TCS_NUM_USER_SGPR, "");
+
+		/* VGPRs */
+		rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
+		invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
+		tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
+
+		vgpr = SI_TCS_NUM_USER_SGPR + 1;
+		ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
+		ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
+		ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
+		ctx->return_value = ret;
+		return;
+	}
+
+	si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset);
+}
+
+static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
-	struct si_shader *shader = si_shader_ctx->shader;
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct si_shader *shader = ctx->shader;
 	struct tgsi_shader_info *info = &shader->selector->info;
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	unsigned i, chan;
-	LLVMValueRef vertex_id = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
-					      si_shader_ctx->param_rel_auto_id);
+	LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
+					      ctx->param_rel_auto_id);
 	LLVMValueRef vertex_dw_stride =
-		unpack_param(si_shader_ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
+		unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
 	LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
 						 vertex_dw_stride, "");
 
 	/* Write outputs to LDS. The next shader (TCS aka HS) will read
 	 * its inputs from it. */
 	for (i = 0; i < info->num_outputs; i++) {
-		LLVMValueRef *out_ptr = si_shader_ctx->radeon_bld.soa.outputs[i];
+		LLVMValueRef *out_ptr = ctx->radeon_bld.soa.outputs[i];
 		unsigned name = info->output_semantic_name[i];
 		unsigned index = info->output_semantic_index[i];
 		int param = si_shader_io_get_unique_index(name, index);
@@ -1953,21 +2191,20 @@ static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context * bld_base)
 	}
 }
 
-static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
+static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
-	struct si_shader *es = si_shader_ctx->shader;
+	struct si_shader *es = ctx->shader;
 	struct tgsi_shader_info *info = &es->selector->info;
-	LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
-	LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
-					    si_shader_ctx->param_es2gs_offset);
+	LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
+					    ctx->param_es2gs_offset);
 	unsigned chan;
 	int i;
 
 	for (i = 0; i < info->num_outputs; i++) {
 		LLVMValueRef *out_ptr =
-			si_shader_ctx->radeon_bld.soa.outputs[i];
+			ctx->radeon_bld.soa.outputs[i];
 		int param_index;
 
 		if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
@@ -1979,12 +2216,12 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
 
 		for (chan = 0; chan < 4; chan++) {
 			LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
-			out_val = LLVMBuildBitCast(gallivm->builder, out_val, i32, "");
+			out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
 
-			build_tbuffer_store(si_shader_ctx,
-					    si_shader_ctx->esgs_ring,
+			build_tbuffer_store(ctx,
+					    ctx->esgs_ring,
 					    out_val, 1,
-					    LLVMGetUndef(i32), soffset,
+					    LLVMGetUndef(ctx->i32), soffset,
 					    (4 * param_index + chan) * 4,
 					    V_008F0C_BUF_DATA_FORMAT_32,
 					    V_008F0C_BUF_NUM_FORMAT_UINT,
@@ -1995,25 +2232,26 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
 
 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMValueRef args[2];
 
 	args[0] = lp_build_const_int32(gallivm,	SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
-	args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
+	args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
 	lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
-			LLVMVoidTypeInContext(gallivm->context), args, 2,
-			LLVMNoUnwindAttribute);
+			   ctx->voidt, args, 2, LLVMNoUnwindAttribute);
 }
 
-static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base)
+static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
-	struct tgsi_shader_info *info = &si_shader_ctx->shader->selector->info;
+	struct tgsi_shader_info *info = &ctx->shader->selector->info;
 	struct si_shader_output_values *outputs = NULL;
 	int i,j;
 
+	assert(!ctx->is_gs_copy_shader);
+
 	outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
 
 	/* Vertex color clamping.
@@ -2022,8 +2260,7 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base)
 	 * an IF statement is added that clamps all colors if the constant
 	 * is true.
 	 */
-	if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX &&
-	    !si_shader_ctx->shader->is_gs_copy_shader) {
+	if (ctx->type == TGSI_PROCESSOR_VERTEX) {
 		struct lp_build_if_state if_ctx;
 		LLVMValueRef cond = NULL;
 		LLVMValueRef addr, val;
@@ -2036,15 +2273,15 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base)
 			/* We've found a color. */
 			if (!cond) {
 				/* The state is in the first bit of the user SGPR. */
-				cond = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				cond = LLVMGetParam(ctx->radeon_bld.main_fn,
 						    SI_PARAM_VS_STATE_BITS);
 				cond = LLVMBuildTrunc(gallivm->builder, cond,
-						      LLVMInt1TypeInContext(gallivm->context), "");
+						      ctx->i1, "");
 				lp_build_if(&if_ctx, gallivm, cond);
 			}
 
 			for (j = 0; j < 4; j++) {
-				addr = si_shader_ctx->radeon_bld.soa.outputs[i][j];
+				addr = ctx->radeon_bld.soa.outputs[i][j];
 				val = LLVMBuildLoad(gallivm->builder, addr, "");
 				val = radeon_llvm_saturate(bld_base, val);
 				LLVMBuildStore(gallivm->builder, val, addr);
@@ -2062,20 +2299,30 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base)
 		for (j = 0; j < 4; j++)
 			outputs[i].values[j] =
 				LLVMBuildLoad(gallivm->builder,
-					      si_shader_ctx->radeon_bld.soa.outputs[i][j],
+					      ctx->radeon_bld.soa.outputs[i][j],
 					      "");
 	}
 
-	/* Export PrimitiveID when PS needs it. */
-	if (si_vs_exports_prim_id(si_shader_ctx->shader)) {
-		outputs[i].name = TGSI_SEMANTIC_PRIMID;
-		outputs[i].sid = 0;
-		outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
-					       get_primitive_id(bld_base, 0));
-		outputs[i].values[1] = bld_base->base.undef;
-		outputs[i].values[2] = bld_base->base.undef;
-		outputs[i].values[3] = bld_base->base.undef;
-		i++;
+	if (ctx->is_monolithic) {
+		/* Export PrimitiveID when PS needs it. */
+		if (si_vs_exports_prim_id(ctx->shader)) {
+			outputs[i].name = TGSI_SEMANTIC_PRIMID;
+			outputs[i].sid = 0;
+			outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+						       get_primitive_id(bld_base, 0));
+			outputs[i].values[1] = bld_base->base.undef;
+			outputs[i].values[2] = bld_base->base.undef;
+			outputs[i].values[3] = bld_base->base.undef;
+			i++;
+		}
+	} else {
+		/* Return the primitive ID from the LLVM function. */
+		ctx->return_value =
+			LLVMBuildInsertValue(gallivm->builder,
+					     ctx->return_value,
+					     bitcast(bld_base, TGSI_TYPE_FLOAT,
+						     get_primitive_id(bld_base, 0)),
+					     VS_EPILOG_PRIMID_LOC, "");
 	}
 
 	si_llvm_export_vs(bld_base, outputs, i);
@@ -2086,7 +2333,7 @@ static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
 			   LLVMValueRef depth, LLVMValueRef stencil,
 			   LLVMValueRef samplemask)
 {
-	struct si_screen *sscreen = si_shader_context(bld_base)->screen;
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct lp_build_context *base = &bld_base->base;
 	struct lp_build_context *uint = &bld_base->uint_bld;
 	LLVMValueRef args[9];
@@ -2123,71 +2370,89 @@ static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
 
 	/* SI (except OLAND) has a bug that it only looks
 	 * at the X writemask component. */
-	if (sscreen->b.chip_class == SI &&
-	    sscreen->b.family != CHIP_OLAND)
+	if (ctx->screen->b.chip_class == SI &&
+	    ctx->screen->b.family != CHIP_OLAND)
 		mask |= 0x1;
 
 	/* Specify which components to enable */
 	args[0] = lp_build_const_int32(base->gallivm, mask);
 
 	lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
-			   LLVMVoidTypeInContext(base->gallivm->context),
-			   args, 9, 0);
+			   ctx->voidt, args, 9, 0);
 }
 
 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
 				LLVMValueRef *color, unsigned index,
+				unsigned samplemask_param,
 				bool is_last)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct lp_build_context *base = &bld_base->base;
-	LLVMValueRef args[9];
 	int i;
 
 	/* Clamp color */
-	if (si_shader_ctx->shader->key.ps.clamp_color)
+	if (ctx->shader->key.ps.epilog.clamp_color)
 		for (i = 0; i < 4; i++)
 			color[i] = radeon_llvm_saturate(bld_base, color[i]);
 
 	/* Alpha to one */
-	if (si_shader_ctx->shader->key.ps.alpha_to_one)
+	if (ctx->shader->key.ps.epilog.alpha_to_one)
 		color[3] = base->one;
 
 	/* Alpha test */
 	if (index == 0 &&
-	    si_shader_ctx->shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS)
+	    ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
 		si_alpha_test(bld_base, color[3]);
 
 	/* Line & polygon smoothing */
-	if (si_shader_ctx->shader->key.ps.poly_line_smoothing)
-		color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3]);
+	if (ctx->shader->key.ps.epilog.poly_line_smoothing)
+		color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
+							 samplemask_param);
 
 	/* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
-	if (index == 0 &&
-	    si_shader_ctx->shader->key.ps.last_cbuf > 0) {
-		for (int c = 1; c <= si_shader_ctx->shader->key.ps.last_cbuf; c++) {
+	if (ctx->shader->key.ps.epilog.last_cbuf > 0) {
+		LLVMValueRef args[8][9];
+		int c, last = -1;
+
+		/* Get the export arguments, also find out what the last one is. */
+		for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
 			si_llvm_init_export_args(bld_base, color,
-						 V_008DFC_SQ_EXP_MRT + c, args);
-			lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
-					   LLVMVoidTypeInContext(base->gallivm->context),
-					   args, 9, 0);
+						 V_008DFC_SQ_EXP_MRT + c, args[c]);
+			if (args[c][0] != bld_base->uint_bld.zero)
+				last = c;
 		}
-	}
 
-	/* Export */
-	si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
-				 args);
-	if (is_last) {
-		args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
-		args[2] = bld_base->uint_bld.one; /* DONE bit */
+		/* Emit all exports. */
+		for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
+			if (is_last && last == c) {
+				args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
+				args[c][2] = bld_base->uint_bld.one; /* DONE bit */
+			} else if (args[c][0] == bld_base->uint_bld.zero)
+				continue; /* unnecessary NULL export */
+
+			lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+					   ctx->voidt, args[c], 9, 0);
+		}
+	} else {
+		LLVMValueRef args[9];
+
+		/* Export */
+		si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
+					 args);
+		if (is_last) {
+			args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
+			args[2] = bld_base->uint_bld.one; /* DONE bit */
+		} else if (args[0] == bld_base->uint_bld.zero)
+			return; /* unnecessary NULL export */
+
+		lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+				   ctx->voidt, args, 9, 0);
 	}
-	lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
-			   LLVMVoidTypeInContext(base->gallivm->context),
-			   args, 9, 0);
 }
 
 static void si_export_null(struct lp_build_tgsi_context *bld_base)
 {
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct lp_build_context *base = &bld_base->base;
 	struct lp_build_context *uint = &bld_base->uint_bld;
 	LLVMValueRef args[9];
@@ -2203,34 +2468,57 @@ static void si_export_null(struct lp_build_tgsi_context *bld_base)
 	args[8] = uint->undef; /* A */
 
 	lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
-			   LLVMVoidTypeInContext(base->gallivm->context),
-			   args, 9, 0);
+			   ctx->voidt, args, 9, 0);
 }
 
-static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
+static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
 {
-	struct si_shader_context * si_shader_ctx = si_shader_context(bld_base);
-	struct si_shader * shader = si_shader_ctx->shader;
-	struct lp_build_context * base = &bld_base->base;
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct si_shader *shader = ctx->shader;
+	struct lp_build_context *base = &bld_base->base;
 	struct tgsi_shader_info *info = &shader->selector->info;
 	LLVMBuilderRef builder = base->gallivm->builder;
 	LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
 	int last_color_export = -1;
 	int i;
 
-	/* If there are no outputs, add a dummy export. */
-	if (!info->num_outputs) {
-		si_export_null(bld_base);
-		return;
-	}
-
 	/* Determine the last export. If MRTZ is present, it's always last.
 	 * Otherwise, find the last color export.
 	 */
-	if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask)
-		for (i = 0; i < info->num_outputs; i++)
-			if (info->output_semantic_name[i] == TGSI_SEMANTIC_COLOR)
+	if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) {
+		unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format;
+
+		/* Don't export NULL and return if alpha-test is enabled. */
+		if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS &&
+		    shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER &&
+		    (spi_format & 0xf) == 0)
+			spi_format |= V_028714_SPI_SHADER_32_AR;
+
+		for (i = 0; i < info->num_outputs; i++) {
+			unsigned index = info->output_semantic_index[i];
+
+			if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR)
+				continue;
+
+			/* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
+			if (shader->key.ps.epilog.last_cbuf > 0) {
+				/* Just set this if any of the colorbuffers are enabled. */
+				if (spi_format &
+				    ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1))
+					last_color_export = i;
+				continue;
+			}
+
+			if ((spi_format >> (index * 4)) & 0xf)
 				last_color_export = i;
+		}
+
+		/* If there are no outputs, export NULL. */
+		if (last_color_export == -1) {
+			si_export_null(bld_base);
+			return;
+		}
+	}
 
 	for (i = 0; i < info->num_outputs; i++) {
 		unsigned semantic_name = info->output_semantic_name[i];
@@ -2242,22 +2530,23 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 		switch (semantic_name) {
 		case TGSI_SEMANTIC_POSITION:
 			depth = LLVMBuildLoad(builder,
-					      si_shader_ctx->radeon_bld.soa.outputs[i][2], "");
+					      ctx->radeon_bld.soa.outputs[i][2], "");
 			break;
 		case TGSI_SEMANTIC_STENCIL:
 			stencil = LLVMBuildLoad(builder,
-						si_shader_ctx->radeon_bld.soa.outputs[i][1], "");
+						ctx->radeon_bld.soa.outputs[i][1], "");
 			break;
 		case TGSI_SEMANTIC_SAMPLEMASK:
 			samplemask = LLVMBuildLoad(builder,
-						   si_shader_ctx->radeon_bld.soa.outputs[i][0], "");
+						   ctx->radeon_bld.soa.outputs[i][0], "");
 			break;
 		case TGSI_SEMANTIC_COLOR:
 			for (j = 0; j < 4; j++)
 				color[j] = LLVMBuildLoad(builder,
-							 si_shader_ctx->radeon_bld.soa.outputs[i][j], "");
+							 ctx->radeon_bld.soa.outputs[i][j], "");
 
 			si_export_mrt_color(bld_base, color, semantic_index,
+					    SI_PARAM_SAMPLE_COVERAGE,
 					    last_color_export == i);
 			break;
 		default:
@@ -2271,9 +2560,103 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 		si_export_mrt_z(bld_base, depth, stencil, samplemask);
 }
 
-static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
-				struct lp_build_tgsi_context * bld_base,
-				struct lp_build_emit_data * emit_data);
+/**
+ * Return PS outputs in this order:
+ *
+ * v[0:3] = color0.xyzw
+ * v[4:7] = color1.xyzw
+ * ...
+ * vN+0 = Depth
+ * vN+1 = Stencil
+ * vN+2 = SampleMask
+ * vN+3 = SampleMaskIn (used for OpenGL smoothing)
+ *
+ * The alpha-ref SGPR is returned via its original location.
+ */
+static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct si_shader *shader = ctx->shader;
+	struct lp_build_context *base = &bld_base->base;
+	struct tgsi_shader_info *info = &shader->selector->info;
+	LLVMBuilderRef builder = base->gallivm->builder;
+	unsigned i, j, first_vgpr, vgpr;
+
+	LLVMValueRef color[8][4] = {};
+	LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
+	LLVMValueRef ret;
+
+	/* Read the output values. */
+	for (i = 0; i < info->num_outputs; i++) {
+		unsigned semantic_name = info->output_semantic_name[i];
+		unsigned semantic_index = info->output_semantic_index[i];
+
+		switch (semantic_name) {
+		case TGSI_SEMANTIC_COLOR:
+			assert(semantic_index < 8);
+			for (j = 0; j < 4; j++) {
+				LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j];
+				LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
+				color[semantic_index][j] = result;
+			}
+			break;
+		case TGSI_SEMANTIC_POSITION:
+			depth = LLVMBuildLoad(builder,
+					      ctx->radeon_bld.soa.outputs[i][2], "");
+			break;
+		case TGSI_SEMANTIC_STENCIL:
+			stencil = LLVMBuildLoad(builder,
+						ctx->radeon_bld.soa.outputs[i][1], "");
+			break;
+		case TGSI_SEMANTIC_SAMPLEMASK:
+			samplemask = LLVMBuildLoad(builder,
+						   ctx->radeon_bld.soa.outputs[i][0], "");
+			break;
+		default:
+			fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
+				semantic_name);
+		}
+	}
+
+	/* Fill the return structure. */
+	ret = ctx->return_value;
+
+	/* Set SGPRs. */
+	ret = LLVMBuildInsertValue(builder, ret,
+				   bitcast(bld_base, TGSI_TYPE_SIGNED,
+					   LLVMGetParam(ctx->radeon_bld.main_fn,
+							SI_PARAM_ALPHA_REF)),
+				   SI_SGPR_ALPHA_REF, "");
+
+	/* Set VGPRs */
+	first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
+	for (i = 0; i < ARRAY_SIZE(color); i++) {
+		if (!color[i][0])
+			continue;
+
+		for (j = 0; j < 4; j++)
+			ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
+	}
+	if (depth)
+		ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
+	if (stencil)
+		ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
+	if (samplemask)
+		ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
+
+	/* Add the input sample mask for smoothing at the end. */
+	if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
+		vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
+	ret = LLVMBuildInsertValue(builder, ret,
+				   LLVMGetParam(ctx->radeon_bld.main_fn,
+						SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
+
+	ctx->return_value = ret;
+}
+
+static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
+				struct lp_build_tgsi_context *bld_base,
+				struct lp_build_emit_data *emit_data);
 
 static bool tgsi_is_array_sampler(unsigned target)
 {
@@ -2286,20 +2669,20 @@ static bool tgsi_is_array_sampler(unsigned target)
 	       target == TGSI_TEXTURE_2D_ARRAY_MSAA;
 }
 
-static void set_tex_fetch_args(struct gallivm_state *gallivm,
+static void set_tex_fetch_args(struct si_shader_context *ctx,
 			       struct lp_build_emit_data *emit_data,
 			       unsigned opcode, unsigned target,
 			       LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
 			       LLVMValueRef *param, unsigned count,
 			       unsigned dmask)
 {
+	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 	unsigned num_args;
 	unsigned is_rect = target == TGSI_TEXTURE_RECT;
-	LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
 
 	/* Pad to power of two vector */
 	while (count < util_next_power_of_two(count))
-		param[count++] = LLVMGetUndef(i32);
+		param[count++] = LLVMGetUndef(ctx->i32);
 
 	/* Texture coordinates. */
 	if (count > 1)
@@ -2312,10 +2695,9 @@ static void set_tex_fetch_args(struct gallivm_state *gallivm,
 	num_args = 2;
 
 	if (opcode == TGSI_OPCODE_TXF || opcode == TGSI_OPCODE_TXQ)
-		emit_data->dst_type = LLVMVectorType(i32, 4);
+		emit_data->dst_type = ctx->v4i32;
 	else {
-		emit_data->dst_type = LLVMVectorType(
-			LLVMFloatTypeInContext(gallivm->context), 4);
+		emit_data->dst_type = ctx->v4f32;
 
 		emit_data->args[num_args++] = samp_ptr;
 	}
@@ -2335,14 +2717,66 @@ static void set_tex_fetch_args(struct gallivm_state *gallivm,
 
 static const struct lp_build_tgsi_action tex_action;
 
-static void tex_fetch_ptrs(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data,
+enum desc_type {
+	DESC_IMAGE,
+	DESC_FMASK,
+	DESC_SAMPLER
+};
+
+static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
+{
+	return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
+			       CONST_ADDR_SPACE);
+}
+
+/**
+ * Load an image view, fmask view. or sampler state descriptor.
+ */
+static LLVMValueRef get_sampler_desc_custom(struct si_shader_context *ctx,
+					    LLVMValueRef list, LLVMValueRef index,
+					    enum desc_type type)
+{
+	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+
+	switch (type) {
+	case DESC_IMAGE:
+		/* The image is at [0:7]. */
+		index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
+		break;
+	case DESC_FMASK:
+		/* The FMASK is at [8:15]. */
+		index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
+		index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), "");
+		break;
+	case DESC_SAMPLER:
+		/* The sampler state is at [12:15]. */
+		index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
+		index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
+		list = LLVMBuildPointerCast(builder, list,
+					    const_array(ctx->v4i32, 0), "");
+		break;
+	}
+
+	return build_indexed_load_const(ctx, list, index);
+}
+
+static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
+				     LLVMValueRef index, enum desc_type type)
+{
+	LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn,
+					 SI_PARAM_SAMPLERS);
+
+	return get_sampler_desc_custom(ctx, list, index, type);
+}
+
+static void tex_fetch_ptrs(
+	struct lp_build_tgsi_context *bld_base,
+	struct lp_build_emit_data *emit_data,
 	LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
-	struct gallivm_state *gallivm = bld_base->base.gallivm;
-	const struct tgsi_full_instruction * inst = emit_data->inst;
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	const struct tgsi_full_instruction *inst = emit_data->inst;
 	unsigned target = inst->Texture.Texture;
 	unsigned sampler_src;
 	unsigned sampler_index;
@@ -2354,37 +2788,33 @@ static void tex_fetch_ptrs(
 		const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src];
 		LLVMValueRef ind_index;
 
-		ind_index = get_indirect_index(si_shader_ctx, &reg->Indirect, reg->Register.Index);
-
-		*res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_VIEWS);
-		*res_ptr = build_indexed_load_const(si_shader_ctx, *res_ptr, ind_index);
+		ind_index = get_indirect_index(ctx, &reg->Indirect, reg->Register.Index);
 
-		*samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_STATES);
-		*samp_ptr = build_indexed_load_const(si_shader_ctx, *samp_ptr, ind_index);
+		*res_ptr = get_sampler_desc(ctx, ind_index, DESC_IMAGE);
 
 		if (target == TGSI_TEXTURE_2D_MSAA ||
 		    target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
-			ind_index = LLVMBuildAdd(gallivm->builder, ind_index,
-						 lp_build_const_int32(gallivm,
-								      SI_FMASK_TEX_OFFSET), "");
-			*fmask_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_VIEWS);
-			*fmask_ptr = build_indexed_load_const(si_shader_ctx, *fmask_ptr, ind_index);
+			*samp_ptr = NULL;
+			*fmask_ptr = get_sampler_desc(ctx, ind_index, DESC_FMASK);
+		} else {
+			*samp_ptr = get_sampler_desc(ctx, ind_index, DESC_SAMPLER);
+			*fmask_ptr = NULL;
 		}
 	} else {
-		*res_ptr = si_shader_ctx->sampler_views[sampler_index];
-		*samp_ptr = si_shader_ctx->sampler_states[sampler_index];
-		*fmask_ptr = si_shader_ctx->sampler_views[SI_FMASK_TEX_OFFSET + sampler_index];
+		*res_ptr = ctx->sampler_views[sampler_index];
+		*samp_ptr = ctx->sampler_states[sampler_index];
+		*fmask_ptr = ctx->fmasks[sampler_index];
 	}
 }
 
 static void tex_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
+	struct lp_build_tgsi_context *bld_base,
+	struct lp_build_emit_data *emit_data)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
-	const struct tgsi_full_instruction * inst = emit_data->inst;
+	const struct tgsi_full_instruction *inst = emit_data->inst;
 	unsigned opcode = inst->Instruction.Opcode;
 	unsigned target = inst->Texture.Texture;
 	LLVMValueRef coords[5], derivs[6];
@@ -2396,21 +2826,18 @@ static void tex_fetch_args(
 	unsigned num_deriv_channels = 0;
 	bool has_offset = inst->Texture.NumOffsets > 0;
 	LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
-	LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
 	unsigned dmask = 0xf;
 
 	tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
 
 	if (opcode == TGSI_OPCODE_TXQ) {
 		if (target == TGSI_TEXTURE_BUFFER) {
-			LLVMTypeRef v8i32 = LLVMVectorType(i32, 8);
-
 			/* Read the size from the buffer descriptor directly. */
-			LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, v8i32, "");
+			LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
 			LLVMValueRef size = LLVMBuildExtractElement(builder, res,
 							lp_build_const_int32(gallivm, 6), "");
 
-			if (si_shader_ctx->screen->b.chip_class >= VI) {
+			if (ctx->screen->b.chip_class >= VI) {
 				/* On VI, the descriptor contains the size in bytes,
 				 * but TXQ must return the size in elements.
 				 * The stride is always non-zero for resources using TXQ.
@@ -2433,24 +2860,21 @@ static void tex_fetch_args(
 		/* Textures - set the mip level. */
 		address[count++] = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
 
-		set_tex_fetch_args(gallivm, emit_data, opcode, target, res_ptr,
+		set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr,
 				   NULL, address, count, 0xf);
 		return;
 	}
 
 	if (target == TGSI_TEXTURE_BUFFER) {
-		LLVMTypeRef i128 = LLVMIntTypeInContext(gallivm->context, 128);
-		LLVMTypeRef v2i128 = LLVMVectorType(i128, 2);
-		LLVMTypeRef i8 = LLVMInt8TypeInContext(gallivm->context);
-		LLVMTypeRef v16i8 = LLVMVectorType(i8, 16);
+		LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
 
 		/* Bitcast and truncate v8i32 to v16i8. */
 		LLVMValueRef res = res_ptr;
 		res = LLVMBuildBitCast(gallivm->builder, res, v2i128, "");
 		res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, "");
-		res = LLVMBuildBitCast(gallivm->builder, res, v16i8, "");
+		res = LLVMBuildBitCast(gallivm->builder, res, ctx->v16i8, "");
 
-		emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
+		emit_data->dst_type = ctx->v4f32;
 		emit_data->args[0] = res;
 		emit_data->args[1] = bld_base->uint_bld.zero;
 		emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
@@ -2587,7 +3011,7 @@ static void tex_fetch_args(
 
 	for (chan = 0; chan < count; chan++ ) {
 		address[chan] = LLVMBuildBitCast(gallivm->builder,
-						 address[chan], i32, "");
+						 address[chan], ctx->i32, "");
 	}
 
 	/* Adjust the sample index according to FMASK.
@@ -2624,14 +3048,14 @@ static void tex_fetch_args(
 		inst.Texture.Texture = target;
 		txf_emit_data.inst = &inst;
 		txf_emit_data.chan = 0;
-		set_tex_fetch_args(gallivm, &txf_emit_data, TGSI_OPCODE_TXF,
+		set_tex_fetch_args(ctx, &txf_emit_data, TGSI_OPCODE_TXF,
 				   target, fmask_ptr, NULL,
 				   txf_address, txf_count, 0xf);
 		build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
 
 		/* Initialize some constants. */
-		LLVMValueRef four = LLVMConstInt(uint_bld->elem_type, 4, 0);
-		LLVMValueRef F = LLVMConstInt(uint_bld->elem_type, 0xF, 0);
+		LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
+		LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
 
 		/* Apply the formula. */
 		LLVMValueRef fmask =
@@ -2655,7 +3079,7 @@ static void tex_fetch_args(
 		 */
 		LLVMValueRef fmask_desc =
 			LLVMBuildBitCast(gallivm->builder, fmask_ptr,
-					 LLVMVectorType(uint_bld->elem_type, 8), "");
+					 ctx->v8i32, "");
 
 		LLVMValueRef fmask_word1 =
 			LLVMBuildExtractElement(gallivm->builder, fmask_desc,
@@ -2676,7 +3100,7 @@ static void tex_fetch_args(
 		if (inst->Texture.NumOffsets) {
 			struct lp_build_context *uint_bld = &bld_base->uint_bld;
 			struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
-			const struct tgsi_texture_offset * off = inst->TexOffsets;
+			const struct tgsi_texture_offset *off = inst->TexOffsets;
 
 			assert(inst->Texture.NumOffsets == 1);
 
@@ -2735,15 +3159,15 @@ static void tex_fetch_args(
 		dmask = 1 << gather_comp;
 	}
 
-	set_tex_fetch_args(gallivm, emit_data, opcode, target, res_ptr,
+	set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr,
 			   samp_ptr, address, count, dmask);
 }
 
-static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
-				struct lp_build_tgsi_context * bld_base,
-				struct lp_build_emit_data * emit_data)
+static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
+				struct lp_build_tgsi_context *bld_base,
+				struct lp_build_emit_data *emit_data)
 {
-	struct lp_build_context * base = &bld_base->base;
+	struct lp_build_context *base = &bld_base->base;
 	unsigned opcode = emit_data->inst->Instruction.Opcode;
 	unsigned target = emit_data->inst->Texture.Texture;
 	char intr_name[127];
@@ -2844,14 +3268,13 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
 }
 
 static void si_llvm_emit_txqs(
-	const struct lp_build_tgsi_action * action,
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
+	const struct lp_build_tgsi_action *action,
+	struct lp_build_tgsi_context *bld_base,
+	struct lp_build_emit_data *emit_data)
 {
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
-	LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
-	LLVMTypeRef v8i32 = LLVMVectorType(i32, 8);
 	LLVMValueRef res, samples;
 	LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
 
@@ -2859,7 +3282,7 @@ static void si_llvm_emit_txqs(
 
 
 	/* Read the samples from the descriptor directly. */
-	res = LLVMBuildBitCast(builder, res_ptr, v8i32, "");
+	res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
 	samples = LLVMBuildExtractElement(
 		builder, res,
 		lp_build_const_int32(gallivm, 3), "");
@@ -2903,30 +3326,26 @@ static void si_llvm_emit_txqs(
 #define TID_MASK_LEFT     0xfffffffe
 
 static void si_llvm_emit_ddxy(
-	const struct lp_build_tgsi_action * action,
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
+	const struct lp_build_tgsi_action *action,
+	struct lp_build_tgsi_context *bld_base,
+	struct lp_build_emit_data *emit_data)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
-	struct lp_build_context * base = &bld_base->base;
 	const struct tgsi_full_instruction *inst = emit_data->inst;
 	unsigned opcode = inst->Instruction.Opcode;
 	LLVMValueRef indices[2];
 	LLVMValueRef store_ptr, load_ptr0, load_ptr1;
 	LLVMValueRef tl, trbl, result[4];
-	LLVMTypeRef i32;
 	unsigned swizzle[4];
 	unsigned c;
 	int idx;
 	unsigned mask;
 
-	i32 = LLVMInt32TypeInContext(gallivm->context);
-
 	indices[0] = bld_base->uint_bld.zero;
-	indices[1] = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", i32,
+	indices[1] = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", ctx->i32,
 				     NULL, 0, LLVMReadNoneAttribute);
-	store_ptr = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+	store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
 				 indices, 2, "");
 
 	if (opcode == TGSI_OPCODE_DDX_FINE)
@@ -2938,14 +3357,14 @@ static void si_llvm_emit_ddxy(
 
 	indices[1] = LLVMBuildAnd(gallivm->builder, indices[1],
 				  lp_build_const_int32(gallivm, mask), "");
-	load_ptr0 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+	load_ptr0 = LLVMBuildGEP(gallivm->builder, ctx->lds,
 				 indices, 2, "");
 
 	/* for DDX we want to next X pixel, DDY next Y pixel. */
 	idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
 	indices[1] = LLVMBuildAdd(gallivm->builder, indices[1],
 				  lp_build_const_int32(gallivm, idx), "");
-	load_ptr1 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+	load_ptr1 = LLVMBuildGEP(gallivm->builder, ctx->lds,
 				 indices, 2, "");
 
 	for (c = 0; c < 4; ++c) {
@@ -2964,14 +3383,14 @@ static void si_llvm_emit_ddxy(
 		LLVMBuildStore(gallivm->builder,
 			       LLVMBuildBitCast(gallivm->builder,
 						lp_build_emit_fetch(bld_base, inst, 0, c),
-						i32, ""),
+						ctx->i32, ""),
 			       store_ptr);
 
 		tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
-		tl = LLVMBuildBitCast(gallivm->builder, tl, base->elem_type, "");
+		tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
 
 		trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
-		trbl = LLVMBuildBitCast(gallivm->builder, trbl,	base->elem_type, "");
+		trbl = LLVMBuildBitCast(gallivm->builder, trbl,	ctx->f32, "");
 
 		result[c] = LLVMBuildFSub(gallivm->builder, trbl, tl, "");
 	}
@@ -2988,21 +3407,17 @@ static LLVMValueRef si_llvm_emit_ddxy_interp(
 	struct lp_build_tgsi_context *bld_base,
 	LLVMValueRef interp_ij)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
-	struct lp_build_context *base = &bld_base->base;
 	LLVMValueRef indices[2];
 	LLVMValueRef store_ptr, load_ptr_x, load_ptr_y, load_ptr_ddx, load_ptr_ddy, temp, temp2;
 	LLVMValueRef tl, tr, bl, result[4];
-	LLVMTypeRef i32;
 	unsigned c;
 
-	i32 = LLVMInt32TypeInContext(gallivm->context);
-
 	indices[0] = bld_base->uint_bld.zero;
-	indices[1] = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", i32,
+	indices[1] = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", ctx->i32,
 					NULL, 0, LLVMReadNoneAttribute);
-	store_ptr = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+	store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
 				 indices, 2, "");
 
 	temp = LLVMBuildAnd(gallivm->builder, indices[1],
@@ -3012,21 +3427,21 @@ static LLVMValueRef si_llvm_emit_ddxy_interp(
 			     lp_build_const_int32(gallivm, TID_MASK_TOP), "");
 
 	indices[1] = temp;
-	load_ptr_x = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+	load_ptr_x = LLVMBuildGEP(gallivm->builder, ctx->lds,
 				  indices, 2, "");
 
 	indices[1] = temp2;
-	load_ptr_y = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+	load_ptr_y = LLVMBuildGEP(gallivm->builder, ctx->lds,
 				  indices, 2, "");
 
 	indices[1] = LLVMBuildAdd(gallivm->builder, temp,
 				  lp_build_const_int32(gallivm, 1), "");
-	load_ptr_ddx = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+	load_ptr_ddx = LLVMBuildGEP(gallivm->builder, ctx->lds,
 				   indices, 2, "");
 
 	indices[1] = LLVMBuildAdd(gallivm->builder, temp2,
 				  lp_build_const_int32(gallivm, 2), "");
-	load_ptr_ddy = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+	load_ptr_ddy = LLVMBuildGEP(gallivm->builder, ctx->lds,
 				   indices, 2, "");
 
 	for (c = 0; c < 2; ++c) {
@@ -3040,18 +3455,18 @@ static LLVMValueRef si_llvm_emit_ddxy_interp(
 			       store_ptr);
 
 		tl = LLVMBuildLoad(gallivm->builder, load_ptr_x, "");
-		tl = LLVMBuildBitCast(gallivm->builder, tl, base->elem_type, "");
+		tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
 
 		tr = LLVMBuildLoad(gallivm->builder, load_ptr_ddx, "");
-		tr = LLVMBuildBitCast(gallivm->builder, tr, base->elem_type, "");
+		tr = LLVMBuildBitCast(gallivm->builder, tr, ctx->f32, "");
 
 		result[c] = LLVMBuildFSub(gallivm->builder, tr, tl, "");
 
 		tl = LLVMBuildLoad(gallivm->builder, load_ptr_y, "");
-		tl = LLVMBuildBitCast(gallivm->builder, tl, base->elem_type, "");
+		tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
 
 		bl = LLVMBuildLoad(gallivm->builder, load_ptr_ddy, "");
-		bl = LLVMBuildBitCast(gallivm->builder, bl, base->elem_type, "");
+		bl = LLVMBuildBitCast(gallivm->builder, bl, ctx->f32, "");
 
 		result[c + 2] = LLVMBuildFSub(gallivm->builder, bl, tl, "");
 	}
@@ -3063,7 +3478,7 @@ static void interp_fetch_args(
 	struct lp_build_tgsi_context *bld_base,
 	struct lp_build_emit_data *emit_data)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	const struct tgsi_full_instruction *inst = emit_data->inst;
 
@@ -3087,9 +3502,8 @@ static void interp_fetch_args(
 		sample_id = lp_build_emit_fetch(bld_base,
 						emit_data->inst, 1, TGSI_CHAN_X);
 		sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
-					     LLVMInt32TypeInContext(gallivm->context),
-					     "");
-		sample_position = load_sample_position(&si_shader_ctx->radeon_bld, sample_id);
+					     ctx->i32, "");
+		sample_position = load_sample_position(&ctx->radeon_bld, sample_id);
 
 		emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
 							     sample_position,
@@ -3108,23 +3522,22 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
 				struct lp_build_tgsi_context *bld_base,
 				struct lp_build_emit_data *emit_data)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
-	struct si_shader *shader = si_shader_ctx->shader;
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct si_shader *shader = ctx->shader;
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMValueRef interp_param;
 	const struct tgsi_full_instruction *inst = emit_data->inst;
 	const char *intr_name;
-	int input_index;
+	int input_index = inst->Src[0].Register.Index;
 	int chan;
 	int i;
 	LLVMValueRef attr_number;
-	LLVMTypeRef input_type = LLVMFloatTypeInContext(gallivm->context);
-	LLVMValueRef params = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK);
+	LLVMValueRef params = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK);
 	int interp_param_idx;
+	unsigned interp = shader->selector->info.input_interpolate[input_index];
 	unsigned location;
 
 	assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
-	input_index = inst->Src[0].Register.Index;
 
 	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
 	    inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
@@ -3132,17 +3545,15 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
 	else
 		location = TGSI_INTERPOLATE_LOC_CENTROID;
 
-	interp_param_idx = lookup_interp_param_index(shader->ps_input_interpolate[input_index],
-						     location);
+	interp_param_idx = lookup_interp_param_index(interp, location);
 	if (interp_param_idx == -1)
 		return;
 	else if (interp_param_idx)
-		interp_param = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, interp_param_idx);
+		interp_param = LLVMGetParam(ctx->radeon_bld.main_fn, interp_param_idx);
 	else
 		interp_param = NULL;
 
-	attr_number = lp_build_const_int32(gallivm,
-					   shader->ps_input_param_offset[input_index]);
+	attr_number = lp_build_const_int32(gallivm, input_index);
 
 	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
 	    inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
@@ -3169,7 +3580,7 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
 			LLVMValueRef temp1, temp2;
 
 			interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
-						     LLVMFloatTypeInContext(gallivm->context), "");
+						     ctx->f32, "");
 
 			temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
 
@@ -3180,8 +3591,7 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
 			temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
 
 			ij_out[i] = LLVMBuildBitCast(gallivm->builder,
-						     temp2,
-						     LLVMIntTypeInContext(gallivm->context, 32), "");
+						     temp2, ctx->i32, "");
 		}
 		interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2);
 	}
@@ -3202,7 +3612,7 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
 
 		emit_data->output[chan] =
 			lp_build_intrinsic(gallivm->builder, intr_name,
-					   input_type, args, args[3] ? 4 : 3,
+					   ctx->f32, args, args[3] ? 4 : 3,
 					   LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 	}
 }
@@ -3226,13 +3636,12 @@ static void si_llvm_emit_vertex(
 	struct lp_build_tgsi_context *bld_base,
 	struct lp_build_emit_data *emit_data)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct lp_build_context *uint = &bld_base->uint_bld;
-	struct si_shader *shader = si_shader_ctx->shader;
+	struct si_shader *shader = ctx->shader;
 	struct tgsi_shader_info *info = &shader->selector->info;
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
-	LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
-	LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+	LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
 					    SI_PARAM_GS2VS_OFFSET);
 	LLVMValueRef gs_next_vertex;
 	LLVMValueRef can_emit, kill;
@@ -3245,7 +3654,7 @@ static void si_llvm_emit_vertex(
 
 	/* Write vertex attribute values to GSVS ring */
 	gs_next_vertex = LLVMBuildLoad(gallivm->builder,
-				       si_shader_ctx->gs_next_vertex[stream],
+				       ctx->gs_next_vertex[stream],
 				       "");
 
 	/* If this thread has already emitted the declared maximum number of
@@ -3261,11 +3670,11 @@ static void si_llvm_emit_vertex(
 			       lp_build_const_float(gallivm, -1.0f));
 
 	lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
-			   LLVMVoidTypeInContext(gallivm->context), &kill, 1, 0);
+			   ctx->voidt, &kill, 1, 0);
 
 	for (i = 0; i < info->num_outputs; i++) {
 		LLVMValueRef *out_ptr =
-			si_shader_ctx->radeon_bld.soa.outputs[i];
+			ctx->radeon_bld.soa.outputs[i];
 
 		for (chan = 0; chan < 4; chan++) {
 			LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
@@ -3276,10 +3685,10 @@ static void si_llvm_emit_vertex(
 			voffset = lp_build_add(uint, voffset, gs_next_vertex);
 			voffset = lp_build_mul_imm(uint, voffset, 4);
 
-			out_val = LLVMBuildBitCast(gallivm->builder, out_val, i32, "");
+			out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
 
-			build_tbuffer_store(si_shader_ctx,
-					    si_shader_ctx->gsvs_ring[stream],
+			build_tbuffer_store(ctx,
+					    ctx->gsvs_ring[stream],
 					    out_val, 1,
 					    voffset, soffset, 0,
 					    V_008F0C_BUF_DATA_FORMAT_32,
@@ -3290,14 +3699,13 @@ static void si_llvm_emit_vertex(
 	gs_next_vertex = lp_build_add(uint, gs_next_vertex,
 				      lp_build_const_int32(gallivm, 1));
 
-	LLVMBuildStore(gallivm->builder, gs_next_vertex, si_shader_ctx->gs_next_vertex[stream]);
+	LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
 
 	/* Signal vertex emission */
 	args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
-	args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
+	args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
 	lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
-			LLVMVoidTypeInContext(gallivm->context), args, 2,
-			LLVMNoUnwindAttribute);
+			   ctx->voidt, args, 2, LLVMNoUnwindAttribute);
 }
 
 /* Cut one primitive from the geometry shader */
@@ -3306,7 +3714,7 @@ static void si_llvm_emit_primitive(
 	struct lp_build_tgsi_context *bld_base,
 	struct lp_build_emit_data *emit_data)
 {
-	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMValueRef args[2];
 	unsigned stream;
@@ -3314,21 +3722,22 @@ static void si_llvm_emit_primitive(
 	/* Signal primitive cut */
 	stream = si_llvm_get_stream(bld_base, emit_data);
 	args[0] = lp_build_const_int32(gallivm,	SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
-	args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
+	args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
 	lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
-			LLVMVoidTypeInContext(gallivm->context), args, 2,
-			LLVMNoUnwindAttribute);
+			   ctx->voidt, args, 2, LLVMNoUnwindAttribute);
 }
 
 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
 				 struct lp_build_tgsi_context *bld_base,
 				 struct lp_build_emit_data *emit_data)
 {
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 
-	lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.barrier.local",
-			LLVMVoidTypeInContext(gallivm->context), NULL, 0,
-			LLVMNoUnwindAttribute);
+	lp_build_intrinsic(gallivm->builder,
+			   HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
+					       : "llvm.AMDGPU.barrier.local",
+			   ctx->voidt, NULL, 0, LLVMNoUnwindAttribute);
 }
 
 static const struct lp_build_tgsi_action tex_action = {
@@ -3341,25 +3750,43 @@ static const struct lp_build_tgsi_action interp_action = {
 	.emit = build_interp_intrinsic,
 };
 
-static void create_meta_data(struct si_shader_context *si_shader_ctx)
+static void si_create_function(struct si_shader_context *ctx,
+			       LLVMTypeRef *returns, unsigned num_returns,
+			       LLVMTypeRef *params, unsigned num_params,
+			       int last_array_pointer, int last_sgpr)
 {
-	struct gallivm_state *gallivm = si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm;
+	int i;
+
+	radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns,
+				params, num_params);
+	radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
+	ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type);
+
+	for (i = 0; i <= last_sgpr; ++i) {
+		LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
+
+		/* We tell llvm that array inputs are passed by value to allow Sinking pass
+		 * to move load. Inputs are constant so this is fine. */
+		if (i <= last_array_pointer)
+			LLVMAddAttribute(P, LLVMByValAttribute);
+		else
+			LLVMAddAttribute(P, LLVMInRegAttribute);
+	}
+}
+
+static void create_meta_data(struct si_shader_context *ctx)
+{
+	struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
 	LLVMValueRef args[3];
 
 	args[0] = LLVMMDStringInContext(gallivm->context, "const", 5);
 	args[1] = 0;
 	args[2] = lp_build_const_int32(gallivm, 1);
 
-	si_shader_ctx->const_md = LLVMMDNodeInContext(gallivm->context, args, 3);
-}
-
-static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
-{
-	return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
-			       CONST_ADDR_SPACE);
+	ctx->const_md = LLVMMDNodeInContext(gallivm->context, args, 3);
 }
 
-static void declare_streamout_params(struct si_shader_context *si_shader_ctx,
+static void declare_streamout_params(struct si_shader_context *ctx,
 				     struct pipe_stream_output_info *so,
 				     LLVMTypeRef *params, LLVMTypeRef i32,
 				     unsigned *num_params)
@@ -3368,149 +3795,243 @@ static void declare_streamout_params(struct si_shader_context *si_shader_ctx,
 
 	/* Streamout SGPRs. */
 	if (so->num_outputs) {
-		params[si_shader_ctx->param_streamout_config = (*num_params)++] = i32;
-		params[si_shader_ctx->param_streamout_write_index = (*num_params)++] = i32;
+		params[ctx->param_streamout_config = (*num_params)++] = i32;
+		params[ctx->param_streamout_write_index = (*num_params)++] = i32;
 	}
 	/* A streamout buffer offset is loaded if the stride is non-zero. */
 	for (i = 0; i < 4; i++) {
 		if (!so->stride[i])
 			continue;
 
-		params[si_shader_ctx->param_streamout_offset[i] = (*num_params)++] = i32;
+		params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
+	}
+}
+
+static unsigned llvm_get_type_size(LLVMTypeRef type)
+{
+	LLVMTypeKind kind = LLVMGetTypeKind(type);
+
+	switch (kind) {
+	case LLVMIntegerTypeKind:
+		return LLVMGetIntTypeWidth(type) / 8;
+	case LLVMFloatTypeKind:
+		return 4;
+	case LLVMPointerTypeKind:
+		return 8;
+	case LLVMVectorTypeKind:
+		return LLVMGetVectorSize(type) *
+		       llvm_get_type_size(LLVMGetElementType(type));
+	default:
+		assert(0);
+		return 0;
 	}
 }
 
-static void create_function(struct si_shader_context *si_shader_ctx)
+static void declare_tess_lds(struct si_shader_context *ctx)
+{
+	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
+	LLVMTypeRef i32 = ctx->radeon_bld.soa.bld_base.uint_bld.elem_type;
+
+	/* This is the upper bound, maximum is 32 inputs times 32 vertices */
+	unsigned vertex_data_dw_size = 32*32*4;
+	unsigned patch_data_dw_size = 32*4;
+	/* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */
+	unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size;
+	unsigned lds_dwords = patch_dw_size;
+
+	/* The actual size is computed outside of the shader to reduce
+	 * the number of shader variants. */
+	ctx->lds =
+		LLVMAddGlobalInAddressSpace(gallivm->module,
+					    LLVMArrayType(i32, lds_dwords),
+					    "tess_lds",
+					    LOCAL_ADDR_SPACE);
+}
+
+static void create_function(struct si_shader_context *ctx)
 {
-	struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
+	struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
-	struct si_shader *shader = si_shader_ctx->shader;
-	LLVMTypeRef params[SI_NUM_PARAMS], f32, i8, i32, v2i32, v3i32, v16i8, v4i32, v8i32;
-	unsigned i, last_array_pointer, last_sgpr, num_params;
-
-	i8 = LLVMInt8TypeInContext(gallivm->context);
-	i32 = LLVMInt32TypeInContext(gallivm->context);
-	f32 = LLVMFloatTypeInContext(gallivm->context);
-	v2i32 = LLVMVectorType(i32, 2);
-	v3i32 = LLVMVectorType(i32, 3);
-	v4i32 = LLVMVectorType(i32, 4);
-	v8i32 = LLVMVectorType(i32, 8);
-	v16i8 = LLVMVectorType(i8, 16);
-
-	params[SI_PARAM_RW_BUFFERS] = const_array(v16i8, SI_NUM_RW_BUFFERS);
-	params[SI_PARAM_CONST_BUFFERS] = const_array(v16i8, SI_NUM_CONST_BUFFERS);
-	params[SI_PARAM_SAMPLER_STATES] = const_array(v4i32, SI_NUM_SAMPLER_STATES);
-	params[SI_PARAM_SAMPLER_VIEWS] = const_array(v8i32, SI_NUM_SAMPLER_VIEWS);
-	last_array_pointer = SI_PARAM_SAMPLER_VIEWS;
-
-	switch (si_shader_ctx->type) {
+	struct si_shader *shader = ctx->shader;
+	LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32;
+	LLVMTypeRef returns[16+32*4];
+	unsigned i, last_array_pointer, last_sgpr, num_params, num_return_sgprs;
+	unsigned num_returns = 0;
+
+	v3i32 = LLVMVectorType(ctx->i32, 3);
+
+	params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
+	params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
+	params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
+	params[SI_PARAM_UNUSED] = LLVMPointerType(ctx->i32, CONST_ADDR_SPACE);
+	last_array_pointer = SI_PARAM_UNUSED;
+
+	switch (ctx->type) {
 	case TGSI_PROCESSOR_VERTEX:
-		params[SI_PARAM_VERTEX_BUFFERS] = const_array(v16i8, SI_NUM_VERTEX_BUFFERS);
+		params[SI_PARAM_VERTEX_BUFFERS] = const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
 		last_array_pointer = SI_PARAM_VERTEX_BUFFERS;
-		params[SI_PARAM_BASE_VERTEX] = i32;
-		params[SI_PARAM_START_INSTANCE] = i32;
+		params[SI_PARAM_BASE_VERTEX] = ctx->i32;
+		params[SI_PARAM_START_INSTANCE] = ctx->i32;
 		num_params = SI_PARAM_START_INSTANCE+1;
 
 		if (shader->key.vs.as_es) {
-			params[si_shader_ctx->param_es2gs_offset = num_params++] = i32;
+			params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
 		} else if (shader->key.vs.as_ls) {
-			params[SI_PARAM_LS_OUT_LAYOUT] = i32;
+			params[SI_PARAM_LS_OUT_LAYOUT] = ctx->i32;
 			num_params = SI_PARAM_LS_OUT_LAYOUT+1;
 		} else {
-			if (shader->is_gs_copy_shader) {
+			if (ctx->is_gs_copy_shader) {
 				last_array_pointer = SI_PARAM_CONST_BUFFERS;
 				num_params = SI_PARAM_CONST_BUFFERS+1;
 			} else {
-				params[SI_PARAM_VS_STATE_BITS] = i32;
+				params[SI_PARAM_VS_STATE_BITS] = ctx->i32;
 				num_params = SI_PARAM_VS_STATE_BITS+1;
 			}
 
 			/* The locations of the other parameters are assigned dynamically. */
-			declare_streamout_params(si_shader_ctx, &shader->selector->so,
-						 params, i32, &num_params);
+			declare_streamout_params(ctx, &shader->selector->so,
+						 params, ctx->i32, &num_params);
 		}
 
 		last_sgpr = num_params-1;
 
 		/* VGPRs */
-		params[si_shader_ctx->param_vertex_id = num_params++] = i32;
-		params[si_shader_ctx->param_rel_auto_id = num_params++] = i32;
-		params[si_shader_ctx->param_vs_prim_id = num_params++] = i32;
-		params[si_shader_ctx->param_instance_id = num_params++] = i32;
+		params[ctx->param_vertex_id = num_params++] = ctx->i32;
+		params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
+		params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
+		params[ctx->param_instance_id = num_params++] = ctx->i32;
+
+		if (!ctx->is_monolithic &&
+		    !ctx->is_gs_copy_shader) {
+			/* Vertex load indices. */
+			ctx->param_vertex_index0 = num_params;
+
+			for (i = 0; i < shader->selector->info.num_inputs; i++)
+				params[num_params++] = ctx->i32;
+
+			/* PrimitiveID output. */
+			if (!shader->key.vs.as_es && !shader->key.vs.as_ls)
+				for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
+					returns[num_returns++] = ctx->f32;
+		}
 		break;
 
 	case TGSI_PROCESSOR_TESS_CTRL:
-		params[SI_PARAM_TCS_OUT_OFFSETS] = i32;
-		params[SI_PARAM_TCS_OUT_LAYOUT] = i32;
-		params[SI_PARAM_TCS_IN_LAYOUT] = i32;
-		params[SI_PARAM_TESS_FACTOR_OFFSET] = i32;
+		params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
+		params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
+		params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32;
+		params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32;
 		last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
 
 		/* VGPRs */
-		params[SI_PARAM_PATCH_ID] = i32;
-		params[SI_PARAM_REL_IDS] = i32;
+		params[SI_PARAM_PATCH_ID] = ctx->i32;
+		params[SI_PARAM_REL_IDS] = ctx->i32;
 		num_params = SI_PARAM_REL_IDS+1;
+
+		if (!ctx->is_monolithic) {
+			/* PARAM_TESS_FACTOR_OFFSET is after user SGPRs. */
+			for (i = 0; i <= SI_TCS_NUM_USER_SGPR; i++)
+				returns[num_returns++] = ctx->i32; /* SGPRs */
+
+			for (i = 0; i < 3; i++)
+				returns[num_returns++] = ctx->f32; /* VGPRs */
+		}
 		break;
 
 	case TGSI_PROCESSOR_TESS_EVAL:
-		params[SI_PARAM_TCS_OUT_OFFSETS] = i32;
-		params[SI_PARAM_TCS_OUT_LAYOUT] = i32;
+		params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
+		params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
 		num_params = SI_PARAM_TCS_OUT_LAYOUT+1;
 
 		if (shader->key.tes.as_es) {
-			params[si_shader_ctx->param_es2gs_offset = num_params++] = i32;
+			params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
 		} else {
-			declare_streamout_params(si_shader_ctx, &shader->selector->so,
-						 params, i32, &num_params);
+			declare_streamout_params(ctx, &shader->selector->so,
+						 params, ctx->i32, &num_params);
 		}
 		last_sgpr = num_params - 1;
 
 		/* VGPRs */
-		params[si_shader_ctx->param_tes_u = num_params++] = f32;
-		params[si_shader_ctx->param_tes_v = num_params++] = f32;
-		params[si_shader_ctx->param_tes_rel_patch_id = num_params++] = i32;
-		params[si_shader_ctx->param_tes_patch_id = num_params++] = i32;
+		params[ctx->param_tes_u = num_params++] = ctx->f32;
+		params[ctx->param_tes_v = num_params++] = ctx->f32;
+		params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
+		params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
+
+		/* PrimitiveID output. */
+		if (!ctx->is_monolithic && !shader->key.tes.as_es)
+			for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
+				returns[num_returns++] = ctx->f32;
 		break;
 
 	case TGSI_PROCESSOR_GEOMETRY:
-		params[SI_PARAM_GS2VS_OFFSET] = i32;
-		params[SI_PARAM_GS_WAVE_ID] = i32;
+		params[SI_PARAM_GS2VS_OFFSET] = ctx->i32;
+		params[SI_PARAM_GS_WAVE_ID] = ctx->i32;
 		last_sgpr = SI_PARAM_GS_WAVE_ID;
 
 		/* VGPRs */
-		params[SI_PARAM_VTX0_OFFSET] = i32;
-		params[SI_PARAM_VTX1_OFFSET] = i32;
-		params[SI_PARAM_PRIMITIVE_ID] = i32;
-		params[SI_PARAM_VTX2_OFFSET] = i32;
-		params[SI_PARAM_VTX3_OFFSET] = i32;
-		params[SI_PARAM_VTX4_OFFSET] = i32;
-		params[SI_PARAM_VTX5_OFFSET] = i32;
-		params[SI_PARAM_GS_INSTANCE_ID] = i32;
+		params[SI_PARAM_VTX0_OFFSET] = ctx->i32;
+		params[SI_PARAM_VTX1_OFFSET] = ctx->i32;
+		params[SI_PARAM_PRIMITIVE_ID] = ctx->i32;
+		params[SI_PARAM_VTX2_OFFSET] = ctx->i32;
+		params[SI_PARAM_VTX3_OFFSET] = ctx->i32;
+		params[SI_PARAM_VTX4_OFFSET] = ctx->i32;
+		params[SI_PARAM_VTX5_OFFSET] = ctx->i32;
+		params[SI_PARAM_GS_INSTANCE_ID] = ctx->i32;
 		num_params = SI_PARAM_GS_INSTANCE_ID+1;
 		break;
 
 	case TGSI_PROCESSOR_FRAGMENT:
-		params[SI_PARAM_ALPHA_REF] = f32;
-		params[SI_PARAM_PS_STATE_BITS] = i32;
-		params[SI_PARAM_PRIM_MASK] = i32;
+		params[SI_PARAM_ALPHA_REF] = ctx->f32;
+		params[SI_PARAM_PRIM_MASK] = ctx->i32;
 		last_sgpr = SI_PARAM_PRIM_MASK;
-		params[SI_PARAM_PERSP_SAMPLE] = v2i32;
-		params[SI_PARAM_PERSP_CENTER] = v2i32;
-		params[SI_PARAM_PERSP_CENTROID] = v2i32;
+		params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
+		params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
+		params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
 		params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
-		params[SI_PARAM_LINEAR_SAMPLE] = v2i32;
-		params[SI_PARAM_LINEAR_CENTER] = v2i32;
-		params[SI_PARAM_LINEAR_CENTROID] = v2i32;
-		params[SI_PARAM_LINE_STIPPLE_TEX] = f32;
-		params[SI_PARAM_POS_X_FLOAT] = f32;
-		params[SI_PARAM_POS_Y_FLOAT] = f32;
-		params[SI_PARAM_POS_Z_FLOAT] = f32;
-		params[SI_PARAM_POS_W_FLOAT] = f32;
-		params[SI_PARAM_FRONT_FACE] = f32;
-		params[SI_PARAM_ANCILLARY] = i32;
-		params[SI_PARAM_SAMPLE_COVERAGE] = f32;
-		params[SI_PARAM_POS_FIXED_PT] = f32;
+		params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
+		params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
+		params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
+		params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
+		params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
+		params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
+		params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
+		params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
+		params[SI_PARAM_FRONT_FACE] = ctx->i32;
+		params[SI_PARAM_ANCILLARY] = ctx->i32;
+		params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
+		params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
 		num_params = SI_PARAM_POS_FIXED_PT+1;
+
+		if (!ctx->is_monolithic) {
+			/* Color inputs from the prolog. */
+			if (shader->selector->info.colors_read) {
+				unsigned num_color_elements =
+					util_bitcount(shader->selector->info.colors_read);
+
+				assert(num_params + num_color_elements <= ARRAY_SIZE(params));
+				for (i = 0; i < num_color_elements; i++)
+					params[num_params++] = ctx->f32;
+			}
+
+			/* Outputs for the epilog. */
+			num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
+			num_returns =
+				num_return_sgprs +
+				util_bitcount(shader->selector->info.colors_written) * 4 +
+				shader->selector->info.writes_z +
+				shader->selector->info.writes_stencil +
+				shader->selector->info.writes_samplemask +
+				1 /* SampleMaskIn */;
+
+			num_returns = MAX2(num_returns,
+					   num_return_sgprs +
+					   PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
+
+			for (i = 0; i < num_return_sgprs; i++)
+				returns[i] = ctx->i32;
+			for (; i < num_returns; i++)
+				returns[i] = ctx->f32;
+		}
 		break;
 
 	default:
@@ -3519,23 +4040,37 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 	}
 
 	assert(num_params <= Elements(params));
-	radeon_llvm_create_func(&si_shader_ctx->radeon_bld, params, num_params);
-	radeon_llvm_shader_type(si_shader_ctx->radeon_bld.main_fn, si_shader_ctx->type);
 
-	if (shader->dx10_clamp_mode)
-		LLVMAddTargetDependentFunctionAttr(si_shader_ctx->radeon_bld.main_fn,
-						   "enable-no-nans-fp-math", "true");
+	si_create_function(ctx, returns, num_returns, params,
+			   num_params, last_array_pointer, last_sgpr);
+
+	/* Reserve register locations for VGPR inputs the PS prolog may need. */
+	if (ctx->type == TGSI_PROCESSOR_FRAGMENT &&
+	    !ctx->is_monolithic) {
+		radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
+					  "InitialPSInputAddr",
+					  S_0286D0_PERSP_SAMPLE_ENA(1) |
+					  S_0286D0_PERSP_CENTER_ENA(1) |
+					  S_0286D0_PERSP_CENTROID_ENA(1) |
+					  S_0286D0_LINEAR_SAMPLE_ENA(1) |
+					  S_0286D0_LINEAR_CENTER_ENA(1) |
+					  S_0286D0_LINEAR_CENTROID_ENA(1) |
+					  S_0286D0_FRONT_FACE_ENA(1) |
+					  S_0286D0_POS_FIXED_PT_ENA(1));
+	}
 
-	for (i = 0; i <= last_sgpr; ++i) {
-		LLVMValueRef P = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, i);
+	shader->num_input_sgprs = 0;
+	shader->num_input_vgprs = 0;
 
-		/* We tell llvm that array inputs are passed by value to allow Sinking pass
-		 * to move load. Inputs are constant so this is fine. */
-		if (i <= last_array_pointer)
-			LLVMAddAttribute(P, LLVMByValAttribute);
-		else
-			LLVMAddAttribute(P, LLVMInRegAttribute);
-	}
+	for (i = 0; i <= last_sgpr; ++i)
+		shader->num_input_sgprs += llvm_get_type_size(params[i]) / 4;
+
+	/* Unused fragment shader inputs are eliminated by the compiler,
+	 * so we don't know yet how many there will be.
+	 */
+	if (ctx->type != TGSI_PROCESSOR_FRAGMENT)
+		for (; i < num_params; ++i)
+			shader->num_input_vgprs += llvm_get_type_size(params[i]) / 4;
 
 	if (bld_base->info &&
 	    (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
@@ -3544,39 +4079,25 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 	     bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
 	     bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
 	     bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
-		si_shader_ctx->lds =
+		ctx->lds =
 			LLVMAddGlobalInAddressSpace(gallivm->module,
-						    LLVMArrayType(i32, 64),
+						    LLVMArrayType(ctx->i32, 64),
 						    "ddxy_lds",
 						    LOCAL_ADDR_SPACE);
 
-	if ((si_shader_ctx->type == TGSI_PROCESSOR_VERTEX && shader->key.vs.as_ls) ||
-	    si_shader_ctx->type == TGSI_PROCESSOR_TESS_CTRL ||
-	    si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL) {
-		/* This is the upper bound, maximum is 32 inputs times 32 vertices */
-		unsigned vertex_data_dw_size = 32*32*4;
-		unsigned patch_data_dw_size = 32*4;
-		/* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */
-		unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size;
-		unsigned lds_dwords = patch_dw_size;
-
-		/* The actual size is computed outside of the shader to reduce
-		 * the number of shader variants. */
-		si_shader_ctx->lds =
-			LLVMAddGlobalInAddressSpace(gallivm->module,
-						    LLVMArrayType(i32, lds_dwords),
-						    "tess_lds",
-						    LOCAL_ADDR_SPACE);
-	}
+	if ((ctx->type == TGSI_PROCESSOR_VERTEX && shader->key.vs.as_ls) ||
+	    ctx->type == TGSI_PROCESSOR_TESS_CTRL ||
+	    ctx->type == TGSI_PROCESSOR_TESS_EVAL)
+		declare_tess_lds(ctx);
 }
 
-static void preload_constants(struct si_shader_context *si_shader_ctx)
+static void preload_constants(struct si_shader_context *ctx)
 {
-	struct lp_build_tgsi_context * bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
-	struct gallivm_state * gallivm = bld_base->base.gallivm;
-	const struct tgsi_shader_info * info = bld_base->info;
+	struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	const struct tgsi_shader_info *info = bld_base->info;
 	unsigned buf;
-	LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
+	LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
 
 	for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) {
 		unsigned i, num_const = info->const_file_max[buf] + 1;
@@ -3585,84 +4106,76 @@ static void preload_constants(struct si_shader_context *si_shader_ctx)
 			continue;
 
 		/* Allocate space for the constant values */
-		si_shader_ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef));
+		ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef));
 
 		/* Load the resource descriptor */
-		si_shader_ctx->const_buffers[buf] =
-			build_indexed_load_const(si_shader_ctx, ptr, lp_build_const_int32(gallivm, buf));
+		ctx->const_buffers[buf] =
+			build_indexed_load_const(ctx, ptr, lp_build_const_int32(gallivm, buf));
 
 		/* Load the constants, we rely on the code sinking to do the rest */
 		for (i = 0; i < num_const * 4; ++i) {
-			si_shader_ctx->constants[buf][i] =
+			ctx->constants[buf][i] =
 				buffer_load_const(gallivm->builder,
-					si_shader_ctx->const_buffers[buf],
+					ctx->const_buffers[buf],
 					lp_build_const_int32(gallivm, i * 4),
-					bld_base->base.elem_type);
+					ctx->f32);
 		}
 	}
 }
 
-static void preload_samplers(struct si_shader_context *si_shader_ctx)
+static void preload_samplers(struct si_shader_context *ctx)
 {
-	struct lp_build_tgsi_context * bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
-	struct gallivm_state * gallivm = bld_base->base.gallivm;
-	const struct tgsi_shader_info * info = bld_base->info;
-
+	struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	const struct tgsi_shader_info *info = bld_base->info;
 	unsigned i, num_samplers = info->file_max[TGSI_FILE_SAMPLER] + 1;
-
-	LLVMValueRef res_ptr, samp_ptr;
 	LLVMValueRef offset;
 
 	if (num_samplers == 0)
 		return;
 
-	res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_VIEWS);
-	samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_STATES);
-
 	/* Load the resources and samplers, we rely on the code sinking to do the rest */
 	for (i = 0; i < num_samplers; ++i) {
 		/* Resource */
 		offset = lp_build_const_int32(gallivm, i);
-		si_shader_ctx->sampler_views[i] = build_indexed_load_const(si_shader_ctx, res_ptr, offset);
-
-		/* Sampler */
-		offset = lp_build_const_int32(gallivm, i);
-		si_shader_ctx->sampler_states[i] = build_indexed_load_const(si_shader_ctx, samp_ptr, offset);
+		ctx->sampler_views[i] =
+			get_sampler_desc(ctx, offset, DESC_IMAGE);
 
 		/* FMASK resource */
-		if (info->is_msaa_sampler[i]) {
-			offset = lp_build_const_int32(gallivm, SI_FMASK_TEX_OFFSET + i);
-			si_shader_ctx->sampler_views[SI_FMASK_TEX_OFFSET + i] =
-				build_indexed_load_const(si_shader_ctx, res_ptr, offset);
-		}
+		if (info->is_msaa_sampler[i])
+			ctx->fmasks[i] =
+				get_sampler_desc(ctx, offset, DESC_FMASK);
+		else
+			ctx->sampler_states[i] =
+				get_sampler_desc(ctx, offset, DESC_SAMPLER);
 	}
 }
 
-static void preload_streamout_buffers(struct si_shader_context *si_shader_ctx)
+static void preload_streamout_buffers(struct si_shader_context *ctx)
 {
-	struct lp_build_tgsi_context * bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
-	struct gallivm_state * gallivm = bld_base->base.gallivm;
+	struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	unsigned i;
 
 	/* Streamout can only be used if the shader is compiled as VS. */
-	if (!si_shader_ctx->shader->selector->so.num_outputs ||
-	    (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX &&
-	     (si_shader_ctx->shader->key.vs.as_es ||
-	      si_shader_ctx->shader->key.vs.as_ls)) ||
-	    (si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL &&
-	     si_shader_ctx->shader->key.tes.as_es))
+	if (!ctx->shader->selector->so.num_outputs ||
+	    (ctx->type == TGSI_PROCESSOR_VERTEX &&
+	     (ctx->shader->key.vs.as_es ||
+	      ctx->shader->key.vs.as_ls)) ||
+	    (ctx->type == TGSI_PROCESSOR_TESS_EVAL &&
+	     ctx->shader->key.tes.as_es))
 		return;
 
-	LLVMValueRef buf_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+	LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
 					    SI_PARAM_RW_BUFFERS);
 
 	/* Load the resources, we rely on the code sinking to do the rest */
 	for (i = 0; i < 4; ++i) {
-		if (si_shader_ctx->shader->selector->so.stride[i]) {
+		if (ctx->shader->selector->so.stride[i]) {
 			LLVMValueRef offset = lp_build_const_int32(gallivm,
 								   SI_SO_BUF_OFFSET + i);
 
-			si_shader_ctx->so_buffers[i] = build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
+			ctx->so_buffers[i] = build_indexed_load_const(ctx, buf_ptr, offset);
 		}
 	}
 }
@@ -3671,42 +4184,85 @@ static void preload_streamout_buffers(struct si_shader_context *si_shader_ctx)
  * Load ESGS and GSVS ring buffer resource descriptors and save the variables
  * for later use.
  */
-static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
+static void preload_ring_buffers(struct si_shader_context *ctx)
 {
 	struct gallivm_state *gallivm =
-		si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm;
+		ctx->radeon_bld.soa.bld_base.base.gallivm;
 
-	LLVMValueRef buf_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+	LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
 					    SI_PARAM_RW_BUFFERS);
 
-	if ((si_shader_ctx->type == TGSI_PROCESSOR_VERTEX &&
-	     si_shader_ctx->shader->key.vs.as_es) ||
-	    (si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL &&
-	     si_shader_ctx->shader->key.tes.as_es) ||
-	    si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY) {
+	if ((ctx->type == TGSI_PROCESSOR_VERTEX &&
+	     ctx->shader->key.vs.as_es) ||
+	    (ctx->type == TGSI_PROCESSOR_TESS_EVAL &&
+	     ctx->shader->key.tes.as_es) ||
+	    ctx->type == TGSI_PROCESSOR_GEOMETRY) {
 		LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_ESGS);
 
-		si_shader_ctx->esgs_ring =
-			build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
+		ctx->esgs_ring =
+			build_indexed_load_const(ctx, buf_ptr, offset);
 	}
 
-	if (si_shader_ctx->shader->is_gs_copy_shader) {
+	if (ctx->is_gs_copy_shader) {
 		LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS);
 
-		si_shader_ctx->gsvs_ring[0] =
-			build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
+		ctx->gsvs_ring[0] =
+			build_indexed_load_const(ctx, buf_ptr, offset);
 	}
-	if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY) {
+	if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
 		int i;
 		for (i = 0; i < 4; i++) {
 			LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS + i);
 
-			si_shader_ctx->gsvs_ring[i] =
-				build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
+			ctx->gsvs_ring[i] =
+				build_indexed_load_const(ctx, buf_ptr, offset);
 		}
 	}
 }
 
+static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
+					 LLVMValueRef param_sampler_views,
+					 unsigned param_pos_fixed_pt)
+{
+	struct lp_build_tgsi_context *bld_base =
+		&ctx->radeon_bld.soa.bld_base;
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	struct lp_build_emit_data result = {};
+	struct tgsi_full_instruction inst = {};
+	LLVMValueRef desc, sampler_index, address[2], pix;
+
+	/* Use the fixed-point gl_FragCoord input.
+	 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
+	 * per coordinate to get the repeating effect.
+	 */
+	address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
+	address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
+
+	/* Load the sampler view descriptor. */
+	sampler_index = lp_build_const_int32(gallivm, SI_POLY_STIPPLE_SAMPLER);
+	desc = get_sampler_desc_custom(ctx, param_sampler_views,
+				       sampler_index, DESC_IMAGE);
+
+	/* Load the texel. */
+	inst.Instruction.Opcode = TGSI_OPCODE_TXF;
+	inst.Texture.Texture = TGSI_TEXTURE_2D_MSAA; /* = use load, not load_mip */
+	result.inst = &inst;
+	set_tex_fetch_args(ctx, &result, TGSI_OPCODE_TXF,
+			   inst.Texture.Texture,
+			   desc, NULL, address, ARRAY_SIZE(address), 0xf);
+	build_tex_intrinsic(&tex_action, bld_base, &result);
+
+	/* Kill the thread accordingly. */
+	pix = LLVMBuildExtractElement(gallivm->builder, result.output[0],
+				      lp_build_const_int32(gallivm, 3), "");
+	pix = bitcast(bld_base, TGSI_TYPE_FLOAT, pix);
+	pix = LLVMBuildFNeg(gallivm->builder, pix, "");
+
+	lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
+			   LLVMVoidTypeInContext(gallivm->context),
+			   &pix, 1, 0);
+}
+
 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
 				  struct si_shader_config *conf,
 				  unsigned symbol_offset)
@@ -3742,6 +4298,9 @@ void si_shader_binary_read_config(struct radeon_shader_binary *binary,
 		case R_0286CC_SPI_PS_INPUT_ENA:
 			conf->spi_ps_input_ena = value;
 			break;
+		case R_0286D0_SPI_PS_INPUT_ADDR:
+			conf->spi_ps_input_addr = value;
+			break;
 		case R_0286E8_SPI_TMPRING_SIZE:
 		case R_00B860_COMPUTE_TMPRING_SIZE:
 			/* WAVESIZE is in units of 256 dwords. */
@@ -3749,10 +4308,20 @@ void si_shader_binary_read_config(struct radeon_shader_binary *binary,
 				G_00B860_WAVESIZE(value) * 256 * 4 * 1;
 			break;
 		default:
-			fprintf(stderr, "Warning: Compiler emitted unknown "
-				"config register: 0x%x\n", reg);
+			{
+				static bool printed;
+
+				if (!printed) {
+					fprintf(stderr, "Warning: LLVM emitted unknown "
+						"config register: 0x%x\n", reg);
+					printed = true;
+				}
+			}
 			break;
 		}
+
+		if (!conf->spi_ps_input_addr)
+			conf->spi_ps_input_addr = conf->spi_ps_input_ena;
 	}
 }
 
@@ -3779,41 +4348,70 @@ void si_shader_apply_scratch_relocs(struct si_context *sctx,
 	}
 }
 
+static unsigned si_get_shader_binary_size(struct si_shader *shader)
+{
+	unsigned size = shader->binary.code_size;
+
+	if (shader->prolog)
+		size += shader->prolog->binary.code_size;
+	if (shader->epilog)
+		size += shader->epilog->binary.code_size;
+	return size;
+}
+
 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
 {
-	const struct radeon_shader_binary *binary = &shader->binary;
-	unsigned code_size = binary->code_size + binary->rodata_size;
+	const struct radeon_shader_binary *prolog =
+		shader->prolog ? &shader->prolog->binary : NULL;
+	const struct radeon_shader_binary *epilog =
+		shader->epilog ? &shader->epilog->binary : NULL;
+	const struct radeon_shader_binary *mainb = &shader->binary;
+	unsigned bo_size = si_get_shader_binary_size(shader) +
+			   (!epilog ? mainb->rodata_size : 0);
 	unsigned char *ptr;
 
+	assert(!prolog || !prolog->rodata_size);
+	assert((!prolog && !epilog) || !mainb->rodata_size);
+	assert(!epilog || !epilog->rodata_size);
+
 	r600_resource_reference(&shader->bo, NULL);
 	shader->bo = si_resource_create_custom(&sscreen->b.b,
 					       PIPE_USAGE_IMMUTABLE,
-					       code_size);
+					       bo_size);
 	if (!shader->bo)
 		return -ENOMEM;
 
+	/* Upload. */
 	ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
 					PIPE_TRANSFER_READ_WRITE);
-	util_memcpy_cpu_to_le32(ptr, binary->code, binary->code_size);
-	if (binary->rodata_size > 0) {
-		ptr += binary->code_size;
-		util_memcpy_cpu_to_le32(ptr, binary->rodata,
-					binary->rodata_size);
+
+	if (prolog) {
+		util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
+		ptr += prolog->code_size;
 	}
 
+	util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
+	ptr += mainb->code_size;
+
+	if (epilog)
+		util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
+	else if (mainb->rodata_size > 0)
+		util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
+
 	sscreen->b.ws->buffer_unmap(shader->bo->buf);
 	return 0;
 }
 
 static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
-				       struct pipe_debug_callback *debug)
+				       struct pipe_debug_callback *debug,
+				       const char *name)
 {
 	char *line, *p;
 	unsigned i, count;
 
 	if (binary->disasm_string) {
-		fprintf(stderr, "\nShader Disassembly:\n\n");
-		fprintf(stderr, "%s\n", binary->disasm_string);
+		fprintf(stderr, "Shader %s disassembly:\n", name);
+		fprintf(stderr, "%s", binary->disasm_string);
 
 		if (debug && debug->debug_message) {
 			/* Very long debug messages are cut off, so send the
@@ -3843,7 +4441,7 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary
 					   "Shader Disassembly End");
 		}
 	} else {
-		fprintf(stderr, "SI CODE:\n");
+		fprintf(stderr, "Shader %s binary:\n", name);
 		for (i = 0; i < binary->code_size; i += 4) {
 			fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i,
 				binary->code[i + 3], binary->code[i + 2],
@@ -3854,33 +4452,128 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary
 
 static void si_shader_dump_stats(struct si_screen *sscreen,
 			         struct si_shader_config *conf,
+				 unsigned num_inputs,
 				 unsigned code_size,
 			         struct pipe_debug_callback *debug,
 			         unsigned processor)
 {
+	unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
+	unsigned lds_per_wave = 0;
+	unsigned max_simd_waves = 10;
+
+	/* Compute LDS usage for PS. */
+	if (processor == TGSI_PROCESSOR_FRAGMENT) {
+		/* The minimum usage per wave is (num_inputs * 36). The maximum
+		 * usage is (num_inputs * 36 * 16).
+		 * We can get anything in between and it varies between waves.
+		 *
+		 * Other stages don't know the size at compile time or don't
+		 * allocate LDS per wave, but instead they do it per thread group.
+		 */
+		lds_per_wave = conf->lds_size * lds_increment +
+			       align(num_inputs * 36, lds_increment);
+	}
+
+	/* Compute the per-SIMD wave counts. */
+	if (conf->num_sgprs) {
+		if (sscreen->b.chip_class >= VI)
+			max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
+		else
+			max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
+	}
+
+	if (conf->num_vgprs)
+		max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
+
+	/* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD
+	 * that PS can use.
+	 */
+	if (lds_per_wave)
+		max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
+
 	if (r600_can_dump_shader(&sscreen->b, processor)) {
+		if (processor == TGSI_PROCESSOR_FRAGMENT) {
+			fprintf(stderr, "*** SHADER CONFIG ***\n"
+				"SPI_PS_INPUT_ADDR = 0x%04x\n"
+				"SPI_PS_INPUT_ENA  = 0x%04x\n",
+				conf->spi_ps_input_addr, conf->spi_ps_input_ena);
+		}
+
 		fprintf(stderr, "*** SHADER STATS ***\n"
-			"SGPRS: %d\nVGPRS: %d\nCode Size: %d bytes\nLDS: %d blocks\n"
-			"Scratch: %d bytes per wave\n********************\n",
+			"SGPRS: %d\n"
+			"VGPRS: %d\n"
+			"Code Size: %d bytes\n"
+			"LDS: %d blocks\n"
+			"Scratch: %d bytes per wave\n"
+			"Max Waves: %d\n"
+			"********************\n",
 			conf->num_sgprs, conf->num_vgprs, code_size,
-			conf->lds_size, conf->scratch_bytes_per_wave);
+			conf->lds_size, conf->scratch_bytes_per_wave,
+			max_simd_waves);
 	}
 
 	pipe_debug_message(debug, SHADER_INFO,
-			   "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d LDS: %d Scratch: %d",
+			   "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
+			   "LDS: %d Scratch: %d Max Waves: %d",
 			   conf->num_sgprs, conf->num_vgprs, code_size,
-			   conf->lds_size, conf->scratch_bytes_per_wave);
+			   conf->lds_size, conf->scratch_bytes_per_wave,
+			   max_simd_waves);
+}
+
+static const char *si_get_shader_name(struct si_shader *shader,
+				      unsigned processor)
+{
+	switch (processor) {
+	case TGSI_PROCESSOR_VERTEX:
+		if (shader->key.vs.as_es)
+			return "Vertex Shader as ES";
+		else if (shader->key.vs.as_ls)
+			return "Vertex Shader as LS";
+		else
+			return "Vertex Shader as VS";
+	case TGSI_PROCESSOR_TESS_CTRL:
+		return "Tessellation Control Shader";
+	case TGSI_PROCESSOR_TESS_EVAL:
+		if (shader->key.tes.as_es)
+			return "Tessellation Evaluation Shader as ES";
+		else
+			return "Tessellation Evaluation Shader as VS";
+	case TGSI_PROCESSOR_GEOMETRY:
+		if (shader->gs_copy_shader == NULL)
+			return "GS Copy Shader as VS";
+		else
+			return "Geometry Shader";
+	case TGSI_PROCESSOR_FRAGMENT:
+		return "Pixel Shader";
+	case TGSI_PROCESSOR_COMPUTE:
+		return "Compute Shader";
+	default:
+		return "Unknown Shader";
+	}
 }
 
 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
 		    struct pipe_debug_callback *debug, unsigned processor)
 {
-	if (r600_can_dump_shader(&sscreen->b, processor))
-		if (!(sscreen->b.debug_flags & DBG_NO_ASM))
-			si_shader_dump_disassembly(&shader->binary, debug);
+	if (r600_can_dump_shader(&sscreen->b, processor) &&
+	    !(sscreen->b.debug_flags & DBG_NO_ASM)) {
+		fprintf(stderr, "\n%s:\n", si_get_shader_name(shader, processor));
+
+		if (shader->prolog)
+			si_shader_dump_disassembly(&shader->prolog->binary,
+						   debug, "prolog");
+
+		si_shader_dump_disassembly(&shader->binary, debug, "main");
+
+		if (shader->epilog)
+			si_shader_dump_disassembly(&shader->epilog->binary,
+						   debug, "epilog");
+		fprintf(stderr, "\n");
+	}
 
 	si_shader_dump_stats(sscreen, &shader->config,
-			     shader->binary.code_size, debug, processor);
+			     shader->selector ? shader->selector->info.num_inputs : 0,
+			     si_get_shader_binary_size(shader), debug, processor);
 }
 
 int si_compile_llvm(struct si_screen *sscreen,
@@ -3889,7 +4582,8 @@ int si_compile_llvm(struct si_screen *sscreen,
 		    LLVMTargetMachineRef tm,
 		    LLVMModuleRef mod,
 		    struct pipe_debug_callback *debug,
-		    unsigned processor)
+		    unsigned processor,
+		    const char *name)
 {
 	int r = 0;
 	unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
@@ -3897,8 +4591,11 @@ int si_compile_llvm(struct si_screen *sscreen,
 	if (r600_can_dump_shader(&sscreen->b, processor)) {
 		fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
 
-		if (!(sscreen->b.debug_flags & DBG_NO_IR))
+		if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
+			fprintf(stderr, "%s LLVM IR:\n\n", name);
 			LLVMDumpModule(mod);
+			fprintf(stderr, "\n");
+		}
 	}
 
 	if (!si_replace_shader(count, binary)) {
@@ -3911,24 +4608,49 @@ int si_compile_llvm(struct si_screen *sscreen,
 
 	si_shader_binary_read_config(binary, conf, 0);
 
+	/* Enable 64-bit and 16-bit denormals, because there is no performance
+	 * cost.
+	 *
+	 * If denormals are enabled, all floating-point output modifiers are
+	 * ignored.
+	 *
+	 * Don't enable denormals for 32-bit floats, because:
+	 * - Floating-point output modifiers would be ignored by the hw.
+	 * - Some opcodes don't support denormals, such as v_mad_f32. We would
+	 *   have to stop using those.
+	 * - SI & CI would be very slow.
+	 */
+	conf->float_mode |= V_00B028_FP_64_DENORMS;
+
 	FREE(binary->config);
 	FREE(binary->global_symbol_offsets);
 	binary->config = NULL;
 	binary->global_symbol_offsets = NULL;
+
+	/* Some shaders can't have rodata because their binaries can be
+	 * concatenated.
+	 */
+	if (binary->rodata_size &&
+	    (processor == TGSI_PROCESSOR_VERTEX ||
+	     processor == TGSI_PROCESSOR_TESS_CTRL ||
+	     processor == TGSI_PROCESSOR_TESS_EVAL ||
+	     processor == TGSI_PROCESSOR_FRAGMENT)) {
+		fprintf(stderr, "radeonsi: The shader can't have rodata.");
+		return -EINVAL;
+	}
+
 	return r;
 }
 
 /* Generate code for the hardware VS shader stage to go with a geometry shader */
 static int si_generate_gs_copy_shader(struct si_screen *sscreen,
-				      struct si_shader_context *si_shader_ctx,
-				      struct si_shader *gs, bool dump,
+				      struct si_shader_context *ctx,
+				      struct si_shader *gs,
 				      struct pipe_debug_callback *debug)
 {
-	struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
-	struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
-	struct lp_build_context *base = &bld_base->base;
+	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
+	struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
 	struct lp_build_context *uint = &bld_base->uint_bld;
-	struct si_shader *shader = si_shader_ctx->shader;
 	struct si_shader_output_values *outputs;
 	struct tgsi_shader_info *gsinfo = &gs->selector->info;
 	LLVMValueRef args[9];
@@ -3936,20 +4658,19 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 
 	outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
 
-	si_shader_ctx->type = TGSI_PROCESSOR_VERTEX;
-	shader->is_gs_copy_shader = true;
-
-	radeon_llvm_context_init(&si_shader_ctx->radeon_bld);
+	si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm);
+	ctx->type = TGSI_PROCESSOR_VERTEX;
+	ctx->is_gs_copy_shader = true;
 
-	create_meta_data(si_shader_ctx);
-	create_function(si_shader_ctx);
-	preload_streamout_buffers(si_shader_ctx);
-	preload_ring_buffers(si_shader_ctx);
+	create_meta_data(ctx);
+	create_function(ctx);
+	preload_streamout_buffers(ctx);
+	preload_ring_buffers(ctx);
 
-	args[0] = si_shader_ctx->gsvs_ring[0];
+	args[0] = ctx->gsvs_ring[0];
 	args[1] = lp_build_mul_imm(uint,
-				   LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
-						si_shader_ctx->param_vertex_id),
+				   LLVMGetParam(ctx->radeon_bld.main_fn,
+						ctx->param_vertex_id),
 				   4);
 	args[3] = uint->zero;
 	args[4] = uint->one;  /* OFFEN */
@@ -3974,31 +4695,37 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 				LLVMBuildBitCast(gallivm->builder,
 						 lp_build_intrinsic(gallivm->builder,
 								 "llvm.SI.buffer.load.dword.i32.i32",
-								 LLVMInt32TypeInContext(gallivm->context),
-								 args, 9,
+								 ctx->i32, args, 9,
 								 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute),
-						 base->elem_type, "");
+						 ctx->f32, "");
 		}
 	}
 
 	si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
 
-	radeon_llvm_finalize_module(&si_shader_ctx->radeon_bld);
+	LLVMBuildRet(gallivm->builder, ctx->return_value);
 
-	if (dump)
-		fprintf(stderr, "Copy Vertex Shader for Geometry Shader:\n\n");
+	/* Dump LLVM IR before any optimization passes */
+	if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
+	    r600_can_dump_shader(&sscreen->b, TGSI_PROCESSOR_GEOMETRY))
+		LLVMDumpModule(bld_base->base.gallivm->module);
 
-	r = si_compile_llvm(sscreen, &si_shader_ctx->shader->binary,
-			    &si_shader_ctx->shader->config, si_shader_ctx->tm,
+	radeon_llvm_finalize_module(&ctx->radeon_bld);
+
+	r = si_compile_llvm(sscreen, &ctx->shader->binary,
+			    &ctx->shader->config, ctx->tm,
 			    bld_base->base.gallivm->module,
-			    debug, TGSI_PROCESSOR_GEOMETRY);
+			    debug, TGSI_PROCESSOR_GEOMETRY,
+			    "GS Copy Shader");
 	if (!r) {
-		si_shader_dump(sscreen, si_shader_ctx->shader, debug,
+		if (r600_can_dump_shader(&sscreen->b, TGSI_PROCESSOR_GEOMETRY))
+			fprintf(stderr, "GS Copy Shader:\n");
+		si_shader_dump(sscreen, ctx->shader, debug,
 			       TGSI_PROCESSOR_GEOMETRY);
-		r = si_shader_binary_upload(sscreen, si_shader_ctx->shader);
+		r = si_shader_binary_upload(sscreen, ctx->shader);
 	}
 
-	radeon_llvm_dispose(&si_shader_ctx->radeon_bld);
+	radeon_llvm_dispose(&ctx->radeon_bld);
 
 	FREE(outputs);
 	return r;
@@ -4013,35 +4740,38 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
 	switch (shader) {
 	case PIPE_SHADER_VERTEX:
 		fprintf(f, "  instance_divisors = {");
-		for (i = 0; i < Elements(key->vs.instance_divisors); i++)
+		for (i = 0; i < Elements(key->vs.prolog.instance_divisors); i++)
 			fprintf(f, !i ? "%u" : ", %u",
-				key->vs.instance_divisors[i]);
+				key->vs.prolog.instance_divisors[i]);
 		fprintf(f, "}\n");
 		fprintf(f, "  as_es = %u\n", key->vs.as_es);
 		fprintf(f, "  as_ls = %u\n", key->vs.as_ls);
-		fprintf(f, "  export_prim_id = %u\n", key->vs.export_prim_id);
+		fprintf(f, "  export_prim_id = %u\n", key->vs.epilog.export_prim_id);
 		break;
 
 	case PIPE_SHADER_TESS_CTRL:
-		fprintf(f, "  prim_mode = %u\n", key->tcs.prim_mode);
+		fprintf(f, "  prim_mode = %u\n", key->tcs.epilog.prim_mode);
 		break;
 
 	case PIPE_SHADER_TESS_EVAL:
 		fprintf(f, "  as_es = %u\n", key->tes.as_es);
-		fprintf(f, "  export_prim_id = %u\n", key->tes.export_prim_id);
+		fprintf(f, "  export_prim_id = %u\n", key->tes.epilog.export_prim_id);
 		break;
 
 	case PIPE_SHADER_GEOMETRY:
 		break;
 
 	case PIPE_SHADER_FRAGMENT:
-		fprintf(f, "  export_16bpc = 0x%X\n", key->ps.export_16bpc);
-		fprintf(f, "  last_cbuf = %u\n", key->ps.last_cbuf);
-		fprintf(f, "  color_two_side = %u\n", key->ps.color_two_side);
-		fprintf(f, "  alpha_func = %u\n", key->ps.alpha_func);
-		fprintf(f, "  alpha_to_one = %u\n", key->ps.alpha_to_one);
-		fprintf(f, "  poly_stipple = %u\n", key->ps.poly_stipple);
-		fprintf(f, "  clamp_color = %u\n", key->ps.clamp_color);
+		fprintf(f, "  prolog.color_two_side = %u\n", key->ps.prolog.color_two_side);
+		fprintf(f, "  prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple);
+		fprintf(f, "  prolog.force_persample_interp = %u\n", key->ps.prolog.force_persample_interp);
+		fprintf(f, "  epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format);
+		fprintf(f, "  epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8);
+		fprintf(f, "  epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf);
+		fprintf(f, "  epilog.alpha_func = %u\n", key->ps.epilog.alpha_func);
+		fprintf(f, "  epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one);
+		fprintf(f, "  epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing);
+		fprintf(f, "  epilog.clamp_color = %u\n", key->ps.epilog.clamp_color);
 		break;
 
 	default:
@@ -4049,47 +4779,39 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
 	}
 }
 
-int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
-		     struct si_shader *shader,
-		     struct pipe_debug_callback *debug)
+static void si_init_shader_ctx(struct si_shader_context *ctx,
+			       struct si_screen *sscreen,
+			       struct si_shader *shader,
+			       LLVMTargetMachineRef tm)
 {
-	struct si_shader_selector *sel = shader->selector;
-	struct tgsi_token *tokens = sel->tokens;
-	struct si_shader_context si_shader_ctx;
-	struct lp_build_tgsi_context * bld_base;
-	struct tgsi_shader_info stipple_shader_info;
-	LLVMModuleRef mod;
-	int r = 0;
-	bool poly_stipple = sel->type == PIPE_SHADER_FRAGMENT &&
-			    shader->key.ps.poly_stipple;
-	bool dump = r600_can_dump_shader(&sscreen->b, sel->info.processor);
-
-	if (poly_stipple) {
-		tokens = util_pstipple_create_fragment_shader(tokens, NULL,
-						SI_POLY_STIPPLE_SAMPLER,
-						TGSI_FILE_INPUT);
-		tgsi_scan_shader(tokens, &stipple_shader_info);
-	}
-
-	/* Dump TGSI code before doing TGSI->LLVM conversion in case the
-	 * conversion fails. */
-	if (dump && !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
-		si_dump_shader_key(sel->type, &shader->key, stderr);
-		tgsi_dump(tokens, 0);
-		si_dump_streamout(&sel->so);
-	}
-
-	assert(shader->nparam == 0);
-
-	memset(&si_shader_ctx, 0, sizeof(si_shader_ctx));
-	radeon_llvm_context_init(&si_shader_ctx.radeon_bld);
-	bld_base = &si_shader_ctx.radeon_bld.soa.bld_base;
-
-	if (sel->type != PIPE_SHADER_COMPUTE)
-		shader->dx10_clamp_mode = true;
-
-	shader->uses_instanceid = sel->info.uses_instanceid;
-	bld_base->info = poly_stipple ? &stipple_shader_info : &sel->info;
+	struct lp_build_tgsi_context *bld_base;
+
+	memset(ctx, 0, sizeof(*ctx));
+	radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--");
+	ctx->tm = tm;
+	ctx->screen = sscreen;
+	if (shader && shader->selector)
+		ctx->type = shader->selector->info.processor;
+	else
+		ctx->type = -1;
+	ctx->shader = shader;
+
+	ctx->voidt = LLVMVoidTypeInContext(ctx->radeon_bld.gallivm.context);
+	ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context);
+	ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context);
+	ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context);
+	ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context);
+	ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128);
+	ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context);
+	ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
+	ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
+	ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
+	ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
+	ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
+
+	bld_base = &ctx->radeon_bld.soa.bld_base;
+	if (shader && shader->selector)
+		bld_base->info = &shader->selector->info;
 	bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
 
 	bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
@@ -4119,22 +4841,44 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
 	bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
 
-	if (HAVE_LLVM >= 0x0306) {
-		bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
-		bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32";
-		bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem;
-		bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
+	bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
+	bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32";
+	bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem;
+	bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
+}
+
+int si_compile_tgsi_shader(struct si_screen *sscreen,
+			   LLVMTargetMachineRef tm,
+			   struct si_shader *shader,
+			   bool is_monolithic,
+			   struct pipe_debug_callback *debug)
+{
+	struct si_shader_selector *sel = shader->selector;
+	struct si_shader_context ctx;
+	struct lp_build_tgsi_context *bld_base;
+	LLVMModuleRef mod;
+	int r = 0;
+
+	/* Dump TGSI code before doing TGSI->LLVM conversion in case the
+	 * conversion fails. */
+	if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
+	    !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
+		si_dump_shader_key(sel->type, &shader->key, stderr);
+		tgsi_dump(sel->tokens, 0);
+		si_dump_streamout(&sel->so);
 	}
 
-	si_shader_ctx.radeon_bld.load_system_value = declare_system_value;
-	si_shader_ctx.shader = shader;
-	si_shader_ctx.type = tgsi_get_processor_type(tokens);
-	si_shader_ctx.screen = sscreen;
-	si_shader_ctx.tm = tm;
+	si_init_shader_ctx(&ctx, sscreen, shader, tm);
+	ctx.is_monolithic = is_monolithic;
+
+	shader->uses_instanceid = sel->info.uses_instanceid;
+
+	bld_base = &ctx.radeon_bld.soa.bld_base;
+	ctx.radeon_bld.load_system_value = declare_system_value;
 
-	switch (si_shader_ctx.type) {
+	switch (ctx.type) {
 	case TGSI_PROCESSOR_VERTEX:
-		si_shader_ctx.radeon_bld.load_input = declare_input_vs;
+		ctx.radeon_bld.load_input = declare_input_vs;
 		if (shader->key.vs.as_ls)
 			bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
 		else if (shader->key.vs.as_es)
@@ -4160,62 +4904,112 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 		bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
 		break;
 	case TGSI_PROCESSOR_FRAGMENT:
-		si_shader_ctx.radeon_bld.load_input = declare_input_fs;
-		bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
+		ctx.radeon_bld.load_input = declare_input_fs;
+		if (is_monolithic)
+			bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
+		else
+			bld_base->emit_epilogue = si_llvm_return_fs_outputs;
 		break;
 	default:
 		assert(!"Unsupported shader type");
 		return -1;
 	}
 
-	create_meta_data(&si_shader_ctx);
-	create_function(&si_shader_ctx);
-	preload_constants(&si_shader_ctx);
-	preload_samplers(&si_shader_ctx);
-	preload_streamout_buffers(&si_shader_ctx);
-	preload_ring_buffers(&si_shader_ctx);
+	create_meta_data(&ctx);
+	create_function(&ctx);
+	preload_constants(&ctx);
+	preload_samplers(&ctx);
+	preload_streamout_buffers(&ctx);
+	preload_ring_buffers(&ctx);
+
+	if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT &&
+	    shader->key.ps.prolog.poly_stipple) {
+		LLVMValueRef views = LLVMGetParam(ctx.radeon_bld.main_fn,
+						  SI_PARAM_SAMPLERS);
+		si_llvm_emit_polygon_stipple(&ctx, views,
+					     SI_PARAM_POS_FIXED_PT);
+	}
 
-	if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) {
+	if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
 		int i;
 		for (i = 0; i < 4; i++) {
-			si_shader_ctx.gs_next_vertex[i] =
+			ctx.gs_next_vertex[i] =
 				lp_build_alloca(bld_base->base.gallivm,
-						bld_base->uint_bld.elem_type, "");
+						ctx.i32, "");
 		}
 	}
 
-	if (!lp_build_tgsi_llvm(bld_base, tokens)) {
+	if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
 		fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
 		goto out;
 	}
 
-	radeon_llvm_finalize_module(&si_shader_ctx.radeon_bld);
-
+	LLVMBuildRet(bld_base->base.gallivm->builder, ctx.return_value);
 	mod = bld_base->base.gallivm->module;
+
+	/* Dump LLVM IR before any optimization passes */
+	if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
+	    r600_can_dump_shader(&sscreen->b, ctx.type))
+		LLVMDumpModule(mod);
+
+	radeon_llvm_finalize_module(&ctx.radeon_bld);
+
 	r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
-			    mod, debug, si_shader_ctx.type);
+			    mod, debug, ctx.type, "TGSI shader");
 	if (r) {
 		fprintf(stderr, "LLVM failed to compile shader\n");
 		goto out;
 	}
 
-	si_shader_dump(sscreen, shader, debug, si_shader_ctx.type);
-
-	r = si_shader_binary_upload(sscreen, shader);
-	if (r) {
-		fprintf(stderr, "LLVM failed to upload shader\n");
-		goto out;
+	radeon_llvm_dispose(&ctx.radeon_bld);
+
+	/* Calculate the number of fragment input VGPRs. */
+	if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
+		shader->num_input_vgprs = 0;
+		shader->face_vgpr_index = -1;
+
+		if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
+			shader->num_input_vgprs += 2;
+		if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
+			shader->num_input_vgprs += 2;
+		if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
+			shader->num_input_vgprs += 2;
+		if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
+			shader->num_input_vgprs += 3;
+		if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
+			shader->num_input_vgprs += 2;
+		if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
+			shader->num_input_vgprs += 2;
+		if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
+			shader->num_input_vgprs += 2;
+		if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
+			shader->num_input_vgprs += 1;
+		if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
+			shader->num_input_vgprs += 1;
+		if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
+			shader->num_input_vgprs += 1;
+		if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
+			shader->num_input_vgprs += 1;
+		if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
+			shader->num_input_vgprs += 1;
+		if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
+			shader->face_vgpr_index = shader->num_input_vgprs;
+			shader->num_input_vgprs += 1;
+		}
+		if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
+			shader->num_input_vgprs += 1;
+		if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
+			shader->num_input_vgprs += 1;
+		if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
+			shader->num_input_vgprs += 1;
 	}
 
-	radeon_llvm_dispose(&si_shader_ctx.radeon_bld);
-
-	if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) {
+	if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
 		shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
 		shader->gs_copy_shader->selector = shader->selector;
-		shader->gs_copy_shader->key = shader->key;
-		si_shader_ctx.shader = shader->gs_copy_shader;
-		if ((r = si_generate_gs_copy_shader(sscreen, &si_shader_ctx,
-						    shader, dump, debug))) {
+		ctx.shader = shader->gs_copy_shader;
+		if ((r = si_generate_gs_copy_shader(sscreen, &ctx,
+						    shader, debug))) {
 			free(shader->gs_copy_shader);
 			shader->gs_copy_shader = NULL;
 			goto out;
@@ -4224,18 +5018,966 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 
 out:
 	for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++)
-		FREE(si_shader_ctx.constants[i]);
-	if (poly_stipple)
-		tgsi_free_tokens(tokens);
+		FREE(ctx.constants[i]);
 	return r;
 }
 
-void si_shader_destroy_binary(struct radeon_shader_binary *binary)
+/**
+ * Create, compile and return a shader part (prolog or epilog).
+ *
+ * \param sscreen	screen
+ * \param list		list of shader parts of the same category
+ * \param key		shader part key
+ * \param tm		LLVM target machine
+ * \param debug		debug callback
+ * \param compile	the callback responsible for compilation
+ * \return		non-NULL on success
+ */
+static struct si_shader_part *
+si_get_shader_part(struct si_screen *sscreen,
+		   struct si_shader_part **list,
+		   union si_shader_part_key *key,
+		   LLVMTargetMachineRef tm,
+		   struct pipe_debug_callback *debug,
+		   bool (*compile)(struct si_screen *,
+				   LLVMTargetMachineRef,
+				   struct pipe_debug_callback *,
+				   struct si_shader_part *))
+{
+	struct si_shader_part *result;
+
+	pipe_mutex_lock(sscreen->shader_parts_mutex);
+
+	/* Find existing. */
+	for (result = *list; result; result = result->next) {
+		if (memcmp(&result->key, key, sizeof(*key)) == 0) {
+			pipe_mutex_unlock(sscreen->shader_parts_mutex);
+			return result;
+		}
+	}
+
+	/* Compile a new one. */
+	result = CALLOC_STRUCT(si_shader_part);
+	result->key = *key;
+	if (!compile(sscreen, tm, debug, result)) {
+		FREE(result);
+		pipe_mutex_unlock(sscreen->shader_parts_mutex);
+		return NULL;
+	}
+
+	result->next = *list;
+	*list = result;
+	pipe_mutex_unlock(sscreen->shader_parts_mutex);
+	return result;
+}
+
+/**
+ * Create a vertex shader prolog.
+ *
+ * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
+ * All inputs are returned unmodified. The vertex load indices are
+ * stored after them, which will used by the API VS for fetching inputs.
+ *
+ * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
+ *   input_v0,
+ *   input_v1,
+ *   input_v2,
+ *   input_v3,
+ *   (VertexID + BaseVertex),
+ *   (InstanceID + StartInstance),
+ *   (InstanceID / 2 + StartInstance)
+ */
+static bool si_compile_vs_prolog(struct si_screen *sscreen,
+				 LLVMTargetMachineRef tm,
+				 struct pipe_debug_callback *debug,
+				 struct si_shader_part *out)
+{
+	union si_shader_part_key *key = &out->key;
+	struct si_shader shader = {};
+	struct si_shader_context ctx;
+	struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
+	LLVMTypeRef *params, *returns;
+	LLVMValueRef ret, func;
+	int last_sgpr, num_params, num_returns, i;
+	bool status = true;
+
+	si_init_shader_ctx(&ctx, sscreen, &shader, tm);
+	ctx.type = TGSI_PROCESSOR_VERTEX;
+	ctx.param_vertex_id = key->vs_prolog.num_input_sgprs;
+	ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3;
+
+	/* 4 preloaded VGPRs + vertex load indices as prolog outputs */
+	params = alloca((key->vs_prolog.num_input_sgprs + 4) *
+			sizeof(LLVMTypeRef));
+	returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
+			  key->vs_prolog.last_input + 1) *
+			 sizeof(LLVMTypeRef));
+	num_params = 0;
+	num_returns = 0;
+
+	/* Declare input and output SGPRs. */
+	num_params = 0;
+	for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
+		params[num_params++] = ctx.i32;
+		returns[num_returns++] = ctx.i32;
+	}
+	last_sgpr = num_params - 1;
+
+	/* 4 preloaded VGPRs (outputs must be floats) */
+	for (i = 0; i < 4; i++) {
+		params[num_params++] = ctx.i32;
+		returns[num_returns++] = ctx.f32;
+	}
+
+	/* Vertex load indices. */
+	for (i = 0; i <= key->vs_prolog.last_input; i++)
+		returns[num_returns++] = ctx.f32;
+
+	/* Create the function. */
+	si_create_function(&ctx, returns, num_returns, params,
+			   num_params, -1, last_sgpr);
+	func = ctx.radeon_bld.main_fn;
+
+	/* Copy inputs to outputs. This should be no-op, as the registers match,
+	 * but it will prevent the compiler from overwriting them unintentionally.
+	 */
+	ret = ctx.return_value;
+	for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
+		LLVMValueRef p = LLVMGetParam(func, i);
+		ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
+	}
+	for (i = num_params - 4; i < num_params; i++) {
+		LLVMValueRef p = LLVMGetParam(func, i);
+		p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, "");
+		ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
+	}
+
+	/* Compute vertex load indices from instance divisors. */
+	for (i = 0; i <= key->vs_prolog.last_input; i++) {
+		unsigned divisor = key->vs_prolog.states.instance_divisors[i];
+		LLVMValueRef index;
+
+		if (divisor) {
+			/* InstanceID / Divisor + StartInstance */
+			index = get_instance_index_for_fetch(&ctx.radeon_bld,
+							     SI_SGPR_START_INSTANCE,
+							     divisor);
+		} else {
+			/* VertexID + BaseVertex */
+			index = LLVMBuildAdd(gallivm->builder,
+					     LLVMGetParam(func, ctx.param_vertex_id),
+					     LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
+		}
+
+		index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, "");
+		ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
+					   num_params++, "");
+	}
+
+	/* Compile. */
+	LLVMBuildRet(gallivm->builder, ret);
+	radeon_llvm_finalize_module(&ctx.radeon_bld);
+
+	if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
+			    gallivm->module, debug, ctx.type,
+			    "Vertex Shader Prolog"))
+		status = false;
+
+	radeon_llvm_dispose(&ctx.radeon_bld);
+	return status;
+}
+
+/**
+ * Compile the vertex shader epilog. This is also used by the tessellation
+ * evaluation shader compiled as VS.
+ *
+ * The input is PrimitiveID.
+ *
+ * If PrimitiveID is required by the pixel shader, export it.
+ * Otherwise, do nothing.
+ */
+static bool si_compile_vs_epilog(struct si_screen *sscreen,
+				 LLVMTargetMachineRef tm,
+				 struct pipe_debug_callback *debug,
+				 struct si_shader_part *out)
+{
+	union si_shader_part_key *key = &out->key;
+	struct si_shader_context ctx;
+	struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
+	struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
+	LLVMTypeRef params[5];
+	int num_params, i;
+	bool status = true;
+
+	si_init_shader_ctx(&ctx, sscreen, NULL, tm);
+	ctx.type = TGSI_PROCESSOR_VERTEX;
+
+	/* Declare input VGPRs. */
+	num_params = key->vs_epilog.states.export_prim_id ?
+			   (VS_EPILOG_PRIMID_LOC + 1) : 0;
+	assert(num_params <= ARRAY_SIZE(params));
+
+	for (i = 0; i < num_params; i++)
+		params[i] = ctx.f32;
+
+	/* Create the function. */
+	si_create_function(&ctx, NULL, 0, params, num_params,
+			   -1, -1);
+
+	/* Emit exports. */
+	if (key->vs_epilog.states.export_prim_id) {
+		struct lp_build_context *base = &bld_base->base;
+		struct lp_build_context *uint = &bld_base->uint_bld;
+		LLVMValueRef args[9];
+
+		args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
+		args[1] = uint->zero; /* whether the EXEC mask is valid */
+		args[2] = uint->zero; /* DONE bit */
+		args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM +
+					       key->vs_epilog.prim_id_param_offset);
+		args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
+		args[5] = LLVMGetParam(ctx.radeon_bld.main_fn,
+				       VS_EPILOG_PRIMID_LOC); /* X */
+		args[6] = uint->undef; /* Y */
+		args[7] = uint->undef; /* Z */
+		args[8] = uint->undef; /* W */
+
+		lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+				   LLVMVoidTypeInContext(base->gallivm->context),
+				   args, 9, 0);
+	}
+
+	/* Compile. */
+	LLVMBuildRet(gallivm->builder, ctx.return_value);
+	radeon_llvm_finalize_module(&ctx.radeon_bld);
+
+	if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
+			    gallivm->module, debug, ctx.type,
+			    "Vertex Shader Epilog"))
+		status = false;
+
+	radeon_llvm_dispose(&ctx.radeon_bld);
+	return status;
+}
+
+/**
+ * Create & compile a vertex shader epilog. This a helper used by VS and TES.
+ */
+static bool si_get_vs_epilog(struct si_screen *sscreen,
+			     LLVMTargetMachineRef tm,
+		             struct si_shader *shader,
+		             struct pipe_debug_callback *debug,
+			     struct si_vs_epilog_bits *states)
+{
+	union si_shader_part_key epilog_key;
+
+	memset(&epilog_key, 0, sizeof(epilog_key));
+	epilog_key.vs_epilog.states = *states;
+
+	/* Set up the PrimitiveID output. */
+	if (shader->key.vs.epilog.export_prim_id) {
+		unsigned index = shader->selector->info.num_outputs;
+		unsigned offset = shader->nr_param_exports++;
+
+		epilog_key.vs_epilog.prim_id_param_offset = offset;
+		shader->vs_output_param_offset[index] = offset;
+	}
+
+	shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
+					    &epilog_key, tm, debug,
+					    si_compile_vs_epilog);
+	return shader->epilog != NULL;
+}
+
+/**
+ * Select and compile (or reuse) vertex shader parts (prolog & epilog).
+ */
+static bool si_shader_select_vs_parts(struct si_screen *sscreen,
+				      LLVMTargetMachineRef tm,
+				      struct si_shader *shader,
+				      struct pipe_debug_callback *debug)
+{
+	struct tgsi_shader_info *info = &shader->selector->info;
+	union si_shader_part_key prolog_key;
+	unsigned i;
+
+	/* Get the prolog. */
+	memset(&prolog_key, 0, sizeof(prolog_key));
+	prolog_key.vs_prolog.states = shader->key.vs.prolog;
+	prolog_key.vs_prolog.num_input_sgprs = shader->num_input_sgprs;
+	prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
+
+	/* The prolog is a no-op if there are no inputs. */
+	if (info->num_inputs) {
+		shader->prolog =
+			si_get_shader_part(sscreen, &sscreen->vs_prologs,
+					   &prolog_key, tm, debug,
+					   si_compile_vs_prolog);
+		if (!shader->prolog)
+			return false;
+	}
+
+	/* Get the epilog. */
+	if (!shader->key.vs.as_es && !shader->key.vs.as_ls &&
+	    !si_get_vs_epilog(sscreen, tm, shader, debug,
+			      &shader->key.vs.epilog))
+		return false;
+
+	/* Set the instanceID flag. */
+	for (i = 0; i < info->num_inputs; i++)
+		if (prolog_key.vs_prolog.states.instance_divisors[i])
+			shader->uses_instanceid = true;
+
+	return true;
+}
+
+/**
+ * Select and compile (or reuse) TES parts (epilog).
+ */
+static bool si_shader_select_tes_parts(struct si_screen *sscreen,
+				       LLVMTargetMachineRef tm,
+				       struct si_shader *shader,
+				       struct pipe_debug_callback *debug)
+{
+	if (shader->key.tes.as_es)
+		return true;
+
+	/* TES compiled as VS. */
+	return si_get_vs_epilog(sscreen, tm, shader, debug,
+				&shader->key.tes.epilog);
+}
+
+/**
+ * Compile the TCS epilog. This writes tesselation factors to memory based on
+ * the output primitive type of the tesselator (determined by TES).
+ */
+static bool si_compile_tcs_epilog(struct si_screen *sscreen,
+				  LLVMTargetMachineRef tm,
+				  struct pipe_debug_callback *debug,
+				  struct si_shader_part *out)
+{
+	union si_shader_part_key *key = &out->key;
+	struct si_shader shader = {};
+	struct si_shader_context ctx;
+	struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
+	struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
+	LLVMTypeRef params[16];
+	LLVMValueRef func;
+	int last_array_pointer, last_sgpr, num_params;
+	bool status = true;
+
+	si_init_shader_ctx(&ctx, sscreen, &shader, tm);
+	ctx.type = TGSI_PROCESSOR_TESS_CTRL;
+	shader.key.tcs.epilog = key->tcs_epilog.states;
+
+	/* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
+	params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS);
+	last_array_pointer = SI_PARAM_RW_BUFFERS;
+	params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
+	params[SI_PARAM_SAMPLERS] = ctx.i64;
+	params[SI_PARAM_UNUSED] = ctx.i64;
+	params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
+	params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
+	params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
+	params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32;
+	last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
+	num_params = last_sgpr + 1;
+
+	params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */
+	params[num_params++] = ctx.i32; /* invocation ID within the patch */
+	params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */
+
+	/* Create the function. */
+	si_create_function(&ctx, NULL, 0, params, num_params,
+			   last_array_pointer, last_sgpr);
+	declare_tess_lds(&ctx);
+	func = ctx.radeon_bld.main_fn;
+
+	si_write_tess_factors(bld_base,
+			      LLVMGetParam(func, last_sgpr + 1),
+			      LLVMGetParam(func, last_sgpr + 2),
+			      LLVMGetParam(func, last_sgpr + 3));
+
+	/* Compile. */
+	LLVMBuildRet(gallivm->builder, ctx.return_value);
+	radeon_llvm_finalize_module(&ctx.radeon_bld);
+
+	if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
+			    gallivm->module, debug, ctx.type,
+			    "Tessellation Control Shader Epilog"))
+		status = false;
+
+	radeon_llvm_dispose(&ctx.radeon_bld);
+	return status;
+}
+
+/**
+ * Select and compile (or reuse) TCS parts (epilog).
+ */
+static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
+				       LLVMTargetMachineRef tm,
+				       struct si_shader *shader,
+				       struct pipe_debug_callback *debug)
+{
+	union si_shader_part_key epilog_key;
+
+	/* Get the epilog. */
+	memset(&epilog_key, 0, sizeof(epilog_key));
+	epilog_key.tcs_epilog.states = shader->key.tcs.epilog;
+
+	shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
+					    &epilog_key, tm, debug,
+					    si_compile_tcs_epilog);
+	return shader->epilog != NULL;
+}
+
+/**
+ * Compile the pixel shader prolog. This handles:
+ * - two-side color selection and interpolation
+ * - overriding interpolation parameters for the API PS
+ * - polygon stippling
+ *
+ * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
+ * overriden by other states. (e.g. per-sample interpolation)
+ * Interpolated colors are stored after the preloaded VGPRs.
+ */
+static bool si_compile_ps_prolog(struct si_screen *sscreen,
+				 LLVMTargetMachineRef tm,
+				 struct pipe_debug_callback *debug,
+				 struct si_shader_part *out)
+{
+	union si_shader_part_key *key = &out->key;
+	struct si_shader shader = {};
+	struct si_shader_context ctx;
+	struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
+	LLVMTypeRef *params;
+	LLVMValueRef ret, func;
+	int last_sgpr, num_params, num_returns, i, num_color_channels;
+	bool status = true;
+
+	si_init_shader_ctx(&ctx, sscreen, &shader, tm);
+	ctx.type = TGSI_PROCESSOR_FRAGMENT;
+	shader.key.ps.prolog = key->ps_prolog.states;
+
+	/* Number of inputs + 8 color elements. */
+	params = alloca((key->ps_prolog.num_input_sgprs +
+			 key->ps_prolog.num_input_vgprs + 8) *
+			sizeof(LLVMTypeRef));
+
+	/* Declare inputs. */
+	num_params = 0;
+	for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
+		params[num_params++] = ctx.i32;
+	last_sgpr = num_params - 1;
+
+	for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
+		params[num_params++] = ctx.f32;
+
+	/* Declare outputs (same as inputs + add colors if needed) */
+	num_returns = num_params;
+	num_color_channels = util_bitcount(key->ps_prolog.colors_read);
+	for (i = 0; i < num_color_channels; i++)
+		params[num_returns++] = ctx.f32;
+
+	/* Create the function. */
+	si_create_function(&ctx, params, num_returns, params,
+			   num_params, -1, last_sgpr);
+	func = ctx.radeon_bld.main_fn;
+
+	/* Copy inputs to outputs. This should be no-op, as the registers match,
+	 * but it will prevent the compiler from overwriting them unintentionally.
+	 */
+	ret = ctx.return_value;
+	for (i = 0; i < num_params; i++) {
+		LLVMValueRef p = LLVMGetParam(func, i);
+		ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
+	}
+
+	/* Polygon stippling. */
+	if (key->ps_prolog.states.poly_stipple) {
+		/* POS_FIXED_PT is always last. */
+		unsigned pos = key->ps_prolog.num_input_sgprs +
+			       key->ps_prolog.num_input_vgprs - 1;
+		LLVMValueRef ptr[2], views;
+
+		/* Get the pointer to sampler views. */
+		ptr[0] = LLVMGetParam(func, SI_SGPR_SAMPLERS);
+		ptr[1] = LLVMGetParam(func, SI_SGPR_SAMPLERS+1);
+		views = lp_build_gather_values(gallivm, ptr, 2);
+		views = LLVMBuildBitCast(gallivm->builder, views, ctx.i64, "");
+		views = LLVMBuildIntToPtr(gallivm->builder, views,
+					  const_array(ctx.v8i32, SI_NUM_SAMPLERS), "");
+
+		si_llvm_emit_polygon_stipple(&ctx, views, pos);
+	}
+
+	/* Interpolate colors. */
+	for (i = 0; i < 2; i++) {
+		unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
+		unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
+				     key->ps_prolog.face_vgpr_index;
+		LLVMValueRef interp[2], color[4];
+		LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
+
+		if (!writemask)
+			continue;
+
+		/* If the interpolation qualifier is not CONSTANT (-1). */
+		if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
+			unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
+					       key->ps_prolog.color_interp_vgpr_index[i];
+
+			interp[0] = LLVMGetParam(func, interp_vgpr);
+			interp[1] = LLVMGetParam(func, interp_vgpr + 1);
+			interp_ij = lp_build_gather_values(gallivm, interp, 2);
+			interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij,
+						     ctx.v2i32, "");
+		}
+
+		/* Use the absolute location of the input. */
+		prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
+
+		if (key->ps_prolog.states.color_two_side) {
+			face = LLVMGetParam(func, face_vgpr);
+			face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, "");
+		}
+
+		interp_fs_input(&ctx,
+				key->ps_prolog.color_attr_index[i],
+				TGSI_SEMANTIC_COLOR, i,
+				key->ps_prolog.num_interp_inputs,
+				key->ps_prolog.colors_read, interp_ij,
+				prim_mask, face, color);
+
+		while (writemask) {
+			unsigned chan = u_bit_scan(&writemask);
+			ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
+						   num_params++, "");
+		}
+	}
+
+	/* Force per-sample interpolation. */
+	if (key->ps_prolog.states.force_persample_interp) {
+		unsigned i, base = key->ps_prolog.num_input_sgprs;
+		LLVMValueRef persp_sample[2], linear_sample[2];
+
+		/* Read PERSP_SAMPLE. */
+		for (i = 0; i < 2; i++)
+			persp_sample[i] = LLVMGetParam(func, base + i);
+		/* Overwrite PERSP_CENTER. */
+		for (i = 0; i < 2; i++)
+			ret = LLVMBuildInsertValue(gallivm->builder, ret,
+						   persp_sample[i], base + 2 + i, "");
+		/* Overwrite PERSP_CENTROID. */
+		for (i = 0; i < 2; i++)
+			ret = LLVMBuildInsertValue(gallivm->builder, ret,
+						   persp_sample[i], base + 4 + i, "");
+		/* Read LINEAR_SAMPLE. */
+		for (i = 0; i < 2; i++)
+			linear_sample[i] = LLVMGetParam(func, base + 6 + i);
+		/* Overwrite LINEAR_CENTER. */
+		for (i = 0; i < 2; i++)
+			ret = LLVMBuildInsertValue(gallivm->builder, ret,
+						   linear_sample[i], base + 8 + i, "");
+		/* Overwrite LINEAR_CENTROID. */
+		for (i = 0; i < 2; i++)
+			ret = LLVMBuildInsertValue(gallivm->builder, ret,
+						   linear_sample[i], base + 10 + i, "");
+	}
+
+	/* Compile. */
+	LLVMBuildRet(gallivm->builder, ret);
+	radeon_llvm_finalize_module(&ctx.radeon_bld);
+
+	if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
+			    gallivm->module, debug, ctx.type,
+			    "Fragment Shader Prolog"))
+		status = false;
+
+	radeon_llvm_dispose(&ctx.radeon_bld);
+	return status;
+}
+
+/**
+ * Compile the pixel shader epilog. This handles everything that must be
+ * emulated for pixel shader exports. (alpha-test, format conversions, etc)
+ */
+static bool si_compile_ps_epilog(struct si_screen *sscreen,
+				 LLVMTargetMachineRef tm,
+				 struct pipe_debug_callback *debug,
+				 struct si_shader_part *out)
+{
+	union si_shader_part_key *key = &out->key;
+	struct si_shader shader = {};
+	struct si_shader_context ctx;
+	struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
+	struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
+	LLVMTypeRef params[16+8*4+3];
+	LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
+	int last_array_pointer, last_sgpr, num_params, i;
+	bool status = true;
+
+	si_init_shader_ctx(&ctx, sscreen, &shader, tm);
+	ctx.type = TGSI_PROCESSOR_FRAGMENT;
+	shader.key.ps.epilog = key->ps_epilog.states;
+
+	/* Declare input SGPRs. */
+	params[SI_PARAM_RW_BUFFERS] = ctx.i64;
+	params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
+	params[SI_PARAM_SAMPLERS] = ctx.i64;
+	params[SI_PARAM_UNUSED] = ctx.i64;
+	params[SI_PARAM_ALPHA_REF] = ctx.f32;
+	last_array_pointer = -1;
+	last_sgpr = SI_PARAM_ALPHA_REF;
+
+	/* Declare input VGPRs. */
+	num_params = (last_sgpr + 1) +
+		     util_bitcount(key->ps_epilog.colors_written) * 4 +
+		     key->ps_epilog.writes_z +
+		     key->ps_epilog.writes_stencil +
+		     key->ps_epilog.writes_samplemask;
+
+	num_params = MAX2(num_params,
+			  last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
+
+	assert(num_params <= ARRAY_SIZE(params));
+
+	for (i = last_sgpr + 1; i < num_params; i++)
+		params[i] = ctx.f32;
+
+	/* Create the function. */
+	si_create_function(&ctx, NULL, 0, params, num_params,
+			   last_array_pointer, last_sgpr);
+	/* Disable elimination of unused inputs. */
+	radeon_llvm_add_attribute(ctx.radeon_bld.main_fn,
+				  "InitialPSInputAddr", 0xffffff);
+
+	/* Process colors. */
+	unsigned vgpr = last_sgpr + 1;
+	unsigned colors_written = key->ps_epilog.colors_written;
+	int last_color_export = -1;
+
+	/* Find the last color export. */
+	if (!key->ps_epilog.writes_z &&
+	    !key->ps_epilog.writes_stencil &&
+	    !key->ps_epilog.writes_samplemask) {
+		unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
+
+		/* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
+		if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
+			/* Just set this if any of the colorbuffers are enabled. */
+			if (spi_format &
+			    ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
+				last_color_export = 0;
+		} else {
+			for (i = 0; i < 8; i++)
+				if (colors_written & (1 << i) &&
+				    (spi_format >> (i * 4)) & 0xf)
+					last_color_export = i;
+		}
+	}
+
+	while (colors_written) {
+		LLVMValueRef color[4];
+		int mrt = u_bit_scan(&colors_written);
+
+		for (i = 0; i < 4; i++)
+			color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
+
+		si_export_mrt_color(bld_base, color, mrt,
+				    num_params - 1,
+				    mrt == last_color_export);
+	}
+
+	/* Process depth, stencil, samplemask. */
+	if (key->ps_epilog.writes_z)
+		depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
+	if (key->ps_epilog.writes_stencil)
+		stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
+	if (key->ps_epilog.writes_samplemask)
+		samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
+
+	if (depth || stencil || samplemask)
+		si_export_mrt_z(bld_base, depth, stencil, samplemask);
+	else if (last_color_export == -1)
+		si_export_null(bld_base);
+
+	/* Compile. */
+	LLVMBuildRetVoid(gallivm->builder);
+	radeon_llvm_finalize_module(&ctx.radeon_bld);
+
+	if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
+			    gallivm->module, debug, ctx.type,
+			    "Fragment Shader Epilog"))
+		status = false;
+
+	radeon_llvm_dispose(&ctx.radeon_bld);
+	return status;
+}
+
+/**
+ * Select and compile (or reuse) pixel shader parts (prolog & epilog).
+ */
+static bool si_shader_select_ps_parts(struct si_screen *sscreen,
+				      LLVMTargetMachineRef tm,
+				      struct si_shader *shader,
+				      struct pipe_debug_callback *debug)
+{
+	struct tgsi_shader_info *info = &shader->selector->info;
+	union si_shader_part_key prolog_key;
+	union si_shader_part_key epilog_key;
+	unsigned i;
+
+	/* Get the prolog. */
+	memset(&prolog_key, 0, sizeof(prolog_key));
+	prolog_key.ps_prolog.states = shader->key.ps.prolog;
+	prolog_key.ps_prolog.colors_read = info->colors_read;
+	prolog_key.ps_prolog.num_input_sgprs = shader->num_input_sgprs;
+	prolog_key.ps_prolog.num_input_vgprs = shader->num_input_vgprs;
+
+	if (info->colors_read) {
+		unsigned *color = shader->selector->color_attr_index;
+
+		if (shader->key.ps.prolog.color_two_side) {
+			/* BCOLORs are stored after the last input. */
+			prolog_key.ps_prolog.num_interp_inputs = info->num_inputs;
+			prolog_key.ps_prolog.face_vgpr_index = shader->face_vgpr_index;
+			shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
+		}
+
+		for (i = 0; i < 2; i++) {
+			unsigned location = info->input_interpolate_loc[color[i]];
+
+			if (!(info->colors_read & (0xf << i*4)))
+				continue;
+
+			prolog_key.ps_prolog.color_attr_index[i] = color[i];
+
+			/* Force per-sample interpolation for the colors here. */
+			if (shader->key.ps.prolog.force_persample_interp)
+				location = TGSI_INTERPOLATE_LOC_SAMPLE;
+
+			switch (info->input_interpolate[color[i]]) {
+			case TGSI_INTERPOLATE_CONSTANT:
+				prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1;
+				break;
+			case TGSI_INTERPOLATE_PERSPECTIVE:
+			case TGSI_INTERPOLATE_COLOR:
+				switch (location) {
+				case TGSI_INTERPOLATE_LOC_SAMPLE:
+					prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0;
+					shader->config.spi_ps_input_ena |=
+						S_0286CC_PERSP_SAMPLE_ENA(1);
+					break;
+				case TGSI_INTERPOLATE_LOC_CENTER:
+					prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2;
+					shader->config.spi_ps_input_ena |=
+						S_0286CC_PERSP_CENTER_ENA(1);
+					break;
+				case TGSI_INTERPOLATE_LOC_CENTROID:
+					prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4;
+					shader->config.spi_ps_input_ena |=
+						S_0286CC_PERSP_CENTROID_ENA(1);
+					break;
+				default:
+					assert(0);
+				}
+				break;
+			case TGSI_INTERPOLATE_LINEAR:
+				switch (location) {
+				case TGSI_INTERPOLATE_LOC_SAMPLE:
+					prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6;
+					shader->config.spi_ps_input_ena |=
+						S_0286CC_LINEAR_SAMPLE_ENA(1);
+					break;
+				case TGSI_INTERPOLATE_LOC_CENTER:
+					prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8;
+					shader->config.spi_ps_input_ena |=
+						S_0286CC_LINEAR_CENTER_ENA(1);
+					break;
+				case TGSI_INTERPOLATE_LOC_CENTROID:
+					prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10;
+					shader->config.spi_ps_input_ena |=
+						S_0286CC_LINEAR_CENTROID_ENA(1);
+					break;
+				default:
+					assert(0);
+				}
+				break;
+			default:
+				assert(0);
+			}
+		}
+	}
+
+	/* The prolog is a no-op if these aren't set. */
+	if (prolog_key.ps_prolog.colors_read ||
+	    prolog_key.ps_prolog.states.force_persample_interp ||
+	    prolog_key.ps_prolog.states.poly_stipple) {
+		shader->prolog =
+			si_get_shader_part(sscreen, &sscreen->ps_prologs,
+					   &prolog_key, tm, debug,
+					   si_compile_ps_prolog);
+		if (!shader->prolog)
+			return false;
+	}
+
+	/* Get the epilog. */
+	memset(&epilog_key, 0, sizeof(epilog_key));
+	epilog_key.ps_epilog.colors_written = info->colors_written;
+	epilog_key.ps_epilog.writes_z = info->writes_z;
+	epilog_key.ps_epilog.writes_stencil = info->writes_stencil;
+	epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask;
+	epilog_key.ps_epilog.states = shader->key.ps.epilog;
+
+	shader->epilog =
+		si_get_shader_part(sscreen, &sscreen->ps_epilogs,
+				   &epilog_key, tm, debug,
+				   si_compile_ps_epilog);
+	if (!shader->epilog)
+		return false;
+
+	/* Enable POS_FIXED_PT if polygon stippling is enabled. */
+	if (shader->key.ps.prolog.poly_stipple) {
+		shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
+		assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
+	}
+
+	/* Set up the enable bits for per-sample shading if needed. */
+	if (shader->key.ps.prolog.force_persample_interp) {
+		if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
+		    G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena)) {
+			shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
+			shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
+			shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
+		}
+		if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
+		    G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena)) {
+			shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
+			shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
+			shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
+		}
+	}
+
+	/* POW_W_FLOAT requires that one of the perspective weights is enabled. */
+	if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
+	    !(shader->config.spi_ps_input_ena & 0xf)) {
+		shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
+		assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
+	}
+
+	/* At least one pair of interpolation weights must be enabled. */
+	if (!(shader->config.spi_ps_input_ena & 0x7f)) {
+		shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
+		assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
+	}
+
+	/* The sample mask input is always enabled, because the API shader always
+	 * passes it through to the epilog. Disable it here if it's unused.
+	 */
+	if (!shader->key.ps.epilog.poly_line_smoothing &&
+	    !shader->selector->info.reads_samplemask)
+		shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
+
+	return true;
+}
+
+int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
+		     struct si_shader *shader,
+		     struct pipe_debug_callback *debug)
 {
-	FREE(binary->code);
-	FREE(binary->rodata);
-	FREE(binary->relocs);
-	FREE(binary->disasm_string);
+	struct si_shader *mainp = shader->selector->main_shader_part;
+	int r;
+
+	/* LS and ES are always compiled on demand. */
+	if (!mainp ||
+	    (shader->selector->type == PIPE_SHADER_VERTEX &&
+	     (shader->key.vs.as_es || shader->key.vs.as_ls)) ||
+	    (shader->selector->type == PIPE_SHADER_TESS_EVAL &&
+	     shader->key.tes.as_es)) {
+		/* Monolithic shader (compiled as a whole, has many variants,
+		 * may take a long time to compile).
+		 */
+		r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
+		if (r)
+			return r;
+	} else {
+		/* The shader consists of 2-3 parts:
+		 *
+		 * - the middle part is the user shader, it has 1 variant only
+		 *   and it was compiled during the creation of the shader
+		 *   selector
+		 * - the prolog part is inserted at the beginning
+		 * - the epilog part is inserted at the end
+		 *
+		 * The prolog and epilog have many (but simple) variants.
+		 */
+
+		/* Copy the compiled TGSI shader data over. */
+		shader->is_binary_shared = true;
+		shader->binary = mainp->binary;
+		shader->config = mainp->config;
+		shader->num_input_sgprs = mainp->num_input_sgprs;
+		shader->num_input_vgprs = mainp->num_input_vgprs;
+		shader->face_vgpr_index = mainp->face_vgpr_index;
+		memcpy(shader->vs_output_param_offset,
+		       mainp->vs_output_param_offset,
+		       sizeof(mainp->vs_output_param_offset));
+		shader->uses_instanceid = mainp->uses_instanceid;
+		shader->nr_pos_exports = mainp->nr_pos_exports;
+		shader->nr_param_exports = mainp->nr_param_exports;
+
+		/* Select prologs and/or epilogs. */
+		switch (shader->selector->type) {
+		case PIPE_SHADER_VERTEX:
+			if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
+				return -1;
+			break;
+		case PIPE_SHADER_TESS_CTRL:
+			if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
+				return -1;
+			break;
+		case PIPE_SHADER_TESS_EVAL:
+			if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
+				return -1;
+			break;
+		case PIPE_SHADER_FRAGMENT:
+			if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
+				return -1;
+
+			/* Make sure we have at least as many VGPRs as there
+			 * are allocated inputs.
+			 */
+			shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
+							shader->num_input_vgprs);
+			break;
+		}
+
+		/* Update SGPR and VGPR counts. */
+		if (shader->prolog) {
+			shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
+							shader->prolog->config.num_sgprs);
+			shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
+							shader->prolog->config.num_vgprs);
+		}
+		if (shader->epilog) {
+			shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
+							shader->epilog->config.num_sgprs);
+			shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
+							shader->epilog->config.num_vgprs);
+		}
+	}
+
+	si_shader_dump(sscreen, shader, debug, shader->selector->info.processor);
+
+	/* Upload. */
+	r = si_shader_binary_upload(sscreen, shader);
+	if (r) {
+		fprintf(stderr, "LLVM failed to upload shader\n");
+		return r;
+	}
+
+	return 0;
 }
 
 void si_shader_destroy(struct si_shader *shader)
@@ -4249,5 +5991,7 @@ void si_shader_destroy(struct si_shader *shader)
 		r600_resource_reference(&shader->scratch_bo, NULL);
 
 	r600_resource_reference(&shader->bo, NULL);
-	si_shader_destroy_binary(&shader->binary);
+
+	if (!shader->is_binary_shared)
+		radeon_shader_binary_clean(&shader->binary);
 }